forked from FR0ST1N/Seq2Seq-Chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
123 lines (98 loc) · 4.42 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import re
import tensorflow as tf
from six.moves import urllib
from tensorflow.python.platform import gfile
_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
# Regular expressions used to tokenize.
_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
_DIGIT_RE = re.compile(br"\d")
def basic_tokenizer(sentence):
"""Very basic tokenizer: split the sentence into a list of tokens."""
words = []
for space_separated_fragment in sentence.strip().split():
words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
return [w for w in words if w]
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
tokenizer=None, normalize_digits=True):
if not gfile.Exists(vocabulary_path):
print("Creating vocabulary %s from %s" % (vocabulary_path, data_path))
vocab = {}
with gfile.GFile(data_path, mode="rb") as f:
counter = 0
for line in f:
line = tf.compat.as_bytes(line)
counter += 1
if counter % 100000 == 0:
print(" processing line %d" % counter)
tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
for w in tokens:
word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
if word in vocab:
vocab[word] += 1
else:
vocab[word] = 1
vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
print('>> Full Vocabulary Size :',len(vocab_list))
if len(vocab_list) > max_vocabulary_size:
vocab_list = vocab_list[:max_vocabulary_size]
with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
for w in vocab_list:
vocab_file.write(w + b"\n")
def initialize_vocabulary(vocabulary_path):
if gfile.Exists(vocabulary_path):
rev_vocab = []
with gfile.GFile(vocabulary_path, mode="rb") as f:
rev_vocab.extend(f.readlines())
rev_vocab = [line.strip() for line in rev_vocab]
vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
return vocab, rev_vocab
else:
raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def sentence_to_token_ids(sentence, vocabulary, tokenizer=None, normalize_digits=True):
if tokenizer:
words = tokenizer(sentence)
else:
words = basic_tokenizer(sentence)
if not normalize_digits:
return [vocabulary.get(w, UNK_ID) for w in words]
# Normalize digits by 0 before looking words up in the vocabulary.
# return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words] #mark added .decode by Ken
return [vocabulary.get(w.decode('utf-8'), UNK_ID) for w in words] # added by Ken
def data_to_token_ids(data_path, target_path, vocabulary_path,
tokenizer=None, normalize_digits=True):
if not gfile.Exists(target_path):
print("Tokenizing data in %s" % data_path)
vocab, _ = initialize_vocabulary(vocabulary_path)
with gfile.GFile(data_path, mode="rb") as data_file:
with gfile.GFile(target_path, mode="wb") as tokens_file:
counter = 0
for line in data_file:
line = tf.compat.as_bytes(line)
counter += 1
if counter % 100000 == 0:
print(" tokenizing line %d" % counter)
token_ids = sentence_to_token_ids(line, vocab, tokenizer,
normalize_digits)
tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def prepare_custom_data(working_directory, train_enc, train_dec, enc_vocabulary_size, dec_vocabulary_size, tokenizer=None):
enc_vocab_path = os.path.join(working_directory, "vocab%d.enc" % enc_vocabulary_size)
dec_vocab_path = os.path.join(working_directory, "vocab%d.dec" % dec_vocabulary_size)
create_vocabulary(enc_vocab_path, train_enc, enc_vocabulary_size, tokenizer)
create_vocabulary(dec_vocab_path, train_dec, dec_vocabulary_size, tokenizer)
enc_train_ids_path = train_enc + (".ids%d" % enc_vocabulary_size)
dec_train_ids_path = train_dec + (".ids%d" % dec_vocabulary_size)
data_to_token_ids(train_enc, enc_train_ids_path, enc_vocab_path, tokenizer)
data_to_token_ids(train_dec, dec_train_ids_path, dec_vocab_path, tokenizer)
return (enc_train_ids_path, dec_train_ids_path, enc_vocab_path, dec_vocab_path)