-
Notifications
You must be signed in to change notification settings - Fork 0
/
dialog.py
executable file
·277 lines (261 loc) · 9.87 KB
/
dialog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import re
import nltk
class Dialog:
max_dialog_len = 0
all_tokens = []
all_text = [] # used for obtaining frequency distribution
word_id_dict = {}
lemon = None # it is the lemmatizer
read_len = 700 # -1 to read until ends
freq_dist = None
vocab_size = 800
contractions = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"gon'na": "going to",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have",
}
def __init__(self, character1, character2, text):
self.character1 = character1
self.character2 = character2
self.text = text
@staticmethod
def pre_text(text):
for word in text.split():
if word.lower() in Dialog.contractions:
text = text.replace(word, Dialog.contractions[word.lower()])
result_tokens = []
for token in nltk.word_tokenize(text):
if token.isnumeric():
result_tokens.append('1.01') # replace any numeric token with 1.01 to reduce vocab size
else:
result_tokens.append(Dialog.lemon.lemmatize(token))
return result_tokens
@staticmethod
def resolve_data(conversation_path, lines_path, vocab_size=800):
dialog_list = []
line_pairs = {}
Dialog.lemon = nltk.WordNetLemmatizer()
Dialog.vocab_size = vocab_size
with open(lines_path, encoding="utf8", errors='ignore', mode="r") as lines_file:
for count, line in enumerate(lines_file):
parts = line.split(' +++$+++ ')
string = str(parts[-1])
string = string.lower()
line_pairs[parts[0]] = string
with open(conversation_path, "r") as conversation_file:
for count, line in enumerate(conversation_file):
if count > Dialog.read_len != -1:
break
parts = line.split(' +++$+++ ')
text_ids = re.findall(r'(L[0-9]+)', parts[3])
texts = []
for text_id in text_ids:
text_string = line_pairs[text_id]
text_tokens = Dialog.pre_text(text_string)
texts.append(text_tokens)
if len(text_tokens) > Dialog.max_dialog_len:
Dialog.max_dialog_len = len(text_tokens)
for token in text_tokens:
Dialog.all_text.append(token)
dialog_list.append(Dialog(parts[0], parts[1], texts))
if count % 100 == 0:
print("Load text resources, line NO." + str(count))
print("Movie Conversation Loading Complete!")
Dialog.max_dialog_len = Dialog.max_dialog_len + 3
return dialog_list
@staticmethod
def load_word2ids():
Dialog.freq_dist = nltk.FreqDist(Dialog.all_text)
Dialog.all_tokens = list(dict(Dialog.freq_dist.most_common(Dialog.vocab_size + 2)).keys())
known = sum(list(dict(Dialog.freq_dist.most_common(Dialog.vocab_size + 2)).values()))
total = sum(list(Dialog.freq_dist.values()))
known_rate = float(known) / float(total) # to measure how many words are Not replaced by UNKNOWN
print("Vocabulary Known Rate is: " + str(known_rate))
Dialog.word_id_dict = {token: index + 3 for index, token in set(enumerate(Dialog.all_tokens))}
Dialog.word_id_dict['TSTSTARTTST'] = 1
Dialog.word_id_dict['TSTUNKNOWNTST'] = 2
'''leave 0 as the empty token for padding'''
@staticmethod
def pair_qa_ids(dialog_list):
dialog_x = []
dialog_y = []
for dialog in dialog_list:
for index in range(len(dialog.text)):
if index + 1 < len(dialog.text):
text1 = dialog.text[index]
text1ids = Dialog.tokens2id(text1)
text2 = dialog.text[index + 1]
text2ids = Dialog.tokens2id(text2)
dialog_x.append(text1ids)
dialog_y.append(text2ids)
return dialog_x, dialog_y
@staticmethod
def pair_qa(dialog_list):
dialog_x = []
dialog_y = []
for dialog in dialog_list:
for index in range(len(dialog.text)):
if index + 1 < len(dialog.text):
text1tokens = [token for token in dialog.text[index]]
pad_template = [0] * Dialog.max_dialog_len
pad_template[:len(text1tokens)] = text1tokens
text2tokens = [token for token in dialog.text[index + 1]]
dialog_x.append(text1tokens)
dialog_y.append(text2tokens)
return dialog_x, dialog_y
@staticmethod
def id2word(id):
if id < 0.000001:
return ''
for k, v in Dialog.word_id_dict.items(): # for key, value in dictionary.iteritems()
if id == v:
return k
return 'ERROR'
@staticmethod
def word2id(word):
if len(word) == 0:
return 0
if word in Dialog.word_id_dict:
return Dialog.word_id_dict[word]
else:
return Dialog.word_id_dict['TSTUNKNOWNTST']
@staticmethod
def sent2id(sent):
id_sent = []
sent = Dialog.pre_text(sent)
for word in sent:
id_sent.append(Dialog.word2id(word))
pad = [0] * Dialog.max_dialog_len
pad[:len(id_sent)] = id_sent
return pad
@staticmethod
def id2sent(sent):
word_sent = []
for i in sent:
if i == 0:
continue
word_sent.append(Dialog.id2word(i))
return ' '.join(word_sent)
@staticmethod
def tokens2id(sent):
id_sent = []
for word in sent:
id_sent.append(Dialog.word2id(word))
pad = [0] * Dialog.max_dialog_len
pad[:len(id_sent)] = id_sent
return pad