This repository has been archived by the owner on Jul 27, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 729
/
utils.py
59 lines (50 loc) · 1.77 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import sklearn.datasets
import numpy as np
import re
import collections
import random
from sklearn import metrics
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
def clearstring(string):
string = re.sub('[^A-Za-z0-9 ]+', '', string)
string = string.split(' ')
string = filter(None, string)
string = [y.strip() for y in string if y.strip() not in english_stopwords]
string = ' '.join(string)
return string.lower()
def separate_dataset(trainset, ratio = 0.5):
datastring = []
datatarget = []
for i in range(len(trainset.data)):
data_ = trainset.data[i].split('\n')
data_ = list(filter(None, data_))
data_ = random.sample(data_, int(len(data_) * ratio))
for n in range(len(data_)):
data_[n] = clearstring(data_[n])
datastring += data_
for n in range(len(data_)):
datatarget.append(trainset.target[i])
return datastring, datatarget
def build_dataset(words, n_words):
count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
count.extend(collections.Counter(words).most_common(n_words - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
index = dictionary.get(word, 0)
if index == 0:
unk_count += 1
data.append(index)
count[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reversed_dictionary
def str_idx(corpus, dic, maxlen, UNK = 3):
X = np.zeros((len(corpus), maxlen))
for i in range(len(corpus)):
for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
X[i, -1 - no] = dic.get(k, UNK)
return X