-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_eng_utils.py
99 lines (85 loc) · 3.23 KB
/
data_eng_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
import sys
import os
#!{sys.executable} -m spacy download en
import spacy
import scipy
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Fixing the different patterns
email_pat = r"([\w.+-]+@[a-z\d-]+\.[a-z\d.-]+)"
punct_pat = r"[,|.|_|@|\|?|\\|$&*|%|\r|\n|.:|\s+|/|//|\\|/|\||-|<|>|;|(|)|=|+|#|-|\"|[-\]]|{|}]"
num_pat = r"(?<!)(\d+(?:\.\d+)?)"
# Define a function to remove email_patterns, punctuations and numbers from the text
def preText(text):
# Make the text unicase (lower)
text = str(text).lower()
# Remove email adresses
text = re.sub(email_pat, ' ', text, flags=re.IGNORECASE)
# Remove all numbers
text = re.sub(r'\d+',' ',text)# remove numbers
text = re.sub(num_pat, ' ', text)
# Replace all punctuations with blank space
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(punct_pat, " ", text, flags=re.MULTILINE)
text = re.sub(r'\s+', ' ', text)
# remove HTML tags
text = re.sub('<.*?>', '', text)
# Replace multiple spaces from prev step to single
text = re.sub(r' {2,}', " ", text, flags=re.MULTILINE)
text = text.replace('`',"'")
return text.strip()
# Initialize spacy 'en' medium model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Define a function to lemmatize the descriptions
def lemmatizer(sentence):
# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)
return " ".join([token.lemma_ for token in doc if token.lemma_ !='-PRON-'])
def clusters(X,iters):
sse=[]
models=[]
scores=[]
for i in iters:
kmeans=KMeans(n_clusters=i)
clusters=kmeans.fit(X)
sse.append(kmeans.inertia_)
scores.append(silhouette_score(X, kmeans.labels_))
return sse, scores
#Setting up paths to load Word2Vec Data
data='DATA'
processed_data_folder='PROCESSED_DATA'
DATA_PATH=os.path.join(data)
PROCESSED_DATA_PATH=os.path.join(DATA_PATH,processed_data_folder)
word2vec_folder='word2vec'
WORD_2_VEC_PATH=os.path.join(PROCESSED_DATA_PATH,word2vec_folder)
from gensim.models import KeyedVectors
word_vecs_reload = KeyedVectors.load(os.path.join(WORD_2_VEC_PATH,'word2vec.wordvectors'), mmap='r')
with open(os.path.join(WORD_2_VEC_PATH,'dict_wordprob'),'r')as f:
dict_word_prob=f.read()
def get_average_vector_sent(sent,dict_sent={},a=0.0001,word_vecs_reload=word_vecs_reload,dict_word_prob=dict_word_prob):
av_vect=[]
sum_word_count=0
for word in sent.split():
if word not in word_vecs_reload:
print('word not in word_vecs_reload.keys')
if word not in dict_word_prob.keys():
print('word not in dict_word_prob.keys')
vec=word_vecs_reload[word]*(a/(a+dict_word_prob[word]))
av_vect.append(vec)
mean=np.mean(av_vect, axis=0)
dict_sent[sent]=mean
return mean
def get_df_vectors(sents):
dict_sent={}
means=[]
for sent in sents:
mean=get_average_vector_sent(sent,dict_sent)
means.append(mean)
return np.array(means).squeeze()
def get_final_vectors(sent):
pre_array=get_df_vectors(sent).T
svd_u=scipy.linalg.svd(pre_array)[0]
u2=svd_u.dot(svd_u.T).dot(pre_array)
return pre_array-u2