-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathkd.py
65 lines (51 loc) · 1.87 KB
/
kd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#importing libraries
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import gensim
from nltk.stem import WordNetLemmatizer
from sklearn.neighbors import KDTree
import nltk
import time
import joblib
start = time.time()
#opening the cleaned file
DATA_DIR = r'.\Results\datasetcleaned.json'
df = pd.read_json(DATA_DIR)
np.random.seed(2018)
nltk.download('wordnet')
#lemmitization and stemming before LDA
wnl = WordNetLemmatizer()
def lemmatize_stemming(text):
lemmatized_words = wnl.lemmatize(text)
return lemmatized_words
def preprocess(text):
result = ""
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3:
result += lemmatize_stemming(token) + " "
return result
#pre-processing the combined values (name + description)
data_words = df['combined'].map(preprocess)
print(data_words)
# Feature extraction using CountVectorizer
vectorizer = CountVectorizer(max_df=0.90, min_df=1, stop_words='english')
joblib.dump(vectorizer.fit(data_words), r'.\Models\CVfitted.pkl')
print("vectorizer fitted")
data_vectorized = vectorizer.fit_transform(data_words)
#number of topics for LDA model
no_topics = 10
lda = LatentDirichletAllocation(n_components=no_topics, random_state=0, learning_method='online')
lda_fitted = lda.fit(data_vectorized)
lda_output = lda.fit_transform(data_vectorized)
# storing the LDA models
joblib.dump(lda_fitted, r'.\Models\ldafitted.pkl')
joblib.dump(lda_output, r'.\Models\ldamodel.pkl')
print("lda model made")
# K-D Tree created post - LDA
tree = KDTree(lda_output, leaf_size = 2)
joblib.dump(tree, r'.\Models\finaltree.pkl')
#Printing the runtime
end = time.time()
print(end - start)