-
Notifications
You must be signed in to change notification settings - Fork 2
/
reviews.py
222 lines (164 loc) · 7.77 KB
/
reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import os
import pandas as pd
from scipy.sparse.csr import csr_matrix
import numpy as np
from typing import List
project_dir = 'D:\GitHub\Projects\Analysis_of_Delivery_Companies_Reviews'
os.chdir(project_dir)
from helpers.utilities import get_ratings_mapping, split_ratings_col, get_ratings_mapping
from processing import text_processing as tp
from texteda import (most_common_words,
plot_most_common_words,
plot_wordcloud,
compute_bigrams,
plot_bigrams)
from nltk.corpus import stopwords
processed_pages_file = os.path.join(project_dir, 'processed_pages.txt')
reviews_base_file = os.path.join(project_dir, 'reviews.csv')
ratings_dict = get_ratings_mapping()
base_df = pd.read_csv(reviews_base_file, sep=',')
stopwords_ls = stopwords.words('english')
stpw_charlist = ['\'d', '\'m', '\'s', '\'ve', '\'re', '\'ll', 'n\'t', '’']
common_delivery_words = ['delivery', 'deliver', 'driver', 'order', 'uber',
'stuart', 'deliveroo', 'food', 'use', 'get', 'service',
'customer', 'refund']
stopwords_ls.extend(stpw_charlist)
stopwords_ls.extend(common_delivery_words)
# See a distribution of number of reviews among all companies
base_df['Company'].value_counts()
# Check for duplicates
base_df.drop_duplicates(inplace=True)
# Cast columns specific data format
base_df['Date'] = pd.to_datetime(base_df['Date'], format="%Y-%m-%d %H:%M", errors='coerce')
base_df['Rating'] = base_df['Rating'].apply(lambda row: split_ratings_col(row)[0]).astype(int)
base_df['Rating_Text'] = base_df['Rating'].apply(lambda row: ratings_dict[row])
# Are there reviewers that have submitted to more than one reviews ?
# reviewers_multiple = base_df['Reviewer_Id'].value_counts()
# f = base_df[base_df['Reviewer_Id']==reviewers_multiple.index[0]]
# Transform dataset
# Delete rows where the review is empty
# base_df = base_df[base_df['Review'].notna()]
# Wherever Review is empty, replace it with Title
base_df.loc[base_df['Review'].isnull(), 'Review'] = base_df['Title']
# Split review in tokens and remove punctuation, stopwords
base_df['Review_Tokens_Clean'] = base_df['Review'].apply(lambda row: tp.tokenize_and_clean(text=row, stopwords_ls=stopwords_ls))
# Lemmatize the tokens
base_df['Review_Tokens_Lemma'] = base_df['Review_Tokens_Clean'].apply(lambda row: tp.lemmatize(text=row, pos_type='n'))
base_df['Review_Tokens_Lemma'] = base_df['Review_Tokens_Lemma'].apply(lambda row: tp.lemmatize(text=row, pos_type='a'))
base_df['Reviews_Clean'] = base_df['Review_Tokens_Lemma'].apply(lambda row: ' '.join([x for x in row]))
base_df['Review_Bigram'] = base_df['Review_Tokens_Lemma'].apply(lambda row: tp.sentence_to_ngram_tokens(text=row, ngram_size=2))
base_df['Review_Bigram_Sentence'] = base_df['Review_Bigram'].apply(lambda row: ' '.join([x for x in row]))
# Exploratory Data Analysis
most_common_words(base_df, text_col='Review_Bigram_Sentence', n_most_common=10)
plot_most_common_words(base_df, n_most_common=10, text_col='Review_Bigram_Sentence')
plot_wordcloud(base_df, text_col='Review_Bigram_Sentence')
compute_bigrams(base_df, text_col='Reviews_Clean')
plot_bigrams(input_df=base_df, text_col='Reviews_Clean', top_n=10)
'''
From the bigrams plot we can infer that there are 3 main topics in the data:
1) Case where order arrived but an item was missing
2) Case when items received very late
3) Case when the order never arrived
The customer service related queries might be adding noise in the data as in all
cases above, the customer most likely would try to contact the customer service
'''
# Deliveroo
most_common_words(base_df[base_df['Company'] == 'Deliveroo'],
text_col='Review_Bigram_Sentence',
n_most_common=10)
# LDA
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_df=0.7,
min_df=1,
max_features=6000)
'''
This create a sparse matrix where each row is a document and each column
is a word. The values [xi, yi] represent a count of how many times a word
appears in that document.
'''
# apply transformation
cv = vectorizer.fit_transform(base_df['Review_Bigram_Sentence']) #.toarray()
# tf_feature_names tells us what word each column in the matrix represents
cv.shape # (15407, 800)
def countVectorizer_to_dict(vectorizer: CountVectorizer,
matrix: csr_matrix) -> dict:
feature_names = vectorizer.get_feature_names()
counts = np.asarray(matrix.sum(axis=0))[0]
return dict(zip(feature_names, counts))
word_counts = countVectorizer_to_dict(vectorizer=vectorizer, matrix=cv)
from sklearn.decomposition import LatentDirichletAllocation
number_of_topics = 3
lda_model = LatentDirichletAllocation(n_components=number_of_topics,
max_iter=10,
random_state=45,
n_jobs=-1,
verbose=1) # random state for reproducibility
# Fit data to model
lda_model.fit(cv) # (15349, 17697) i.e. 15349 documents (rows), and 17697 words (columns)
# lda_model.fit_transform(term_freq[1:2])
'''
The output is a NxM matrix where N is number of samples(e.g. a document)
and M is the number of topics.
Gives the probability of the document to belong to each of the topics
'''
lda_model.components_[0]
'''
this gives the weight of each word for a specific document
Its size is number_of_documents x number_of_words
'''
lda_model.exp_dirichlet_component_
lda_model.get_params
def get_word_weights_per_topic(model, feature_names: List[str], sort=True):
word_weights_per_topic = []
for i, topic in enumerate(model.components_):
weights = list(zip(feature_names, topic))
if sort:
weights = sorted(weights, key=lambda x: x[1], reverse=True)
word_weights_per_topic.append([i, weights])
return word_weights_per_topic
t = get_word_weights_per_topic(lda_model, feature_names=vectorizer.get_feature_names())
t[0][1][0:5]
def show_top_words_per_topic(model, feature_names: List[str], num_top_words: int):
for i in range(0, len(model.components_)):
weights = get_word_weights_per_topic(model, feature_names)[i][1]
print('Topic {0} : {1}'.format(i, weights[0:num_top_words]))
show_top_words_per_topic(lda_model, feature_names=vectorizer.get_feature_names(), num_top_words=10)
# Gensim
import gensim
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(base_df['Review_Tokens_Lemma'])
# Create Corpus
texts = base_df['Review_Tokens_Lemma']
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])
from pprint import pprint
# number of topics
num_topics = 3
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=id2word,
num_topics=num_topics)
# Print the Keyword in the 3 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus] # We can view each documents distribution over each topic
len(doc_lda)
import pyLDAvis.gensim
import pickle
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
with open(LDAvis_data_filepath, 'wb') as f:
pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared