-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproduct_w2v_visual.py
261 lines (210 loc) · 10.6 KB
/
product_w2v_visual.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from scipy import sparse
import pickle
with open('word2vec_model', 'rb') as handle:
model = pickle.load(handle)
vocab = model.keys()
plotly.offline.init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
data = pd.read_pickle('pickels/16k_apperal_data_preprocessed')
idf_title_vectorizer = CountVectorizer()
idf_title_features = idf_title_vectorizer.fit_transform(data['title'])
# some of the brand values are empty.
# Need to replace Null with string "NULL"
data['brand'].fillna(value="Not given", inplace=True )
# replace spaces with hypen
brands = [x.replace(" ", "-") for x in data['brand'].values]
types = [x.replace(" ", "-") for x in data['product_type_name'].values]
colors = [x.replace(" ", "-") for x in data['color'].values]
#bottleneck_features_train = np.load('16k_data_cnn_features.npy')
asins = np.load('16k_data_cnn_feature_asins.npy')
#bottleneck_features_train = sparse.csr_matrix(bottleneck_features_train)
asins = list(asins)
# load the original 16K dataset
data = pd.read_pickle('pickels/16k_apperal_data_preprocessed')
df_asins = list(data['asin'])
#bottleneck_vectorizer = CountVectorizer()
#bottleneck_features = bottleneck_vectorizer.fit_transform(bottleneck_features_train)
type_vectorizer = CountVectorizer()
type_features = type_vectorizer.fit_transform(types)
brand_vectorizer = CountVectorizer()
brand_features = brand_vectorizer.fit_transform(brands)
color_vectorizer = CountVectorizer()
color_features = color_vectorizer.fit_transform(colors)
color_features_extra = hstack((color_features,type_features)).tocsr()
asins_vectorizer = CountVectorizer()
asins = asins_vectorizer.fit_transform(asins)
print(type(color_features_extra))
print(type(asins))
#print(type(bottleneck_features_train))
def build_avg_vec(sentence, num_features, doc_id, m_name):
# sentace: its title of the apparel
# num_features: the lenght of word2vec vector, its values = 300
# m_name: model information it will take two values
# if m_name == 'avg', we will append the model[i], w2v representation of word i
# if m_name == 'weighted', we will multiply each w2v[word] with the idf(word)
featureVec = np.zeros((num_features,), dtype="float32")
# we will intialize a vector of size 300 with all zeros
# we add each word2vec(wordi) to this fetureVec
nwords = 0
for word in sentence.split():
nwords += 1
if word in vocab:
if m_name == 'weighted' and word in idf_title_vectorizer.vocabulary_:
featureVec = np.add(featureVec, idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[word]] * model[word])
elif m_name == 'avg':
featureVec = np.add(featureVec, model[word])
if(nwords>0):
featureVec = np.divide(featureVec, nwords)
# returns the avg vector of given sentance, its of shape (1, 300)
return featureVec
doc_id = 0
w2v_title_weight = []
# for every title we build a weighted vector representation
for i in data['title']:
w2v_title_weight.append(build_avg_vec(i, 300, doc_id,'weighted'))
doc_id += 1
# w2v_title = np.array(# number of doc in courpus * 300), each row corresponds to a doc
w2v_title_weight = np.array(w2v_title_weight)
def get_word_vec(sentence, doc_id, m_name):
# sentence : title of the apparel
# doc_id: document id in our corpus
# m_name: model information it will take two values
# if m_name == 'avg', we will append the model[i], w2v representation of word i
# if m_name == 'weighted', we will multiply each w2v[word] with the idf(word)
vec = []
for i in sentence.split():
if i in vocab:
if m_name == 'weighted' and i in idf_title_vectorizer.vocabulary_:
vec.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[i]] * model[i])
elif m_name == 'avg':
vec.append(model[i])
else:
# if the word in our courpus is not there in the google word2vec corpus, we are just ignoring it
vec.append(np.zeros(shape=(300,)))
# we will return a numpy array of shape (#number of words in title * 300 ) 300 = len(w2v_model[word])
# each row represents the word2vec representation of each word (weighted/avg) in given sentance
return np.array(vec)
def get_distance(vec1, vec2):
# vec1 = np.array(#number_of_words_title1 * 300), each row is a vector of length 300 corresponds to each word in give title
# vec2 = np.array(#number_of_words_title2 * 300), each row is a vector of length 300 corresponds to each word in give title
final_dist = []
# for each vector in vec1 we caluclate the distance(euclidean) to all vectors in vec2
for i in vec1:
dist = []
for j in vec2:
# np.linalg.norm(i-j) will result the euclidean distance between vectors i, j
dist.append(np.linalg.norm(i-j))
final_dist.append(np.array(dist))
# final_dist = np.array(#number of words in title1 * #number of words in title2)
# final_dist[i,j] = euclidean distance between vectors i, j
return np.array(final_dist)
def display_img(url,ax,fig):
# we get the url of the apparel and download it
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# we will display it in notebook
plt.imshow(img)
def heat_map_w2v_brand(sentance1, sentance2, url, doc_id1, doc_id2, df_id1, df_id2, model):
# sentance1 : title1, input apparel
# sentance2 : title2, recommended apparel
# url: apparel image url
# doc_id1: document id of input apparel
# doc_id2: document id of recommended apparel
# df_id1: index of document1 in the data frame
# df_id2: index of document2 in the data frame
# model: it can have two values, 1. avg 2. weighted
#s1_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
s1_vec = get_word_vec(sentance1, doc_id1, model)
#s2_vec = np.array(#number_of_words_title2 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
s2_vec = get_word_vec(sentance2, doc_id2, model)
# s1_s2_dist = np.array(#number of words in title1 * #number of words in title2)
# s1_s2_dist[i,j] = euclidean distance between words i, j
s1_s2_dist = get_distance(s1_vec, s2_vec)
data_matrix = [['Asin','Brand', 'Color', 'Product type'],
[data['asin'].loc[df_id1],brands[doc_id1], colors[doc_id1], types[doc_id1]], # input apparel's features
[data['asin'].loc[df_id2],brands[doc_id2], colors[doc_id2], types[doc_id2]]] # recommonded apparel's features
colorscale = [[0, '#1d004d'],[.5, '#f2e5ff'],[1, '#f2e5d1']] # to color the headings of each column
# we create a table with the data_matrix
table = ff.create_table(data_matrix, index=True, colorscale=colorscale)
# plot it with plotly
plotly.offline.iplot(table, filename='simple_table')
# devide whole figure space into 25 * 1:10 grids
gs = gridspec.GridSpec(25, 15)
fig = plt.figure(figsize=(25,5))
# in first 25*10 grids we plot heatmap
ax1 = plt.subplot(gs[:, :-5])
# ploting the heap map based on the pairwise distances
ax1 = sns.heatmap(np.round(s1_s2_dist,6), annot=True)
# set the x axis labels as recommended apparels title
ax1.set_xticklabels(sentance2.split())
# set the y axis labels as input apparels title
ax1.set_yticklabels(sentance1.split())
# set title as recommended apparels title
ax1.set_title(sentance2)
# in last 25 * 10:15 grids we display image
ax2 = plt.subplot(gs[:, 10:16])
# we dont display grid lins and axis labels to images
ax2.grid(False)
ax2.set_xticks([])
ax2.set_yticks([])
# pass the url it display it
display_img(url, ax2, fig)
plt.show()
def idf_w2v_brand(doc_id, widf,wBrand, wColor,wVisualAsins, num_results):
# doc_id: apparel's id in given corpus
# widf: weight for w2v features
# wBrand: weight for brand feature
# wColor: weight for color features
#wBottle: weight for visula features
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
idf_w2v_dist = pairwise_distances(w2v_title_weight, w2v_title_weight[doc_id].reshape(1,-1))
brand_feat_dist = pairwise_distances(brand_features, brand_features[doc_id])
color_feat_dist = pairwise_distances(color_features_extra, color_features_extra[doc_id])
#bottle_feat_dist = pairwise_distances(bottleneck_features_train, bottleneck_features_train[doc_id])
asins_feat_dist = pairwise_distances(asins, asins[doc_id])
pairwise_dist = ((widf * idf_w2v_dist) + (wBrand * brand_feat_dist) + (wColor * color_feat_dist) +(wVisualAsins * asins_feat_dist) )/float(widf + wColor + wBrand + wVisualAsins )
#pairwise_dist = (widf * idf_w2v_dist + wColor * ex_feat_dist)/float(widf + wColor)
# np.argsort will return indices of 9 smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the 9 smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
#data frame indices of the 9 smallest distace's
df_indices = list(data.index[indices])
for i in range(0, len(indices)):
heat_map_w2v_brand(data['title'].loc[df_indices[0]],data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i],df_indices[0], df_indices[i], 'weighted')
print('ASIN :',data['asin'].loc[df_indices[i]])
print('Brand :',data['brand'].loc[df_indices[i]])
print('euclidean distance from input :', pdists[i])
print('='*125)
idf_w2v_brand(12566, 5, 5, 5, 1, 20)
# in the give heat map, each cell contains the euclidean distance between words i, j