forked from manasakalaimalai/stylesynth.ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommender.py
134 lines (113 loc) · 4.86 KB
/
recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pandas as pd
import re
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import BM25
nltk.download('stopwords')
nltk.download('punkt')
class OutfitRecommender:
def __init__(self, filename, columns, t_column, d_column, f_column, c_column, b_column):
self.filename = filename
self.columns = columns
self.title_column = t_column
self.description_column = d_column
self.fabric_column = f_column # fabric column
self.color_column = c_column # color column
self.brand_column = b_column # brand column
self.df = None
def process(self, show=True):
self.df = pd.read_csv(self.filename)
self.df = self.df[self.columns]
self.df[self.description_column].fillna('', inplace=True)
self.df[self.description_column] = self.df[self.title_column] + '. ' + self.df[self.description_column].map(str)
self.df.dropna(inplace=True)
self.df.drop_duplicates(inplace=True)
return self.df
def show_df_records(self, n = 5):
return self.df.head(n)
def show_info_details(self):
return self.df.info()
def __normalize(self, d):
stopwords = nltk.corpus.stopwords.words('english')
d = re.sub(r'[^a-zA-Z0-9\s]', '', d, re.I|re.A)
d = d.lower().strip()
tks = nltk.word_tokenize(d)
f_tks = [t for t in tks if t not in stopwords]
return ' '.join(f_tks)
def get_normalized_corpus(self, tokens = False):
n_corpus = np.vectorize(self.__normalize)
if tokens == True:
norm_courpus = n_corpus(list(self.df[self.description_column]))
return np.array([nltk.word_tokenize(d) for d in norm_corpus])
else:
return n_corpus(list(self.df[self.description_column]))
def search_outfits_by_brand(self, term='fabric'):
category_column = self.fabric_column if term == 'fabric' else (
self.color_column if term == 'fabric' else self.fabric_column
)
outfits = self.df[self.title_column].values
possible_options = [
(i, outfit)
for i, outfit in enumerate(outfits)
for word in outfit.split(' ')
if word == category
]
return possible_options
def search_outfits_by_brand(self, term='color'):
category_column = self.color_column if term == 'color' else (
self.color_column if term == 'color' else self.color_column
)
outfits = self.df[self.title_column].values
possible_options = [
(i, outfit)
for i, outfit in enumerate(outfits)
for word in outfit.split(' ')
if word == category
]
return possible_options
def search_outfits_by_brand(self, term='brand'):
category_column = self.brand_column if term == 'branc' else (
self.color_column if term == 'brand' else self.brand_column
)
outfits = self.df[self.title_column].values
possible_options = [
(i, outfit)
for i, outfit in enumerate(outfits)
for word in outfit.split(' ')
if word == category
]
return possible_options
def get_features(self, norm_corpus):
tf_idf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
tfidf_array = tf_idf.fit_transform(norm_corpus)
return tfidf_array
def get_vector_cosine(self, tfidf_array):
return pd.DataFrame(cosine_similarity(tfidf_array))
def get_bm25_weights(self, corpus):
bm25 = BM25(corpus)
avg_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
weights = []
for doc in corpus:
scores = bm25.get_scores(doc, avg_idf)
weights.append(scores)
return pd.DataFrame(weights)
def get_`bert`_weights(self, corpus):
model = SentenceTransformer('bert-base-nli-mean-tokens')
vectors = model.encode(corpus)
weights = pd.DataFrame(cosine_similarity(vectors))
return weights
# search outfits based on fabric, color, brand
def search_outfits_by_term(self, term='outfit'):
outfits = self.df[self.title_column].values
possible_options = [(i, outfit) for i, outfit in enumerate(outfits) for word in outfit.split(' ') if word == term]
return possible_options
# returns outfit recommendations for each product category
def outfit_ recommendation(self, index, vector, n):
similarities = vector.iloc[index].values
similar_indices = np.argsort(-similarities)[1:n + 1]
outfits = self.df[self.title_column].values
similar_outfits = outfits[similar_indices]
return similar_outfits