-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment_buckets.py
153 lines (118 loc) · 5.72 KB
/
sentiment_buckets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import time
import os
import pandas as pd
import numpy as np
from scipy import spatial
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm, skew
from tqdm import tqdm
from utils import get_taskparams, load_data, send_email
from utils1 import flatten_metrics, open_entities, get_mapping_dict, mmap_subcat, glove_model, within_category_deviation, level1_estimate, pos_based_catagorization
import stanza
from google.oauth2 import service_account
from google.cloud import language, bigquery
import pandas_gbq
import configparser
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
config = configparser.ConfigParser()
config.read('variables.ini')
task_params = get_taskparams(config['Default']['sheet_id'], config['Default']['sheet_name'])
#testing
#xls = pd.ExcelFile('abc.xlsx')
#df = pd.read_excel(xls, 'fan')
class Sentiment_Buckets():
def __init__(self, data):
self.data = data
def get_review_buckets(self):
self.data.Entity = self.data.Entity.astype(str)
sadf = self.data.copy()
# Getting level Categories
mm = get_mapping_dict(sheet_id=mapping_sheet_id, sheet_name='level1')
#category
def mmap(x):
r = ''
for k in mm.keys():
if k in str(x).lower():
return mm[k]
r = mm[k]
break
else:
pass
if r == '':
return 'NA'
sadf['Categories'] = sadf.Entity.apply(lambda x: mmap(x))
sadf['Categories'] = sadf['Categories'].fillna('NA')
#sadf.loc[sadf.Entity == 'NA'].Categories = 'IR'
# getting estimate for remaining level 1 categories
sadf.Entity = sadf.Entity.apply(lambda x: x if x == 'NA' else x.lower())
mm_general = get_mapping_dict(sheet_id=mapping_sheet_id, sheet_name='level1_general')
dev_df = within_category_deviation(glovedf, mm_general)
def predict_level1_category(x, level1_dict):
try:
return level1_estimate(glovedf, x, level1_dict, dev_df = dev_df)[1]
except:
return 'NA'
#getting categories for entities which could not be mapped by the level1 ductionary
sadf['Categories'] = sadf.apply(lambda x: predict_level1_category(x['Entity'], mm_general) if (x['Categories'] == 'NA') & (x['Entity'] != 'NA') else x['Categories'], axis = 1)
#getting categories for reviews for whihc entities were not detected
sadf['Categories'] = sadf.apply(lambda x: pos_based_catagorization(glovedf, x['Reviews'], mm_general, dev_df = dev_df) if x['Entity'] == 'NA' else x['Categories'], axis = 1)
#getting categories for which only one entity was detected, i.e, salience = 1
sadf['Categories'] = sadf.apply(lambda x: pos_based_catagorization(glovedf, x['Reviews'], mm_general, dev_df = dev_df) if (str(x['Salience']) == '1') & (x['Categories'] in ['NA', 'IR']) else x['Categories'], axis = 1)
# Getting level 2 Categories
#sadf = mmap_subcat(sadf)
#combining categories, eg: Features1 and Features2, Usability1 and Usability2
sadf['Categories'] = sadf['Categories'].apply(lambda x: str(x)[:-1] if x not in ('NA','IR') else x)
#return sadf
return sadf
def run(self):
sadf = self.get_review_buckets()
sadf = sadf.astype(str)
sadf = sadf.fillna("")
credentials = service_account.Credentials.from_service_account_file(out_key_file,)
try:
pandas_gbq.to_gbq(sadf, destination_table=destination_table, project_id=out_project_id, credentials=credentials, if_exists='replace')
return df
except Exception as e:
send_email(from_email=task_params.loc[task_id,'From Email'], #send an email if there is an error in uploading
from_email_pass=task_params.loc[task_id,'From Email Password'],
to_email=task_params.loc[task_id,'To Email'],
subject='Error in Uploading file(s)',
body_text=f'There was an error in uploading the file to {destination_table} : {e}')
print('--Error--\n')
print(e)
if __name__ == '__main__':
task_id_list = task_params.index.to_list()
model_path = 'C:/Users/ADMIN/Desktop/sentiment analysis module/glove.6B.100d.txt'
glovedf = glove_model(model_path) # importing model
for task_id in task_id_list:
mapping_sheet_id = task_params.loc[task_id, 'Mapping Sheet']
sa_key_file = task_params.loc[task_id, 'Sentiment Analysis Key File']
sa_project_id = task_params.loc[task_id, 'Sentiment Analysis Project ID']
sa_dataset = task_params.loc[task_id, 'Sentiment Analysis Dataset']
sa_table = task_params.loc[task_id, 'Sentiment Analysis Table']
out_key_file = task_params.loc[task_id, 'Sentiment Analysis Key File']
out_project_id = task_params.loc[task_id, 'Sentiment Analysis Project ID']
out_dataset = task_params.loc[task_id, 'Sentiment Analysis Dataset']
out_table_suffix = task_params.loc[task_id, 'Sentiment Analysis Table Suffix']
df = load_data(key_path=sa_key_file, project_id=sa_project_id, dataset=sa_dataset, t=sa_table) #importing sentiment analysis table
print(df.shape)
df = df.sort_values(by = 'Review_ID')
df = flatten_metrics(df)
#df = df.head(20)
#rr_tables = task_params.loc[task_id, 'Raw Reviews Tables'].split(',')
#sa_tables = [t + '_' + sa_table_suffix for t in rr_tables]
distinct_product_types = df.Product_Category.unique()
for t in distinct_product_types:
print(t)
table = t + '_' + out_table_suffix
destination_table = f'{out_dataset}.{table}'
dft = df[df.Product_Category == t]
obj = Sentiment_Buckets(dft)
print(obj.data.head())
print(" ")
obj.run()
print(task_params.loc[task_id, 'Client'], f'{t} Bucketing Done')
print('All',task_params.loc[task_id, 'Client'],'Done')
print('--Done--')