-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
182 lines (165 loc) · 7.64 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import praw
import datetime
import numpy
import pandas
import os
import time
import matplotlib.pyplot as plt
from dotenv_config import Config
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import TimestampType
from pyspark.sql.types import FloatType
from pyspark.sql.types import StringType
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql.functions import desc
from pandas.plotting import table
#NEED THIS TO GET TO REDDIT
config = Config('.env')
#SET UP THE REDDIT CONNECTION
r = praw.Reddit(user_agent='reddit-neoliberal-bot-py',
client_id=os.environ['CLIENT_ID'],
client_secret=os.environ['CLIENT_SECRET'],
password=os.environ['REDDIT_PASS'],
username=os.environ['REDDIT_USER'])
#FOR SOME REASON THIS ISN'T WORKING...
#r = praw.Reddit(user_agent='reddit-neoliberal-bot-py',
# client_id=config('NEOLIBERAL_CLIENT_ID'),
# client_secret=config('NEOLIBERAL_CLIENT_SECRET'),
# password=config('NEOLIBERAL_REDDIT_PASS'),
# username=config('NEOLIBERAL_REDDIT_USER'))
#READ COMMENTS FROM DISCUSSION THREAD
def handle_post(submission, post_date):
spark = start_spark()
comment_list = submission.comments.replace_more(limit=None)
schema_1 = gen_schema_1()
schema_2 = gen_schema_2()
rdd = spark.sparkContext.parallelize(comment_list)
comment_data_frame_1 = spark.createDataFrame(rdd,schema_1)
comment_data_frame_2 = spark.createDataFrame(rdd,schema_2)
graph_most_upvoted_comments(comment_data_frame_2)
graph_comment_count_per_time(comment_data_frame_1)
file_path = os.path.join(ROOT_DIR, 'discussion_thread_data/'+str(post_date)+'_neolib_dt_analytics.pdf')
plt.savefig(file_path)
create_post(file_path, post_date)
spark.stop()
#GENERATE SCHEMA 1
def gen_schema_1():
return StructType([
StructField('body', StringType(), True),
StructField('created_utc', FloatType(), True)
])
#GENERATE SCHEMA 2
def gen_schema_2():
return StructType([
StructField('body', StringType(), True),
StructField('score', IntegerType(), True),
StructField('author', StructType([
StructField('name', StringType(), True)
]), True)
])
#START SPARK
def start_spark():
return SparkSession\
.builder\
.appName('DiscussionThreadAnalyzer')\
.getOrCreate()
#CREATE POST WITH IMAGE
def create_post(file_path, post_date):
submission = r.subreddit('neoliberal').submit_image(str(post_date)+' DT Analysis', file_path)
submission.reply(POST_DESCRIPTION)
return
#GRAPH NUMBER OF COMMENTS OVER TIME
def graph_comment_count_per_time(comments_date_time_data_frame):
udf_as_time_str = udf(as_time_str, StringType())
comments_date_time_data_frame_converted = comments_date_time_data_frame.withColumn('datetime', udf_as_time_str('created_utc'))
comments_time = comments_date_time_data_frame_converted.groupBy('datetime')
pandas_comments_time = comments_time.count().orderBy('datetime').toPandas()
ax0 = plt.subplot2grid((8,3), (0,0), rowspan=1, colspan=3)
pandas_comments_time.plot(ax=ax0, kind='line', x='datetime', y='count')
ax0.set_xlabel('time (EST)')
ax0.set_ylabel('comments')
ax0.set_title('DT Comments per Hour')
find_real_neolib_hours(comments_date_time_data_frame_converted, pandas_comments_time, udf_as_time_str)
#DOES THE COMMENT CONTAIN A "REAL NEOLIBERAL HOURS" KEY WORD?
def contains_rnh(comment):
rnh_root_words = ['alcohol', 'depress', 'whiskey', 'sad', 'wine', 'suicid', 'beer', 'alone', 'tequila', 'lonely', 'vodka', 'antisocial', 'drugs', 'cry', 'weed', 'anxiety', 'gin', 'unhappy', 'cocktail', 'stress']
for word in rnh_root_words:
if word in str(comment):
return True
return False
#DETERMINE WHEN REAL NEOLIBERAL HOURS BEGIN
def find_real_neolib_hours(comments_by_time, pandas_comments_time, udf_as_time_str):
udf_contains_rnh = udf(contains_rnh, BooleanType())
comments_by_time = comments_by_time.filter(udf_contains_rnh(comments_by_time['body'])).groupBy('datetime').count().orderBy('datetime').toPandas()
c = ['rnh comment ratio', 'time']
ratios = numpy.divide(comments_by_time['count'], pandas_comments_time['count'])
ratios.replace([numpy.inf, -numpy.inf], numpy.nan).dropna()
pandas_comments_time.replace([numpy.inf, -numpy.inf], numpy.nan).dropna()
matrix = list(zip(ratios, comments_by_time['datetime']))
pandas_ratios_time = pandas.DataFrame(data=matrix, columns=c)
ax1 = plt.subplot2grid((8,3), (2,0), rowspan=1, colspan=3)
pandas_ratios_time.plot(ax=ax1, kind='line', x='time', y='rnh comment ratio')
ax1.set_xlabel('time (EST)')
ax1.set_ylabel('ratio of comments about alcohol or depression per comments per hour')
ax1.set_title('DT Real Neoliberal Hours')
#CONVERT TIMESTAMP
def as_time_str(x):
return time.ctime(x)[11:-10]+'00'
#GET NAME FIELD
def get_name(x):
return str(x)[10:-2]
#GRAPH MOST UPVOTED COMMENTS
def graph_most_upvoted_comments(commentsDataFrame):
udf_get_name = udf(get_name, StringType())
comments_data_frame_converted = commentsDataFrame.withColumn('author', udf_get_name('author'))
comments_by_score = comments_data_frame_converted.orderBy(desc('score')).limit(10).toPandas()
comments_by_score['body'] = comments_by_score['body'].str.wrap(140)
comments_by_score.rename(columns={'body': 'comment'})
comments_by_score.set_index('body')
ax2 = plt.subplot2grid((8,3), (4,0), rowspan=4, colspan=3, frame_on=False)
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax2.axis('off')
tab = table(ax2, comments_by_score, loc='upper center', cellLoc='left')
tab.auto_set_font_size(False)
tab.set_fontsize(17)
cell_dict=tab.get_celld()
for i in range(11):
cell_dict[(i,0)].set_width(0.9)
cell_dict[(i,0)].set_height(0.1)
cell_dict[(i,1)].set_width(0.06)
cell_dict[(i,1)].set_height(0.1)
cell_dict[(i,2)].set_width(0.2)
cell_dict[(i,2)].set_height(0.1)
cell_dict[(0,0)].set_height(0.03)
cell_dict[(0,1)].set_height(0.03)
cell_dict[(0,2)].set_height(0.03)
#LINK USER TO NEOLIBERAL POSTS
def fetch_discussion_thread():
for submission in r.subreddit('neoliberal').new(limit=None):
if 'neoliberal-shill-bot' == submission.author.name:
return
post_date = datetime.date.fromtimestamp(submission.created_utc)
if 'discussion thread' in submission.title.lower() and today-post_date>datetime.timedelta(hours=12):
handle_post(submission, post_date)
return
#LETS GET THIS THING STARTED
POST_DESCRIPTION = ('Hi! I\'m a bot. I run daily data analytics on the previous day\'s Discussion Thread. The first graph in my post shows the traffic in the DT over time. \n\n'+
'With the second graph, I am attempting to learn when exactly "real neoliberal hours" begin. The y axis is the ratio of the sum of comments containing alcohol/drugs/depression related words per hour, over the total comments per hour. \n\n'+
'The table at the bottom shows the most upvoted comments.\n I hope you found this informative.' )
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
today = datetime.date.today()
plt.rcParams['figure.figsize'] = (30,100)
plt.rc('font', size=17)
plt.rc('axes', titlesize=17)
plt.rc('axes', labelsize=17)
plt.rc('xtick', labelsize=17)
plt.rc('ytick', labelsize=17)
plt.rc('legend', fontsize=12)
plt.rc('figure', titlesize=24)
fetch_discussion_thread()