-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetFBData.py
229 lines (191 loc) · 7.14 KB
/
getFBData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re, time
from datetime import date
from bs4 import BeautifulSoup
import pandas as pd
# read .env file element
from dotenv import load_dotenv
load_dotenv()
import os
# connect to amazon RDS mysql server
import mysql.connector
import sqlalchemy
DBHOST= os.getenv("DBHOST")
DBUSER= os.getenv("DBUSER")
DBPASS= os.getenv("DBPASS")
DBDATABASE= os.getenv("DBDATABASE")
engine = sqlalchemy.create_engine(f'mysql+pymysql://{DBUSER}:{DBPASS}@{DBHOST}:3306/{DBDATABASE}')
if(engine):
print("Connect to mysql successfully!")
else:
print("Oops, connect to mysql unsuccessfully.")
def listToString(s):
joined_string = "|".join(s)
return joined_string
#Add options
option = webdriver.ChromeOptions()
option.add_argument('--no-sandbox')
option.add_argument('--headless')
option.add_argument('--disable-dev-shm-usage')
option.add_argument('--disable-notifications')
#Find Post Link
driver = webdriver.Chrome(options = option)
def FindLinks(url, n):
Links = []
driver.get(url)
time.sleep(3) #wait for a few secs for the element to show
try:
driver.find_element_by_class_name('layerCancel').click() ##byPass we got error message box
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scroll down
except:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scroll down
for i in range(n): #make it sleep a few secs so you can see the next block box
try:
driver.find_element_by_class_name('layerCancel').click() #byPass we got error message box
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(3)
except:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(3)
try:
driver.find_element_by_xpath('//a[@id="expanding_cta_close_button"]').click() #byPass register user box
soup = BeautifulSoup(driver.page_source, "html.parser")
posts = soup.findAll('div',{'class':'clearfix y_c3pyo2ta3'})
except:
soup = BeautifulSoup(driver.page_source, "html.parser")
posts = soup.findAll('div',{'class':'clearfix y_c3pyo2ta3'})
for i in posts:
try:
driver.find_element_by_class_name('layerCancel').click()
Links.append('https://www.facebook.com'+i.find('a',{'class':'_5pcq'}).attrs['href'].split('?',2)[0]) #get each post link
except:
Links.append('https://www.facebook.com'+i.find('a',{'class':'_5pcq'}).attrs['href'].split('?',2)[0]) #get each post link
return Links
AllPost =[]
def PostContent(soup, source):
AllReaction = []
FullPost = {}
userContent = soup.find('div', {'class':'_5pcr userContentWrapper'})
try:
Content = userContent.find('div',{'class':'_5pbx userContent _3576'}).text
PosterInfo = userContent.find('div', {'class':'l_c3pyo2v0u _5eit i_c3pynyi2f clearfix'})
except:
Content = "No Content Found"
PosterInfo = "No PosterInfo"
try:
Time = PosterInfo.find('abbr').attrs['title']
except:
Time = "No Time"
try:
moodInfo = userContent.find('span', {'class':'_1n9r _66lh'})
except:
moodInfo = "No moodInfo"
try:
Reactions = moodInfo.find_all('span', {'class':'_1n9k'})
for Reaction in Reactions:
ReactionNum = Reaction.find('a').attrs['aria-label']
AllReaction.append(ReactionNum)
except:
Like = '0'
# add Big title of article
try:
bigTitle = userContent.find('div', {'class': 'mbs _6m6 _2cnj _5s6c'}).text
except:
bigTitle = "No Big Title"
# add Small title of article
try:
smallTitle = userContent.find('div', {'class': '_6m7 _3bt9'}).text
except:
smallTitle = "No Small Title"
FullPost['Link'] = driver.current_url
FullPost['Time'] = Time
FullPost['Content'] = Content
FullPost['Reaction'] = listToString(AllReaction)
FullPost['Source'] = source
FullPost['SavedDate'] = date.today()
FullPost['bigTitle'] = bigTitle
FullPost['smallTitle'] = smallTitle
AllPost.append(FullPost)
return AllPost
#============================================================================================================================================================== ADD new news
#---------------------------------------------------------------------------------------------------------------------------- Get new New York Time Post
driver = webdriver.Chrome(options = option)
AllPost =[]
NYTimeLinks = FindLinks(url='https://www.facebook.com/nytimes/', n = 10)
for Link in NYTimeLinks:
print(Link)
driver.get(Link) #expand link for soup below to catch
soup = BeautifulSoup(driver.page_source, "html.parser")
PostContent(soup, "nytimes")
# check if title exists
Facebookdf = pd.DataFrame(AllPost)
try:
Facebookdf.columns = ['post_link','post_time','content','reaction','post_source', 'saved_date', 'title', 'small_title']
print(f"Facebookdf len before: {len(Facebookdf)}")
except:
pass
existingTitles = []
with engine.begin() as conn:
results = conn.execute('SELECT DISTINCT title FROM politicmotion.fb_rawdata;')
rows = results.fetchall()
for i in rows:
try:
existingTitles.append(''.join(i)) #add tuples
except:
pass
# start delete df rows
try:
RowsToDelete = Facebookdf['title'].isin(existingTitles)
Facebookdf = Facebookdf[~RowsToDelete]
print(f"Facebookdf len after removing existing titles: {len(Facebookdf)}")
Facebookdf.to_sql(
'fb_rawdata',
con=engine,
index=False,
if_exists = 'append'
)
except:
pass
driver.close()
#---------------------------------------------------------------------------------------------------------------------------- Get new Fox Post
driver = webdriver.Chrome(options = option)
AllPost =[]
FoxNewsLinks = FindLinks(url='https://www.facebook.com/FoxNews/', n = 10)
for Link in FoxNewsLinks:
print(Link)
driver.get(Link) #expand link for soup below to catch
soup = BeautifulSoup(driver.page_source, "html.parser")
PostContent(soup, "foxnews")
# check if title exists
Foxnewsdf = pd.DataFrame(AllPost)
try:
Foxnewsdf.columns = ['post_link','post_time','content','reaction','post_source', 'saved_date', 'title', 'small_title']
print(f"len before: {len(Foxnewsdf)}")
except:
pass
existingTitles = []
with engine.begin() as conn:
results = conn.execute('SELECT DISTINCT title FROM politicmotion.fb_rawdata;')
rows = results.fetchall()
for i in rows:
try:
existingTitles.append(''.join(i)) #add tuples
except:
pass
# start delete df rows
try:
RowsToDelete = Foxnewsdf['title'].isin(existingTitles)
Foxnewsdf = Foxnewsdf[~RowsToDelete]
print(f"Foxnewsdf len after removing existing titles: {len(Foxnewsdf)}")
Foxnewsdf.to_sql(
'fb_rawdata',
con=engine,
index=False,
if_exists = 'append'
)
except:
pass
driver.close()
print("Done getting data from FB")