-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_raw_html.py
104 lines (64 loc) · 2.36 KB
/
get_raw_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# purpose: given a dataset of URLs, attempt to retrieve the content at that address
# does not do encoding or file type checks (ie, can download and add a .zip file)
# post-process to drop error codes as relevant
import pandas as pd
import datetime
import requests
import random
import time
# given a URL, attempt to retrieve the XML file
def fetchXML(url):
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
# in case the dataset lacks protocol, attach http in front
if url[0:7] != "http://" and url[0:8] != "https://":
url = "http://"+url
try:
r = requests.get(url, headers=headers, timeout=10)
except:
print("failure: " + url)
return 0, None
if r.status_code != 200:
status = r.status_code
xml = r.content
print("Irregular status code")
else:
status = 200
time.sleep(10)
xml = r.content
return status, xml
def parser(inputdf, urls_list, fails_list, tries):
for url in urls_list:
print("trying: ", url)
temp_status, temp_xml = fetchXML(url)
if temp_status == 200 or tries == 3:
new_row = {'full_url':url, 'status':temp_status, 'xml':temp_xml,'datetime':datetime.datetime.now()}
inputdf = inputdf.append(new_row, ignore_index=True)
else:
fails_list.append(url)
return inputdf
def main():
# append URL-XML pair to dataframe
# This makes a new dataframe (linear)
df1 = pd.DataFrame(columns = ['full_url', 'status','xml','datetime'])
df2 = pd.DataFrame(columns = ['full_url', 'status','xml','datetime'])
df3 = pd.DataFrame(columns = ['full_url', 'status','xml','datetime'])
filename = 'justalexa.csv'
inputdf = pd.read_csv(filename, usecols=['full_url'])
urls_list = inputdf['full_url'].values.tolist()
fails_list = []
fails_list2 = []
df1 = parser(df1, urls_list, fails_list, 1)
df1.to_csv('output_temp.csv', encoding='utf-8', index=False)
# this saves the first wave
df2 = parser(df2, fails_list, fails_list2, 2)
df3 = parser(df3, fails_list2, fails_list2, 3)
frames = [df1, df2, df3]
resultdf = pd.concat(frames)
#print(type(result))
output = 'outputxml.csv'
resultdf.to_csv(output, encoding='utf-8', index=False)
print("failed:")
print(fails_list)
if __name__ == "__main__":
main()