forked from silverfoxy/JSCryptoMinerStudy
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetch_page.py
56 lines (50 loc) · 2.17 KB
/
fetch_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gzip
import json
import requests
import sys
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
class GetSource():
def __init__(self):
self.dataset = 'CC-MAIN-2018-13'
def get_html(self, url, dataset=None):
if dataset is None:
dataset = self.dataset
# Let's fetch the Common Crawl FAQ using the CC index
resp = requests.get('http://index.commoncrawl.org/{0}-index?url={1}&output=json'.format(dataset, url))
pages = [json.loads(x) for x in resp.content.strip().split('\n')]
# Multiple pages may have been found - we're only interested in one
page = pages[0]
# We need to calculate the start and the end of the relevant byte range
# (each WARC file is composed of many small GZIP files stuck together)
offset, length = int(page['offset']), int(page['length'])
offset_end = offset + length - 1
# We'll get the file via HTTPS so we don't need to worry about S3 credentials
# Getting the file on S3 is equivalent however - you can request a Range
prefix = 'https://commoncrawl.s3.amazonaws.com/'
# We can then use the Range header to ask for just this set of bytes
resp = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
# The page is stored compressed (gzip) to save space
# We can extract it using the GZIP library
raw_data = StringIO(resp.content)
f = gzip.GzipFile(fileobj=raw_data)
# What we have now is just the WARC response, formatted:
data = f.read()
warc, header, response = data.strip().split('\r\n\r\n', 2)
#
with open('pagesource/' + url, 'w') as f:
f.write(response)
if __name__ == '__main__':
getSource = GetSource()
urls_file = sys.argv[1]
with open(urls_file, 'r') as f:
for line in f:
url = line.split(',')[1].strip()
print '[{0}] Fetching {1}'.format(line.split(',')[0], line.split(',')[1].strip()),
try:
getSource.get_html(url)
print ' Done'
except:
print ' Failed'