-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_crawler.py
89 lines (73 loc) · 2.72 KB
/
run_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import time
import json
import argparse
from post_process import post_process
from crawler.config import config_desc, Config as config
from crawler.Logger import Logger
from crawler.DocMgr import DocMgr
from crawler.URLMgr import URLMgr
from crawler.Progress import Progress
from crawler.constants import CrawlResult as CR
from crawler.CrawlerMgr import CrawlerMgr
def main():
# required arguments
req_args = ['SEED_TARGETS_LIST']
# build arguments parser
parser = argparse.ArgumentParser()
for name in dir(config):
if not name.startswith("__"):
value = getattr(config, name)
_type = type(value)
help_text = config_desc[name]
if _type is list:
default = ' '.join(map(str, value))
kwargs = {'type': type(value[0]), 'nargs': '+'}
else:
default = str(value)
kwargs = {'type': _type}
if name in req_args :
parser.add_argument(name, **dict(kwargs, help='%s (Example: %s)'%(help_text, default)))
else:
parser.add_argument('--'+name.lower(), **dict(kwargs, dest=name, help='%s (Default: %s)'%(help_text, default)))
# retrieve arguments
args = parser.parse_args()
for name in vars(args):
value = getattr(args, name)
if value is not None:
setattr(config, name, value)
# build objects
logger = Logger(config)
url_mgr = URLMgr(config, logger)
doc_mgr = DocMgr(config, logger)
crawler_mgr = CrawlerMgr(config, logger, doc_mgr)
progress = Progress(config, logger, url_mgr, doc_mgr, crawler_mgr)
# add seed urls
with open(config.SEED_TARGETS_LIST, 'r') as f:
for url_str in f.read().splitlines():
url_mgr.set(url_str)
# start crawling
doc_mgr.start_doc_parsers()
crawler_mgr.start_crawlers()
while progress.active:
progress.print()
for url in url_mgr.get():
crawler_mgr.add_to_queue(url)
for result, url, url_str, anchor_text in doc_mgr.get_parsed(n=300):
if result == CR.SUCCESS:
url_mgr.set(url_str, anchor_text, parent_URL=url)
elif result == CR.NEED_RETRY:
url_mgr.set(url)
else:
url_mgr.deactive_url(url)
logger.save_to_disk()
progress.print(force=True)
print('\n')
# end crawling
doc_mgr.stop_doc_parsers()
crawler_mgr.stop_crawlers()
while doc_mgr.num_running_process or crawler_mgr.num_running_process:
time.sleep(.5)
logger.save_to_disk(force=True)
if __name__ == '__main__':
main()
post_process(config.STORAGE_FOLDER)