-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
88 lines (72 loc) · 2.93 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/python
from parse_webpage import FakeBookParser
from PacketTransfer import PacketTransfer
import re, sys
# Class representing the web crawler
class Crawler:
def __init__(self):
self.secretFlag = []
self.visited = set()
self.allLinks = []
# the main crawl function which gets called when the application is started
def crawl(self, username, password):
connect = PacketTransfer(username, password, logging_level=10)
welcome_page = connect.login()
parse = FakeBookParser()
parse.feed(welcome_page)
links_dictionary = parse.links
self.add_link(links_dictionary)
# keep crawling until there are a total of 5 flags
while len(self.secretFlag) < 5:
if len(self.allLinks) == 0:
break
if self.allLinks[0] in self.visited:
self.allLinks.pop(0)
else:
page_parser = FakeBookParser()
url = self.allLinks[0]
dictionary = connect.send_request_message(url)
# handling the 500 server code
while dictionary is None or 500 in dictionary.keys():
connect.login()
dictionary = connect.send_request_message(url)
if self.handle_requests(dictionary):
page = dictionary[200]
page_parser.feed(page)
self.add_link(page_parser.links)
self.visited.add(self.allLinks.pop(0))
for flag in self.secretFlag:
print(flag[0])
# function for handling different HTML codes
def handle_requests(self, dictionary):
if dictionary.get(200):
return True
elif dictionary.get(301):
self.visited.add(self.allLinks.pop(0))
if self.check_valid_url(dictionary[301]):
self.allLinks.append(dictionary[301].split("http://cs5700.ccs.neu.edu")[1])
elif dictionary.get(403):
self.visited.add(self.allLinks.pop(0))
else:
self.allLinks.append(self.allLinks.pop(0))
return False
# function for checking a valid URL so that the crawler only crawls the specified domain links
def check_valid_url(self, url):
reg_list = re.compile('^http:\/\/cs5700\.ccs\.neu\.edu\/fakebook').findall(url)
if len(reg_list) > 0:
return True
return False
# adding a new link to the list
def add_link(self, dictionary):
for key in dictionary:
if key != 'secret_key':
for link in dictionary[key]:
if link not in self.visited:
self.allLinks.append(link)
if len(dictionary['secret_key']) > 0:
self.secretFlag.append(dictionary['secret_key'])
if __name__ == '__main__':
obj = Crawler()
nuid = sys.argv[1]
password = sys.argv[2]
obj.crawl(str(nuid), str(password))