-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_webpage.py
39 lines (33 loc) · 1.49 KB
/
parse_webpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from HTMLParser import HTMLParser
import re
class FakeBookParser(HTMLParser):
# This class reads all the pages and extracts the relevant links and the secret keys from the web page.
# This class is inherited from the HTMLParse class.
def __init__(self):
HTMLParser.__init__(self)
self.links = {}
self.is_last_tag_secret_flag = False
self.links["profiles_url_list"] = set()
self.links["friend_list_pages"] = set()
self.links["secret_key"] = []
def handle_starttag(self, tag, attrs):
# If a html tag is found with anchor tag extract url from it.
if tag == "a":
attr = dict(attrs)
if "/friends/" in attr.get('href'):
self.links["friend_list_pages"].add(attr.get('href'))
if "/fakebook/" in attr.get('href'):
pattern = re.compile(r'^\/fakebook\/\d*\/$')
link = pattern.findall(attr.get('href'))
if len(link) > 0:
self.links["profiles_url_list"].add(link[0])
# If a html tag is found with h2 tag and class as secret key extract the secret key from it done in
# handle_data.
if tag == "h2":
attr = dict(attrs)
if attr.get("class") == "secret_flag":
self.is_last_tag_secret_flag = True
def handle_data(self, data):
if self.is_last_tag_secret_flag:
self.links["secret_key"].append(data)
self.is_last_tag_secret_flag = False