-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwikidotparser.py
116 lines (97 loc) · 4.75 KB
/
wikidotparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re
from collections import defaultdict
from iso3166 import normalize_country
photos_base_url = 'http://openglidernetwork.wdfiles.com'
heading_pattern = re.compile(r'\+\+ (?P<text>.*) ?\[\[(?P<tag>.*)\n')
receiver_pattern = re.compile(r"""\|\|\ \|\|\ ?\[\[\#\ (?P<aprsname>.*)\]\](?:.*)
\|\|(?P<desc>.*)
\|\|(?P<photos>.*)
\|\|(?:.*)
\|\|(?P<contact>.*)\|\|""", re.MULTILINE | re.VERBOSE)
wikidot_link_pattern = re.compile(r"\[\*?([^\[\]\ ]*)\ ([^\[\]]*)\]")
photos_pattern = re.compile(r'\[\*?(?P<photo_url>[^\s\[\]]*)\s+(?P<name>[^\]]*)\]')
image_url_pattern = re.compile(r".*\.(svg|jpeg|pdf|apng|mng|jpg|png|gif)$", re.IGNORECASE)
mail_address_pattern = re.compile(r'^[a-z0-9+\-_%.]+@[a-z0-9+\-_%.]+.[a-z]{2,}$', re.IGNORECASE)
contact_mail_pattern = re.compile(r'\[\[\[mailto:(?P<email>[^?\ ]*)(?P<subject>.*)\|\ *(?P<name>.*)\]\]\]')
contact_url_pattern = re.compile(r'\[\[\[(?P<url>http.*)\|(?P<name>.*)\]\]\]')
contact_intern_pattern = re.compile(r"""\[\/contact\ (?P<name0>\S*)
(
\]\ \/\ \[\/contact\ (?P<name1>.*)\] |
\]\ \/\ (?P<name2>.*) |
\]
)""", re.MULTILINE | re.VERBOSE)
def parse_description_links(text):
links = []
for link in re.finditer(wikidot_link_pattern, text):
links.append({'rel': link.group(2), 'href': link.group(1)})
text = text.replace(link.group(0),link.group(2),1)
return text, links
def parse_contact(raw):
contact_details = {'name': '', 'email': ''}
links = []
match_mail = re.search(contact_mail_pattern, raw)
match_url = re.search(contact_url_pattern, raw)
match_intern = re.search(contact_intern_pattern, raw)
if match_mail:
if re.match(mail_address_pattern, match_mail.group('email')):
# found an email address
contact_details = {'name': match_mail.group('name'),
'email': match_mail.group('email')}
else:
contact_details = {'name': match_mail.group('name'),
'email': ''}
elif match_url:
# found a hyperlink
links.append({'ref': 'contact', 'href': match_url.group('url')})
elif match_intern:
# found a link to the wiki page '/contact'
contact_details = {'name': ' / '.join(name for name in match_intern.groupdict().values() if (name is not None)), 'email': ''}
else:
name = raw.replace("[", "").replace("]", "").replace("|", "").strip()
if name:
# found a name
contact_details = {'name': name, 'email': ''}
else:
# found nothing
pass
return contact_details, links
def parse_photo_links(raw):
photos = []
links = []
for link in re.finditer(photos_pattern, raw):
if link.group('photo_url').startswith('/local--files'):
photos.append('{}{}'.format(photos_base_url, link.group('photo_url')))
else:
if re.match(image_url_pattern, link.group('photo_url')):
photos.append(link.group('photo_url'))
else:
links.append({'href': link.group('photo_url'), 'rel': link.group('name')})
return photos, links
def parse_receiver_list(page):
country = 'None'
data = defaultdict(list)
# Seperate lines by heading (country)
for line in page.splitlines(True):
heading = re.search(heading_pattern, line)
if heading:
country = normalize_country(heading.group('text').strip().lower())
else:
data[country].append(line)
receivers = []
# Parse lines
for country, lines in data.items():
for line in lines:
line = line.replace(" ", "").replace("&", '&').replace(""", '"')
match = re.match(receiver_pattern, line)
if match:
description, desc_links = parse_description_links(match.group('desc').strip())
photos, photo_links = parse_photo_links(match.group('photos'))
contact_details, contact_links = parse_contact(match.group('contact'))
receivers.append({'callsign': match.group('aprsname'),
'description': description,
'photos': photos,
'links': desc_links + photo_links + contact_links,
'contact': contact_details['name'],
'email': contact_details['email'],
'country': country})
return receivers