-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_parse.py
69 lines (52 loc) · 1.61 KB
/
test_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import traceback
from urllib import quote
from dateutil.parser import parse
from datetime import timedelta
import requests
import pdb
import json
import codecs
import re
import csv
from bs4 import BeautifulSoup
from extract import get_submission_wikicode_link
from extract import get_submission_wikicode
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
p = "\[\[(.*)\]\]"
s = "'''(.*)'''"
event_pattern = '^\|.*class\s*="presentation|unconference|workshop|keynote|posters|logistics"'
l = "\{\{TNT\|(.*)\}\}"
b = "(\w*[Bb]reakout\w*)"
data = s + "|" + p + "|" + l + "|" + b
def traverse_schedule(schedule):
for line in schedule:
yield line
raise StopIteration()
def get_url(url):
response = requests.get(url)
return response.content
def get_schedule(html_doc):
soup = BeautifulSoup(html_doc,"lxml")
schedule = soup.find("textarea")
return schedule.get_text().splitlines()
def get_data(line):
columns = line.split("|")
return columns[:-1]
def test_patterns():
friday="https://wikimania2017.wikimedia.org/w/index.php?title=Programme/Friday&action=edit"
#saturday="https://wikimania2017.wikimedia.org/w/index.php?title=Programme/Saturday&action=edit"
#sunday="https://wikimania2017.wikimedia.org/w/index.php?title=Programme/Sunday&action=edit"
html_doc = get_url(friday)
schedule = get_schedule(html_doc)
#pattern = re.compile(p)
data_pattern = re.compile(data)
for line in schedule:
result = data_pattern.search(line)
if result:
print result.group(0)
def main():
test_patterns()
if __name__ == "__main__":
main()