forked from amakelov/parlamentaren-kontrol
-
Notifications
You must be signed in to change notification settings - Fork 1
/
craw_stenograms.py
342 lines (303 loc) · 15.3 KB
/
craw_stenograms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import datetime
import os.path
import re
import xlrd
import openpyxl
from pk_db import db, cur
from pk_logging import logging, logger_workaround
from pk_namedtuples import *
from pk_tools import urlopen, canonical_party_name, StenogramsHTMLParser
##############################################################################
# Excel Parsing
##############################################################################
logger_excel = logging.getLogger('excel_parser')
logger_excel_lib = logging.getLogger('excel_parser_lib')
class ExcelWarnings(object):
def write(self, string):
s = string.strip()
if s:
logger_excel_lib.warning(s)
excel_warnings = ExcelWarnings()
class XLSFile(object):
def __init__(self, filename):
self.book = xlrd.open_workbook(filename, logfile=excel_warnings)
self.sheet = self.book.sheet_by_index(0)
@property
def nrows(self):
return self.sheet.nrows
@property
def ncols(self):
return self.sheet.ncols
def cell_value(self, rowx, colx):
return self.sheet.cell_value(rowx=rowx, colx=colx)
def close(self):
pass
class XLSXFile(object):
def __init__(self, filename):
self.wb = openpyxl.load_workbook(filename=filename, read_only=True)
self.ws = self.wb.active
self.rows = list(self.ws.rows)
@property
def nrows(self):
if self.rows[-1]==(): # Seems to be consistently empty
return len(self.rows)-1
return len(self.rows)
@property
def ncols(self):
return len(self.rows[2]) # The one that usually contains the vote headers
def cell_value(self, rowx, colx):
try:
return self.rows[rowx][colx].value
except IndexError:
return ''
def close(self):
self.wb.close()
def xlsopen(filename):
if filename.endswith('xls'):
return XLSFile(filename)
elif filename.endswith('xlsx'):
return XLSXFile(filename)
else:
raise ValueError('Bad excel file extension')
registration_marker = 'РЕГИСТРАЦИЯ'
vote_marker = 'ГЛАСУВАНЕ'
def parse_excel_by_name(filename):
"""
Parse excel files with vote statistics by representative.
Assumptions
===========
The .xls file starts with two lines we don't care about. All remaining lines
contain the following fields, from left to right:
- representative name
- two fields we skip
- representative's party
- undefined number of fields containing stuff about how the
representative voted.
"""
# XXX Workarounds
# Correct spelling errors in names of MPs.
def MP_name_spellcheck(name):
tr_dict = {'МАРИЯНА ПЕТРОВА ИВАНОВА-НИКОЛОВА': 'МАРИАНА ПЕТРОВА ИВАНОВА-НИКОЛОВА',
'ВЕНЦЕСЛАВ ВАСИЛЕВ ВЪРБАНОВ': 'ВЕНЦИСЛАВ ВАСИЛЕВ ВЪРБАНОВ',
'АЛЕКСАНДЪР СТОЙЧЕВ СТОЙЧЕВ': 'АЛЕКСАНДЪР СТОЙЧЕВ СТОЙКОВ',
'СЛАВЧО ПЕНЧЕВ БИНЕВ': 'СЛАВИ ПЕНЧЕВ БИНЕВ',
'ДЕНИЦА ЗЛАТКОВА КАРАДЖОВА': 'ДЕНИЦА ЗЛАТКОВА ЗЛАТЕВА'}
if name in tr_dict:
logger_workaround.warning("Spelling error: %s" % name)
return tr_dict[name]
return name
# Remove unregistered MPs.
def filter_names(*args):
to_filter_out = ['МИХАИЛ ВЛАДИМИРОВ ВЛАДОВ', 'НИКОЛАЙ НАНКОВ НАНКОВ', 'ЕЛЕНА СТЕФАНОВА АКСИЕВА', 'ГЕОРГИ ДОБРЕВ ЕЛЕНКОВ']
zip_args = list(zip(*args))
filtered = list(filter(lambda a: a[0] not in to_filter_out, zip_args))
if len(filtered) != len(zip_args):
logger_workaround.warning("An MP was filtered out of the by-names list, because they are not registered as an MP.")
return zip(*filtered)
return args
# XXX End of Workarounds.
# Translate the registration and vote markers.
tr_reg = {'О':'absent', 'П':'present', 'Р':'manually_registered'}
tr_vote = {'+':'yes', '-':'no', '=':'abstain', '0':'absent'}
sheet = xlsopen(filename)
cols = sheet.ncols
rows = sheet.nrows
names = [MP_name_spellcheck(' '.join(sheet.cell_value(rowx=row, colx=0).upper().split()))
for row in range(2, rows)]
parties = [canonical_party_name(sheet.cell_value(rowx=row, colx=3).strip().upper())
for row in range(2, rows)]
reg_sessions = []
vote_sessions = []
for col in range(4, cols):
values = [sheet.cell_value(rowx=row, colx=col) for row in range(2, rows)]
if all(v in tr_reg.keys() for v in values):
reg_sessions.append([tr_reg[v] for v in values])
elif all(v in tr_vote.keys() for v in values):
vote_sessions.append([tr_vote[v] for v in values])
elif all(v=='' for v in values):
logger_excel.warning("Empty column %s found in the by_names excell file. Skipping it."%col)
else:
logger_excel.error("Strange column %s found in the by_names excell file. Skipping it."%col)
vote_sessions = list(zip(*vote_sessions))
if len(reg_sessions) > 1:
logger_excel.warning("There are more than one registration for this stenogram.")
elif len(reg_sessions) != 1:
raise ValueError('No registrations detected in the by-names file.')
return filter_names(names, parties, reg_sessions[-1], vote_sessions)
def parse_excel_by_party(filename):
"""
Parse excel files with vote statistics by party.
One excel file per stenogram.
Assumptions
===========
- The number of parties per vote is always the same.
- For each session, there is a line containing either `vote_marker` or
`registration_marker`, and that gives the kind of the session. There is
only one registration per stenogram.
- After this line, there are two lines we don't care about, and the next
parties_count consecutive lines contain the vote/presence statistics by party.
"""
sheet = xlsopen(filename)
rows = sheet.nrows
sessions = []
assert registration_marker in sheet.cell_value(rowx=1,colx=0), "No registration marker"
assert 'ПГ' in sheet.cell_value(rowx=2,colx=0), "No ПГ marker"
assert 'Общо' in sheet.cell_value(rowx=3,colx=0), "No Общо marker"
row = 4
parties_count = 0
while row < rows:
cell = sheet.cell_value(rowx=row,colx=0)
if vote_marker in cell or registration_marker in cell: # Some stenograms have more than one registrations
break
if 'НЕЗ' in cell: # Not all stenograms include a complete list of parties (or list totals for independents)
parties_count += 1
break
row += 1
parties_count += 1
else:
logger_to_db.info("Only a single registration and no votes detected in party xls file %s"%filename)
row = 0
while row < rows:
first = sheet.cell_value(rowx=row, colx=0)
if registration_marker in first:
row += 2
per_party_dict = {}
for i in range(parties_count):
row += 1
party = canonical_party_name(sheet.cell_value(rowx=row, colx=0).strip().upper())
present = int(sheet.cell_value(rowx=row, colx=1))
expected = int(sheet.cell_value(rowx=row, colx=2))
per_party_dict[party] = reg_stats_per_party_tuple(present, expected)
reg_by_party_dict = per_party_dict
elif vote_marker in first:
description = first.split(vote_marker)[-1].strip()
time, description = description.split('по тема')
time = datetime.datetime.strptime(time[-6:], '%H:%M ')
description = description.strip()
row += 2
votes_by_party_dict = {}
for i in range(parties_count):
row += 1
party = canonical_party_name(sheet.cell_value(rowx=row, colx=0).strip().upper())
yes = int(sheet.cell_value(rowx=row, colx=1))
no = int(sheet.cell_value(rowx=row, colx=2))
abstained = int(sheet.cell_value(rowx=row, colx=3))
total = int(sheet.cell_value(rowx=row, colx=4))
votes_by_party_dict[party] = vote_stats_per_party_tuple(yes, no, abstained, total)
sessions.append(session_tuple(description, time, None, votes_by_party_dict))
row += 1
sheet.close()
return reg_by_party_dict, sessions
##############################################################################
# Parse and save to disc.
##############################################################################
logger_to_db = logging.getLogger('stenograms_to_db')
stenogram_IDs = []
assemblies_and_ids = [(7,41),(50,42),(51,43),(52,44)] # XXX Hardcoded values
for internal_id, assembly_number in assemblies_and_ids:
stenogram_IDs += [(i, 'https://www.parliament.bg/bg/plenaryst/ns/%d/ID/'%internal_id+i)
for i in map(str.strip, open('craw_data/IDs_plenary_stenograms_%d'%assembly_number).readlines())]
cur.execute("""SELECT original_url FROM stenograms""")
urls_already_in_db = set(_[0] for _ in cur.fetchall())
cur.execute("""SELECT party_name FROM parties""")
forces_already_in_db = set(_[0] for _ in cur.fetchall())
all_parties = set(forces_already_in_db)
logger_to_db.info("Parties already in db %s" % all_parties)
for i, (ID, original_url) in enumerate(stenogram_IDs):
if original_url in urls_already_in_db:
continue
if ID == '5660': # XXX Workaround malformated html file: fixed in 5661.
logger_workaround.warning('Workaround for ID 5660.')
continue
problem_by_name = False
problem_by_party = False
logger_to_db.info("Parsing stenogram %s - %d of %d." % (ID, i+1, len(stenogram_IDs)))
try:
filename = './craw_data/stenograms/%s.html'%ID
f = open(filename)
complete_stenogram_page = f.read()
parser = StenogramsHTMLParser(complete_stenogram_page)
date_string = parser.date.strftime('%d%m%y')
except Exception as e:
logger_to_db.error("Parsing problem with ID %s. %s"%(ID,str(e)))
continue
try:
filename = './craw_data/stenograms/%s-%s-g.xls'%(ID,date_string)
if not os.path.isfile(filename):
filename += 'x'
if not os.path.isfile(filename):
if not parser.votes_indices:
logger_to_db.info("No party excel file was found for ID %s because no votes happened according to stenogram."%(ID))
else:
logger_to_db.error("No party xls file for ID %s - check it was downloaded"%ID)
problem_by_party = True
else:
reg_by_party_dict, sessions = parse_excel_by_party(filename)
# Check for new parties
size = len(all_parties)
old_parties = set(all_parties)
all_parties.update(set(reg_by_party_dict.keys()))
if size!=len(all_parties):
new_parties = all_parties-old_parties
logger_to_db.info("Detected new parties %s"%new_parties)
cur.executemany("""INSERT INTO parties VALUES (%s)""",
[(_,) for _ in new_parties])
except Exception as e:
logger_to_db.error("Problem with party excel file ID %s due to %s"%(ID,repr(e)))
problem_by_party = True
try:
filename = './craw_data/stenograms/%s-%s-i.xls'%(ID,date_string)
if ID == '2766': # XXX Workaround malformated excel file.
logger_workaround.warning('Using the workaround for ID 2766.')
filename = 'workarounds/iv050712_ID2766_line32-33_workaround.xls'
if not os.path.isfile(filename):
filename += 'x'
if not os.path.isfile(filename):
if not parser.votes_indices:
logger_to_db.info("No MP name excel file was found for ID %s because no votes happened according to stenogram."%(ID))
else:
logger_to_db.error("No MP name xls file for ID %s - check it was downloaded"%ID)
problem_by_name = True
else:
mp_names, mp_parties, mp_reg_session, mp_vote_sessions = parse_excel_by_name(filename)
except Exception as e:
logger_to_db.error("Problem with MP name excel file ID %s due to %s"%(ID,repr(e)))
problem_by_name = True
if parser.votes_indices and not (problem_by_name or problem_by_party):
if len(sessions) != len(parser.votes_indices):
logger_to_db.warning("The detected votes in the stenogram text do not match the votes from the parties xls file")
if mp_vote_sessions:
if len(mp_vote_sessions[0]) != len(parser.votes_indices):
logger_to_db.warning("The detected votes in the stenogram text do not match the votes from the MP xls file")
if len(sessions) != len(mp_vote_sessions[0]):
logger_to_db.error("The detected sessions in the MP xls file do not match the sessions from the parties xls file")
else:
if parser.votes_indices:
logger_to_db.warning("The detected votes in the stenogram text do not match the votes from the MP xls file")
if sessions:
logger_to_db.error("The detected sessions in the MP xls file do not match the sessions from the parties xls file")
if problem_by_name or problem_by_party:
cur.execute("""INSERT INTO stenograms VALUES (%s, %s, %s, %s, %s)""",
(parser.date, parser.data_list, parser.votes_indices, True, original_url))
else:
try:
assert all([len(sessions) == len(mp_votes) for mp_votes in mp_vote_sessions]), "Not all sessions are recorded."
cur.execute("""INSERT INTO stenograms VALUES (%s, %s, %s, %s, %s)""",
(parser.date, parser.data_list, parser.votes_indices, False, original_url))
cur.executemany("""INSERT INTO party_reg VALUES (%s, %s, %s, %s)""",
((k, parser.date, v.present, v.expected) for k,v in reg_by_party_dict.items()))
cur.executemany("""INSERT INTO vote_sessions VALUES (%s, %s, %s, %s)""",
((parser.date, i, s.description, s.time) for i, s in enumerate(sessions)))
cur.executemany("""INSERT INTO party_votes VALUES (%s, %s, %s, %s, %s, %s, %s)""",
((party, parser.date, i, votes.yes, votes.no, votes.abstained, votes.total)
for i, s in enumerate(sessions)
for party, votes in s.votes_by_party_dict.items()))
cur.executemany("""INSERT INTO mp_reg VALUES (%s, %s, %s, %s)""",
((name, party, parser.date, reg) for name, party, reg in zip(mp_names, mp_parties, mp_reg_session)))
cur.executemany("""INSERT INTO mp_votes VALUES (%s, %s, %s, %s, %s)""",
((name, party, parser.date, i, v)
for name, party, votes in zip(mp_names, mp_parties, mp_vote_sessions)
for i, v in enumerate(votes)))
except Exception as e:
logger_to_db.error("Writting to db failed on stenogram %s due to %s" % (ID, str(e)))
db.commit()