-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep2dep.py
121 lines (95 loc) · 3.49 KB
/
prep2dep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#! /usr/bin/env python3
import re
import os
from text2law import get_args
def read(inp):
for fl in inp:
with open(fl, encoding='utf-8') as f:
yield f.read().replace(u'\xa0', u' ').replace(" ", " "), os.path.basename(fl)
def tokmod(txt):
"""
:param txt: tokenized input
Function to merge separated but originally whole sentences in a legislation.
"""
txtls = [sent.split("\n") for sent in [sent for sent in txt.split("\n\n")]]
newls = [txtls[0]]
for i, sent in enumerate(txtls[1:]):
for char in sent[0]:
# sent[0][0] in "§(" ez nem jó, mert ( -vel kezdődik a bekezdés is
# (len(first_word) > 1 and first_word.isupper()) or ... -> out_ut_... "ÖTM utasítása" gond
if char and (sent[0][0].islower() or sent[0][0] == "§"):
newls[-1].extend(sent)
break
else:
newls.append(sent)
return "\n\n".join(["\n".join(sent) for sent in newls])
def replmatch(match):
"""
:param match: match of a regex
:return:
In case of match: cuts a sentence along the match without endmark by putting
extra enter in the input txt. So it makes two sentence out of one.
"""
endmark = match.group(1)
replwith = match.group(2)[1:] if match.group(2)[0] == "\n" else match.group(2)
return endmark + "\n\n"+replwith
def process(inps, outp):
"""
:param inps: input files
:param outp: output folder
This function separates sentences along paragraphs and enumerations.
Too long sentences will be split avoiding later memory errors during dependency parsing
and also avoiding the parsing to take too much time.
"""
# txtlists = [] # ellenőrzéshez
pat_paragraph = re.compile(r"""
([^\n])(\n\d+\.\t.+?\n
§\t.+?\n
(?:\(\t.+?\n
\d+\)\t.+?\n)?
[A-ZÖÜÓŐÚÉÁŰÍ])
""", re.VERBOSE)
pat_num_listing = re.compile(r"""
("\s*\\n\s*")\n
(\d{1,3}\.\t.+?\n
[^§)])
""", re.VERBOSE)
pat_abc_listing = re.compile(r"""
([^\n])(\n[a-z]{1,3}
(?:(?:\)\t.+?\n)|(?:\t.+?\n\)\t.+?\n))
(?!pont))
""", re.VERBOSE)
pat_rom_w_dot = re.compile(r"""
([^\n]\n[;:.,]\t.+?)\n
([IVXLCDM]+\.\t.+?\n
[A-ZÖÜÓŐÚÉÁŰÍa-zöüóőúéáűí])
""", re.VERBOSE)
pat_dot_col = re.compile(r"""
([^\n]\n[;]\t.+?)\n
([a-zöüóőúéáűí]+[^.)])
""", re.VERBOSE | re.IGNORECASE)
for inp in inps:
forparse, fl = tokmod(inp[0]), inp[1]
forparse = pat_paragraph.sub(replmatch, forparse)
forparse = pat_num_listing.sub(replmatch, forparse)
forparse = pat_abc_listing.sub(replmatch, forparse)
forparse = pat_rom_w_dot.sub(replmatch, forparse)
forparse = pat_dot_col.sub(replmatch, forparse)
with open(os.path.join(outp, fl), "w", encoding="utf-8", newline="\n") as f:
f.write(forparse)
# teszt ellenőrzés: maradék hosszú mondatok
# txtls = [sent.split("\n") for sent in [sent for sent in forparse.split("\n\n")]]
# txtlists.append(txtls)
# with open("long_sents2.txt", "w", encoding="utf-8") as f:
# for txtls in txtlists:
# for sent in txtls:
# if len(sent) > 100:
# print("###################\n", "\n".join(sent), file=f)
# teszt ellenőrzés vége
def main():
args = get_args()
inp = read(args['files'])
os.makedirs(args['dir'], exist_ok=True)
process(inp, args['dir'])
if __name__ == "__main__":
main()