-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures_m.py
288 lines (232 loc) · 10.6 KB
/
features_m.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/python3
import re
import xml.etree.ElementTree as ET
chrono_list = ["also", "anfangs", "anno", "bald", "beizeiten", "bekanntlich", "bereits", "bisher", "bislang", "dadrauf",
"dadurch", "daher", "damals", "danach", "damit", "dann", "darauf", "daraufhin", "davor", "dazwischen",
"demnach", "demnächst",
"dereinst", "derweil", "doch", "drauf", "eben", "ehemals", "einmal", "einst", "einstmals", "einstweilen",
"erwartungsgemäß", "ferner", "folglich", "früher", "gerade", "gleich", "grad", "gleichwohl",
"infolgedessen", "indes", "jedoch",
"jüngst", "just", "künftig", "kürzlich", "längst", "letztens", "letztlich", "letzthin", "letztmals",
"neulich", "schließlich", "seitdem", "seither", "sodann", "somit", "später", "späterhin", "vordem",
"vorgestern", "gestern", "vorher",
"vorhin", "vormals", "weiter", "weiters", "wodurch", "wogegen", "womit", "wonach", "zeither", "zuerst",
"zugleich", "zuletzt", "überdies"]
# features
def li_chronologically_structured(text, tags):
"""compares each word with a wordlist of chronologically structuring terms, returns BOOL"""
for word in tags:
if word[0].lower() in chrono_list:
return True
return False
def gp_past_proportion(text, tags):
""""counts all verbs that are conjugated in past tense and divides them with all verbs, returns FLOAT"""
allverbcount = 0
pastverbcount = 0
for word, tag in tags:
if tag["pos"][0] == "V":
allverbcount += 1
if "tense" in tag["attributes"] and tag["attributes"]["tense"] == "Past":
pastverbcount += 1
return pastverbcount / allverbcount if allverbcount != 0 else -1
def gb_contains_past(text, tags):
"""is searching for verbs conjugated in past tense, returns true if at least 1 verb stands in past tense (BOOL)"""
for word, tag in tags:
if tag["pos"][0] == "V" and "tense" in tag["attributes"] and tag["attributes"]["tense"] == "Past":
return True
return False
# futur2 wird momentan noch nicht gefunden, muss implementiert werden + durch was soll nun geteilt werden?
def gp_future_proportion(text, tags):
"""counts all futur1+futur2 constructions and divides them with all verbs, returns FLOAT"""
hilfsv_toggle = False
allverbs = 0
future_constructions = 0
for word, tag in tags:
if tag["pos"][0] == "V":
if tag["pos"] == "VFIN":
allverbs += 1
elif hilfsv_toggle == True and tag["pos"] == "VINF":
future_constructions += 1
hilfsv_toggle = False
elif hilfsv_toggle == True:
hilfsv_toggle = False
iswerden = "type" in tag["attributes"] and tag["attributes"]["type"] == "Aux" and word[0].lower() == "w"
ispast = "tense" in tag["attributes"] and tag["attributes"]["tense"] == "Past"
if iswerden and not ispast:
hilfsv_toggle = True
return future_constructions / allverbs if allverbs != 0 else -1
def gb_contains_future(text, tags):
"""is searching for future constellations, returns True if at least 1 constellation was found (BOOL)"""
hilfsv_toggle = False
partizip2_toggle = False
for word, tag in tags:
try: # give some output to help debugging
tag["attributes"]
except KeyError:
print(word, tag)
if hilfsv_toggle and not partizip2_toggle and tag["pos"] == "VINF": # erfüllt: futur 1
return True
elif hilfsv_toggle and partizip2_toggle and tag["pos"] == "VINF" and "type" in tag[
"attributes"] and tag["attributes"]["type"] == "Aux": # erfüllt: futur 2
return True
elif hilfsv_toggle and not partizip2_toggle and tag["pos"] == "VPP" and "subtype" in tag[
"attributes"] and tag["attributes"]["subtype"] == "Psp":
partizip_toggle = True
elif not tag["attributes"] is None and "type" in tag["attributes"] and tag["attributes"]["type"] == "Aux" and word[0].lower() == "w":
hilfsv_toggle = True
elif word == ".":
hilfsv_toggle = False
partizip2_toggle = False
return False
def gb_contains_non_present(text, tags):
"""combines contains_futur/past, returns BOOL"""
return gb_contains_future(text, tags) or gb_contains_past(text, tags)
# umbenannt nach korr. engl. Bezeichnung
def gp_subj_proportion(text, tags):
"""counts all verbs in subjunctive and divides them with all verbs, returns FLOAT"""
allverbcount = 0
subjverbcount = 0
for word, tag in tags:
if tag["pos"][0] == "V":
allverbcount += 1
if "mood" in tag["attributes"] and tag["attributes"]["mood"] == "Subj":
subjverbcount += 1
return subjverbcount / allverbcount if allverbcount != 0 else -1
def gb_contains_thirdpers(text, tags):
"""is searching for verbs and pronouns in 3rd pers.sg., returns BOOL"""
for word, tag in tags:
if tag["pos"][0] == "V" and "person" in tag["attributes"] and tag["attributes"]["person"] == "3":
return True
elif tag["pos"] == "PRO" and "person" in tag["attributes"] and tag["attributes"][
"person"] == "3":
return True
return False
def gp_thirdpers_proportion(text, tags):
"""counts all verbs and pronouns in 3rd pers.sg. and divides them with all verbs and pronouns, returns FLOAT"""
allverbpron = 0
thirdpers = 0
for word, tag in tags:
if tag["pos"] == "PRO" or tag["pos"][0] == "V":
allverbpron += 1
verbisthird = tag["pos"][0] == "V" and "Person" in tag["attributes"] and tag["attributes"]["person"] == "3"
proisthird = tag["pos"] == "PRO" and "person" in tag["attributes"] and tag["attributes"]["person"] == "3"
if verbisthird or proisthird:
thirdpers += 1
return thirdpers / allverbpron if allverbpron != 0 else -1
def gp_exclamation_proportion(text, tags):
"""counts all "!" and "?", divides them with all tokens, returns FLOAT"""
allTokens = len(tags)
punct = 0
for word, tag in tags:
if word == "!" or word == "?":
punct += 1
return punct / allTokens if allTokens != 0 else -1
def gp_sym_proportion(text, tags):
"""is searching for all special characters and divides them with all tokens, returns FLOAT"""
# spec_list = re.findall(r"[^a-zA-Z0-9]", text)
symcount = 0
for word, tag in tags:
if tag["pos"] == "SYM":
symcount += 1
return symcount / len(tags) if len(tags) != 0 else -1
def gp_adj_proportion(text, tags):
"""counts all adjectives and divides them with all tokens, returns FLOAT"""
adj_counter = 0
for word, tag in tags:
if tag["pos"] == "ADJA" or tag["pos"] == "ADJD":
adj_counter += 1
return adj_counter / len(tags) if len(tags) != 0 else -1
def gp_noun_proportion(text, tags):
"""counts all nouns (excluding proper names) and divides them with all tokens, returns FLOAT"""
noun_counter = 0
for word, tag in tags:
if tag["pos"] == "N":
noun_counter += 1
return noun_counter / len(tags) if len(tags) != 0 else -1
def gp_ne_proportion(text, tags):
"""counts all proper names and divides them with all tokens (all nouns could be worth a try), returns FLOAT"""
ne_counter = 0
for word, tag in tags:
if tag["pos"] == "N":
if "type" in tag["attributes"] and tag["attributes"]["type"] == "Name":
ne_counter += 1
return ne_counter / len(tags) if len(tags) != 0 else -1
def gp_pron_proportion(text, tags):
"""counts all pronouns (maybe restrict them if it does not work properly) and divides them with all tokens, returns FLOAT"""
pron_counter = 0
for word, tag in tags:
if tag["pos"] == "PRO":
pron_counter += 1
return pron_counter / len(tags) if len(tags) != 0 else -1
def li_contains_verbs_location(text, tags):
"""checks if location-related verb is contained, returns BOOL"""
liste = _getlist("verbs_location", "./data/verben.Lokation.xml")
for word, tag in tags:
if tag["lemma"] in liste:
return True
return False
def li_contains_adj_time(text, tags):
"""checks if time-related adjective is contained, returns BOOL"""
liste = _getlist("adj_time", "./data/adj.Zeit.xml")
for word, tag in tags:
if tag["lemma"] in liste:
return True
return False
def li_contains_noun_event(text, tags):
"""checks if event-related noun is contained, returns BOOL"""
liste = _getlist("noun_event", "./data/nomen.Geschehen.xml")
for word, tag in tags:
if tag["lemma"] in liste:
return True
return False
def li_contains_noun_group(text, tags):
"""checks if group-related noun is contained, returns BOOL"""
liste = _getlist("noun_group", "./data/nomen.Gruppe.xml")
for word, tag in tags:
if tag["lemma"] in liste:
return True
return False
def li_contains_noun_communication(text, tags):
"""checks if communication-related noun is contained, returns BOOL"""
liste = _getlist("noun_communicaion", "./data/nomen.Kommunikation.xml")
for word, tag in tags:
if tag["lemma"] in liste:
return True
return False
def li_contains_nouns_time(text, tags):
"""checks if time-related noun is contained, returns BOOL"""
liste = _getlist("nouns_time", "./data/nomen.Zeit.xml")
for word, tag in tags:
if tag["lemma"] in liste:
return True
return False
def li_contains_nouns_location(text, tags):
"""checks if location-related noun is contained, returns BOOL"""
liste = _getlist("nouns_location", "./data/nomen.Ort.xml")
for word, tag in tags:
if tag["lemma"] in liste:
return True
return False
def _getlist(varName, fileName):
if varName in globals():
foolist = globals()[varName]
else:
et = ET.parse(fileName)
root = et.getroot()
elist = root.findall(".//orthForm")
foolist = [e.text for e in elist]
globals()[varName] = foolist
return foolist
if __name__ == "__main__":
import pickle
import inspect
import sys
with open("test.test", "rb") as testfile:
tags = pickle.load(testfile)
text = pickle.load(testfile)
res = []
functions = [obj for name, obj in inspect.getmembers(sys.modules[__name__]) if inspect.isfunction(obj)]
for f in functions:
res.append((f.__name__, f(text, tags)))
for elem in res:
print(elem[0], elem[1])