-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathconllxdataset.py
104 lines (84 loc) · 2.18 KB
/
conllxdataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
Source: https://github.com/bastings/parser.
Edited slightly.
"""
import io
import os
from tokens import XToken
ROOT_TOKEN = '<root>'
ROOT_TAG = 'ROOT'
ROOT_LABEL = '-root-'
def empty_conllx_example_dict():
ex = {
'id': [],
'form': [],
'lemma': [],
'cpos': [],
'pos': [],
'feats': [],
'head': [],
'deprel': [],
'phead': [],
'pdeprel': []
}
return ex
def start_conllx_example_dict():
ex = {
'id': [0],
'form': [ROOT_TOKEN],
'lemma': ['_'],
'cpos': [ROOT_TAG],
'pos': [ROOT_TAG],
'feats': ['_'],
'head': [0],
'deprel': [ROOT_LABEL],
'phead': ['_'],
'pdeprel': ['_']
}
return ex
def conllx_reader(f):
"""
Return examples as a dictionary.
Args:
f:
Returns:
"""
ex = start_conllx_example_dict()
for line in f:
line = line.strip()
if not line:
yield ex
ex = start_conllx_example_dict()
continue
parts = line.split()
assert len(parts) == 10, "invalid conllx line: %s" % line
_id, _form, _lemma, _cpos, _pos, _feats, _head, _deprel, _phead, _pdeprel = parts
ex['id'].append(_id)
ex['form'].append(_form)
ex['lemma'].append(_lemma)
ex['cpos'].append(_cpos)
ex['pos'].append(_pos)
ex['feats'].append(_feats)
ex['head'].append(_head)
ex['deprel'].append(_deprel)
ex['phead'].append(_phead)
ex['pdeprel'].append(_pdeprel)
# possible last sentence without newline after
if len(ex['form']) > 0:
yield ex
class ConllXDataset:
"""Defines a CONLL-X Dataset. """
def __init__(self, path):
"""Create a ConllXDataset given a path and field list.
Arguments:
path (str): Path to the data file.
fields (dict[str: tuple(str, Field)]):
The keys should be a subset of the columns, and the
values should be tuples of (name, field).
Keys not present in the input dictionary are ignored.
"""
with io.open(os.path.expanduser(path), encoding="utf8") as f:
self.examples = [d for d in conllx_reader(f)]
self.tokens = [
[XToken(*parts) for parts in zip(*d.values())]
for d in self.examples]