-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataParser.py
117 lines (100 loc) · 4.07 KB
/
dataParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
---Parsing Tool---
This script allows the user to parse a Reuters XML file.
This script requires that 'xmljson' be installed within the Python
environment you are running this script in.
"""
import logging
import os
from xml.etree.ElementTree import fromstring
from xmljson import parker as pr
logger = logging.getLogger(__name__)
class DataParser:
def __init__(self, directory_path, is_test_data=False):
self.data_path = directory_path
self.is_test_data = is_test_data
def parse_data(self, is_test=False):
"""
:param is_test: boolean. Gets True if the article is part of the test set.
:return: Returns list of dictionaries with TEXT and LABELS keys for each article
"""
final_data = []
for root, dirs, files in os.walk(self.data_path, topdown=False):
for name in files:
try:
data = list(filter(lambda x: x['labels'] != [] and x['text'] != '',
DataParser._parsing(os.path.join(root, name), is_test)))
logger.debug("number of articles in file {}: {}".format(name, len(data)))
final_data = final_data + data
except UnicodeDecodeError:
continue
logger.info("Total number of articles parsed: %s", len(final_data))
return final_data
@staticmethod
def _create_labels(article):
"""
:param article:(ordered dictionary).
:return:list of labels based on existing labels in the received article.
"""
labels = []
labels_type = ["TOPICS", "PLACES", "PEOPLE", "ORGS", "EXCHANGES", "COMPANIES"]
for label_type in labels_type:
if article[label_type] is not None:
if article[label_type]['D'] != "" and article[label_type]['D'] is not None:
labels.append(article[label_type]['D'])
# Flatting the label list in case labels is list of lists
flat_labels = []
for sublist in labels:
if isinstance(sublist, list):
for item in sublist:
flat_labels.append(item)
else:
flat_labels.append(sublist)
return flat_labels
@staticmethod
def _get_dateline(article):
"""
:param article:
:return:
"""
if 'DATELINE' in article['TEXT']:
return article['TEXT']['DATELINE'].split(',')[0]
else:
return ""
@staticmethod
def _get_text(article, is_test=False):
"""
:param article:(ordered dictionary).
:param is_test: if test data parses without the 'title' attribute
:return:Returns the text of the received article.
"""
text_to_return = ""
if not is_test:
if 'TITLE' in article['TEXT']:
text_to_return = text_to_return + " " + article['TEXT']['TITLE']
if 'BODY' in article['TEXT']:
text_to_return = text_to_return + " " + article['TEXT']['BODY']
return text_to_return
@staticmethod
def _parsing(file_path, test):
"""
:param file_path:
:return:
"""
with open(file_path) as f:
f.readline()
raw_data = "<xml>" + f.read() + "</xml>"
raw_data = raw_data.replace('&', "").replace('#', "")
data_dict = pr.data(fromstring(raw_data), preserve_root=True)
if test:
data = [
{"labels": "", "text": DataParser._get_text(article, test), "dateline":
DataParser._get_dateline(article)} for article in data_dict['xml']['REUTERS']]
else:
data = [{"labels": DataParser._create_labels(article), "text": DataParser._get_text(article, test),
"dateline": DataParser._get_dateline(article)} for article in data_dict['xml']['REUTERS']]
return data
# print(parsing_data('test', False)[0])
# print(parsing_data('test', False)[223])
# for elem in parsing_data('test', False):
# print(elem)