-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQuery_cleaning.py
41 lines (35 loc) · 1.24 KB
/
Query_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from xml.etree import cElementTree as et
import re
q_dict = {}
# import the query file
q_file = 'cacm.query.txt'
q_content = open(q_file, 'r').read()
# Convert it into an XML format for cleaning
q_content = "<ROOT>\n" + q_content + "\n</ROOT>"
q_content = q_content.replace("</DOCNO>", "</DOCNO>\n<QUERY>")
xml_content_string = q_content.replace("</DOC>", "</QUERY>\n</DOC>")
path_r = et.fromstring(xml_content_string)
for query in path_r:
query_id = query.find('DOCNO').text.strip()
quer = query.find('QUERY').text
quer = quer.lower().replace("\n", " ")
quer= re.sub(' +', ' ', quer).strip()
quer = re.sub(r"[^0-9A-Za-z,-.:\\$]", " ", quer)
quer = re.sub(r"(?!\d)[$,%:.-](?!\d)", " ", quer, 0)
quer = quer.split()
for l in quer:
if l.startswith('-'):
l.replace(l, l.split('-')[1])
if l.endswith('-'):
l.replace(l, l.split('-')[0])
else:
continue
quer = ' '.join(quer)
q_dict[query_id] = quer
for key, value in q_dict.items():
print("Query after cleaning " + key + " : " + value)
# write the output to the file
file = open("Lucene-proj/src/cleaned_query.txt", 'w', encoding='utf-8')
for qid in q_dict:
file.write(qid + "\t" + q_dict[qid] + "\n")
file.close()