-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathmakeidf.py
54 lines (51 loc) · 2.43 KB
/
makeidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import math
import re
import sys
MAX_QUERY_TERMS = 80
MAX_DOC_TERMS = 400
regex_drop_char = re.compile('[^a-z0-9\s]+')
regex_multi_space = re.compile('\s+')
if __name__ == "__main__":
if len(sys.argv) != 5:
print("Usage: makeidf.py <train_triples_filename> <dev_rerank_filename> <eval_rerank_filename> <passage_collections_filename>")
exit(-1)
else:
df = {}
n = 0
with open(sys.argv[1], encoding = 'utf-8', mode='r') as reader:
for line in reader:
cols = line.split('\t')
for t in regex_multi_space.sub(' ', regex_drop_char.sub(' ', cols[0].lower())).strip().split()[:MAX_QUERY_TERMS]:
df[t] = 0
with open(sys.argv[2], encoding = 'utf-8', mode='r') as reader:
for line in reader:
cols = line.split('\t')
for t in regex_multi_space.sub(' ', regex_drop_char.sub(' ', cols[2].lower())).strip().split()[:MAX_QUERY_TERMS]:
df[t] = 0
with open(sys.argv[3], encoding = 'utf-8', mode='r') as reader:
for line in reader:
cols = line.split('\t')
for t in regex_multi_space.sub(' ', regex_drop_char.sub(' ', cols[2].lower())).strip().split()[:MAX_QUERY_TERMS]:
df[t] = 0
with open(sys.argv[4], encoding = 'utf-8', mode='r') as reader:
for line in reader:
cols = line.split('\t')
for t in set(regex_multi_space.sub(' ', regex_drop_char.sub(' ', cols[1].lower())).strip().split()[:MAX_DOC_TERMS]):
if t in df:
df[t] += 1
n += 1
with open('idf.tsv', encoding = 'utf-8', mode='w') as writer:
for k, v in df.items():
writer.write('{}\t{}\n'.format(k, math.log(n / v) if v > 0 else 0))
n = 0
with open(sys.argv[3], encoding = 'utf-8', mode='r') as reader:
for line in reader:
n += 1
denom = math.log(n)
with open('idf.tsv', encoding = 'utf-8', mode='r') as reader:
with open('idf.norm.tsv', encoding = 'utf-8', mode='w') as writer:
for line in reader:
cols = line.split('\t')
score = float(cols[1])
if score > 0:
writer.write('{}\t{}\n'.format(cols[0], score / denom))