-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathidf_weighting.py
41 lines (35 loc) · 1.11 KB
/
idf_weighting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import sys
import math
import collections
def create_idf_weights(embeddings_path, corpus_path):
df = {}
document_count = 0
with open(corpus_path) as f:
for document in f:
words = set(document.strip().split())
for word in words:
if word not in df:
df[word] = 0.0
df[word] += 1.0
document_count += 1
idf = collections.OrderedDict()
with open(embeddings_path) as f:
for line in f:
line_parts = line.strip().split()
if len(line_parts) <= 2:
continue
word = line_parts[0]
n = 1.0
if word in df:
n += df[word]
score = math.log(document_count / float(n))
idf[word] = score
return idf
if __name__ == "__main__":
embeddings_path = sys.argv[1]
corpus_path = sys.argv[2]
output_path = sys.argv[3]
weights = create_idf_weights(embeddings_path, corpus_path)
with open(output_path, 'w') as f:
for word in weights:
f.write(word + "\t" + str(weights[word]) + "\n")