-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathword_similarities.py
38 lines (32 loc) · 946 Bytes
/
word_similarities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pickle
import numpy as np
import cupy as cp
import time
import sys
index_meta = pickle.loads( open('make_word_vector/index_meta.pkl', 'rb').read() )
arrays = []
word_index = {}
index_word = {}
for index, meta in index_meta.items():
#print( index )
arrays.append( meta['vec'] )
word = meta['word']
word_index[word] = index
index_word[index] = word
x_all = cp.array(arrays)
x_allnorm = cp.linalg.norm(x_all, axis=(1,) )
start = time.time()
for e, (word, index) in enumerate(word_index.items()):
print( word )
x_word = cp.array( arrays[index] )
x_wa = (x_word * x_all).sum(axis=1)
x_wordnorm = cp.linalg.norm(x_word)
norm = x_wordnorm * x_allnorm
invnorm = norm**-1
cossims = x_wa * invnorm
weight_term = [(w, index_word[i]) for i, w in enumerate(cossims.tolist())]
topn = sorted( weight_term, key=lambda x:x[0]*-1)[:31]
print(topn)
if e > 10 and bench:
break
print('elapsed', time.time() - start )