-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlstm_proj.py
788 lines (643 loc) · 30.7 KB
/
lstm_proj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
# coding: utf-8
import pickle
import numpy as np
import scipy.stats as meas
from collections import OrderedDict
import time
import theano
from theano import config
import theano.tensor as T
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
# from sentences import *
# from random import sample
import random
from sentences_proj import prepare_data as prepare_data
from sentences_proj import embed_w2v as embed
import pandas as pd
# from sentences import expand as expand # Useless in the later tasks
data_path = "./data/"
# test = pickle.load(open(data_path + "semtest.p",'rb')) # add by liuenda
options = locals().copy()
random.seed(1234)
np.random.seed(1234)
# Called by find_ranking
# Given 2 list of projection results, calculate there L1-norm similarity
def cal_similarity(a, b):
diff = np.linalg.norm(a - b, 1, axis=1)
sim = np.exp(-diff)
# len(diff)
return sim
# Find the ranking results with respect to real pairs
# Defaulty, projection1 should be JP
# Whiile, projection2 should be EN->JP
def find_ranking(projection1, projection2):
sim_results = []
rank_results = []
# Iterate each of the ariticle from projection1 (999) as proj1
# Calculate the simialrity of proj1 with all ariticles in projection2 (999)
for i, proj1 in enumerate(projection1):
sim = cal_similarity(proj1, projection2)
rank = pd.Series(sim).rank(ascending = False)[i]
sim_results.append(sim)
rank_results.append(rank)
# sim_results contains 999*999 similairty matrix
return sim_results, rank_results
# rank_results should be list of (999,)
def find_top(rank_results, top):
s = pd.Series(rank_results)
n_top = (s <= top).sum()
return n_top
# ---------------------------------------------
# Make a new name
# combine pp and name to "pp_name"
def _p(pp, name):
return '%s_%s' % (pp, name)
def numpy_floatX(data):
return np.asarray(data, dtype=config.floatX)
use_noise = theano.shared(numpy_floatX(0.))
# NOT used
def zipp(params, tparams):
for kk, vv in params.iteritems():
tparams[kk].set_value(vv)
def unzip(zipped):
new_params = OrderedDict()
for kk, vv in zipped.iteritems():
new_params[kk] = vv.get_value()
return new_params
# Create Tensor Shared variable(parameters) from Dicitonary of Weights (WUb)
# and then save the tensors into a new Dictionary tparams
def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.iteritems():
tparams[kk] = theano.shared(params[kk], name = kk)
return tparams
def get_layer(name):
fns = layers[name]
return fns
# 生成一个正态分布随机矩阵
# Generate random numbers according to standard deviation
# mu: mean, sigma: deviation
# n1, n2: size of the matrix
def genm(mu, sigma, n1, n2):
return np.random.normal(mu, sigma, (n1, n2))
# 生成一个LSTM单元,参数U,W,b三组,并且初始化
# Example: newp = getlayerx(newp, '1lstm1', 50, 300)
# d: OrderedDictionary, pref: prefix (name), n: timesteps, nin: input dimension
def getlayerx(d, pref, n, nin):
# mean value for normal distribution
mu = 0.0
# deviation for normal distribution
sigma = 0.2
# U, with random initialization
U = np.concatenate([genm(mu, sigma, n, n), genm(mu, sigma, n, n), genm(mu, sigma, n, n), genm(mu, sigma, n, n)]) / np.sqrt(n)
U = np.array(U, dtype = np.float32)
# W, with random initialization
W = np.concatenate([genm(mu, sigma, n, nin), genm(mu, sigma, n, nin), genm(mu, sigma, n, nin), genm(mu, sigma, n, nin)]) / np.sqrt(np.sqrt(n * nin))
W = np.array(W, dtype = np.float32)
# b, with random initialization
# Initialize the b_i, b_f, b_c and b_o in the same time
b = np.random.uniform(-0.5, 0.5, size=(4*n,))
# set thhe bias of the forget gates b[n:n+n]) to 1.5
#b = numpy.zeros((n * 300,))+1.5
b[n:n*2] = 1.5
# Update the dictionary
d[_p(pref, 'U')] = U
d[_p(pref, 'W')] = W
d[_p(pref, 'b')] = b.astype(config.floatX)
return d
# Here the hidden unite is set to be 50
def creatrnnx():
newp = OrderedDict()
#print ("Creating neural network")
newp = getlayerx(newp, '1lstm1', 50, 200)
#newp=getlayerx(newp,'1lstm2',30,50)
#newp=getlayerx(newp,'1lstm3',40,60)
#newp=getlayerx(newp,'1lstm4',6)
#newp=getlayerx(newp,'1lstm5',4)
newp = getlayerx(newp, '2lstm1', 50, 200)
#newp=getlayerx(newp,'2lstm2',20,10)
#newp=getlayerx(newp,'2lstm3',10,20)
#newp=getlayerx(newp,'2lstm4',6)
#newp=getlayerx(newp,'2lstm5',4)
#newp=getlayerx(newp,'2lstm3',4)
#newp['2lstm1']=newp['1lstm1']
#newp['2lstm2']=newp['1lstm2']
#newp['2lstm3']=newp['1lstm3']
return newp
def dropout_layer(state_before, use_noise, rrng,rate):
proj = tensor.switch(use_noise,
(state_before *rrng),
state_before * (1-rate))
return proj
# Usage example:
# proj11 = getpl2(prevlayer =emb11, pre = '1lstm1', mymask = mask11,
# used = False, rrng, size = 50, tnewp)[-1]
def getpl2(prevlayer, pre, mymask, used, rrng, size, tnewp):
# lstm_layer2() returns the value of hidden layer (hvals)
proj = lstm_layer2(tnewp, prevlayer, options,
prefix=pre,
mask=mymask,nhd=size)
if used:
print "Added dropout"
proj = dropout_layer(proj, use_noise, rrng, 0.5)
return proj
# nhd -> number of hidden units -> dim(output)
# state_below -> (Max No. of words in batch, No. of Samples, 300) -> the Input of LSTM
# (tparams = tnewp, state_below = emb11, options, prefix = '1lstm1', mask = mymaks, nhd =50)
def lstm_layer2(tparams, state_below, options, prefix='lstm', mask=None, nhd=None):
# nsteps: Max No. of words in batch (maxlen, timesteps)
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
assert mask is not None
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
# Define the machenism of LSTM units
def _step(m_, x_, h_, c_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')].T)
preact += x_
preact += tparams[_p(prefix, 'b')]
i = tensor.nnet.sigmoid(_slice(preact, 0, nhd))
f = tensor.nnet.sigmoid(_slice(preact, 1, nhd))
o = tensor.nnet.sigmoid(_slice(preact, 2, nhd))
c = tensor.tanh(_slice(preact, 3, nhd))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
# there are two output for each lstm unit:
# hidden layer output (h) and cell memory output (c)
return [h, c]
# state_below -> word2vec embedding
# Re new the state_below to -> Wx_t + b ???
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')].T) +
tparams[_p(prefix, 'b')].T)
#print "hvals"
dim_proj = nhd
# 为LSTM单元做正向传播(nsteps[timesteps])次,每次san都会输出一个(N, 50)的矩阵
# 最后一次循环(nsteps[timestpes])才得到所有的hidden layer的数值
# Forward propogation for nsteps, where the last loop results are expected
# nsteps = state_below.shape[0] == Max No. of words in batch == [timesteps]
# state_below -> (Max No. of words in batch[timesteps], No. of Samples[N], 300)
# hvals: values of the hidden layer, shape = (nsteps[timestep], n_samples[N], dim_proj[dim(output)])
# yvals: values of output layers, shape = (nsteps[timestep], n_samples[N], dim_proj[dim(output)])
[hvals,yvals], updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
# Here, the shape of hvals would be (nsteps[timestep], n_samples[N], dim_proj[dim(output)])
return hvals
# IN USE
def adadelta(lr, tparams, grads, emb11,mask11,emb21,mask21,y, cost):
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rup2' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, (0.95 * rg2 + 0.05* (g ** 2)))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([emb11,mask11,emb21,mask21,y], cost, updates=zgup + rg2up,
name='adadelta_f_grad_shared')
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
for zg, ru2, rg2 in zip(zipped_grads,
running_up2,
running_grads2)]
ru2up = [(ru2, (0.95 * ru2 + 0.05 * (ud ** 2)))
for ru2, ud in zip(running_up2,updir)]
param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
f_update = theano.function([lr], [], updates=ru2up + param_up,
on_unused_input='ignore',
name='adadelta_f_update')
return f_grad_shared, f_update
# NOT USED
def sgd(lr, tparams, grads, emb11,mask11,emb21,mask21,y, cost):
gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
f_grad_shared = theano.function([emb11,mask11,emb21,mask21,y], cost, updates=gsup,
name='sgd_f_grad_shared')
pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
f_update = theano.function([lr], [], updates=pup,
name='sgd_f_update')
return f_grad_shared, f_update
# NOT USED
def rmsprop(lr, tparams, grads, emb11,mask11,emb21,mask21,y, cost):
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([emb11,mask11,emb21,mask21,y], cost,
updates=zgup + rgup + rg2up,
name='rmsprop_f_grad_shared')
updir = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_updir' % k)
for k, p in tparams.iteritems()]
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
running_grads2)]
param_up = [(p, p + udn[1])
for p, udn in zip(tparams.values(), updir_new)]
f_update = theano.function([lr], [], updates=updir_new + param_up,
on_unused_input='ignore',
name='rmsprop_f_update')
return f_grad_shared, f_update
class LSTM():
def __init__(self, nam, W, maxlen=0, load=False, training=False):
self.W = W
# 创建2个LSTM单元(参数:WUb)放入词典中,并初始化参数
# Generate 2 LSTM unit with Guassian innitialization
# Type: Dictionary
self.maxlen = maxlen
newp = creatrnnx()
self.model_name = nam
# 让两个LSTM单元的参数WUb的初始相同
# Make the weights(WUb) of both LSTM unit same
for i in newp.keys():
if i[0] == '1':
newp['2' + i[1:]] = newp[i]
# Create 5 tensors (symoblic) variables (y, mask11, mask21, emb11, emb21)
# Here, config.floatX = 'float32'
y = T.vector('y', dtype = config.floatX)
mask11 = T.matrix('mask11', dtype = config.floatX)
mask21 = T.matrix('mask21', dtype = config.floatX)
emb11 = T.ftensor3('emb11')
emb21 = T.ftensor3('emb21') # 3-D float-type tensor
# Load the existed model (pre-trained weights) if needed
if load == True:
newp = pickle.load(open(nam,'rb'))
# Convert 'newp' to shared-tensor-type dictionary 'tnewp'
# Shared tenasor variable
self.tnewp = init_tparams(newp)
# Set tensor-type noise
use_noise = theano.shared(numpy_floatX(0.))
# Set tensor-type random number generator
# rng -> random number generator
trng = RandomStreams(1234)
# ??? rrng?
# create a 3-D random tensor for "dropout"?
rate = 0.5
rrng = trng.binomial(emb11.shape, p = 1 - rate, n = 1, dtype = emb11.dtype)
# print "rrng:"
# print "type of rrng:", type(rrng)
# print rrng
# 具体化LSTM模型的结构和参数(核心)proj代表着一个mini-batch输入以后的输出值
# Implement the LSTM module;
# Here 'False' -> NOT apply DROPOUT layers;
# Since the input is in the format: (Max No. of words in batch, No. of Samples, 300)
# Note: that the 1st term and 2nd term are exchanged!
# 只需要getp()即scan循环以后的最后一次(timesteps)结果,之前记录LSTM输出的结果都抛弃
# proj11[-1] -> (No. of samples[N], Hidden unit dimension[timesteps]) -> (N, 50)
# proj11 takes the inputs as embedding matrix emb1 and gives the o/p of the LSTM_A
proj11 = getpl2(emb11, '1lstm1', mask11, False, rrng, 50, self.tnewp)[-1]
proj21 = getpl2(emb21, '2lstm1', mask21, False, rrng, 50, self.tnewp)[-1]
# Define the cost function
dif = (proj21 - proj11).norm(L = 1, axis = 1)
s2 = T.exp(-dif)
sim = T.clip(s2, 1e-7, 1.0-1e-7) # Similarity
lr = tensor.scalar(name = 'lr') # learning rate
ys = T.clip((y-1.0) / 4.0, 1e-7, 1.0-1e-7)
cost = T.mean((sim - ys) ** 2)
ns=emb11.shape[1]
self.f2sim = theano.function([emb11, mask11, emb21, mask21], sim, allow_input_downcast = True)
self.f_proj11 = theano.function([emb11, mask11], proj11, allow_input_downcast = True) # NOT used
self.f_cost = theano.function([emb11, mask11, emb21, mask21, y], cost, allow_input_downcast = True) # NOT used
# Prepare for the backpropogation and gradiant descend
if training == True:
# 计算cost对不同参数的导数,并且平均两个LSTM模型的参数
# The gradi refers to gradients wrt. cost, and is a list containing gradients to be update weights
# We average out the gradients by appending to another list grads[]
# So, we average out the gradients : wrt LSTM_A and wrt LSTM_B
# i.e, gradient= (grad(wrt(LSTM_A)+grad(wrt(LSTM_B))/2.0 to maintain the symmetricity between either LSTMs
# wrt: (variable or list of variables) – term[s] for which we want gradients
gradi = tensor.grad(cost, wrt = self.tnewp.values()) # T.grad -> differential
grads = []
l = len(gradi)
for i in range(0, l/2):
gravg = (gradi[i] + gradi[i + l / 2]) / (4.0)
#print i,i+9
grads.append(gravg)
for i in range(0, len(self.tnewp.keys()) / 2):
grads.append(grads[i])
# Here, the f_grad_shared and f_update are theano functions
self.f_grad_shared, self.f_update = adadelta(lr, self.tnewp, grads, emb11, mask11, emb21, mask21, y, cost)
def train_lstm(self, train, max_epochs, correct, test_correct, batchsize=32):
print "Training"
print "the length of the training data is ", len(train)
# test = train
print "Batchsize =", batchsize
print "max_epochs =", max_epochs
lrate = 0.0001 # Learning rate, but Not USED ???
freq = 0 # ???
batchsize = 64
dfreq = 21 #display frequency
self.mse = [] # MSE of train1 + train2
self.rank = []
self.tops = {}
self.mse_test = [] # MSE of test1
self.mse_train = [] # MSE of train1
self.rank_test = []
self.tops_test = {}
self.top_keys = [1, 5, 10]
print "Before trianing, the error is:"
# print self.chkterr2(train) # MSE check
cst_all = self.chkterr2(train)[0]/16
self.mse.append(cst_all)
cst_test = self.chkterr2(test_correct)[0]/16
self.mse_test.append(cst_test)
cst_train = self.chkterr2(correct)[0]/16
self.mse_train.append(cst_train)
# 【注意】内存不足时使用chkterr2但是会慢,内存足够时使用 , self.get_mse(train)
# 【注意】不要直接使用cst变量作为cost,因为这里的cst是最后一个batch的cost而已,不是全部的
print "Training error:", cst_all #, "==", self.get_mse(train)
print "Training_correct error", cst_train
print "Testing_correct error:", cst_test
# Saving (Initialization) the ranking and top1,5,10 information (Trianing data)
rank_results_train, n_tops = self.evaluate2(correct, tops=self.top_keys) # Similairty check
# print "[debug]", n_tops
for top_key in self.top_keys:
# print "[debug]", n_tops[top_key]
self.tops[top_key] = []
self.tops[top_key].append(n_tops[top_key])
print "top-",top_key, "=", self.tops[top_key], ":", n_tops[top_key]
print "Discription of evaluation (ranking) for training data:"
print pd.Series(rank_results_train).describe()
# Saving (Initialization) the ranking and top1,5,10 information (Testing data)
rank_results_test, n_tops_test = self.evaluate2(test_correct, tops=self.top_keys) # Similairty check
# print "[debug]", n_tops
for top_key in self.top_keys:
# print "[debug]", n_tops[top_key]
self.tops_test[top_key] = []
self.tops_test[top_key].append(n_tops_test[top_key])
print "top-",top_key, "=", self.tops_test[top_key], ":", n_tops_test[top_key]
print "Discription of evaluation (ranking) for testing data:"
print pd.Series(rank_results_test).describe()
# eidx -> index of epoch
for eidx in xrange(0, max_epochs):
sta = time.time()
print ""
print 'Epoch', eidx, '...'
num = len(train) # length of training data
#---------------------Shuffle the data------------------------------#
# 为何不直接用shuffle函数?
# generates a list with length of num from the population xrange(num)
# Used for shuffling the training data each time for each epoches
# [5,2,6,.11,...] length -> len(train)
rnd = random.sample(xrange(num), num)
# i would be (0,32,64,...)
# Iterate all batches
for i in range(0, num, batchsize):
q = []
x = i + batchsize
if x > num:
x = num
# Shuffle data
# Iterate samples inside each batch
# i -> start index of the batch
# x -> end index of the batch
for z in range(i, x):
# shuffling the training data to the list q
q.append(train[rnd[z]])
#---------------------------------------------------------------------#
"""
Mask for LSTM is prepared by sentence module
x1 = np.array([["我","很","好",",",",",","][...]...])
len(x1) => 文档的总数
mas1 = np.array([[1,1,1,0,0,0,0,0,0,0][...]...])
"""
x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen)
ls = []
ls2 = []
freq += 1
use_noise.set_value(1.)
for j in range(0, len(x1)):
ls.append(embed(x1[j], 'en', W=self.W))
ls2.append(embed(x2[j], 'jp'))
trconv = np.dstack(ls)
trconv2 = np.dstack(ls2)
emb2 = np.swapaxes(trconv2, 1, 2)
emb1 = np.swapaxes(trconv, 1, 2)
cst = self.f_grad_shared(emb2, mas2, emb1, mas1, y2)
s = self.f_update(lrate) # Not USED ???
if np.mod(freq, dfreq) == 0:
print 'Epoch ', eidx, 'Update ', freq, 'Cost ', cst
# print 'Epoch ', eidx, 'Update ', freq, 'Cost ', cst
# Evalution
# print self.chkterr2(train) # MSE check
cst_all = self.chkterr2(train)[0]/16
self.mse.append(cst_all)
cst_test = self.chkterr2(test_correct)[0]/16
self.mse_test.append(cst_test)
cst_train = self.chkterr2(correct)[0]/16
self.mse_train.append(cst_train)
# 【注意】内存不足时使用chkterr2但是会慢,内存足够时使用 , self.get_mse(train)
# 【注意】不要直接使用cst变量作为cost,因为这里的cst是最后一个batch的cost而已,不是全部的
# 错误用法: print "Training error:", cst, "=", self.chkterr2(train)[0]/16, "==", self.get_mse(train)
print "Training error:", cst_all #, "==", self.get_mse(train)
print "Training_correct error", cst_train
print "Testing_correct error:", cst_test
# Saving the ranking and top1,5,10 information
rank_results_train, n_tops = self.evaluate2(correct, tops=self.top_keys) # Similairty check
self.rank.append(rank_results_train)
for top_key in self.top_keys:
self.tops[top_key].append(n_tops[top_key])
print "top-",top_key, "=", self.tops[top_key], ":", n_tops[top_key]
print "Discription of evaluation (ranking) for training data:"
print pd.Series(rank_results_train).describe()
# Saving the ranking and top1,5,10 information
rank_results_test, n_tops_test = self.evaluate2(test_correct, tops=self.top_keys) # Similairty check
self.rank_test.append(rank_results_test)
for top_key in self.top_keys:
self.tops_test[top_key].append(n_tops_test[top_key])
print "top-",top_key, "=", self.tops_test[top_key], ":", n_tops_test[top_key]
print "Discription of evaluation (ranking) for testing data:"
print pd.Series(rank_results_test).describe()
# Saving the present weights:
self.save_model(name=self.model_name+"_"+str(eidx)+".p")
sto = time.time()
self.time_saver = sto - sta
print "epoch took:", self.time_saver
# --- check the error 2 ---#
# 【注意】这个函数之所以效率低下,要每256组数据为一个循环来做数据的预测 -> 为了防止内存不足!!
def chkterr2(self, mydata):
# count = []
num = len(mydata)
px = []
yx = []
use_noise.set_value(0.)
for i in range(0, num, 256):
q = []
x=i + 256
if x > num:
x = num
for j in range(i, x):
q.append(mydata[j])
x1,mas1,x2,mas2,y2 = prepare_data(q, self.maxlen)
ls = []
ls2 = []
for j in range(0, len(q)):
ls.append(embed(x1[j], 'en', W=self.W))
ls2.append(embed(x2[j], 'jp'))
trconv = np.dstack(ls)
trconv2 = np.dstack(ls2)
emb2 = np.swapaxes(trconv2, 1, 2)
emb1 = np.swapaxes(trconv, 1, 2)
pred = (self.f2sim(emb1, mas1, emb2, mas2)) * 4.0 + 1.0
#dm1=np.ones(mas1.shape,dtype=np.float32)
#dm2=np.ones(mas2.shape,dtype=np.float32)
#corr=f_cost(emb1,mas1,emb2,mas2,y2)
for z in range(0, len(q)):
yx.append(y2[z])
px.append(pred[z])
#count.append(corr)
px = np.array(px)
yx = np.array(yx)
#print "average error= "+str(np.mean(acc))
return np.mean(np.square(px - yx)), meas.pearsonr(px, yx)[0],meas.spearmanr(yx, px)[0]
def predict_similarity(self, sa, sb):
q=[[sa, sb, 0]]
x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen)
ls = []
ls2 = []
use_noise.set_value(0.)
for j in range(0, len(x1)):
ls.append(embed(x1[j], 'en', W=self.W))
ls2.append(embed(x2[j], 'jp'))
trconv = np.dstack(ls)
trconv2 = np.dstack(ls2)
emb2 = np.swapaxes(trconv2, 1, 2)
emb1 = np.swapaxes(trconv, 1, 2)
return self.f2sim(emb1, mas1, emb2, mas2)
def save_model(self, type = 'pikcle', name=None):
if name == None:
name = self.model_name
print "Saving the model to", name
self.new_params = unzip(self.tnewp)
print "saving the model..."
with open(name, 'wb') as handle:
pickle.dump(self.new_params, handle)
# Evaluate the each pairs of multilingual language
# Give each pair a similairty ranking (for 1-999)
def evaluate(self, data):
x1, mas1, x2, mas2, y2 = prepare_data(data, self.maxlen)
use_noise.set_value(0.)
n_samples = len(data)
ls = [] # Embedding results of xa
ls2 = [] # Embedding results of xb
for j in range(0, n_samples):
ls.append(embed(x1[j], 'en', W=self.W))
ls2.append(embed(x2[j], 'jp'))
# print "ls: (should be the same ref_embed)", ls
rank_results = []
for i in range(0, n_samples):
# NOTE: mas1 and mas2 are verticle matrix, not a normal one!
# ref_ls refers to n_samples(999,EN) of duplicated ls[i]
# So we can compare the ls[i](EN) with other sentences(999,JP)
# to derive the ranking results for this given article ls[i](EN)
# 用一个英语文章比较所有可能为pairs的日语文章(如999篇)求出ranking
# ref_ls 就是一个重复了999(n_samples)次的文章ls[i]
# 而 ls2 就是可能为paris的999篇日语的文章
ref_ls = [ls[i]] * n_samples
# print "ref_embed", ref_embed
ref_mas1 = np.array([mas1[:,i],] * n_samples).T
# print "ref_mas", ref_mas
# print "mas1", mas1
# return mas1, ref_mas
trconv = np.dstack(ref_ls)
trconv2 = np.dstack(ls2)
emb2 = np.swapaxes(trconv2, 1, 2)
emb1 = np.swapaxes(trconv, 1, 2)
pred = self.f2sim(emb1, ref_mas1, emb2, mas2)
rank = pd.Series(pred).rank(ascending = False)[i]
rank_results.append(rank)
print "the round", i, "rank:", rank
return rank_results
# project a list of article (cluster numbers) to 50 dim vectors
def seq2vec(self, data):
# list saving the projection results (50 dim):
x1, mas1, x2, mas2, y2 = prepare_data(data, self.maxlen)
# print "Finish preparing the data!"
use_noise.set_value(0.)
n_samples = len(data)
ls = [] # Embedding results of xa
ls2 = [] # Embedding results of xb
for j in range(0, n_samples):
ls.append(embed(x1[j], 'en', W=self.W))
ls2.append(embed(x2[j], 'jp'))
# print "Finished embedding,start projecting..."
# start_time = time.time()
# for i in range(0, n_samples):
# print "conducting the", i, "projection"
# loop_time = time.time()
trconv = np.dstack(ls)
trconv2 = np.dstack(ls2)
emb1 = np.swapaxes(trconv, 1, 2)
emb2 = np.swapaxes(trconv2, 1, 2)
# list saving the projection results (50 dim):
list_projection1 = self.f_proj11(emb1, mas1)
list_projection2 = self.f_proj11(emb2, mas2)
# After projection, compare the distance for possible pairs
# ## SKIP
return list_projection1, list_projection2
# Example: tops = [1, 5, 10]
def evaluate2(self, data, tops):
projection1_train, projection2_train = self.seq2vec(data)
# projection1_test, projection2_test = sls.seq2vec(test_1)
# Calculate the rankings for this data set
sim_results_train, rank_results_train = find_ranking(projection1_train, projection2_train)
# Calculate the top1, top5 and top10 information
n_tops = {}
for top in tops:
n_tops[top] = find_top(rank_results_train, top)
return rank_results_train, n_tops
def get_mse(self, data):
# list saving the projection results (50 dim):
x1, mas1, x2, mas2, y2 = prepare_data(data, self.maxlen)
# print "Finish preparing the data!"
use_noise.set_value(0.)
n_samples = len(data)
ls = [] # Embedding results of xa
ls2 = [] # Embedding results of xb
for j in range(0, n_samples):
ls.append(embed(x1[j], 'en', W=self.W))
ls2.append(embed(x2[j], 'jp'))
# print "Finished embedding,start projecting..."
# start_time = time.time()
# for i in range(0, n_samples):
# print "conducting the", i, "projection"
# loop_time = time.time()
trconv = np.dstack(ls)
trconv2 = np.dstack(ls2)
emb1 = np.swapaxes(trconv, 1, 2)
emb2 = np.swapaxes(trconv2, 1, 2)
# list saving the projection results (50 dim):
# list_projection1 = self.f_proj11(emb1, mas1)
# list_projection2 = self.f_proj11(emb2, mas2)
c = self.f_cost(emb1, mas1, emb2, mas2, y2)
# After projection, compare the distance for possible pairs
# ## SKIP
return c