-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathtest_bilm.py
82 lines (72 loc) · 2.82 KB
/
test_bilm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import tensorflow as tf
import os
from bilm import Batcher, BidirectionalLanguageModel, weight_layers
def set_cuda_visible_devices(is_train):
import os
os.environ['CUDA_VISIBLE_DEVICES']='2'
if is_train:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
return True
set_cuda_visible_devices(True)
"""
Load resources
"""
# Location of pretrained LM. Here we use the test fixtures.
datadir = os.path.join('embeddings')
vocab_file = os.path.join(datadir, 'elmo_vocab.txt')
options_file = os.path.join(datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json')
weight_file = os.path.join(datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')
# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50)
# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)
"""
Build graph
"""
# Input placeholders to the biLM.
question_character_ids = tf.placeholder('int32', shape=(None, None, 50)) # word_length = 50
# Get ops to compute the LM embeddings.
question_embeddings_op = bilm(question_character_ids)
# Get an op to compute ELMo (weighted average of the internal biLM layers)
elmo_question_input = weight_layers('input', question_embeddings_op, l2_coef=0.0)
elmo_question_output = weight_layers('output', question_embeddings_op, l2_coef=0.0)
print(elmo_question_input['weighted_op'].get_shape())
"""
Prepare input
"""
tokenized_question = [
['What', 'are', 'biLMs', 'useful', 'for', '?']
]
# Create batches of data.
question_ids = batcher.batch_sentences(tokenized_question) # (batch_size, sentence_length, word_length)
# padding
question_ids = question_ids.tolist()
print('length = ', len(question_ids[0]))
print(question_ids)
max_sentence_length = 10
for i in range(max_sentence_length - len(question_ids[0]) + 2):
question_ids[0].append([0]*50)
print('length = ', len(question_ids[0]))
print(question_ids)
"""
Compute ELMO embedding
"""
with tf.Session() as sess:
# It is necessary to initialize variables once before running inference.
sess.run(tf.global_variables_initializer())
# Compute ELMo representations (here for the input only, for simplicity).
elmo_question_input_ = sess.run([elmo_question_input['weighted_op']],
feed_dict={question_character_ids: question_ids}) # (batch_size, sentence_length, model_dim)
print(elmo_question_input_)
# check padding
for i in range(len(elmo_question_input_[0][0])):
print(i, len(elmo_question_input_[0][0][i]), elmo_question_input_[0][0][i])
##### general usage #####
"""
1. we have 'tokenized_question' for real input texts.
2. get elmo_question_input_
3. concat glove embedding + elmo_question_input_
3. take contextual encoding(via LSTM, Transformer encoder)
4. concat contextual encoding + elmo_question_output_
"""