-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtutorial10_text.py
347 lines (274 loc) · 13.2 KB
/
tutorial10_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
# https://www.tensorflow.org/tutorials/load_data/text?hl=ko
import collections
import pathlib
import re
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
dataset = utils.get_file('stack_overflow_16k.tar.gz', data_url, untar = True,
cache_dir = 'stack_overflow', cache_subdir = '')
dataset_dir = pathlib.Path(dataset).parent
print(list(dataset_dir.iterdir())) # print 추가함
train_dir = dataset_dir/'train'
print(list(train_dir.iterdir()))
sample_file = train_dir/'python/1755.txt'
with open(sample_file) as f:
print(f.read())
batch_size = 32
seed = 42
raw_train_ds = preprocessing.text_dataset_from_directory(train_dir, batch_size = batch_size,
validation_split = 0.2, subset = 'training', seed = seed)
for text_batch, label_batch in raw_train_ds.take(1):
for i in range(10):
print("Question : ", text_batch.numpy()[i][:100], '...')
print("Label : ", label_batch.numpy()[i])
for i, label in enumerate(raw_train_ds.class_names):
print("Label", i, "corresponds to ", label)
raw_val_ds = preprocessing.text_dataset_from_directory(train_dir, batch_size = batch_size,
validation_split = 0.2, subset = 'validation', seed = seed)
test_dir = dataset_dir/'test'
raw_test_ds = preprocessing.text_dataset_from_directory(test_dir, batch_size = batch_size)
VOCAB_SIZE = 10000
binary_vectorize_layer = TextVectorization(max_tokens = VOCAB_SIZE, output_mode = 'binary')
MAX_SEQUENCE_LENGTH = 250
int_vectorize_layer = TextVectorization(max_tokens = VOCAB_SIZE, output_mode = 'int',
output_sequence_length = MAX_SEQUENCE_LENGTH)
# 전처리 레이어의 상태를 데이터셋에 맞추기위해 adapt 호출. 모델이 문자열 인덱스를 정수로 작성
train_text = raw_train_ds.map(lambda text, labels: text)
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)
def binary_vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return binary_vectorize_layer(text), label
def int_vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return int_vectorize_layer(text), label
# Retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)
print("'binary' vectorized question:", binary_vectorize_text(first_question, first_label)[0])
print("'int' vectorized question:", int_vectorize_text(first_question, first_label)[0])
print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[313])
print("Vocabulary size : {}".format(len(int_vectorize_layer.get_vocabulary())))
binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_val_ds = raw_val_ds.map(binary_vectorize_text)
binary_test_ds = raw_test_ds.map(binary_vectorize_text)
int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)
# 성능을 위한 데이터 세트 구성
AUTOTUNE = tf.data.experimental.AUTOTUNE
def configure_dataset(dataset):
return dataset.cache().prefetch(buffer_size = AUTOTUNE)
# 모델 훈련
binary_model = tf.keras.Sequential([layers.Dense(4)])
binary_model.compile(loss = losses.SparseCategoricalCrossentropy(from_logits = True),
optimizer = 'adam', metrics = ['accuracy'])
history = binary_model.fit(binary_train_ds, validation_data = binary_val_ds, epochs = 10)
def create_model(vocab_size, num_labels):
model = tf.keras.Sequential([
layers.Embedding(vocab_size, 64, mask_zero = True),
layers.Conv1D(64, 5, padding = "valid", activation = 'relu', strides = 2),
layers.GlobalMaxPooling1D(),
layers.Dense(num_labels)
])
return model
# vocab_size is VOCAB_SIZE + 1 since 0 is used additionally for padding.
int_model = create_model(vocab_size = VOCAB_SIZE + 1, num_labels = 4)
int_model.compile(loss = losses.SparseCategoricalCrossentropy(from_logits = True),
optimizer = 'adam', metrics = ['accuracy'])
history = int_model.fit(int_train_ds, validation_data = int_val_ds, epochs = 5)
# 두 모델 비교
print("Linear model on binary vectorized data : ")
print(binary_model.summary())
print("ConvNet model on int vectorized data:")
print(int_model.summary())
binary_loss, binary_accuracy = binary_model.evaluate(binary_test_ds)
int_loss, int_accuracy = int_model.evaluate(int_test_ds)
print("Binary model accuracy : {:2.2%}".format(binary_accuracy))
print("Int model accuracy : {:2.2%}".format(int_accuracy))
# 모델 내보내기
export_model = tf.keras.Sequential([binary_vectorize_layer, binary_model,
layers.Activation('sigmoid')])
export_model.compile(loss = losses.SparseCategoricalCrossentropy(from_logits = False),
optimizer = 'adam', metrics = ['accuracy'])
# Test it with 'raw_test_ds', which yeilds raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print("Accuracy: {:2.2%}".format(binary_accuracy))
def get_string_labels(predicted_scores_batch):
predicted_int_labels = tf.argmax(predicted_scores_batch, axis = 1)
predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels)
return predicted_labels
inputs = ["how do I extract keys from a dict into a list?", # python
"debug public static void main(string[] args) {...}", # java
]
predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)
for intput, label in zip(inputs, predicted_labels):
print("Question: ", input)
print("Predicted label: ", label.numpy())
# 예제2 : Illiad 번역의 저자 예측
# 데이터셋 다운로드
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']
for name in FILE_NAMES:
text_dir = utils.get_file(name, origin = DIRECTORY_URL + name)
parent_dir = pathlib.Path(text_dir).parent
list(parent_dir.iterdir())
# 데이터셋 로드
def labeler(example, index):
return example, tf.cast(index, tf.int64)
labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
lines_dataset = tf.data.TextLineDataset(str(parent_dir/file_name))
labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
labeled_data_sets.append(labeled_dataset)
BUFFER_SIZE = 50000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration = False)
for text, label in all_labeled_data.take(10):
print("Sentence: ", text.numpy())
print("Label: ", label.numpy())
# 훈련을 위한 데이터셋 준비
tokenizer =tf_text.UnicodeScriptTokenizer()
def tokenize(text, unused_label):
lower_case = tf_text.case_fold_utf8(text)
return tokenizer.tokenize(lower_case)
tokenized_ds = all_labeled_data.map(tokenize)
for text_batch in tokenized_ds.take(5):
print("Tokens: ", text_batch.numpy())
tokenized_ds = configure_dataset(tokenized_ds)
vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
for tok in toks:
vocab_dict[tok] += 1
vocab = sorted(vocab_dict.items(), key = lambda x: x[1], reverse = True)
vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])
keys = vocab
values = range(2, len(vocab) + 2) # reserve 0 for padding, 1 for OOV(Out-Of-Vocabulary)
init = tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype = tf.string,
value_dtype = tf.int64)
num_oov_buckets = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)
def preprocess_text(text, label):
standardized = tf_text.case_fold_utf8(text)
tokenized = tokenizer.tokenize(standardized)
vectorized = vocab_table.lookup(tokenized)
return vectorized, label
example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text.numpy())
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorized sentence: ", vectorized_text.numpy())
all_encoded_data = all_labeled_data.map(preprocess_text)
# 데이터셋을 학습 및 테스트로 분할
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example:", sample_labels[0])
vocab_size += 2
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)
# 모델 훈련
model = create_model(vocab_size = vocab_size, num_labels = 3)
model.compile(optimizer = 'adam', loss = losses.SparseCategoricalCrossentropy(from_logits = True),
metrics = ['accuracy'])
history = model.fit(train_data, validation_data = validation_data, epochs = 3)
loss, accuracy = model.evaluate(validation_data)
print("Loss : ", loss)
print("Accuracy : {:2.2%}".format(accuracy))
# 모델 내보내기
preprocess_layer = TextVectorization(max_tokens = vocab_size, standardize = tf_text.case_fold_utf8,
split = tokenizer.tokenize, output_mode = 'int',
output_sequence_length = MAX_SEQUENCE_LENGTH)
preprocess_layer.set_vocabulary(vocab)
export_model = tf.keras.Sequential([preprocess_layer, model, layers.Activation('sigmoid')])
export_model.compile(loss = losses.SparseCategoricalCrossentropy(from_logits = False),
optimizer = 'adam', metrics = ['accuracy'])
# Create a test dataset of raw strings
test_ds = all_labeled_data.take(VALIDATION_SIZE).batch(BATCH_SIZE)
test_ds = configure_dataset(test_ds)
loss, accuracy = export_model.evaluate(test_ds)
print("Loss : ", loss)
print("Accuracy : {:2.2%}".format(accuracy))
# 새 테이터에 대한 추론 실행
inputs = ["Join'd to th' Ionians with their flowing robes,", # Label 1
"the allies, and his armour flashed about him so that he seemed to all", # Label 2
"And with loud clangor of his arms he fell", # label 0
]
predicted_scores = export_model.predict(inputs)
predicted_labels = tf.argmax(predicted_scores, axis = 1)
for input, label in zip(inputs, predicted_labels):
print("Question: ", input)
print("Predicted label : ", label.numpy())
# TensorFlow 데이터셋(TFDS)을 사용하여 더 많은 데이터셋 다운로드
train_ds = tfds.load('imdb_reviews', split = 'train', batch_size = BATCH_SIZE, shuffle_files = True,
as_supervised = True)
val_ds = tfds.load('imdb_reviews', split = 'train', batch_size = BATCH_SIZE, shuffle_files = True,
as_supervised = True)
for review_batch, label_batch in val_ds.take(1):
for i in range(5):
print("Review : ", review_batch[i].numpy())
print("Label : ", label_batch[i].numpy())
# 훈련을 위한 데이터셋 준비
vectorize_layer = TextVectorization(max_tokens = VOCAB_SIZE, output_mode = 'int',
output_sequence_length = MAX_SEQUENCE_LENGTH)
# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda text, labels:text)
vectorize_layer.adapt(train_text)
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
# Configure datasets for performance as before
train_ds = configure_dataset(train_ds)
val_ds = configure_dataset(val_ds)
# 모델 훈련
model = create_model(vocab_size = VOCAB_SIZE + 1, num_labels = 1)
model.summary()
model.compile(loss = losses.BinaryCrossentropy(from_logits = True), optimizer = 'adam',
metrics = ['accuracy'])
history = model.fit(train_ds, validation_data = val_ds, epochs = 3)
loss, accuracy = model.evaluate(val_ds)
print("Loss : ", loss)
print("Accuracy : {:2.2%}".format(accuracy))
# 모델 내보내기
export_model = tf.keras.Sequential([vectorize_layer, model, layers.Activation('sigmoid')])
export_model.compile(loss = losses.SparseCategoricalCrossentropy(from_logits = False),
optimizer = 'adam', metrics = ['accuracy'])
# 0 --> negative review
# 1 --> positive review
inputs = [
"This is a fantastic movie.",
"This is a bad movie.",
"This move was so bad that it was good.",
"I will never say yes to watching this movie.",
]
predicted_scores = export_model.predict(inputs)
predicted_labels = [int(round(x[0])) for x in predicted_scores]
for input, label in zip(inputs, predicted_labels):
print("Question : ", input)
print("Predicted label : ", label)