-
Notifications
You must be signed in to change notification settings - Fork 1
/
final4_swahili.py
128 lines (97 loc) · 3.74 KB
/
final4_swahili.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
"""final4_swahili.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1qN1KWrh24ZCv-v5j3fMaPrB00if-O7BY
Referred from https://keras.io/examples/generative/lstm_character_level_text_generation/
"""
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku
import numpy as np
path = "/content/sw-train.txt"
file_content = open(path, "r")
Content = file_content.read()
data = Content[0:4000000]
text= data
len(text)
characters = sorted(list(set(text)))
print("Total characters:", len(characters))
char_indices = dict((c, i) for i, c in enumerate(characters))
indices_char = dict((i, c) for i, c in enumerate(characters))
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i : i + maxlen])
next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))
x = np.zeros((len(sentences), maxlen, len(characters)), dtype=np.bool)
y = np.zeros((len(sentences), len(characters)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
model = tensorflow.keras.Sequential(
[
tensorflow.keras.Input(shape=(maxlen, len(characters))),
tensorflow.keras.layers.LSTM(128,return_sequences=True),
tensorflow.keras.layers.Dropout(0.3),
tensorflow.keras.layers.LSTM(128,return_sequences=True),
tensorflow.keras.layers.Dropout(0.3),
tensorflow.keras.layers.LSTM(128),
tensorflow.keras.layers.Dense(len(characters), activation="softmax"),
]
)
model.compile(loss="categorical_crossentropy", optimizer='adam')
model.summary()
import random
def sample(preds, temperature=1.0):
# helper function to sample an index from a probability array
preds = np.asarray(preds).astype("float64")
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
epochs = 40
batch_size = 4096
for epoch in range(epochs):
model.fit(x, y, batch_size=batch_size, epochs=1)
print()
print("Generating text after epoch: %d" % epoch)
import random
start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5]:
print("...Diversity:", diversity)
generated = ""
sentence = text[start_index : start_index + maxlen]
print('...Generating with seed: "' + sentence + '"')
for i in range(400):
x_pred = np.zeros((1, maxlen, len(characters)))
for t, char in enumerate(sentence):
x_pred[0, t, char_indices[char]] = 1.0
preds = model.predict(x_pred, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
sentence = sentence[1:] + next_char
generated += next_char
print("...Generated: ", generated)
print()
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state = 42)
y_pred = model.predict(X_test)
for y in y_pred:
for ind, val in enumerate(y):
if val == max(y):
y[ind] = 1
else:
y[ind] = 0
print(accuracy_score(y_test,y_pred))