-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbpe_util.py
76 lines (65 loc) · 3.9 KB
/
bpe_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import json
from collections import OrderedDict
def learn_bpe(num_operations, vocabulary_threshold, bpe_model_overwrite=False):
"""
print("Learn BPE")
if not os.path.exists("./bpe_model/"):
os.makedirs("bpe_model")
if os.path.exists(f"./bpe_model/zh_ja_{num_operations}_bpe.model") and not bpe_model_overwrite:
print(f"Found ./bpe_model/zh_ja_{num_operations}_bpe.model")
else:
os.system(f"subword-nmt learn-bpe -v -s {num_operations} < ./text/bpe_train.txt > ./bpe_model/zh_ja_{num_operations}_bpe.model")
print("Apply BPE")
if vocabulary_threshold is None:
os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model < ./text/zh_segment.txt > ./text/zh_train.bpe")
os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model < ./text/ja_segment.txt > ./text/ja_train.bpe")
#os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model < ./text/zh_segment_test.txt > ./text/zh_test.bpe")
os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model < ./text/ja_segment_test.txt > ./text/ja_test.bpe")
else:
os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model --vocabulary-threshold {vocabulary_threshold} < ./text/zh_segment.txt > ./text/zh_train.bpe")
os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model --vocabulary-threshold {vocabulary_threshold} < ./text/ja_segment.txt > ./text/ja_train.bpe")
#os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model --vocabulary-threshold {vocabulary_threshold} < ./text/zh_segment_test.txt > ./text/zh_test.bpe")
os.system(f"subword-nmt apply-bpe -c ./bpe_model/zh_ja_{num_operations}_bpe.model --vocabulary-threshold {vocabulary_threshold} < ./text/ja_segment_test.txt > ./text/ja_test.bpe")
"""
print("Build dictionary")
#os.system(f"python build_dictionary.py ./text/zh_train.bpe ./text/ja_train.bpe")
os.system(f"cat ./text/zh_train.bpe ./text/ja_train.bpe >> ./text/zh_ja_train.bpe")
os.system(f"python build_vocab.py ./text/zh_ja_train.bpe ./bpe_model/vocab_{num_operations}_zh_ja")
"""
print("Build zh_ja dictionary")
zh_bpe_json = json.load(open('./text/zh_train.bpe.json', 'r', encoding="utf-8"))
ja_bpe_json = json.load(open('./text/ja_train.bpe.json', 'r', encoding="utf-8"))
zh_dict = [word for word, index in zh_bpe_json.items()]
ja_dict = [word for word, index in ja_bpe_json.items()]
zh_ja_dict = set(zh_dict + ja_dict)
zh_ja_dict = zh_ja_dict - set(['<EOS>', '<GO>', '<UNK>'])
worddict = OrderedDict()
worddict['<EOS>'] = 0
worddict['<GO>'] = 1
worddict['<UNK>'] = 2
# FIXME We shouldn't assume <EOS>, <GO>, and <UNK> aren't BPE subwords.
for ii, ww in enumerate(zh_ja_dict):
worddict[ww] = ii + 3
# Save word2id, id2word as json
word2id = {word: index for index, word in enumerate(worddict)}
id2word = {index: word for index, word in enumerate(worddict)}
with open('./text/word2id.json', 'w', encoding='utf-8') as f:
json.dump(word2id, f, indent=2, ensure_ascii=False)
with open('./text/id2word.json', 'w', encoding='utf-8') as f:
json.dump(id2word, f, indent=2, ensure_ascii=False)
"""
return
def main(num_operations=50000, vocabulary_threshold=None):
with open("./text/zh_segment.txt", "r", encoding="utf-8") as f:
zh_lines = f.readlines()
with open("./text/ja_segment.txt", "r", encoding="utf-8") as f:
ja_lines = f.readlines()
with open("./text/bpe_train.txt", "w", encoding="utf-8") as f:
for line in zh_lines:
f.write(line.strip() + "\n")
for line in ja_lines:
f.write(line.strip() + "\n")
learn_bpe(num_operations, vocabulary_threshold, bpe_model_overwrite=True)
if __name__ == "__main__":
main()