-
Notifications
You must be signed in to change notification settings - Fork 2
/
clean_data.py
33 lines (29 loc) · 1.06 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from src.pci_crackdown_functions import *
proc_embedding(
input_file = "data/input/sgns_renmin_Word+Character+Ngram/sgns_renmin_Word+Character+Ngram.txt",
output_path = "data/output/embeddings"
)
proc_data(
data_path = 'data/input/Tiananmen_sentences.pkl',
embedding_path = 'data/output/embeddings/embedding.pkl',
tokenizer_path = 'data/output/embeddings/tokenizer.pkl',
create_training_sample = 1,
output_path = 'data/output/',
filename = "tam.pkl"
)
proc_data(
data_path = 'data/input/HK2014_sentences.pkl',
embedding_path = 'data/output/embeddings/embedding.pkl',
tokenizer_path = 'data/output/embeddings/tokenizer.pkl',
create_training_sample = 0,
output_path = 'data/output/',
filename = "prediction_data_HK2014.pkl"
)
proc_data(
data_path = 'data/input/HK2019_sentences.pkl',
embedding_path = 'data/output/embeddings/embedding.pkl',
tokenizer_path = 'data/output/embeddings/tokenizer.pkl',
create_training_sample = 0,
output_path = 'data/output/',
filename = "prediction_data_HK2019.pkl"
)