-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_csv.py
executable file
·79 lines (70 loc) · 1.94 KB
/
create_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python3
import random
import os
MIN_NUM = 1
MAX_NUM = 15
INPUT = "input.txt"
OUTPUT = "output.txt"
MIXED = "russian_corpus.csv"
OUT_CSV = "output.csv"
def read_data(file):
f = open(file, "r", encoding="utf-8")
data = f.read()
f.close()
return data
def check_dots(data):
return int(data.find('.'))
def write_to_file(file, data):
f = open(file, "a")
bytes = f.write(data)
f.close()
if bytes > 0:
return 1
else:
return 0
def remove_empty_lines(input, output):
fh = open(input, "r")
lines = fh.readlines()
fh.close()
keep = []
for line in lines:
if not line.isspace():
keep.append(line)
fh = open(output, "w")
fh.write("".join(keep))
fh.close()
def randomize_lines_in_file(inputfile, outputfile):
with open(inputfile,'r') as source:
data = [ (random.random(), line) for line in source ]
data.sort()
with open(outputfile,'w') as target:
for _, line in data:
target.write( line )
def replace_symbols(data):
data = data.replace('!','.')
data = data.replace('\\u0xE2', '.')
data = data.replace('\\u0x80', '.')
data = data.replace('\\u0xA6', '.')
data = data.replace('. .', '.')
data = data.replace('?','.')
return data
def main():
data = read_data(INPUT)
current_pos = 0
data = replace_symbols(data)
global Data_size
Data_size = len(data)
while Data_size > 1:
data = data[current_pos:].strip(" ")
dot_pos = data[current_pos:].find('.')
chars = len(data[current_pos:dot_pos + 1])
write_to_file(OUT_CSV, data[current_pos:dot_pos + 1] + "\t " + str(chars) + "\n")
current_pos = dot_pos + 1
data = data[current_pos:].strip(" ")
current_pos = 0
Data_size = len(data)
remove_empty_lines(OUT_CSV,OUT_CSV)
randomize_lines_in_file(OUT_CSV,MIXED)
os.remove(OUT_CSV)
if __name__ == "__main__":
main()