-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreader.py
31 lines (30 loc) · 1.05 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
'''
Created on 1 aug. 2018
@author: Dragos2811
'''
import re
#print(re.split(r'(?![/.])\W', "Mr.Jones says This is@a&test example_cool man+right more/fun 43.35"))
def read_data(fname):
with open(fname,'r') as f:
return [x for x in re.split('((?![/.\-"])\W)',f.read()) if x]
#print(len(list(set(read_data("gen_html/1.html")))))
def build_dataset(x,y):
all_words = []
all_data_as_array = []
for i in range (x,y):
data_as_array = read_data("gen_html/%s.html" % (i))
for word in data_as_array:
all_data_as_array.append(word)
unique_words = list(set(data_as_array))
for word in unique_words:
all_words.append(word)
all_words = list(set(all_words))
#print (all_words)
#print (len(all_words))
return all_words,all_data_as_array
def create(x,y):
dictionary,data= build_dataset(x,y)
char_to_ix = { ch:i for i,ch in enumerate(dictionary) }
ix_to_char = { i:ch for i,ch in enumerate(dictionary) }
vocab_size = len(dictionary)
return char_to_ix,ix_to_char,vocab_size,data