forked from rajin/weather_prediction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_manipulation.py
156 lines (124 loc) · 6.14 KB
/
data_manipulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
# coding: utf-8
# #### Data Manipulation
import datetime
from datetime import datetime
import h5py
import numpy as np
#### Function to just give some printing on the screen during conversion of file formats
def fn_print(string):
print("\n-- ", string, ": ", datetime.now().strftime('%H:%M:%S'), "--")
def fn_load_data(path_in, no_import, start_line):
id_no = []
label = []
data = []
# scan through file line by line
with open(path_in) as infile:
i = -1
for line in infile:
i += 1
if i % 500 == 0:
fn_print(("considering line:" + str(i)))
# print("considering line:",i)
if i < start_line:
continue
if i >= no_import + start_line:
break
temp = line.split(",")
id_no.append(str(temp[0]))
label.append(float(temp[1]))
data_temp = temp[2].split(" ")
# data_temp = [int(x) for x in data_temp] # prob slowest part
data_temp = list(map(int, data_temp)) # slightly quicker
data.append(data_temp)
# save results
np_id = np.array(id_no)
np_label = np.array(label)
np_data = np.array(data)
# clear memory
del id_no, label, data
# reshape data
np_data = np.reshape(np_data, newshape=-1)
T, H, Y, X = 15, 4, 101, 101
np_data = np.reshape(np_data, newshape=(-1, T, H, Y, X), order='C')
return np_id, np_label, np_data
## Function to append to a HDF5 file since keeping everything in memory would probably freeze computer.
def fn_h5_append(h5f_name, name_in, data_in):
h5f = h5py.File(h5f_name, 'a')
h5f.create_dataset(name_in, data=data_in)
h5f.close()
# We can now convert all the files to HDF5 now by using the functions writen above.
########## converting testA to HDF5 using the functions given above
step_size = 500
path_in = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/testA.txt'
h5f_name = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/data_testA.h5'
h5f = h5py.File(h5f_name, 'w')
grp = h5f.create_group('test_A')
h5f.close()
for i in np.arange(0, 2000, step_size):
fn_print(("convert to h5, outter loop:" + str(i)))
np_train_id, np_train_label, np_train_data = fn_load_data(path_in, start_line=i, no_import=step_size)
filename_id = ('/test_A/test_id_' + str(i) + "_to_" + str(i + step_size - 1))
filename_label = ('/test_A/test_label_' + str(i) + "_to_" + str(i + step_size - 1))
filename_data = ('/test_A/test_data_' + str(i) + "_to_" + str(i + step_size - 1))
ascii_id = [n.encode("ascii", "ignore") for n in np_train_id.tolist()]
fn_h5_append(h5f_name, filename_id, ascii_id)
fn_h5_append(h5f_name, filename_label, np_train_label)
fn_h5_append(h5f_name, filename_data, np_train_data)
del np_train_id, np_train_label, np_train_data
########## converting testB to HDF5
path_in = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/testB.txt'
h5f_name = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/data_testB.h5'
h5f = h5py.File(h5f_name, 'w')
grp = h5f.create_group('test_B')
h5f.close()
for i in np.arange(0, 2000, step_size):
fn_print(("convert to h5, outter loop:" + str(i)))
np_train_id, np_train_label, np_train_data = fn_load_data(path_in, start_line=i, no_import=step_size)
filename_id = ('/test_B/test_id_' + str(i) + "_to_" + str(i + step_size - 1))
filename_label = ('/test_B/test_label_' + str(i) + "_to_" + str(i + step_size - 1))
filename_data = ('/test_B/test_data_' + str(i) + "_to_" + str(i + step_size - 1))
ascii_id = [n.encode("ascii", "ignore") for n in np_train_id.tolist()]
fn_h5_append(h5f_name, filename_id, ascii_id)
fn_h5_append(h5f_name, filename_label, np_train_label)
fn_h5_append(h5f_name, filename_data, np_train_data)
del np_train_id, np_train_label, np_train_data
########## converting train set to HDF5 using only 4,000 samples instead of the total 10,000
step_size = 500
path_in = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/train.txt'
h5f_name = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/train.h5'
h5f = h5py.File(h5f_name, 'w')
grp = h5f.create_group('train')
h5f.close()
for i in np.arange(0, 4000, step_size):
fn_print(("convert to h5, outter loop:" + str(i)))
np_train_id, np_train_label, np_train_data = fn_load_data(path_in, start_line=i, no_import=step_size)
filename_id = ('/train/train_id_' + str(i) + "_to_" + str(i + step_size - 1))
filename_label = ('/train/train_label_' + str(i) + "_to_" + str(i + step_size - 1))
filename_data = ('/train/train_data_' + str(i) + "_to_" + str(i + step_size - 1))
ascii_id = [n.encode("ascii", "ignore") for n in np_train_id.tolist()]
fn_h5_append(h5f_name, filename_id, ascii_id)
fn_h5_append(h5f_name, filename_label, np_train_label)
fn_h5_append(h5f_name, filename_data, np_train_data)
del np_train_id, np_train_label, np_train_data
############# creating a validation set from the training set
#### using a portion of data that does not overlap with the reduced test set. taking 500 sequencial samples
#### as the validation set.
path_in = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/train.txt'
h5f_name = '/047efbea-741c-4d9b-90a7-128e39c9b91e1/data_new/val.h5'
h5f = h5py.File(h5f_name, 'w')
grp = h5f.create_group('val')
h5f.close()
step_size = 500
for i in np.arange(5000, 5500, step_size):
fn_print(("convert to h5, outter loop:" + str(i)))
np_train_id, np_train_label, np_train_data = fn_load_data(path_in, start_line=i, no_import=step_size)
filename_id = ('/train/val_id_' + str(i) + "_to_" + str(i + step_size - 1))
filename_label = ('/train/val_label_' + str(i) + "_to_" + str(i + step_size - 1))
filename_data = ('/train/val_data_' + str(i) + "_to_" + str(i + step_size - 1))
ascii_id = [n.encode("ascii", "ignore") for n in np_train_id.tolist()]
fn_h5_append(h5f_name, filename_id, ascii_id)
fn_h5_append(h5f_name, filename_label, np_train_label)
fn_h5_append(h5f_name, filename_data, np_train_data)
del np_train_id, np_train_label, np_train_data
# We now have the data in the format we wanted, we can focus on some data preparation and the modelling part of the work.