-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
76 lines (67 loc) · 2.23 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 29 15:16:44 2020
@author: Akshat
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
fname = "../shell_data/clean_dataset.csv"
def flat_preprocess(fname):
df = pd.read_csv(fname).dropna()
del df["original_index"]
#TODO save u and s
u = df.mean()
s = df.std()
np.save("std_flat.npy", s.to_numpy())
np.save("mean_flat.npy", u.to_numpy())
output = df.apply(lambda x: (x - u) / s, axis = 1)
#TODO filter out all anomalies
train, test = train_test_split(output, train_size = 0.7)
train, valid = train_test_split(train, train_size = 0.7)
np.save("../flat_preprocess/train.npy", train.to_numpy())
np.save("../flat_preprocess/test.npy", test.to_numpy())
np.save("../flat_preprocess/valid.npy", valid.to_numpy())
def getIndex(df):
values = df.values
last_index = 0
current_period = [0]
periods = []
for i in range(1,len(values)):
if last_index == df.iloc[i]['original_index'] - 1:
current_period.append(i)
last_index+=1
else:
periods.append(current_period)
current_period = [i]
last_index = df.iloc[i]['original_index']
return periods
def temporalise(indices, time_steps):
return [indices[i : i+time_steps] for i in range(len(indices) + 1 - time_steps)]
def remove_anom(df):
anoms = [10634, 36136, 57280, 57618, 60545, 63144, 118665, 128524, 131118]
predicate = lambda x, y: abs(x + 3 - y) > 4
for anom in anoms:
df = df[predicate(df["original_index"], anom)]
return df
def preprocess(fname):
df = remove_anom(pd.read_csv(fname).dropna())
indices = getIndex(df)
del df["original_index"]
indices = [y for x in [temporalise(i, 10) for i in indices] for y in x]
u = df.mean()
s = df.std()
np.save("std.npy", s.to_numpy())
np.save("mean.npy", u.to_numpy())
output = df.apply(lambda x: (x - u) / s, axis = 1)
arr = np.zeros((len(indices), 10, 362))
for i in range(len(indices)):
a = indices[i]
arr[i] = output[a[0]: a[-1] + 1].to_numpy()
train, test = train_test_split(arr, train_size = 0.8)
train, valid = train_test_split(train, train_size = 0.8)
np.save("../preprocess/train.npy", train)
np.save("../preprocess/test.npy", test)
np.save("../preprocess/valid.npy", valid)
#%%
u, s = preprocess("../shell_data/clean_dataset.csv")