-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathOpen_conversion_data.py
78 lines (62 loc) · 2.03 KB
/
Open_conversion_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import csv
from math import sqrt
## Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
## Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
## Convert string column to integer
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup
##### Normalize Data ###########
# Find the min and max values for each column
def dataset_minmax(dataset):
minmax = list()
for i in range(len(dataset[0])):
colvalues = [row[i] for row in dataset]
min_value = min(colvalues)
max_value = max(colvalues)
minmax.append([min_value, max_value])
return minmax
# Normalize the dataset except last row for classification values
def Normalize_Dataset(dataset, minmax):
for row in dataset:
for i in range(len(row)-1):
row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
#### Standardize Data ######
# calculate column means
def column_means(dataset):
means = [0 for i in range(len(dataset[0]))]
for i in range(len(dataset[0])):
col_values = [row[i] for row in dataset]
means[i] = sum(col_values) / float(len(dataset))
return means
# calculate column standard deviations
def column_stdevs(dataset, means):
stdevs = [0 for i in range(len(dataset[0]))]
for i in range(len(dataset[0])):
variance = [pow(row[i]-means[i], 2) for row in dataset]
stdevs[i] = sum(variance)
stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
return stdevs
# Standardize the dataset
def Standardize_Dataset(dataset, means, stdevs):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - means[i]) / stdevs[i]