-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOne Click DataCleanerAlgorithm.py
244 lines (214 loc) · 10.2 KB
/
One Click DataCleanerAlgorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import numpy as np
import pandas as pd
import re
#Domain knowledge of your dataset is highly recommended before using this algorithm
#Assumes that your csv file as a single row of defined header (starting point, first row)
cleanedFilepath = '/path/to//CleanedData.csv'#path to export your new file to
#filepath = '/home/a0x0bc1/Downloads/CMC_24h_Gainers_Starting_07_10_24 mod.csv'
filepath = '/path/to/input.csv'#your input csv file
csvFile = pd.read_csv(filepath)
print(csvFile.dtypes)#prints datatypes for each col
print(type(csvFile))#prints <class 'pandas.core.frame.DataFrame'>
#To fill empty cells
def fillNullCells(workingCol):
workingCol = pd.to_numeric(workingCol, errors='coerce')
if pd.isna(workingCol.iloc[0]):#Probably first cell in col is empty
colMedian = workingCol.median()
workingCol.iloc[0] = colMedian#interpolate doesnt fill the first cell id its empty so need to fill manually
workingCol.fillna(workingCol.interpolate(), inplace=True)#Using interpolate as it is best fit for time series data
#print(workingCol)
return workingCol
#To turn alphabets to lowercase
def turnAlphaLower(workingCol):
for idx, elmt in enumerate(workingCol):#itrs over each elmt in a single col
if pd.isna(elmt): #skips an empty cell
continue
try: #To avoid float * isalpha * str * strip errors when there are empty cells in the col
#elmt = re.sub('[^a-zA-Z0-9]+', '', str(elmt))#remove all whitespaces and special characters that negates alphabets or numbers from each elmt, no need for 'str'
elmt = re.sub('[^a-zA-Z0-9]+', '', elmt)#remove all other chars apart from alphas and nums
try: #To catch errors that will come from values other than numbers
floatelmt = float(elmt)
except: #Then value is str or unknown, i.e mix of chars
if not elmt.isalpha() and not elmt.isalnum():#If value tossed here is not an alphabet and not alphanumeric
csvFile.at[idx, col] = np.nan
else:
csvFile.at[idx, col] = elmt.lower() #return the elmt in lowercase without spaces
except:
continue #skip the empty cell
return workingCol
#To map each nonnumerical elemt to a num
def mapElmtToNum(workingCol):
nonNum = []
for idx, elmt in enumerate(workingCol):#itrs over each elmt in a single col
if pd.isna(elmt): #skips an empty cell
continue
try:
elmt = re.sub('[^a-zA-Z0-9]+', '', elmt)#remove all other chars apart from alphas and nums
try: #Try to get nums are included in the col
floatelmt = float(elmt)
except: #Then value is str or unknown, i.e mix of chars
nonNum.append(elmt)
except:
continue
nonNum = pd.unique(pd.Series(nonNum))
#If nums are included in the col, we need the max
startNum = pd.to_numeric(workingCol, errors='coerce').max()+1#Trying to get max num in the col, turning all num to float and nonnum to nan
if pd.isna(startNum):#All the values are probably alphabets
startNum = 1
mapped = {val: idx for idx, val in enumerate(nonNum, start=int(startNum))}
workingCol = workingCol.map(lambda elmt: mapped.get(elmt, elmt))
else: #Then there are nums
mapped = {val: idx for idx, val in enumerate(nonNum, start=int(startNum))}
workingCol = workingCol.map(lambda elmt: mapped.get(elmt, elmt))
workingCol = pd.to_numeric(workingCol, errors='coerce')
workingCol = fillNullCells(workingCol)
return workingCol
#To determine for categorical col
def isCategorical(workingCol):
numValues = sum(workingCol.isna() == False)#Total num of vals in the col
numUniks = workingCol.nunique()#Total num of unique vals
numTwiceUniques = sum(workingCol.value_counts() >= 2)#Total num of vals that reapeat atb least twice
#print(workingCol.value_counts()) # To get a view of the freq of each val
#Using max 60% should be unique threshold
#isCat = (numUniks / numValues) #The ratio of unique vals to total vals
#print(isCat)
#if isCat <= 0.6:
#return True
#Using 60% of unique should repeat at least twice threshold
#isCat2 = (0.6 * numUniks)#preferred ratio
#print(isCat2)
#if isCat2 <= numTwiceUniques:
#return True
#OR
if (numTwiceUniques / numUniks) >= 0.4:#you can adjust your threshold for a categorical col here
return True
else:
return None #This col is not cleaned
#To map Categorical vals to num
def mapCategorical(workingCol):
if isCategorical(workingCol) is True:
#Turn to categorical
workingCol = turnAlphaLower(workingCol)
workingCol = mapElmtToNum(workingCol)
return workingCol
else:
return None #This col is not cleaned
#To clean a num col mixed with other chars
def cleanObjNumericalCol(workingCol):
numUniks = workingCol.nunique()
numValues = sum(workingCol.isna() == False)
if isCategorical(workingCol):#might be a categorical col
return True
elif numUniks >= 0.6 * numValues:#To check if the col is not categorical i.e. not repeated values
for idx, elmt in enumerate(workingCol):#itrs over each elmt in a single col
if pd.isna(elmt): #skips an empty cell
continue
elmt = re.sub('[^a-zA-Z0-9]+', '', elmt)#remove all other chars apart from alphas and nums
#try:
#floatelmt = float(elmt)
#except:
#csvFile.at[idx, col] = np.nan
workingCol = pd.to_numeric(workingCol, errors='coerce')
workingCol = fillNullCells(workingCol)
return workingCol
else:
return None
#To count each elmt in each col
def countObjsCol(workingCol):
strCount = 0
alphanumericCount = 0
numCount = 0
emptyCount = 0
unknownCount = 0
#for elmt in csvFile[col]:#itrs over each elmt in a single col
for idx, elmt in enumerate(workingCol):#itrs over each elmt in a single col
if pd.isna(elmt): #skips an empty cell
emptyCount += 1
continue
try: #To avoid float * isalpha * str * strip errors when there are empty cells in the col
elmt = re.sub('[^a-zA-Z0-9]+', '', elmt)#remove all other chars apart from alphas and nums ONLY from each elmt
if elmt.isalpha():#Checks if each elmt is an alphabet
strCount += 1
#print(elmt)#prints the alphabetical elmt
else:
try: #To catch errors that will come from values other than numbers
floatnum = float(elmt)
numCount += 1
except:
if elmt.isalnum():#For alphanum chars
alphanumericCount += 1
else:
unknownCount += 1 #Then value is unknown, i.e mix of chars
continue #skip the value
except:
continue #skip
return strCount, alphanumericCount, numCount, emptyCount, unknownCount
#Main Operation
newCols = []
deletedCols = []
for col in csvFile.columns:#itrs over all cols at once, but a single col with index def
#pass
#print(csvFile[col]) #Prints the current col
workingCol = csvFile[col]
if csvFile[col].dtype == int: #Its perfect
newCols.append(col)
continue
elif csvFile[col].dtype == float: #Might need fixing
strCount, alphanumericCount, numCount, emptyCount, unknownCount = countObjsCol(workingCol)
if emptyCount >= 0.4 * len(workingCol): #Probably almost all cells are empty
deletedCols.append(col)
csvFile.drop(col, axis=1, inplace=True)
#print(col)
else:
newCols.append(col)
csvFile[col] = fillNullCells(csvFile[col])
#print(csvFile[col])
elif workingCol.dtype == object: #Checks for each cols first
strCount, alphanumericCount, numCount, emptyCount, unknownCount = countObjsCol(workingCol)
values = strCount + alphanumericCount + numCount
#print(col)
#print('String: ', strCount)
#print('Alphanumeric: ', alphanumericCount)
#print('Nums: ', numCount)
#print('EmptyCells: ', emptyCount)
if numCount >= (0.8 * values):#Incase a numerical col is mixed with few other chars
#print(workingCol)
if cleanObjNumericalCol(csvFile[col]) is True:#means it's categorica. Can use if isCategorical(csvFile[col]) directly
#print(col)
newCols.append(col)
csvFile[col] = mapCategorical(csvFile[col])
elif cleanObjNumericalCol(csvFile[col]) is None:
deletedCols.append(col)
csvFile.drop(col, axis=1, inplace=True)
else:
newCols.append(col)
csvFile[col] = cleanObjNumericalCol(csvFile[col])#Num col with some empty cells
#print(csvFile[col])
#This works well for cols with more alpha chars than the others
elif (strCount + alphanumericCount + numCount) >= (0.6 * len(workingCol)): #This ratio means cols is likely mixed up with invalid data
#To determine for Categorical col and then map
if isCategorical(csvFile[col]) is None:
deletedCols.append(col)
csvFile.drop(col, axis=1, inplace=True)#Means its not categorical so delete col
else:
newCols.append(col)
csvFile[col] = mapCategorical(csvFile[col])
#print(csvFile[col])
#print(col)#col name
else:
deletedCols.append(col)
#print(workingCol)
csvFile.drop(col, axis=1, inplace=True)#Means its not categorical so delete col
else:
deletedCols.append(col)
csvFile.drop(col, axis=1, inplace=True)#Del any other col
print(csvFile.dtypes)
print()
csvFile.to_csv(cleanedFilepath, index=False)
noOutputCols = len(csvFile.columns)
removedCols = noInputCols - noOutputCols
print(f'Total of {noOutputCols} output cols: {newCols}')
print()
print(f'Total of {removedCols} cols removed: {deletedCols}')
print()
print('Cleaned Data Exported Successfully')