-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
163 lines (124 loc) · 4.76 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm
import constants as const
cache = {}
def vprint(verbose: bool, *args, **kwargs) -> None:
"""Prints if verbose=True, otherwise does nothing"""
if verbose:
print(*args, **kwargs)
def isnan(value: Any) -> bool:
"""Returns True if value is NaN, otherwise False"""
return value != value
def remove_nan_rows(df: pd.DataFrame, threshold=const.MAX_NUM_NANS) -> pd.DataFrame:
"""
Removes all rows from dataset where the number of
NaN columns is above threshold
"""
number_nans = df.apply(lambda row: sum(map(isnan, row)), axis=1)
return df[number_nans <= threshold]
def drop_bad_cols(df: pd.DataFrame) -> None:
"""Drops all COLS_TO_DROP from df inplace"""
df.drop(const.COLS_TO_DROP, axis=1, inplace=True, errors='ignore')
def round_to_nearest_hundred(num: int) -> int:
"""
Rounds a numeric value to the nearest 100
Example:
>>> round_to_nearest_hundred(149.99999)
100
>>> round_to_nearest_hundred(150)
200
"""
return int(round(num, -2))
def get_max_squared_diff(train_df: pd.DataFrame, col: str) -> float:
"""
Returns the squared diff of the 5th quantile and 95th quantile
"""
if col in cache:
return cache[col]
val1, val2 = train_df[col].quantile([0.05, 0.95])
diff = val1 - val2
diff_squared = diff * diff
cache[col] = diff_squared
return diff_squared
def get_median(train_df: pd.DataFrame, col: str) -> float:
"""Returns the median of the column in df"""
key = f'{col}-median'
if key in cache:
return cache[key]
val = train_df[col].median()
cache[key] = val
return val
def has_nan_in_critial_col(row: pd.Series) -> bool:
"""
Checks whether a row has at least one critical
column with a NaN
"""
return any(isnan(row[col]) for col in const.CRITICAL_COLS if col in row)
def get_top_k_most_similar(sim_df, k=3000):
"""
Gets the indices of the top k most similar rows based on a
similarity df with the scores. If sim_df has shape (N, M)
then the output dataframe will have shape (N, k) where k << M.
If k is `None`, all rows are returned.
"""
indices = np.argsort(-sim_df, axis=1)
if k is None:
return indices
return indices.iloc[:, :k]
def create_weights_df(sim_df: pd.DataFrame, top_k: pd.DataFrame,
verbose=False) -> pd.DataFrame:
"""
Creates a dataframe with similar shape as top_k where each
row sums to 1 and the value in each entry denotes how much
weights the corresponding index in top_k should be given
"""
weights = []
for i in tqdm(range(len(top_k)), disable=not verbose,
desc='Computing weights'):
ws = np.array(sim_df.iloc[i, np.array(top_k.iloc[i])])
ws /= np.sum(ws)
weights.append(ws)
return pd.DataFrame(weights, index=top_k.index)
def rmse(*args, **kwargs) -> float:
"""Returns the mean squared error"""
return mse(*args, **kwargs, squared=False)
def preds_to_csv(preds: np.ndarray, out: Path = const.CSV_PREDS_OUT) -> None:
"""
Takes a 1D numpy array of predictions and optionally an out path and writes
them to a csv file. Assumes that the predictions are given in the same
order as the test set.
"""
df = pd.DataFrame(preds, columns=['Predicted'])
df.to_csv(out, index_label='Id')
def get_make_model_dict(df_original: pd.DataFrame, replace_by_mean=False) -> dict:
"""
Function used to generate dictionaries used to map (make,model) to nominal value
Args:
df_original (pd.DataFrame,replace_by_mean, optional): [description]. Defaults to False)
Returns:
Dictionary
"""
df = df_original.copy()
splitted_titles = df.title.apply(str.lower).str.split(" |-")
df.make = splitted_titles.str[0]
df['make_model'] = df.apply(lambda x: x['make'] + ' ' + x['model'], axis=1)
test = df.groupby('make_model').mean().reset_index()
test = test.sort_values('price')[['make_model', 'price']]
if not replace_by_mean:
new_make_model_dict = dict()
prev = 0
for i, untill in enumerate(const.MAKE_MODEL_BINS):
subset = test[(test.price >= prev) & (test.price < untill)]
prev = untill
for key in subset.make_model.unique():
new_make_model_dict[key] = i
return new_make_model_dict
test_min = const.MAKE_MODEL_PRICE_MIN
test_max = const.MAKE_MODEL_PRICE_MAX
test['price'] = test['price'].apply(lambda x: int(((x - test_min) / test_max)*1000))
make_dict_mean_norm = pd.Series(test.price.values, index=test.make_model).to_dict()
return make_dict_mean_norm