-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_sim_df.py
41 lines (34 loc) · 1.24 KB
/
generate_sim_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import cleaner
import constants as const
import similarity as sim
def main():
# Read data and run cleaner
df_train_raw = pd.read_csv(const.TRAIN_PATH)
df_test_raw = pd.read_csv(const.TEST_PATH)
df_train = cleaner.clean_preliminary(df_train_raw)
df_test = cleaner.clean_preliminary(df_test_raw, is_test=True)
# TRAIN (NB: Takes ~4 hours to run)
# Generate similarity matrix, pickle it, replace values and write to csv
sim_df = sim.compute_similarities(df_train, df_train)
sim_df.to_pickle(const.MOST_SIMILIAR_TRAIN_PATH)
replaced_train = sim.replace_nan_with_most_similar(
main_df=df_train,
sim_df=sim_df,
verbose=True
)
replaced_train.to_csv(const.SIM_REPLACED_TRAIN)
# TEST (NB: Takes ~2.5 hours to run)
# Generate similarity matrix, pickle it, replace values and write to csv
sim_df_test = sim.compute_similarities(df_test, df_test)
sim_df_test.to_pickle(const.MOST_SIMILIAR_TEST_PATH)
replaced_test = sim.replace_nan_with_most_similar(
main_df=df_test,
train_df=df_train,
sim_df=sim_df_test,
verbose=True
)
replaced_test.to_csv(const.SIM_REPLACED_TEST)
print('Done')
if __name__ == '__main__':
main()