-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnoxdataset.py
98 lines (73 loc) · 2.76 KB
/
noxdataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import json
def main():
cleaner = NoxCleaner()
df = cleaner.clean()
stats = NoxStatistics(df)
stats.simple()
stats.overall_top(10)
stats.annual_top(5)
class NoxCleaner:
def __init__(self):
self.df = pd.read_csv('input_data/noxrating.csv', sep=';')
def __score_override(self):
"""
Override known false scores with manual checked scores.
"""
# Manual override of known bad values
with open("input_data/score_override.json", "r", encoding="utf-8") as data_file:
data = json.load(data_file)
for yid in data:
self.df.loc[self.df['id'] == yid, 'score'] = data[yid]
def __save(self):
self.df.to_csv('results/dataset.csv')
def clean(self):
self.__score_override()
# Drop rows with any empty cells (no score)
self.df = self.df[self.df['score'].notna()]
# Parse Dates to readable format for pandas
self.df['date'] = pd.to_datetime(self.df['date'], errors='coerce')
# Sort descending on score
self.df = self.df.sort_values(by=['score', 'date'], ascending=False, ignore_index=True)
# Save the cleaning process
self.__save()
return self.df
class NoxStatistics:
def __init__(self, df):
self.df = df
self.markdown_mode = True
def simple(self):
print('Score Statistics:')
print(self.df['score'].describe())
def overall_top(self, top=10):
"""
Prints the overall top 10 of all the datapoints
:param top:
:param df:
"""
self.df = self.df.sort_values(by=['score', 'date'], ascending=False)
# print(self.df.head(n=top).to_string(index=False))
# To print in markdown notation
print(self.df.drop(['id'], axis=1).head(n=top).to_markdown(index=False))
def annual_top(self, top=10):
"""
Gets the annual Top boardgames
:param top:
"""
# Grouping and other functions are not saved
df = self.df.copy()
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df_group = df.groupby(df['date'].dt.year.rename('year'))
for year in reversed(sorted(df_group.groups.keys())):
df_year = df_group.get_group(year)
df_year = df_year.sort_values(by=['score'], ascending=False)
if self.markdown_mode:
df_year['link'] = '<a href="' + df_year['link'] + '">YouTube</a>'
print('### ' + str(year))
print(df_year.drop(['id'], axis=1).head(n=top).to_markdown(index=False))
print()
print()
else:
print(df_year.head(n=top).to_string(index=False))
if __name__ == "__main__":
main()