-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpost_process.py
125 lines (90 loc) · 3.29 KB
/
post_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import csv
import os
def main():
"""
Sorting
Create top csv files
Create summary md file
"""
# Version
version = 3
# Sort the file
sort(version)
# Call function to create top csv files
no_of_rows = (100, 250, 500, 1000, 5000, 10000, 25000, 50000, 100000)
for n in no_of_rows:
top(n, version)
# Call function to create summary md file (only do if directory exists)
summary(version)
# SORT FUNCTIONS
def sort(version):
# paths
csv_path = f'data/output/{version}.csv'
output_path = f'data/output/filtered/{version}.csv'
# Read the CSV file into a list of dictionaries
data = []
with open(csv_path, 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
data.append(row)
# Sort the data based on the 'frequency' column
sorted_data = sorted(data, key=lambda x: int(x['frequency']), reverse=True)
# Write the sorted data to a new CSV file
fieldnames = ['word', 'frequency'] # Assuming the column names are 'word' and 'frequency'
with open(output_path, 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_data)
print("Sorting completed. Sorted file saved at:", output_path)
# TOP FUNCTIONS
def top(no_of_rows, version):
# paths
input_path = f'data/output/filtered/{version}.csv'
output_dir = f'output/v{version}'
output_path = f'{output_dir}/top_{no_of_rows}.csv'
# check if output folder exists
if not os.path.exists(output_dir):
os.makedirs(f'output/v{version}')
print("The new directory is created!")
# Call function
rows = read_n_rows(input_path, no_of_rows)
# Save to csv file
with open(output_path, 'w', newline='') as file:
writer = csv.writer(file)
for row in rows:
writer.writerow([row[0], row[1]])
print(f'Summary: {len(rows)} rows saved to {output_path}')
def read_n_rows(file_path, n):
rows = []
with open(file_path, 'r') as file:
reader = csv.reader(file)
for i, row in enumerate(reader):
if i >= n:
break
rows.append(row)
return rows
# SUMMARY FUNCTIONS
def summary(version):
summary = {}
summary['total_count'] = count_words_above_frequency(version, 0)
summary['freq5'] = count_words_above_frequency(version, 5)
summary['freq100'] = count_words_above_frequency(version, 100)
summary['freq1000'] = count_words_above_frequency(version, 1000)
# Create a md file
with open(f'output/v{version}/summary.md', 'w') as file:
summary_text = f'# Summary\n\nTotal count of words: {summary["total_count"]}\n\nFrequency > 5: {summary["freq5"]}\n\nFrequency > 100: {summary["freq100"]}\n\nFrequency > 1000: {summary["freq1000"]}'
file.write(summary_text)
def count_words_above_frequency(version, n):
# paths
csv_path = f'data/output/filtered/{version}.csv'
count = 0
with open(csv_path, 'r') as csv_file:
reader = csv.reader(csv_file)
next(reader) # Skip the header row
for row in reader:
frequency = int(row[1])
if frequency > n:
count += 1
return count
if __name__ == '__main__':
main()