-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
124 lines (82 loc) · 2.92 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv # ??
from hashlib import md5
from operator import itemgetter
from csv_diff import load_csv, compare
from fuzzywuzzy import process
def sort_data(data):
data = sorted(data, key=itemgetter("PLZ", "Straße", "Ort"))
return data
def remove_duplicates(data):
unique_data = []
identifiers = set()
for item in data:
hash_digest = md5(str(item).encode("utf-8")).hexdigest()
if hash_digest not in identifiers:
identifiers.add(hash_digest)
unique_data.append(item)
return unique_data
def compare_csv(file1, file2):
diff = compare(
load_csv(open(file1), key="Name"), load_csv(open(file2), key="Name"),
)
return diff
def apply_changes(data, diff):
added = diff["added"]
removed = diff["removed"]
changed = diff["changed"]
# Check if items were added
if len(added) > 0:
print("Added:")
for item in added:
print(item["Name"])
# Check if dictionary
for k, v in item.items():
if "; " in v and ": " in v:
d2 = dict(x.split(": ") for x in v.split("; "))
for k2, v2 in d2.items():
if k2 in ["Schüler", "Lehrer", "Klassen"]:
d2[k2] = int(v2)
item[k] = d2
# Check if list
if ", " in v:
item[k] = v.split(", ")
data.append(item)
# Check if items were removed
if len(removed) > 0:
print("\nRemoved:")
for item in removed:
print(item["Name"])
# Remove from data
data = [node for node in data if not (node["Name"] == item["Name"])]
# Check if items were changed
if len(changed) > 0:
print("\nChanged:")
print(json.dumps(changed, ensure_ascii=False, indent=4))
return data
def prepare_item(item):
for k, v in item.items():
if type(v) == list:
item[k] = ", ".join(v)
if type(v) == dict:
array = []
for k2, v2 in v.items():
array.append(": ".join([k2, str(v2)]))
item[k] = "; ".join(array)
return item
def print_row(data, file):
csv_file = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
csv_file.writerow(data[0].keys())
for item in data:
node = prepare_item(item)
csv_file.writerow(node.values())
def fuzzy_search(data, category, accuracy):
names = [item[category] for item in data]
for item in data:
results = process.extractBests(item[category], names, score_cutoff=accuracy)
duplicates = [result for result in results[1:]]
if len(duplicates) > 0:
print('Possible duplicate(s) for "' + item['Name'] + '":')
for duplicate in duplicates:
print(duplicate[0] + ' (' + str(duplicate[-1]) + '%)')