-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcreate_noiseword_list.py
87 lines (66 loc) · 3.12 KB
/
create_noiseword_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Prints the most common words in the sanction lists.
# Manually inspect the lists to find noise words (stop words), like "The", "abd", and other words
# that do not contribute to search results.
from eu import reader as eu_reader
from ofac import reader as ofac_reader
from un import reader as un_reader
from collections import Counter
import normalizer
def print_word_frequencies(stop_words, stop_words_short, key):
print("\nNormal length", key, ":")
for (word, count) in stop_words:
print(word, count)
print("\nShort length", key, ":")
for (word, count) in stop_words_short:
print(word, count)
def find_most_common_words(sanction_entries, count):
words = []
short_words = []
for sanction_entry in sanction_entries:
(reference, list_subject) = sanction_entry
(aliases, birthdates) = list_subject
name_parts = normalizer.normalize_aliases(aliases)
for name_part in name_parts:
if len(name_part) < 2:
continue # ignored
elif len(name_part) <= 4:
short_words.append(name_part)
else:
words.append(name_part)
word_counter = Counter()
short_word_counter = Counter()
word_counter.update(list(words))
short_word_counter.update(list(short_words))
stop_words = word_counter.most_common(count)
stop_words_short = short_word_counter.most_common(count)
return (stop_words, stop_words_short)
all_entities = []
all_persons = []
(persons, entities) = eu_reader.load_sanctions("eu/eu_global_full.xml")
for item in entities.items(): all_entities.append(item)
for item in persons.items(): all_persons.append(item)
(persons, entities) = un_reader.load_sanctions("un/consolidated.xml")
for item in entities.items(): all_entities.append(item)
for item in persons.items(): all_persons.append(item)
(persons, entities) = ofac_reader.load_sdn_sanctions("ofac/sdn_advanced.xml")
for item in entities.items(): all_entities.append(item)
for item in persons.items(): all_persons.append(item)
(persons, entities) = ofac_reader.load_consolidated_sanctions("ofac/cons_advanced.xml")
for item in entities.items(): all_entities.append(item)
for item in persons.items(): all_persons.append(item)
count = 100
print("Loaded {} entities and {} persons from sanction lists".format(len(all_entities), len(all_persons)))
print(count, "most commons words are:\n")
(stop_words, stop_words_short) = find_most_common_words(all_persons, count)
print_word_frequencies(stop_words, stop_words_short, "individuals")
(stop_words, stop_words_short) = find_most_common_words(all_entities, int(count*2.5))
print_word_frequencies(stop_words, stop_words_short, "entities")
entity_words = set()
for subject in stop_words + stop_words_short:
entity_words.add((subject[0]))
business_entity_type_abbreviations = set()
for entity_abbreviation in open('business_entity_type_names'):
business_entity_type_abbreviations.add(entity_abbreviation.strip())
print("Words both in common words for business names and for business entity names are:")
for word in sorted(business_entity_type_abbreviations.intersection(entity_words)):
print(word)