-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
71 lines (59 loc) · 2.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import argparse
from EntityNormalizer import EntityDictionary, normalize
"""
Command line tool for normalizing entities based on a dictionary.
The input file must contain one entity per line. The output file will contain the normalized entities, again, one per line.
If the entity does not produce any match in the dictionary, it will be normalized to [NO_MATCH]. If the entity is found
in the dictionary but the normalization is empty, it will be normalized to [NO_NORM_FOUND].
Command line arguments:
input: Input file path
output: Output file path
dictionary: Normalization dictionary file path
source: Surface form column from dictionary
target: Normalization column from dictionary
matching_threshold: Threshold of string similarity for the normalization to be accepted (default: 50) (Optional)
index: Use column indexes instead of names (Optional)
Example usage (with column names):
python main.py data/input.txt data/output.txt data/dictionary.csv surface_form_col normalization_col --matching_threshold 50
Example usage (with integer column indexes):
python main.py data/input.txt data/output.txt data/dictionary.csv --index source 0 target 2 --matching_threshold 80
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Command line tool for normalizing entities based on a dictionary.
The input file must contain one entity per line. The output file will contain the normalized entities, again, one per line.
If the entity does not produce any match in the dictionary, it will be normalized to [NO_MATCH]. If the entity is found in the dictionary but the normalization is empty, it will be normalized to [NO_NORM_FOUND].""",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument("input", type=str, help="Input file path")
parser.add_argument("output", type=str, help="Output file path")
parser.add_argument(
"dictionary", type=str, help="Normalization dictionary file path"
)
parser.add_argument("source", type=str, help="Surface form column from dictionary")
parser.add_argument("target", type=str, help="Normalization column from dictionary")
parser.add_argument(
"--matching_threshold",
type=int,
help="Threshold of string similarity for the normalization to be accepted (default: 50)",
default=50,
)
parser.add_argument(
"--index", action="store_true", help="Use column indexes instead of names"
)
args = parser.parse_args()
## Parse input file
with open(args.input, "r") as f:
text_entities = f.readlines()
## Normalize entities
normalization_dictionary = EntityDictionary(
args.dictionary, args.source, args.target, index=args.index
)
normalized = normalize(
text_entities,
normalization_dictionary,
matching_threshold=args.matching_threshold,
)
## Write output file
with open(args.output, "w") as f:
f.write("\n".join(normalized))