-
Notifications
You must be signed in to change notification settings - Fork 5
/
experiment_3.py
86 lines (66 loc) · 2.98 KB
/
experiment_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Extrinsic evaluation on the i2b2 data.
Experiment 3 in the paper.
Set the boolean flag Perfect to True if you want to try perfect chunking.
"""
import json
from itertools import chain
from conch.evaluation.extrinsic import eval_extrinsic
from conch.preprocessing.baseline import baseline
from conch.preprocessing.concept_vectors import create_concepts
from reach import Reach
from conch.conch import compose, reciprocal
from conch.evaluation.utils import to_conll
if __name__ == "__main__":
# Set this flag to true to replicate the perfect chunking setting
# in experiment 3.
perfect = False
gold = json.load(open("data/test_gold.json"))
gold = list(zip(*sorted(gold.items())))[1]
if perfect:
data = json.load(open("data/test_gold.json"))
else:
data = json.load(open("data/test_uima.json"))
data = list(zip(*sorted(data.items())))[1]
txt, gold_bio = zip(*gold)
_, data_bio = zip(*data)
embeddings = Reach.load("", unk_word="UNK")
concept_reach = Reach.load_fast_format("data/concept_vectors")
concept_labels = json.load(open("data/concept_names2label.json"))
gold_bio = list(chain.from_iterable(gold_bio))
results_bio = {}
r_phrases = compose(data,
window=0,
embeddings=embeddings,
context_function=reciprocal)
pred_bio_focus = eval_extrinsic(list(chain.from_iterable(data_bio)),
r_phrases,
concept_reach,
concept_labels,
250)
r_phrases = compose(data,
window=10,
embeddings=embeddings,
context_function=reciprocal)
pred_bio_full = eval_extrinsic(list(chain.from_iterable(data_bio)),
r_phrases,
concept_reach,
concept_labels,
250)
txt = list(chain.from_iterable(txt))
baseline_embeddings = baseline(txt, 10000)
concept_baseline, concept_labels = create_concepts(baseline_embeddings,
include_np=True)
r_phrases = compose(data,
window=0,
embeddings=baseline_embeddings,
context_function=reciprocal)
pred_bio_baseline = eval_extrinsic(list(chain.from_iterable(data_bio)),
r_phrases,
concept_baseline,
concept_labels,
250)
json.dump(results_bio, open("results/knn_test_extrinsic.json", 'w'))
to_conll(pred_bio_focus, gold_bio, "results/test_focus_model.conll")
to_conll(pred_bio_full, gold_bio, "results/test_full_model.conll")
to_conll(pred_bio_baseline, gold_bio, "results/test_baseline.conll")