forked from ShuyangCao/open-ended_question_ontology
-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_corenlp_output.py
87 lines (70 loc) · 3.09 KB
/
merge_corenlp_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import argparse
import json
from concurrent.futures import ProcessPoolExecutor
def convert_one(parsed_file):
with open(parsed_file) as f:
parsed_output = json.load(f)
id = parsed_output['docId'].split('.')[0]
cumulative_start = []
doc_words = []
sents_parsed = []
for sentence in parsed_output['sentences']:
cumulative_start.append(len(doc_words))
sentence_words = [tok['word'] for tok in sentence['tokens']]
doc_words.extend(sentence_words)
sents_parsed.append(sentence)
clusters = []
clusters_heads = []
for cluster in parsed_output['corefs'].values():
spans = []
heads = []
for span in cluster:
span_start = span['startIndex']
span_end = span['endIndex']
span_sent = span['sentNum']
span_head = span['headIndex']
span_head = span_head + cumulative_start[span_sent - 1]
span_start = span_start + cumulative_start[span_sent - 1]
span_end = span_end + cumulative_start[span_sent - 1]
spans.append([span_start, span_end])
heads.append(span_head)
clusters.append(spans)
clusters_heads.append(heads)
coref = {'document': doc_words, 'clusters': clusters, 'clusters_heads': clusters_heads}
return {'id': id, 'sents_parsed': sents_parsed, 'coref': coref}
def main():
parser = argparse.ArgumentParser()
parser.add_argument('parsed_dir')
parser.add_argument('order_file')
parser.add_argument('answer_out_file')
parser.add_argument('question_out_file')
args = parser.parse_args()
parsed_files = os.listdir(args.parsed_dir)
order = []
with open(args.order_file) as f:
for line in f:
if 'answer' in line:
order.append(line.strip().split('/')[-1].split('.')[0])
with ProcessPoolExecutor() as executor:
answer_parsed_results = []
question_parsed_results = []
for parsed_file in parsed_files:
if 'answer' in parsed_file:
answer_parsed_results.append(executor.submit(convert_one, os.path.join(args.parsed_dir, parsed_file)))
else:
question_parsed_results.append(executor.submit(convert_one, os.path.join(args.parsed_dir, parsed_file)))
answer_parsed_results = [x.result() for x in answer_parsed_results]
question_parsed_results = [x.result() for x in question_parsed_results]
answer_tmp = {parsed_result['id']: parsed_result for parsed_result in answer_parsed_results}
question_tmp = {parsed_result['id']: parsed_result for parsed_result in question_parsed_results}
answer_parsed_results = [answer_tmp[id] for id in order]
question_parsed_results = [question_tmp[id] for id in order]
with open(args.answer_out_file, 'w') as f:
for parsed_result in answer_parsed_results:
f.write(json.dumps(parsed_result) + '\n')
with open(args.question_out_file, 'w') as f:
for parsed_result in question_parsed_results:
f.write(json.dumps(parsed_result) + '\n')
if __name__ == '__main__':
main()