-
Notifications
You must be signed in to change notification settings - Fork 3
/
question_rewrite.py
executable file
·163 lines (137 loc) · 5.27 KB
/
question_rewrite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
# returns Question#, Rewritten Question as either a dictionary or list.
# This re-writes questions, so for:
# "Who is the President?" you get:
# "the President is "
# Not all questions lend themselves to simple rewrites, in this case I return an
# empty string
from align import sw_align
from difflib import SequenceMatcher as SequenceMatcher
import init
import chunker
import read_questions
import sys
MAX_INT = sys.maxint
def literal_question_distance(question, (answer, doc_num, index, features,q_id)):
"""Evaluates a candidate based on how close it is to the longest fragment of
the question in the document
returns (distance, length of fragment)
"""
doc = chunker.clean_punctuation(init.get_doc(doc_num))
(start, _, length) = find_match(question, doc)
words = doc.split()
index = len(" ".join(words[0:index+1]))
return (min( abs(start - index),
abs(start + length - index),
0 if start <= index <= start + length else MAX_INT), length)
def literal_rewrite_distance(question, candidate):
"""Evaluates a candidate based on how close it is to the longest fragment of
the re-written question in the document
returns (distance, length of fragment)
"""
return literal_question_distance(rewriteQuestion(question), candidate)
def align_question_distance(question, (answer, doc_num, index, features,q_id)):
"""Evaluates a candidate based on how close it is to the alignment of the
question in the document
returns (distance, score)
"""
doc = chunker.clean_punctuation(init.get_doc(doc_num))
(score, q, d, (q_start, d_start), (q_end, d_end)) = sw_align(question, doc)
words = doc.split()
index = len(" ".join(words[0:index+1]))
return (min( abs(d_start - index),
abs(d_end - index),
0 if d_start <= index <= d_end else MAX_INT), score)
def align_rewrite_distance(question, candidate):
"""Evaluates a candidate based on how close it is to the alignment of the
re-written question in the document
returns (distance, score)
"""
return align_question_distance(rewriteQuestion(question), candidate)
def rewriteQuestion(question):
"""Attempts to re-write a question
Takes a question as a string, and attempts to re-write it with simple rules.
Returns a possible re-write, or the empty string
"""
qReWrite = ""
qi = 0
qVerb = ""
# includes spaces around "is" so words containing "is" aren't matched
qIndex = question.find(" is ")
if qIndex != -1:
qi = 4
qVerb = " is "
else:
qIndex = question.find(" was ")
if qIndex != -1:
qi = 5
qVerb = " was "
# I also tried "are" and "were", rarely was the rewrite grammatically
# correct.
if qi > 0:
# take the string after the "is/was"
qReWrite = question[qIndex + qi:]
# remove the "?"
qReWrite = qReWrite[:len(qReWrite)-1]
# if ends in "ed" (VBN) stick verb in front of it
if qReWrite.endswith("ed"):
# find index of last space before "ed" word
qSpace = qReWrite.rfind(" ")
qReWrite = qReWrite[:qSpace] + qVerb + qReWrite[qSpace + 1:]
# "Where was" sometimes needs the verb to go between the NP and the
# adjective this works fine on the test questions, but could easily
# be fooled on the unknown questions
elif question.startswith("Where was"):
qSpace = qReWrite.rfind(" ")
qReWrite = qReWrite[:qSpace] + qVerb + qReWrite[qSpace + 1:]
else:
qReWrite = qReWrite + qVerb
return qReWrite
def unwrap_match(doc, match):
"""gets the original text assuming doc is a"""
return doc[match.a:match.a+match.size]
def find_match(question, doc):
"""Finds occurences of question in docs
Returns the longest match, where a match is a Named Tuple. Use
unwrap_match(doc, match) to unwrap them or for more documentation.
"""
question = question.lower()
doc = doc.lower()
sm = SequenceMatcher()
sm.set_seqs(doc,question)
return sm.find_longest_match(0,len(doc),0,len(question))
def rewriteQuestionsList():
# calls the dictionary version and plugs it into a list of lists (like
# read_questions is presented)
qDict = rewriteQuestionsDict(read_questions.read_questions_no_answers())
questionsList = []
for key in qDict:
questions = []
questions.append(key)
questions.append(qDict[key])
questionsList.append(questions)
return questionsList
def rewriteQuestionsDict(qList):
result = {}
# because a dictionary is easier for me than a list of lists
# key: string_of_int(question_number)
# value: question as a string
qDict = {}
b = 0
for q in qList:
for q2 in q:
if b == 0:
qN = q2
b = 1
else:
qDict[qN] = q2
b = 0
# now loop through the dict, rewriting if possible
for key in qDict:
result[key] = rewriteQuestion(qDict[key])
return result
if __name__ == "__main__":
init.get_corpus(qNum=209)
question = "Who is the inventor of the phonograph?"
doc = "SJMN91-06010225"
print align_question_distance(question, (1, doc, 30, {}))