-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPhase2Run.py
258 lines (194 loc) · 7.85 KB
/
Phase2Run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import Indexer
import io
import math
import string
import re
import traceback
import snippetGeneration
# GLOBAL CONSTANTS
DOC_TOKEN_COUNT = {}
INVERTED_INDEX = {}
QUERY_ID = 0
def average_doc_length():
# Returns the average document length for the documents in this input corpus
total_length = 0
for doc in DOC_TOKEN_COUNT:
total_length += DOC_TOKEN_COUNT[doc]
return float(total_length) / float(len(DOC_TOKEN_COUNT))
def read_relevance_info():
try:
relevant_docs = []
rel_docs_in_corpus = []
with io.open("cacm.rel.txt", 'r', encoding="utf-8") as relevance_file:
for line in relevance_file.readlines():
values = line.split()
if values and (values[0] == str(QUERY_ID)):
relevant_docs.append(values[2])
for doc_id in DOC_TOKEN_COUNT:
if doc_id in relevant_docs:
rel_docs_in_corpus.append(doc_id)
return rel_docs_in_corpus
except Exception as e:
print(traceback.format_exc(), e)
def relevant_doc_count(docs_with_term, relevant_docs):
count = 0
for doc_id in docs_with_term:
if doc_id in relevant_docs:
count += 1
return count
def BM25_score(fetched_index, query_term_freq):
# Computing BM25 scores for all documents in the given index
# Returning a map of the document ids with their BM25 score
DOC_SCORE = {}
relevant_docs = read_relevance_info()
R = len(relevant_docs)
avdl = average_doc_length()
N = len(DOC_TOKEN_COUNT)
k1 = 1.2
k2 = 100
b = 0.75
for query_term in query_term_freq:
qf = query_term_freq[query_term]
n = len(fetched_index[query_term])
if query_term in INVERTED_INDEX:
r = relevant_doc_count(INVERTED_INDEX[query_term], relevant_docs)
else:
r = 0
dl = 0
for doc in fetched_index[query_term]:
f = fetched_index[query_term][doc]
if doc in DOC_TOKEN_COUNT:
dl = DOC_TOKEN_COUNT[doc]
K = k1 * ((1 - b) + (b * (float(dl) / float(avdl))))
relevance_part = math.log(((r + 0.5) / (R - r + 0.5)) / ((n - r + 0.5) / (N - n - R + r + 0.5)))
k1_part = ((k1 + 1) * f) / (K + f)
k2_part = ((k2 + 1) * qf) / (k2 + qf)
for doc1 in DOC_SCORE:
DOC_SCORE[doc1] += (relevance_part * k1_part * k2_part)
else:
DOC_SCORE[doc] = (relevance_part * k1_part * k2_part)
# return doc scores in descending order.
return DOC_SCORE
def query_matching_index(query_term_freq):
# Fetching only those inverted lists from the index, that correspond to the query terms
fetched_index = {}
for term in query_term_freq:
if term in INVERTED_INDEX:
fetched_index[term] = INVERTED_INDEX[term]
else:
fetched_index[term] = {}
return fetched_index
def query_term_freq_map(query):
# Returns a map of query terms and their corresponding frequency in the query
query_terms = query.split()
query_term_freq = {}
for term in query_terms:
if term not in query_term_freq:
query_term_freq[term] = 1
else:
query_term_freq[term] += 1
return query_term_freq
def extract_queries_from_file():
extracted_queries = []
raw_queries = open("cacm.query.txt", 'r').read()
while raw_queries.find('<DOC>') != -1:
query, raw_queries = extract_first_query(raw_queries)
extracted_queries.append(query.lower())
return extracted_queries
def extract_first_query(raw_queries):
transformed_query = []
query = raw_queries[raw_queries.find('</DOCNO>') + 8:raw_queries.find('</DOC>')]
query = str(query).strip()
query_terms = query.split()
for term in query_terms:
transformed_term = term.strip(string.punctuation)
transformed_term = re.sub(r'[^a-zA-Z0-9\-,.–]', '', str(transformed_term))
if transformed_term != '':
transformed_query.append(transformed_term)
query = " ".join(transformed_query)
raw_queries = raw_queries[raw_queries.find('</DOC>') + 6:]
return query, raw_queries
def QLM_score(fetched_index, query_term_freq):
# Computes QLM scores for all documents in the given index
# Returns a map of the document ids with their QLM score
DOC_SCORE_QLM = {}
C = 0
lambda_value = 0.35
# Initialize all docs with score = 0
for doc in DOC_TOKEN_COUNT:
DOC_SCORE_QLM[doc] = 0
C = C + DOC_TOKEN_COUNT[doc] # total number of words in collection
for query_term in query_term_freq:
cq = 0
for doc in fetched_index[query_term]:
cq = cq + fetched_index[query_term][doc] # total occurrence of query term in collection
for doc in fetched_index[query_term]:
D = DOC_TOKEN_COUNT[doc] # total number of words in doc
fq = fetched_index[query_term][doc] # total occurrence of query term in doc
first_part = float(1 - lambda_value) * fq / D
second_part = float(lambda_value) * cq / C
DOC_SCORE_QLM[doc] += math.log(first_part + second_part)
# return doc scores in descending order.
return DOC_SCORE_QLM
def tfidf_score(fetched_index):
# Computes QLM scores for all documents in the given index
# Returns a map of the document ids with their tfidf score
DOC_SCORE_TFIDF = {}
tf_idf_dict = {}
for term in fetched_index:
idf = 1.0 + math.log(float(len(DOC_TOKEN_COUNT)) / float(len(fetched_index[term].keys()) + 1))
for doc_id in fetched_index[term]:
tf = float(fetched_index[term][doc_id]) / float(DOC_TOKEN_COUNT[doc_id])
if term not in tf_idf_dict:
tf_idf_dict[term] = {}
tf_idf_dict[term][doc_id] = tf * idf
for term in fetched_index:
for doc in fetched_index[term]:
doc_weight = 0
doc_weight = doc_weight + tf_idf_dict[term][doc] # get_doc_weight(doc,fetched_index,tf_idf_dict)
if doc in DOC_SCORE_TFIDF:
doc_weight = doc_weight + DOC_SCORE_TFIDF[doc]
DOC_SCORE_TFIDF.update({doc: doc_weight})
# return doc scores in descending order.
return DOC_SCORE_TFIDF
def get_doc_weight(doc, tf_idf_dict):
doc_weight = 0
for term in tf_idf_dict:
if doc in tf_idf_dict[term]:
doc_weight += tf_idf_dict[term][doc]
return doc_weight
def main():
print("--- RETRIEVAL WITH SNIPPETS ---")
# Generate the unigram index.
# By default, not performing stopping.
# So send False
Indexer.unigram_index(False)
# Fetch the index generated.
global INVERTED_INDEX
INVERTED_INDEX = Indexer.INVERTED_INDEX
global DOC_TOKEN_COUNT
DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
# Read all queries.
queries = extract_queries_from_file()
global QUERY_ID
for query in queries:
QUERY_ID += 1
# Dictionary of query term frequency
query_term_freq = query_term_freq_map(query)
# Fetch the inverted indexes corresponding to the terms
# in the query.
fetched_index = query_matching_index(query_term_freq)
# Compute BM25 score for this query.
doc_scores = BM25_score(fetched_index, query_term_freq)
doc_list = []
# Extract top 100 docs, for which we need
# to generate snippets.
sorted_scores = [(k, doc_scores[k]) for k in sorted(doc_scores, key=doc_scores.get, reverse=True)]
for i in range(min(len(sorted_scores), 100)):
k, v = sorted_scores[i]
doc_list.append(k)
# Printing snippets using query term highlighting onto console using snippetGeneration
snippetGeneration.snippet_generator(doc_list, query, INVERTED_INDEX)
print("Completed Retrieval for query : " + query)
print("\nEnd of Retrieval with Snippets.")
main()