This repository has been archived by the owner on Jul 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery_ml.py
186 lines (155 loc) · 4.85 KB
/
query_ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""This module implements querying over text embeddings.
The key idea is to use boolean search to get some L0 candidates, then
find embeddings of top-N of those candidates and rerank them according
to cosine distance to query embedding.
"""
import argparse
import numpy as np
import os
import re
from embedder import Embedder, get_text_reduced
from merge_operations import not_and_postings
from query import Indexer
from scipy.spatial.distance import cosine
from typing import List, Tuple
def query_expand(query: str) -> Tuple[str, str]:
"""Get word tokens for OR part and NOT part.
Args:
query: Query string.
Returns:
Substrings of tokens that should be in results and that shouldn't.
"""
m = re.search(r" NOT", query)
if m:
q_pos = query[: m.start()]
q_neg = query[m.end():].strip("()")
else:
q_pos = query
q_neg = None
return q_pos, q_neg
def query_reduce(query: str) -> str:
"""Strip query string of all nonword tokens.
Args:
query: Query string.
Returns:
String without nonword tokens.
"""
query = re.sub(r"NOT", "", query)
query = re.sub(r"[()]", "", query)
return query
def batch_embed(
embedder: Embedder, texts: List[str], batch_size: int
) -> np.ndarray:
"""Get embeddings in batches if GPU memory is not enough.
Args:
embedder: Embedder with DistilBERT model.
texts: List of songs' texts.
batch_size: Batch size.
Returns:
Numpy array of (N, 768) of texts embeddings.
"""
embeddings = np.zeros((len(texts), 768))
iters = len(texts) // batch_size
if len(texts) % batch_size > 0:
iters += 1
for i in range(iters):
batch = texts[i * batch_size: (i + 1) * batch_size]
emb_batch = embedder.embed(batch)
embeddings[i * batch_size: (i + 1) * batch_size] = emb_batch
return embeddings
def arg_parse() -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Querying with ML")
parser.add_argument(
"--root",
dest="root",
help="Lyrics root directory",
default="lyrics/",
type=str,
)
parser.add_argument(
"--index", dest="index", help="Index file", default="index", type=str
)
parser.add_argument(
"--q",
dest="query",
help="Query string. Syntax: 'word1 word2 NOT(word3 word4)'",
default="",
type=str,
)
parser.add_argument(
"--L0",
dest="l0_size",
help="How many hits from L0 are reranked with ML",
default=100,
type=int,
)
parser.add_argument(
"--L1",
dest="l1_size",
help="How many hits to show",
default=20,
type=int,
)
parser.add_argument(
"--bs",
dest="batch_size",
help="Batch size to use in L1, decrease if GPU rans out of memory",
default=100,
type=int,
)
return parser.parse_args()
def main():
args = arg_parse()
docs = [
os.path.join(dir, f)
for dir in os.listdir(args.root)
for f in os.listdir(os.path.join(args.root, dir))
]
docs = sorted(docs)
index = Indexer(docs, args.index, args.root)
embedder = Embedder()
# L0
q_pos, q_neg = query_expand(args.query)
# Get all OR-ed tokens
q_pos_expand = re.sub(r" ", " AND ", q_pos)
hits = index.query_boolean(q_pos_expand.split())
# Remove all NOT-ed tokens
if q_neg:
for token in q_neg.split():
term = index.stemmer.stem(token)
try:
not_posting = index.tfidf(index.index[term])
except KeyError:
not_posting = []
hits = not_and_postings(not_posting, hits)
if not hits:
print("nothing found")
return
hits = sorted(hits, key=lambda item: item[1], reverse=True)
hits = hits[: args.l0_size]
# L1
doc_ids = [x[0] for x in hits]
filenames = [os.path.join(args.root, docs[i]) for i in doc_ids]
texts = [get_text_reduced(x, maxlen=512) for x in filenames]
if args.batch_size >= args.l0_size:
embeddings = embedder.embed(texts)
else:
embeddings = batch_embed(embedder, texts, args.batch_size)
query_emb = embedder.embed([q_pos])[0]
dist_cos = [cosine(query_emb, e) for e in embeddings]
idx_cos = np.argsort(dist_cos)
# Render
q_red = query_reduce(args.query)
resorted = [doc_ids[i] for i in idx_cos]
for i, id in enumerate(resorted[: args.l1_size]):
print("\n{}:".format(i))
index.render_file(q_red.split(), docs[id])
orig_pos = idx_cos[i]
print(
"\tL0 rank = {}; tf-idf = {:.3f}; cos-sim = {:.3f}".format(
orig_pos, hits[orig_pos][1], 1 - dist_cos[orig_pos]
)
)
if __name__ == "__main__":
main()