-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsemsearch.py
80 lines (50 loc) · 2.44 KB
/
semsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# S4 - Super Simple Semantic Search - minimalistic demo of semantic search with BGE-small-en-1.5
# Requirements:
# pip install FlagEmbedding pandas tqdm
from FlagEmbedding import FlagModel
model = FlagModel('BAAI/bge-small-en-v1.5', use_fp16=True)
# First, import our data (input.csv)
import pandas as pd
import tqdm as tqdm
import time
data = pd.read_csv("describe3.csv")
# Next, grab our "document" column (what we want to embed) and our "key" column (what we want to be returned on a lookup)
col_document = "Description"
col_key = "ProductID"
col_embedding = "Embedding"
# Now, let's embed our data, saving it to a new "Embedding" column that is our semantic vector, displaying a nice progress bar while we do so.
print(f'Embedding {len(data)} documents...')
embeddings = []
for i, row in tqdm.tqdm(data.iterrows(), total=len(data)):
embeddings.append(model.encode(row[col_document]))
data[col_embedding] = embeddings
# Optional: Save this data to a new CSV file so that the embeddings don't have to be calculated at every startup.
data.to_csv("input_with_embeddings.csv", index=False)
# Now, let's define a function that will take a query and return the most similar items in our data.
def search(query, top_n=5):
print(f'Searching for documents related to: {query}')
start_time = time.time()
query_embedding = model.encode(query)
# Distance is calculated as the cosine similarity between the query and the document.
# similarity = embeddings_1 @ embeddings_2.T
search_time = time.time()
query_distances = data[col_embedding].apply(lambda x: query_embedding @ x.T)
final_time = time.time()
print(f" Embedded query in {search_time - start_time}")
print(f" Nearest-neighbor search in {final_time - search_time}")
print(f" Total time: {final_time - start_time}")
# Now, let's sort our data by the distance to the query and return the top N results with each of their distances.
results = data.copy()
results["query_distance"] = query_distances
results = results.sort_values("query_distance", ascending=False).head(top_n)
return results.reset_index()
search_query = "Artwork depicting archery"
while (len(search_query) > 0):
# Let's test our search function with a query.
results = search(search_query, top_n=5)
print(f"Results for query: {search_query}")
print(results)
print()
print('What is your query? (blank to exit)')
search_query = input()
print("All done")