-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.py
104 lines (77 loc) · 4.46 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import time
import pandas as pd
from rag_vectorstore import similarity_search_doc, multi_similarity_search_doc
from LogSetup import logger
from rag_ragas import retriever_evaluation
from StipVectorStore import StipVectorStore
from StipKnowledgeBase import load_suswiki, load_wikipedia, load_50_qa_dataset
from StipEmbedding import StipEmbedding
####
EMBED_MODELS = [ StipEmbedding("gte").embed_model, StipEmbedding("uae").embed_model, StipEmbedding("bge").embed_model]
CHUNK_SIZE = 200
CHUNK_OVERLAP_SCALE = 0.1
TOP_KS = [1,2,3]
INDEX_DISTANCES = ["l2", "cosine", "ip"]
VECTORSTORES = (StipVectorStore("faiss"), StipVectorStore("chroma"))
QUESTION_DATASET = load_50_qa_dataset()['train']
#### Load Knowledge Bases
suswiki_kb = load_suswiki()
wikipedia_kb = load_wikipedia()
# #%%### retriever
# ### create ALL wikipedia vector stores
# # Get the total number of iterations
# total_iterations = len(VECTORSTORES) * len(EMBED_MODELS) * len(INDEX_DISTANCES)
# iteration = 0
# for vector_str in VECTORSTORES:
# print(f"Processing VECTORSTORE: {vector_str.vectorstore_name}")
# for embed_model in EMBED_MODELS:
# print(f" Using EMBED_MODEL: {embed_model.model_name}")
# for index_dist in INDEX_DISTANCES:
# iteration += 1
# print(f" Applying INDEX_DISTANCE: {index_dist}")
# print(f" Starting iteration {iteration} of {total_iterations}...")
# print(" start create vectorstore" + vector_str.vectorstore_name + "for wikipedia with embed model " + embed_model.model_name + " and index distance " + index_dist)
# wikipedia_vectorstore_faiss = vector_str.create_vectorstore(wikipedia_kb, embed_model, CHUNK_SIZE, CHUNK_OVERLAP_SCALE, index_dist)
# print(" success create vectorstore for wikipedia with embed model " + embed_model.model_name + " and index distance " + index_dist)
# logger.info(" success create vectorstore for wikipedia with embed model " + embed_model.model_name + " and index distance " + index_dist)
# print(f" Finished iteration {iteration} of {total_iterations}.")
# print(f" Finished processing EMBED_MODEL: {embed_model.model_name}")
# print(f"Finished processing VECTORSTORE: {vector_str.vectorstore_name}")
#%% use multi processing for the vectorstore creation
# from multiprocessing import Pool
# from functools import partial
# def multi_create_vectorstore(vector_str, kb, embed_model, chunk_size, chunk_overlap_scale, index_dist):
# print(" start create vectorstore" + vector_str.vectorstore_name + "for wikipedia with embed model " + embed_model.model_name + " and index distance " + index_dist)
# wikipedia_vectorstore = vector_str.create_vectorstore(kb, embed_model, chunk_size, chunk_overlap_scale, index_dist)
# print(" success create vectorstore for wikipedia with embed model " + embed_model.model_name + " and index distance " + index_dist)
# logger.info(" success create vectorstore for wikipedia with embed model " + embed_model.model_name + " and index distance " + index_dist)
# return wikipedia_vectorstore
# def multi_create_vectorstore_wrapper(args):
# return multi_create_vectorstore(*args)
# def multi_create_vectorstore_pipeline(vector_str, kb, embed_models, chunk_size, chunk_overlap_scale, index_dists):
# # Get the total number of iterations
# total_iterations = len(vector_str) * len(embed_models) * len(index_dists)
# iteration = 0
# # create a list of args for the multi processing
# args = []
# for embed_model in embed_models:
# for index_dist in index_dists:
# args.append((vector_str, kb, embed_model, chunk_size, chunk_overlap_scale, index_dist))
# # create the pool
# pool = Pool()
# results = pool.map(multi_create_vectorstore_wrapper, args)
# pool.close()
# pool.join()
# return results
# test_pipeline = multi_create_vectorstore_pipeline(VECTORSTORES, wikipedia_kb, EMBED_MODELS, CHUNK_SIZE, CHUNK_OVERLAP_SCALE, INDEX_DISTANCES)
#%%
faiss_vs = StipVectorStore("faiss")
chroma_vs = StipVectorStore("chroma")
bge_embedding = StipEmbedding("bge").embed_model
uae_embedding = StipEmbedding("uae").embed_model
gte_embedding = StipEmbedding("gte").embed_model
index_dist1 = "l2"
index_dist2 = "cosine"
index_dist3 = "ip"
vector_str = chroma_vs.create_vectorstore(wikipedia_kb, bge_embedding, CHUNK_SIZE, CHUNK_OVERLAP_SCALE, index_dist1)