From 04fc07281e270d1ac0d8818ab0e95212bd2ac86c Mon Sep 17 00:00:00 2001 From: MJ Rossetti Date: Thu, 14 Dec 2023 22:48:05 -0500 Subject: [PATCH] Refactor --- README.md | 6 +- app/rows_processor.py | 59 +++++++++++++ app/submissions_retriever.py | 164 ++++++++++++++--------------------- 3 files changed, 128 insertions(+), 101 deletions(-) create mode 100644 app/rows_processor.py diff --git a/README.md b/README.md index 8c9c807..fedec28 100644 --- a/README.md +++ b/README.md @@ -92,10 +92,12 @@ python -m app.submissions_processor ### Document Retrieval -Find relevant content in submission files (uses text embeddings model to find relevant documents): +Designate the homework questions (hard-coded in "app/prompts" dir for now). + +Find the most relevant content from the submissions files for answering each of the homework questions (currently uses lower-cost text embeddings model "text-embedding-ada-002" from OpenAI to find relevant documents): ```sh -python -m app.submissions_retriever +DOCS_LIMIT=5 python -m app.submissions_retriever # DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_retriever ``` diff --git a/app/rows_processor.py b/app/rows_processor.py new file mode 100644 index 0000000..35f2b76 --- /dev/null +++ b/app/rows_processor.py @@ -0,0 +1,59 @@ + +from functools import cached_property + +from langchain.document_loaders import DataFrameLoader + +from app.cell import Cell +from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD + + +# hacky class for allowing us to process documents from a number of rows +# ... instead of reading from a given filepath +# todo: refactor and use mixins maybe +class RowsDocumentProcessor(DocumentProcessor): + """Processes a collection of row documents.""" + + #def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None): + # super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id) + # self.rows_df = rows_df.copy() + # print("ROWS:", len(self.rows_df)) + + def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD): + + self.rows_df = rows_df.copy() + self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same + self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same + + self.chunk_overlap = int(chunk_overlap) + self.chunk_size = int(chunk_size) + + self.embeddings_model_name = "text-embedding-ada-002" + #self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX" + self.similarity_threshold = float(similarity_threshold) + + self.verbose = bool(verbose) + if self.verbose: + print("---------------------") + print("FILENAME:", self.filename) + print("ROWS:", len(self.rows_df)) + + + # OVERWRITE PARENT METHODS WE DON'T NEED + + @cached_property + def docs(self): + return [] + + @cached_property + def doc(self): + return None + + # OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME: + + @cached_property + def cells(self): + loader = DataFrameLoader(self.rows_df, page_content_column="page_content") + docs = loader.load() + # wrap docs in cell class, to stay consistent with parent method + docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs] + return docs # cell_docs diff --git a/app/submissions_retriever.py b/app/submissions_retriever.py index 86a6e09..ef1893f 100644 --- a/app/submissions_retriever.py +++ b/app/submissions_retriever.py @@ -2,65 +2,21 @@ import os from functools import cached_property -from langchain.document_loaders import DataFrameLoader from dotenv import load_dotenv +from pandas import DataFrame from app import RESULTS_DIRPATH -from app.cell import Cell from app.submissions_processor import SubmissionsProcessor -from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD +from app.document_processor import CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD +from app.rows_processor import RowsDocumentProcessor +from app.prompts import STUDENT_QUERY +from app.prompts.homework_4 import HOMEWORK_QUESTIONS load_dotenv() - -class RowsDocumentProcessor(DocumentProcessor): - """Processes a collection of row documents.""" - - #def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None): - # super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id) - # self.rows_df = rows_df.copy() - # print("ROWS:", len(self.rows_df)) - - def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD): - - self.rows_df = rows_df.copy() - self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same - self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same - - self.chunk_overlap = int(chunk_overlap) - self.chunk_size = int(chunk_size) - - self.embeddings_model_name = "text-embedding-ada-002" - #self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX" - self.similarity_threshold = float(similarity_threshold) - - self.verbose = bool(verbose) - if self.verbose: - print("---------------------") - print("FILENAME:", self.filename) - print("ROWS:", len(self.rows_df)) - - - # OVERWRITE PARENT METHODS WE DON'T NEED - - @cached_property - def docs(self): - return [] - - @cached_property - def doc(self): - return None - - # OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME: - - @cached_property - def cells(self): - loader = DataFrameLoader(self.rows_df, page_content_column="page_content") - docs = loader.load() - # wrap docs in cell class, to stay consistent with parent method - docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs] - return docs # cell_docs +UNIQUE_ONLY = bool(os.getenv("UNIQUE_ONLY", default="true").lower() == "true") +DOCS_LIMIT = os.getenv("DOCS_LIMIT") def get_relevant_records(retriever, query, query_id, filename=None, file_id=None, verbose=True): @@ -104,60 +60,70 @@ def get_relevant_records(retriever, query, query_id, filename=None, file_id=None return records -if __name__ == "__main__": - #import pandas as pd - #pd.set_option('display.max_colwidth', 0) +class SubmissionsRetriever: + + def __init__(self, unique_only=UNIQUE_ONLY, similarity_threshold=SIMILARITY_THRESHOLD, docs_limit=DOCS_LIMIT, + #retrieval_strategy="chunks", + chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, + homework_questions=HOMEWORK_QUESTIONS, + ): - from pandas import DataFrame + self.unique_only = unique_only + self.similarity_threshold = float(similarity_threshold) + self.retrieval_strategy = "chunks" + self.chunk_size = CHUNK_SIZE + self.chunk_overlap = CHUNK_OVERLAP - from app.prompts import STUDENT_QUERY - from app.prompts.homework_4 import HOMEWORK_QUESTIONS + self.homework_questions = homework_questions - UNIQUE_ONLY = bool(os.getenv("UNIQUE_ONLY", default="true").lower() == "true") - DOCS_LIMIT = os.getenv("DOCS_LIMIT") + self.docs_limit = DOCS_LIMIT + if self.docs_limit: + self.docs_limit = int(self.docs_limit) - # TODO: convert into a class + self.queries_csv_filepath = os.path.join(RESULTS_DIRPATH, f"queries_similarity_{self.similarity_threshold}_chunks_{self.chunk_size}_{self.chunk_overlap}.csv") + self.queries_df = None # DataFrame() - unique_only = UNIQUE_ONLY - similarity_threshold = SIMILARITY_THRESHOLD - retrieval_strategy = "chunks" - chunk_size = CHUNK_SIZE - chunk_overlap = CHUNK_OVERLAP - docs_limit = DOCS_LIMIT - if docs_limit: - docs_limit = int(docs_limit) - queries_csv_filepath = os.path.join(RESULTS_DIRPATH, f"queries_similarity_{similarity_threshold}_chunks_{chunk_size}_{chunk_overlap}.csv") - sp = SubmissionsProcessor() - sp.perform() + def perform(self): + sp = SubmissionsProcessor() + sp.perform() - cells_df = sp.cells_df.copy() - print("ALL CELLS:", len(cells_df)) - if unique_only: - cells_df = cells_df[ cells_df["dup_content"] == False ] - print("UNIQUE CELLS:", len(cells_df)) + cells_df = sp.cells_df.copy() + print("ALL CELLS:", len(cells_df)) + if self.unique_only: + cells_df = cells_df[ cells_df["dup_content"] == False ] + print("UNIQUE CELLS:", len(cells_df)) - submission_filenames = cells_df["filename"].unique() - print("SUBMISSIONS:", len(submission_filenames)) + submission_filenames = cells_df["filename"].unique() + print("SUBMISSIONS:", len(submission_filenames)) - records = [] - submission_filenames = submission_filenames[0:docs_limit] if docs_limit else submission_filenames - for filename in submission_filenames: - print("---------------------") - print(filename) - - rows_df = cells_df[ cells_df["filename"] == filename ] - dp = RowsDocumentProcessor(rows_df=rows_df, chunk_size=chunk_size, chunk_overlap=chunk_overlap) # similarity_threshold=similarity_threshold - #text_retriever = dp.text_compression_retriever - base_retriever = dp.make_retriever(cell_type="TEXT", storage_strategy=retrieval_strategy) - compression_retriever = dp.make_compression_retriever(base_retriever=base_retriever, similarity_threshold=similarity_threshold) - - records += get_relevant_records(retriever=compression_retriever, query=STUDENT_QUERY, query_id="STUDENT INFO", filename=filename, file_id=dp.file_id) - - for query_id, query in HOMEWORK_QUESTIONS: - records += get_relevant_records(compression_retriever, query=query, query_id=query_id, filename=filename, file_id=dp.file_id) - - queries_df = DataFrame(records) - #queries_df["cell_id"] = queries_df["cell_id"].astype(int) - #queries_df["chunk_id"] = queries_df["chunk_id"].astype(int) # these were showing up as floats, because there are NaNs? - queries_df.to_csv(queries_csv_filepath, index=False) + records = [] + submission_filenames = submission_filenames[0:self.docs_limit] if self.docs_limit else submission_filenames + for filename in submission_filenames: + print("---------------------") + print(filename) + rows_df = cells_df[ cells_df["filename"] == filename ] + dp = RowsDocumentProcessor(rows_df=rows_df, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) # similarity_threshold=similarity_threshold + #text_retriever = dp.text_compression_retriever + base_retriever = dp.make_retriever(cell_type="TEXT", storage_strategy=self.retrieval_strategy) + compression_retriever = dp.make_compression_retriever(base_retriever=base_retriever, similarity_threshold=self.similarity_threshold) + + records += get_relevant_records(retriever=compression_retriever, query=STUDENT_QUERY, query_id="STUDENT INFO", filename=filename, file_id=dp.file_id) + # todo: designate which query gets which response model, so we can treat the student query the same as the homework queries + for query_id, query in self.homework_questions: + records += get_relevant_records(compression_retriever, query=query, query_id=query_id, filename=filename, file_id=dp.file_id) + + self.queries_df = DataFrame(records) + #queries_df["cell_id"] = queries_df["cell_id"].astype(int) + #queries_df["chunk_id"] = queries_df["chunk_id"].astype(int) # these were showing up as floats, because there are NaNs? + self.queries_df.to_csv(self.queries_csv_filepath, index=False) + + + + +if __name__ == "__main__": + + + + sr = SubmissionsRetriever() + sr.perform()