From 04fc07281e270d1ac0d8818ab0e95212bd2ac86c Mon Sep 17 00:00:00 2001
From: MJ Rossetti <s2t2@users.noreply.github.com>
Date: Thu, 14 Dec 2023 22:48:05 -0500
Subject: [PATCH] Refactor

---
 README.md                    |   6 +-
 app/rows_processor.py        |  59 +++++++++++++
 app/submissions_retriever.py | 164 ++++++++++++++---------------------
 3 files changed, 128 insertions(+), 101 deletions(-)
 create mode 100644 app/rows_processor.py

diff --git a/README.md b/README.md
index 8c9c807..fedec28 100644
--- a/README.md
+++ b/README.md
@@ -92,10 +92,12 @@ python -m app.submissions_processor
 
 ### Document Retrieval
 
-Find relevant content in submission files (uses text embeddings model to find relevant documents):
+Designate the homework questions (hard-coded in "app/prompts" dir for now).
+
+Find the most relevant content from the submissions files for answering each of the homework questions (currently uses lower-cost text embeddings model "text-embedding-ada-002" from OpenAI to find relevant documents):
 
 ```sh
-python -m app.submissions_retriever
+DOCS_LIMIT=5 python -m app.submissions_retriever
 
 # DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_retriever
 ```
diff --git a/app/rows_processor.py b/app/rows_processor.py
new file mode 100644
index 0000000..35f2b76
--- /dev/null
+++ b/app/rows_processor.py
@@ -0,0 +1,59 @@
+
+from functools import cached_property
+
+from langchain.document_loaders import DataFrameLoader
+
+from app.cell import Cell
+from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD
+
+
+# hacky class for allowing us to process documents from a number of rows
+# ... instead of reading from a given filepath
+# todo: refactor and use mixins maybe
+class RowsDocumentProcessor(DocumentProcessor):
+    """Processes a collection of row documents."""
+
+    #def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None):
+    #    super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id)
+    #    self.rows_df = rows_df.copy()
+    #    print("ROWS:", len(self.rows_df))
+
+    def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD):
+
+        self.rows_df = rows_df.copy()
+        self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same
+        self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same
+
+        self.chunk_overlap = int(chunk_overlap)
+        self.chunk_size = int(chunk_size)
+
+        self.embeddings_model_name = "text-embedding-ada-002"
+        #self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX"
+        self.similarity_threshold = float(similarity_threshold)
+
+        self.verbose = bool(verbose)
+        if self.verbose:
+            print("---------------------")
+            print("FILENAME:", self.filename)
+            print("ROWS:", len(self.rows_df))
+
+
+    # OVERWRITE PARENT METHODS WE DON'T NEED
+
+    @cached_property
+    def docs(self):
+        return []
+
+    @cached_property
+    def doc(self):
+        return None
+
+    # OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME:
+
+    @cached_property
+    def cells(self):
+        loader = DataFrameLoader(self.rows_df, page_content_column="page_content")
+        docs = loader.load()
+        # wrap docs in cell class, to stay consistent with parent method
+        docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs]
+        return docs # cell_docs
diff --git a/app/submissions_retriever.py b/app/submissions_retriever.py
index 86a6e09..ef1893f 100644
--- a/app/submissions_retriever.py
+++ b/app/submissions_retriever.py
@@ -2,65 +2,21 @@
 import os
 from functools import cached_property
 
-from langchain.document_loaders import DataFrameLoader
 from dotenv import load_dotenv
+from pandas import DataFrame
 
 from app import RESULTS_DIRPATH
-from app.cell import Cell
 from app.submissions_processor import SubmissionsProcessor
-from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD
+from app.document_processor import CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD
+from app.rows_processor import RowsDocumentProcessor
+from app.prompts import STUDENT_QUERY
+from app.prompts.homework_4 import HOMEWORK_QUESTIONS
 
 
 load_dotenv()
 
-
-class RowsDocumentProcessor(DocumentProcessor):
-    """Processes a collection of row documents."""
-
-    #def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None):
-    #    super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id)
-    #    self.rows_df = rows_df.copy()
-    #    print("ROWS:", len(self.rows_df))
-
-    def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD):
-
-        self.rows_df = rows_df.copy()
-        self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same
-        self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same
-
-        self.chunk_overlap = int(chunk_overlap)
-        self.chunk_size = int(chunk_size)
-
-        self.embeddings_model_name = "text-embedding-ada-002"
-        #self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX"
-        self.similarity_threshold = float(similarity_threshold)
-
-        self.verbose = bool(verbose)
-        if self.verbose:
-            print("---------------------")
-            print("FILENAME:", self.filename)
-            print("ROWS:", len(self.rows_df))
-
-
-    # OVERWRITE PARENT METHODS WE DON'T NEED
-
-    @cached_property
-    def docs(self):
-        return []
-
-    @cached_property
-    def doc(self):
-        return None
-
-    # OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME:
-
-    @cached_property
-    def cells(self):
-        loader = DataFrameLoader(self.rows_df, page_content_column="page_content")
-        docs = loader.load()
-        # wrap docs in cell class, to stay consistent with parent method
-        docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs]
-        return docs # cell_docs
+UNIQUE_ONLY = bool(os.getenv("UNIQUE_ONLY", default="true").lower() == "true")
+DOCS_LIMIT = os.getenv("DOCS_LIMIT")
 
 
 def get_relevant_records(retriever, query, query_id, filename=None, file_id=None, verbose=True):
@@ -104,60 +60,70 @@ def get_relevant_records(retriever, query, query_id, filename=None, file_id=None
     return records
 
 
-if __name__ == "__main__":
-    #import pandas as pd
-    #pd.set_option('display.max_colwidth', 0)
+class SubmissionsRetriever:
+
+    def __init__(self, unique_only=UNIQUE_ONLY, similarity_threshold=SIMILARITY_THRESHOLD, docs_limit=DOCS_LIMIT,
+                 #retrieval_strategy="chunks",
+                 chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
+                 homework_questions=HOMEWORK_QUESTIONS,
+                 ):
 
-    from pandas import DataFrame
+        self.unique_only = unique_only
+        self.similarity_threshold = float(similarity_threshold)
+        self.retrieval_strategy = "chunks"
+        self.chunk_size = CHUNK_SIZE
+        self.chunk_overlap = CHUNK_OVERLAP
 
-    from app.prompts import STUDENT_QUERY
-    from app.prompts.homework_4 import HOMEWORK_QUESTIONS
+        self.homework_questions = homework_questions
 
-    UNIQUE_ONLY = bool(os.getenv("UNIQUE_ONLY", default="true").lower() == "true")
-    DOCS_LIMIT = os.getenv("DOCS_LIMIT")
+        self.docs_limit = DOCS_LIMIT
+        if self.docs_limit:
+            self.docs_limit = int(self.docs_limit)
 
-    # TODO: convert into a class
+        self.queries_csv_filepath = os.path.join(RESULTS_DIRPATH, f"queries_similarity_{self.similarity_threshold}_chunks_{self.chunk_size}_{self.chunk_overlap}.csv")
+        self.queries_df = None # DataFrame()
 
-    unique_only = UNIQUE_ONLY
-    similarity_threshold = SIMILARITY_THRESHOLD
-    retrieval_strategy = "chunks"
-    chunk_size = CHUNK_SIZE
-    chunk_overlap = CHUNK_OVERLAP
-    docs_limit = DOCS_LIMIT
-    if docs_limit:
-        docs_limit = int(docs_limit)
-    queries_csv_filepath = os.path.join(RESULTS_DIRPATH, f"queries_similarity_{similarity_threshold}_chunks_{chunk_size}_{chunk_overlap}.csv")
 
-    sp = SubmissionsProcessor()
-    sp.perform()
+    def perform(self):
+        sp = SubmissionsProcessor()
+        sp.perform()
 
-    cells_df = sp.cells_df.copy()
-    print("ALL CELLS:", len(cells_df))
-    if unique_only:
-        cells_df = cells_df[ cells_df["dup_content"] == False ]
-        print("UNIQUE CELLS:", len(cells_df))
+        cells_df = sp.cells_df.copy()
+        print("ALL CELLS:", len(cells_df))
+        if self.unique_only:
+            cells_df = cells_df[ cells_df["dup_content"] == False ]
+            print("UNIQUE CELLS:", len(cells_df))
 
-    submission_filenames = cells_df["filename"].unique()
-    print("SUBMISSIONS:", len(submission_filenames))
+        submission_filenames = cells_df["filename"].unique()
+        print("SUBMISSIONS:", len(submission_filenames))
 
-    records = []
-    submission_filenames = submission_filenames[0:docs_limit] if docs_limit else submission_filenames
-    for filename in submission_filenames:
-        print("---------------------")
-        print(filename)
-
-        rows_df = cells_df[ cells_df["filename"] == filename ]
-        dp = RowsDocumentProcessor(rows_df=rows_df, chunk_size=chunk_size, chunk_overlap=chunk_overlap) # similarity_threshold=similarity_threshold
-        #text_retriever = dp.text_compression_retriever
-        base_retriever = dp.make_retriever(cell_type="TEXT", storage_strategy=retrieval_strategy)
-        compression_retriever = dp.make_compression_retriever(base_retriever=base_retriever, similarity_threshold=similarity_threshold)
-
-        records += get_relevant_records(retriever=compression_retriever, query=STUDENT_QUERY, query_id="STUDENT INFO", filename=filename, file_id=dp.file_id)
-
-        for query_id, query in HOMEWORK_QUESTIONS:
-            records += get_relevant_records(compression_retriever, query=query, query_id=query_id, filename=filename, file_id=dp.file_id)
-
-    queries_df = DataFrame(records)
-    #queries_df["cell_id"] = queries_df["cell_id"].astype(int)
-    #queries_df["chunk_id"] = queries_df["chunk_id"].astype(int) # these were showing up as floats, because there are NaNs?
-    queries_df.to_csv(queries_csv_filepath, index=False)
+        records = []
+        submission_filenames = submission_filenames[0:self.docs_limit] if self.docs_limit else submission_filenames
+        for filename in submission_filenames:
+            print("---------------------")
+            print(filename)
+            rows_df = cells_df[ cells_df["filename"] == filename ]
+            dp = RowsDocumentProcessor(rows_df=rows_df, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) # similarity_threshold=similarity_threshold
+            #text_retriever = dp.text_compression_retriever
+            base_retriever = dp.make_retriever(cell_type="TEXT", storage_strategy=self.retrieval_strategy)
+            compression_retriever = dp.make_compression_retriever(base_retriever=base_retriever, similarity_threshold=self.similarity_threshold)
+
+            records += get_relevant_records(retriever=compression_retriever, query=STUDENT_QUERY, query_id="STUDENT INFO", filename=filename, file_id=dp.file_id)
+            # todo: designate which query gets which response model, so we can treat the student query the same as the homework queries
+            for query_id, query in self.homework_questions:
+                records += get_relevant_records(compression_retriever, query=query, query_id=query_id, filename=filename, file_id=dp.file_id)
+
+        self.queries_df = DataFrame(records)
+        #queries_df["cell_id"] = queries_df["cell_id"].astype(int)
+        #queries_df["chunk_id"] = queries_df["chunk_id"].astype(int) # these were showing up as floats, because there are NaNs?
+        self.queries_df.to_csv(self.queries_csv_filepath, index=False)
+
+
+
+
+if __name__ == "__main__":
+
+
+
+    sr = SubmissionsRetriever()
+    sr.perform()