Document Querying (#2)

Finds the text chunks from each submission document that are most relevant in answering each homework question. Saves the relevant text chunks and similarity scores to a CSV file, so we can inspect them (good for explainability). This helps us ensure the document chunking and retrieval strategies are producing reasonable results, before feeding them to the model during RAG.
s2t2 · Dec 15, 2023 · d71372c · d71372c
1 parent 9dda5d5
commit d71372c
Show file tree

Hide file tree

Showing 13 changed files with 432 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,11 +1,18 @@
 
+
+.DS_Store
+
+
 # ignore artifacts saved to results dir:
+*/*.csv
 results/*.csv
 results/*.png
 results/*.html
 
 # ignore artifacts from testing (we just saved a temporary copy of results there for example purposes)
 test/results/*
+!test/results/cells_example.csv
+!test/results/notebooks_example.csv
 
 
 

diff --git a/README.md b/README.md
@@ -68,6 +68,8 @@ Demonstrate ability to access submission files:
 python -m app.submissions_manager
 ```
 
+### Cell-based Document Splitting
+
 Process the starter file:
 
 ```sh
@@ -80,14 +82,36 @@ python -m app.starter_doc_processor
 # FIG_SHOW=false CHUNK_SIZE=600 CHUNK_OVERLAP=0 SIMILARITY_THRESHOLD=0.75 python -m app.starter_doc_processor
 ```
 
-Process all submission files:
+Process all submission files (provides metadata about the file contents, compares against starter):
 
 ```sh
 python -m app.submissions_processor
 
-#FIG_SHOW=false CHUNK_SIZE=600 CHUNK_OVERLAP=0 python -m app.submissions_processor
+#FIG_SHOW=false python -m app.submissions_processor
+```
+
+### Document Retrieval
+
+Designate the homework questions (hard-coded in "app/prompts" dir for now).
+
+Find the most relevant content from the submissions files for answering each of the homework questions (currently uses lower-cost text embeddings model "text-embedding-ada-002" from OpenAI to find relevant documents):
+
+```sh
+DOCS_LIMIT=5 python -m app.submissions_retriever
+
+# DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_retriever
 ```
 
+
+### Retreival Augmented Generation (RAG)
+
+Chat with the LLM:
+
+```sh
+TEMP=0.6 python -m app.openai_llm
+```
+
+
 ## Testing
 
 Run tests:

diff --git a/app/__init__.py b/app/__init__.py
@@ -4,3 +4,10 @@
 import os
 
 RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results")
+
+
+
+#def seek_confirmation(message="CONTINUE? (Y/N): "):
+#    if input(message).upper() != "Y":
+#        print("EXITING...")
+#        exit()
diff --git a/app/openai_llm.py b/app/openai_llm.py
@@ -0,0 +1,46 @@
+
+import os
+
+import openai
+from langchain.llms import OpenAI
+from dotenv import load_dotenv
+
+#from app import seek_confirmation
+
+
+load_dotenv()
+
+MODEL_NAME = os.getenv("MODEL_NAME", default="text-davinci-003")
+TEMP = float(os.getenv("TEMP", default="0.0")) # @param {type:"slider", min:0, max:1, step:0.1}
+
+
+def create_llm(model_name=MODEL_NAME, temp=TEMP):
+    # default model is text-davinci-003
+    # default temp is 0.7
+    return OpenAI(model_name=MODEL_NAME, temperature=TEMP)
+
+
+if __name__ == "__main__":
+
+    from random import choice
+
+    llm = create_llm()
+    print(llm)
+    #print(llm.model_name)
+    #print(llm.temperature)
+
+    #seek_confirmation("Continue to prompt? (Y/N): ")
+
+    general_knowlege_queries = [
+        "What year was America founded?",
+        "Tell us about the first humans who landed on the moon"
+    ]
+
+    query = input("Please provide a Query (enter for default, 'Q' to quit): ")
+    if query.upper() == "Q":
+        exit()
+
+    query = query or choice(general_knowlege_queries)
+    print(query)
+    response = llm.predict(query).strip()
+    print(response)
diff --git a/app/prompts.py → app/prompts/__init__.py b/app/prompts.py → app/prompts/__init__.py
@@ -1,6 +1,4 @@
 
 
 
-
-
 STUDENT_QUERY = "What is the student's name? What is their GW ID?"
diff --git a/app/prompts/homework_4.py b/app/prompts/homework_4.py
@@ -0,0 +1,83 @@
+
+
+# todo: mechanism for allowing the user to specify their own homework questions
+# hard coding our homework questions here for now :-)
+
+#HOMEWORK_QUESTIONS = {
+#    "Question 3.1 (LeNet Performance)":
+#}
+
+QUESTIONS = [
+"""
++ Question 3.1 (LeNet Performance):
+    What was the overall accuracy achieved by LeNet?
+    Describe which classes generate the most errors based on the confusion matrix.
+    What was the execution runtime?
+    How many parameters does the model have?
+""",
+"""
++ Question 4.1 (Enhanced Classifier Performance):
+    What was the overall accuracy achieved by the Enhanced Classifier?
+    Describe which classes generate the most errors based on the confusion matrix.
+    What was the execution runtime?
+    How many parameters does the model have?
+""",
+"""
++ Question 6.1 (GoogLeNet Performance):
+    What was the overall accuracy achieved by your implementation of GoogLeNet?
+    Describe which classes generate the most errors based on the confusion matrix.
+    What was the execution runtime?
+    How many parameters does your model have?
+""",
+"""
++ Question 5.1 (Activations):
+    For a given layer, describe why some activation channels (rows) tend to have stronger responses than other channels.
+    (The strength of a response is visualized by the brightness of the image).
+    Can you describe the textures that a channel is sensitive to?
+    Do any channels have no activation response (all black images)? If so, why?
+    Describe some differences in the layers. Why are the images getting smaller as the layers increase?
+    What is the relationship between the layer and a receptive field?
+    Why do the images have larger pixels at the higher levels making the image look more "pixelated"?
+    Compare the activations between channels at a layer.
+    Do the channels detect the same textures or do they detect different textures?
+    What enables channels at the same layer to detect different textures?
+""",
+"""
++ Question 7.1 (Results Summary):
+    Create a table of the methods you tried (LeNet, Enhanced Classifier, and GoogLeNet) with the overall accuracies.
+    Which method provided the highest accuracy?
+    Describe which architectural changes (number layers, channel sizes, learning rates, activation functions, etc.) had the strongest influence accuracy?
+    Append the number of parameters and execution time for each model.
+    Is the size and execution time correlated with accuracy?
+""",
+]
+
+# [OPTIONAL grade points]
+BONUS_QUESTIONS = [
+"""
++ Bonus Question 4.1 A (Early Stopping):
+    Create a keras callback function with early stopping instead of specifying the number of epochs, allowing the model to train until the patience parameter runs out.
+    How many epochs did your model execute before early stopping quit?
+""",
+"""
++ Bonus Question 4.1 B (Learning Rate Scheduler):
+    Add in a decaying learning rate scheduler to the optimizer, replacing the fixed-rate schedule.
+    Example learning rate schedulers include: ExponentialDecay, PiecewiseConstantDecay, PolynomialDecay, InverseTimeDecay, CosineDecay, CosineDecayRestarts.
+    What learninging rate decay did you apply? What were the parameters? Were there there any performance improvements?
+""",
+"""
++ Bonus Question 6.1 (Adapting GoogleNet Architecture):
+    Make a changes to the original GoogLeNet architecture, such as the number of layers or types of layers that yields a consistent improvement in accuracy.
+    (Not looking for a robust statistical test, just consistent improvement over two or more executions).
+    To gain credit, you must describe your design change and demonstrate improved accuracy over two or more executions.
+
+"""
+]
+
+ALL_QUESTIONS = QUESTIONS + BONUS_QUESTIONS
+
+HOMEWORK_QUESTIONS = []
+for question in ALL_QUESTIONS:
+    question_id = question.split(")")[0]
+    question_id = question_id.split("+")[-1].strip() + ")" #> "Bonus Question 6.1 (Adapting GoogleNet Architecture)"
+    HOMEWORK_QUESTIONS.append((question_id, question))
diff --git a/app/response_formatters.py b/app/response_formatters.py
@@ -0,0 +1,51 @@
+
+
+# https://docs.pydantic.dev/latest/
+
+from typing import List
+from pydantic import BaseModel, Field
+
+COMMENTS = "The comment to accompany the score. Provides justification for the score. Cites specific content present or absent from the response."
+CONFIDENCE_SCORE = "Confidence level in the score. Values range between 0 (low confidence) and 1 (high confidence)"
+
+ONE_TO_FIVE_SCORE = "The score. Values range from 1 (low) to 5 (high), in increments of 0.25 (where 1 is poor, 3 is decent, 4 is good, 4.5 is great, and 5 is perfect). Indicates the degree to which the response completely, thoroughly, and accurately addresses all the questions."
+ZERO_TO_ONE_SCORE = "The score. Values range from 0 (low) to 1 (high), in increments of 0.05 (where 0 is unattempted or blank, 0.5 is incorrect or incomplete, 0.75 is good, 0.9 is great, and 1.0 is perfect). Indicates the degree to which the response completely, thoroughly, and accurately addresses the question."
+
+class Student(BaseModel):
+    """A student."""
+    name: str = Field(description="The student's full name.")
+    net_id: str = Field(description="The student's identifier.")
+
+
+class DocumentScoring(BaseModel):
+    """A document scoring."""
+    score: float = Field(description=ONE_TO_FIVE_SCORE)
+    comments: str = Field(description=COMMENTS)
+    confidence: float = Field(description=CONFIDENCE_SCORE)
+
+
+class QuestionScoring(BaseModel):
+    """A question scoring."""
+    question_id: str = Field(description="The question identifier.")
+    score: float = Field(description=ZERO_TO_ONE_SCORE)
+    comments: str = Field(description=COMMENTS)
+    confidence: float = Field(description=CONFIDENCE_SCORE)
+
+
+class QuestionScorings(BaseModel):
+    """A List of question scorings, for handling multiple questions in the same prompt."""
+    scorings: List[QuestionScoring]
+
+
+
+if __name__ == "__main__":
+
+    student = Student(name="Sally Student", net_id="G123456")
+    print(student)
+
+    scoring = DocumentScoring(score=3.5, comments="Great work!", confidence=1.0)
+    print(scoring)
+
+    qs = QuestionScoring(question_id="5.1", score=0.75, comments="Great work!", confidence=1.0)
+    scorings = QuestionScorings(scorings=[qs])
+    print(scorings)
diff --git a/app/rows_processor.py b/app/rows_processor.py
@@ -0,0 +1,59 @@
+
+from functools import cached_property
+
+from langchain.document_loaders import DataFrameLoader
+
+from app.cell import Cell
+from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD
+
+
+# hacky class for allowing us to process documents from a number of rows
+# ... instead of reading from a given filepath
+# todo: refactor and use mixins maybe
+class RowsDocumentProcessor(DocumentProcessor):
+    """Processes a collection of row documents."""
+
+    #def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None):
+    #    super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id)
+    #    self.rows_df = rows_df.copy()
+    #    print("ROWS:", len(self.rows_df))
+
+    def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD):
+
+        self.rows_df = rows_df.copy()
+        self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same
+        self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same
+
+        self.chunk_overlap = int(chunk_overlap)
+        self.chunk_size = int(chunk_size)
+
+        self.embeddings_model_name = "text-embedding-ada-002"
+        #self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX"
+        self.similarity_threshold = float(similarity_threshold)
+
+        self.verbose = bool(verbose)
+        if self.verbose:
+            print("---------------------")
+            print("FILENAME:", self.filename)
+            print("ROWS:", len(self.rows_df))
+
+
+    # OVERWRITE PARENT METHODS WE DON'T NEED
+
+    @cached_property
+    def docs(self):
+        return []
+
+    @cached_property
+    def doc(self):
+        return None
+
+    # OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME:
+
+    @cached_property
+    def cells(self):
+        loader = DataFrameLoader(self.rows_df, page_content_column="page_content")
+        docs = loader.load()
+        # wrap docs in cell class, to stay consistent with parent method
+        docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs]
+        return docs # cell_docs
Original file line number	Diff line number	Diff line change
		@@ -1,6 +1,4 @@





		STUDENT_QUERY = "What is the student's name? What is their GW ID?"