Skip to content

Commit

Permalink
Document Querying (#2)
Browse files Browse the repository at this point in the history
Finds the text chunks from each submission document that are most relevant in answering each homework question. 

Saves the relevant text chunks and similarity scores to a CSV file, so we can inspect them (good for explainability). 

This helps us ensure the document chunking and retrieval strategies are producing reasonable results, before feeding them to the model during RAG.
  • Loading branch information
s2t2 authored Dec 15, 2023
1 parent 9dda5d5 commit d71372c
Show file tree
Hide file tree
Showing 13 changed files with 432 additions and 5 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@


.DS_Store


# ignore artifacts saved to results dir:
*/*.csv
results/*.csv
results/*.png
results/*.html

# ignore artifacts from testing (we just saved a temporary copy of results there for example purposes)
test/results/*
!test/results/cells_example.csv
!test/results/notebooks_example.csv



Expand Down
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ Demonstrate ability to access submission files:
python -m app.submissions_manager
```

### Cell-based Document Splitting

Process the starter file:

```sh
Expand All @@ -80,14 +82,36 @@ python -m app.starter_doc_processor
# FIG_SHOW=false CHUNK_SIZE=600 CHUNK_OVERLAP=0 SIMILARITY_THRESHOLD=0.75 python -m app.starter_doc_processor
```

Process all submission files:
Process all submission files (provides metadata about the file contents, compares against starter):

```sh
python -m app.submissions_processor

#FIG_SHOW=false CHUNK_SIZE=600 CHUNK_OVERLAP=0 python -m app.submissions_processor
#FIG_SHOW=false python -m app.submissions_processor
```

### Document Retrieval

Designate the homework questions (hard-coded in "app/prompts" dir for now).

Find the most relevant content from the submissions files for answering each of the homework questions (currently uses lower-cost text embeddings model "text-embedding-ada-002" from OpenAI to find relevant documents):

```sh
DOCS_LIMIT=5 python -m app.submissions_retriever

# DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_retriever
```


### Retreival Augmented Generation (RAG)

Chat with the LLM:

```sh
TEMP=0.6 python -m app.openai_llm
```


## Testing

Run tests:
Expand Down
7 changes: 7 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,10 @@
import os

RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results")



#def seek_confirmation(message="CONTINUE? (Y/N): "):
# if input(message).upper() != "Y":
# print("EXITING...")
# exit()
46 changes: 46 additions & 0 deletions app/openai_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

import os

import openai
from langchain.llms import OpenAI
from dotenv import load_dotenv

#from app import seek_confirmation


load_dotenv()

MODEL_NAME = os.getenv("MODEL_NAME", default="text-davinci-003")
TEMP = float(os.getenv("TEMP", default="0.0")) # @param {type:"slider", min:0, max:1, step:0.1}


def create_llm(model_name=MODEL_NAME, temp=TEMP):
# default model is text-davinci-003
# default temp is 0.7
return OpenAI(model_name=MODEL_NAME, temperature=TEMP)


if __name__ == "__main__":

from random import choice

llm = create_llm()
print(llm)
#print(llm.model_name)
#print(llm.temperature)

#seek_confirmation("Continue to prompt? (Y/N): ")

general_knowlege_queries = [
"What year was America founded?",
"Tell us about the first humans who landed on the moon"
]

query = input("Please provide a Query (enter for default, 'Q' to quit): ")
if query.upper() == "Q":
exit()

query = query or choice(general_knowlege_queries)
print(query)
response = llm.predict(query).strip()
print(response)
2 changes: 0 additions & 2 deletions app/prompts.py → app/prompts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@





STUDENT_QUERY = "What is the student's name? What is their GW ID?"
83 changes: 83 additions & 0 deletions app/prompts/homework_4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@


# todo: mechanism for allowing the user to specify their own homework questions
# hard coding our homework questions here for now :-)

#HOMEWORK_QUESTIONS = {
# "Question 3.1 (LeNet Performance)":
#}

QUESTIONS = [
"""
+ Question 3.1 (LeNet Performance):
What was the overall accuracy achieved by LeNet?
Describe which classes generate the most errors based on the confusion matrix.
What was the execution runtime?
How many parameters does the model have?
""",
"""
+ Question 4.1 (Enhanced Classifier Performance):
What was the overall accuracy achieved by the Enhanced Classifier?
Describe which classes generate the most errors based on the confusion matrix.
What was the execution runtime?
How many parameters does the model have?
""",
"""
+ Question 6.1 (GoogLeNet Performance):
What was the overall accuracy achieved by your implementation of GoogLeNet?
Describe which classes generate the most errors based on the confusion matrix.
What was the execution runtime?
How many parameters does your model have?
""",
"""
+ Question 5.1 (Activations):
For a given layer, describe why some activation channels (rows) tend to have stronger responses than other channels.
(The strength of a response is visualized by the brightness of the image).
Can you describe the textures that a channel is sensitive to?
Do any channels have no activation response (all black images)? If so, why?
Describe some differences in the layers. Why are the images getting smaller as the layers increase?
What is the relationship between the layer and a receptive field?
Why do the images have larger pixels at the higher levels making the image look more "pixelated"?
Compare the activations between channels at a layer.
Do the channels detect the same textures or do they detect different textures?
What enables channels at the same layer to detect different textures?
""",
"""
+ Question 7.1 (Results Summary):
Create a table of the methods you tried (LeNet, Enhanced Classifier, and GoogLeNet) with the overall accuracies.
Which method provided the highest accuracy?
Describe which architectural changes (number layers, channel sizes, learning rates, activation functions, etc.) had the strongest influence accuracy?
Append the number of parameters and execution time for each model.
Is the size and execution time correlated with accuracy?
""",
]

# [OPTIONAL grade points]
BONUS_QUESTIONS = [
"""
+ Bonus Question 4.1 A (Early Stopping):
Create a keras callback function with early stopping instead of specifying the number of epochs, allowing the model to train until the patience parameter runs out.
How many epochs did your model execute before early stopping quit?
""",
"""
+ Bonus Question 4.1 B (Learning Rate Scheduler):
Add in a decaying learning rate scheduler to the optimizer, replacing the fixed-rate schedule.
Example learning rate schedulers include: ExponentialDecay, PiecewiseConstantDecay, PolynomialDecay, InverseTimeDecay, CosineDecay, CosineDecayRestarts.
What learninging rate decay did you apply? What were the parameters? Were there there any performance improvements?
""",
"""
+ Bonus Question 6.1 (Adapting GoogleNet Architecture):
Make a changes to the original GoogLeNet architecture, such as the number of layers or types of layers that yields a consistent improvement in accuracy.
(Not looking for a robust statistical test, just consistent improvement over two or more executions).
To gain credit, you must describe your design change and demonstrate improved accuracy over two or more executions.
"""
]

ALL_QUESTIONS = QUESTIONS + BONUS_QUESTIONS

HOMEWORK_QUESTIONS = []
for question in ALL_QUESTIONS:
question_id = question.split(")")[0]
question_id = question_id.split("+")[-1].strip() + ")" #> "Bonus Question 6.1 (Adapting GoogleNet Architecture)"
HOMEWORK_QUESTIONS.append((question_id, question))
51 changes: 51 additions & 0 deletions app/response_formatters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@


# https://docs.pydantic.dev/latest/

from typing import List
from pydantic import BaseModel, Field

COMMENTS = "The comment to accompany the score. Provides justification for the score. Cites specific content present or absent from the response."
CONFIDENCE_SCORE = "Confidence level in the score. Values range between 0 (low confidence) and 1 (high confidence)"

ONE_TO_FIVE_SCORE = "The score. Values range from 1 (low) to 5 (high), in increments of 0.25 (where 1 is poor, 3 is decent, 4 is good, 4.5 is great, and 5 is perfect). Indicates the degree to which the response completely, thoroughly, and accurately addresses all the questions."
ZERO_TO_ONE_SCORE = "The score. Values range from 0 (low) to 1 (high), in increments of 0.05 (where 0 is unattempted or blank, 0.5 is incorrect or incomplete, 0.75 is good, 0.9 is great, and 1.0 is perfect). Indicates the degree to which the response completely, thoroughly, and accurately addresses the question."

class Student(BaseModel):
"""A student."""
name: str = Field(description="The student's full name.")
net_id: str = Field(description="The student's identifier.")


class DocumentScoring(BaseModel):
"""A document scoring."""
score: float = Field(description=ONE_TO_FIVE_SCORE)
comments: str = Field(description=COMMENTS)
confidence: float = Field(description=CONFIDENCE_SCORE)


class QuestionScoring(BaseModel):
"""A question scoring."""
question_id: str = Field(description="The question identifier.")
score: float = Field(description=ZERO_TO_ONE_SCORE)
comments: str = Field(description=COMMENTS)
confidence: float = Field(description=CONFIDENCE_SCORE)


class QuestionScorings(BaseModel):
"""A List of question scorings, for handling multiple questions in the same prompt."""
scorings: List[QuestionScoring]



if __name__ == "__main__":

student = Student(name="Sally Student", net_id="G123456")
print(student)

scoring = DocumentScoring(score=3.5, comments="Great work!", confidence=1.0)
print(scoring)

qs = QuestionScoring(question_id="5.1", score=0.75, comments="Great work!", confidence=1.0)
scorings = QuestionScorings(scorings=[qs])
print(scorings)
59 changes: 59 additions & 0 deletions app/rows_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@

from functools import cached_property

from langchain.document_loaders import DataFrameLoader

from app.cell import Cell
from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD


# hacky class for allowing us to process documents from a number of rows
# ... instead of reading from a given filepath
# todo: refactor and use mixins maybe
class RowsDocumentProcessor(DocumentProcessor):
"""Processes a collection of row documents."""

#def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None):
# super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id)
# self.rows_df = rows_df.copy()
# print("ROWS:", len(self.rows_df))

def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD):

self.rows_df = rows_df.copy()
self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same
self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same

self.chunk_overlap = int(chunk_overlap)
self.chunk_size = int(chunk_size)

self.embeddings_model_name = "text-embedding-ada-002"
#self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX"
self.similarity_threshold = float(similarity_threshold)

self.verbose = bool(verbose)
if self.verbose:
print("---------------------")
print("FILENAME:", self.filename)
print("ROWS:", len(self.rows_df))


# OVERWRITE PARENT METHODS WE DON'T NEED

@cached_property
def docs(self):
return []

@cached_property
def doc(self):
return None

# OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME:

@cached_property
def cells(self):
loader = DataFrameLoader(self.rows_df, page_content_column="page_content")
docs = loader.load()
# wrap docs in cell class, to stay consistent with parent method
docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs]
return docs # cell_docs
Loading

0 comments on commit d71372c

Please sign in to comment.