Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
s2t2 committed Dec 15, 2023
1 parent 75933fc commit 04fc072
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 101 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,12 @@ python -m app.submissions_processor

### Document Retrieval

Find relevant content in submission files (uses text embeddings model to find relevant documents):
Designate the homework questions (hard-coded in "app/prompts" dir for now).

Find the most relevant content from the submissions files for answering each of the homework questions (currently uses lower-cost text embeddings model "text-embedding-ada-002" from OpenAI to find relevant documents):

```sh
python -m app.submissions_retriever
DOCS_LIMIT=5 python -m app.submissions_retriever

# DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_retriever
```
Expand Down
59 changes: 59 additions & 0 deletions app/rows_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@

from functools import cached_property

from langchain.document_loaders import DataFrameLoader

from app.cell import Cell
from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD


# hacky class for allowing us to process documents from a number of rows
# ... instead of reading from a given filepath
# todo: refactor and use mixins maybe
class RowsDocumentProcessor(DocumentProcessor):
"""Processes a collection of row documents."""

#def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None):
# super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id)
# self.rows_df = rows_df.copy()
# print("ROWS:", len(self.rows_df))

def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD):

self.rows_df = rows_df.copy()
self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same
self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same

self.chunk_overlap = int(chunk_overlap)
self.chunk_size = int(chunk_size)

self.embeddings_model_name = "text-embedding-ada-002"
#self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX"
self.similarity_threshold = float(similarity_threshold)

self.verbose = bool(verbose)
if self.verbose:
print("---------------------")
print("FILENAME:", self.filename)
print("ROWS:", len(self.rows_df))


# OVERWRITE PARENT METHODS WE DON'T NEED

@cached_property
def docs(self):
return []

@cached_property
def doc(self):
return None

# OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME:

@cached_property
def cells(self):
loader = DataFrameLoader(self.rows_df, page_content_column="page_content")
docs = loader.load()
# wrap docs in cell class, to stay consistent with parent method
docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs]
return docs # cell_docs
164 changes: 65 additions & 99 deletions app/submissions_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,65 +2,21 @@
import os
from functools import cached_property

from langchain.document_loaders import DataFrameLoader
from dotenv import load_dotenv
from pandas import DataFrame

from app import RESULTS_DIRPATH
from app.cell import Cell
from app.submissions_processor import SubmissionsProcessor
from app.document_processor import DocumentProcessor, CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD
from app.document_processor import CHUNK_OVERLAP, CHUNK_SIZE, SIMILARITY_THRESHOLD
from app.rows_processor import RowsDocumentProcessor
from app.prompts import STUDENT_QUERY
from app.prompts.homework_4 import HOMEWORK_QUESTIONS


load_dotenv()


class RowsDocumentProcessor(DocumentProcessor):
"""Processes a collection of row documents."""

#def __init__(self, rows_df, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None):
# super().__init__(filepath=filepath, chunk_overlap=chunk_overlap, chunk_size=chunk_size, verbose=verbose, similarity_threshold=similarity_threshold, file_id=file_id)
# self.rows_df = rows_df.copy()
# print("ROWS:", len(self.rows_df))

def __init__(self, rows_df, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD):

self.rows_df = rows_df.copy()
self.filename = rows_df["filename"].unique()[0] # take the first, they should all be the same
self.file_id = rows_df["file_id"].unique()[0] # take the first, they should all be the same

self.chunk_overlap = int(chunk_overlap)
self.chunk_size = int(chunk_size)

self.embeddings_model_name = "text-embedding-ada-002"
#self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX"
self.similarity_threshold = float(similarity_threshold)

self.verbose = bool(verbose)
if self.verbose:
print("---------------------")
print("FILENAME:", self.filename)
print("ROWS:", len(self.rows_df))


# OVERWRITE PARENT METHODS WE DON'T NEED

@cached_property
def docs(self):
return []

@cached_property
def doc(self):
return None

# OVERWRITE PARENT METHOD TO GET CELLS STRAIGHT FROM THE ROWS DATAFRAME:

@cached_property
def cells(self):
loader = DataFrameLoader(self.rows_df, page_content_column="page_content")
docs = loader.load()
# wrap docs in cell class, to stay consistent with parent method
docs = [Cell(page_content=doc.page_content, metadata=doc.metadata) for doc in docs]
return docs # cell_docs
UNIQUE_ONLY = bool(os.getenv("UNIQUE_ONLY", default="true").lower() == "true")
DOCS_LIMIT = os.getenv("DOCS_LIMIT")


def get_relevant_records(retriever, query, query_id, filename=None, file_id=None, verbose=True):
Expand Down Expand Up @@ -104,60 +60,70 @@ def get_relevant_records(retriever, query, query_id, filename=None, file_id=None
return records


if __name__ == "__main__":
#import pandas as pd
#pd.set_option('display.max_colwidth', 0)
class SubmissionsRetriever:

def __init__(self, unique_only=UNIQUE_ONLY, similarity_threshold=SIMILARITY_THRESHOLD, docs_limit=DOCS_LIMIT,
#retrieval_strategy="chunks",
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
homework_questions=HOMEWORK_QUESTIONS,
):

from pandas import DataFrame
self.unique_only = unique_only
self.similarity_threshold = float(similarity_threshold)
self.retrieval_strategy = "chunks"
self.chunk_size = CHUNK_SIZE
self.chunk_overlap = CHUNK_OVERLAP

from app.prompts import STUDENT_QUERY
from app.prompts.homework_4 import HOMEWORK_QUESTIONS
self.homework_questions = homework_questions

UNIQUE_ONLY = bool(os.getenv("UNIQUE_ONLY", default="true").lower() == "true")
DOCS_LIMIT = os.getenv("DOCS_LIMIT")
self.docs_limit = DOCS_LIMIT
if self.docs_limit:
self.docs_limit = int(self.docs_limit)

# TODO: convert into a class
self.queries_csv_filepath = os.path.join(RESULTS_DIRPATH, f"queries_similarity_{self.similarity_threshold}_chunks_{self.chunk_size}_{self.chunk_overlap}.csv")
self.queries_df = None # DataFrame()

unique_only = UNIQUE_ONLY
similarity_threshold = SIMILARITY_THRESHOLD
retrieval_strategy = "chunks"
chunk_size = CHUNK_SIZE
chunk_overlap = CHUNK_OVERLAP
docs_limit = DOCS_LIMIT
if docs_limit:
docs_limit = int(docs_limit)
queries_csv_filepath = os.path.join(RESULTS_DIRPATH, f"queries_similarity_{similarity_threshold}_chunks_{chunk_size}_{chunk_overlap}.csv")

sp = SubmissionsProcessor()
sp.perform()
def perform(self):
sp = SubmissionsProcessor()
sp.perform()

cells_df = sp.cells_df.copy()
print("ALL CELLS:", len(cells_df))
if unique_only:
cells_df = cells_df[ cells_df["dup_content"] == False ]
print("UNIQUE CELLS:", len(cells_df))
cells_df = sp.cells_df.copy()
print("ALL CELLS:", len(cells_df))
if self.unique_only:
cells_df = cells_df[ cells_df["dup_content"] == False ]
print("UNIQUE CELLS:", len(cells_df))

submission_filenames = cells_df["filename"].unique()
print("SUBMISSIONS:", len(submission_filenames))
submission_filenames = cells_df["filename"].unique()
print("SUBMISSIONS:", len(submission_filenames))

records = []
submission_filenames = submission_filenames[0:docs_limit] if docs_limit else submission_filenames
for filename in submission_filenames:
print("---------------------")
print(filename)

rows_df = cells_df[ cells_df["filename"] == filename ]
dp = RowsDocumentProcessor(rows_df=rows_df, chunk_size=chunk_size, chunk_overlap=chunk_overlap) # similarity_threshold=similarity_threshold
#text_retriever = dp.text_compression_retriever
base_retriever = dp.make_retriever(cell_type="TEXT", storage_strategy=retrieval_strategy)
compression_retriever = dp.make_compression_retriever(base_retriever=base_retriever, similarity_threshold=similarity_threshold)

records += get_relevant_records(retriever=compression_retriever, query=STUDENT_QUERY, query_id="STUDENT INFO", filename=filename, file_id=dp.file_id)

for query_id, query in HOMEWORK_QUESTIONS:
records += get_relevant_records(compression_retriever, query=query, query_id=query_id, filename=filename, file_id=dp.file_id)

queries_df = DataFrame(records)
#queries_df["cell_id"] = queries_df["cell_id"].astype(int)
#queries_df["chunk_id"] = queries_df["chunk_id"].astype(int) # these were showing up as floats, because there are NaNs?
queries_df.to_csv(queries_csv_filepath, index=False)
records = []
submission_filenames = submission_filenames[0:self.docs_limit] if self.docs_limit else submission_filenames
for filename in submission_filenames:
print("---------------------")
print(filename)
rows_df = cells_df[ cells_df["filename"] == filename ]
dp = RowsDocumentProcessor(rows_df=rows_df, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) # similarity_threshold=similarity_threshold
#text_retriever = dp.text_compression_retriever
base_retriever = dp.make_retriever(cell_type="TEXT", storage_strategy=self.retrieval_strategy)
compression_retriever = dp.make_compression_retriever(base_retriever=base_retriever, similarity_threshold=self.similarity_threshold)

records += get_relevant_records(retriever=compression_retriever, query=STUDENT_QUERY, query_id="STUDENT INFO", filename=filename, file_id=dp.file_id)
# todo: designate which query gets which response model, so we can treat the student query the same as the homework queries
for query_id, query in self.homework_questions:
records += get_relevant_records(compression_retriever, query=query, query_id=query_id, filename=filename, file_id=dp.file_id)

self.queries_df = DataFrame(records)
#queries_df["cell_id"] = queries_df["cell_id"].astype(int)
#queries_df["chunk_id"] = queries_df["chunk_id"].astype(int) # these were showing up as floats, because there are NaNs?
self.queries_df.to_csv(self.queries_csv_filepath, index=False)




if __name__ == "__main__":



sr = SubmissionsRetriever()
sr.perform()

0 comments on commit 04fc072

Please sign in to comment.