Skip to content

Commit

Permalink
Updated Dockerfile to preload embeddings model into container. Remove…
Browse files Browse the repository at this point in the history
…d cassette file from repo where it container model binary in it. Was too large. Now this won't be necessary thanks to cached model in image.
  • Loading branch information
JSv4 committed Jun 16, 2024
1 parent be28ab1 commit bb3f607
Show file tree
Hide file tree
Showing 15 changed files with 144 additions and 47 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
frontend/public/** linguist-vendored
frontend/src/assets/** linguist-vendored
*.yaml filter=lfs diff=lfs merge=lfs -text
42 changes: 3 additions & 39 deletions .pre-commit-config.yaml
Git LFS file not shown
9 changes: 8 additions & 1 deletion compose/local/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ COPY ./requirements .
RUN pip wheel --wheel-dir /usr/src/app/wheels \
-r ${BUILD_ENVIRONMENT}.txt


# Python 'run' stage
FROM python as python-run-stage

Expand Down Expand Up @@ -99,6 +98,10 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
# copy python dependency wheels from python-build-stage
COPY --from=python-build-stage /usr/src/app/wheels /wheels/

# Install CPU-less requirements
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
RUN pip install sentence-transformers

# use wheels to install python dependencies
RUN pip install --no-cache-dir --no-index --find-links=/wheels/ /wheels/* \
&& rm -rf /wheels/
Expand All @@ -108,6 +111,10 @@ RUN echo "RUN STAGE GITHUB_ACTIONS: $GITHUB_ACTIONS"
COPY ./setup_codecov.sh .
RUN if [ "$GITHUB_ACTIONS" ] ; then echo "GITHUB ACTION MODE" && chmod u+x setup_codecov.sh && ./setup_codecov.sh ; else echo "NOT GITHUB ACTION. DO NOT INSTALL CODECOV" ; fi

# Download sentence transformer binaries
COPY download_embeddings_model.py .
RUN mkdir -p /models
RUN python download_embeddings_model.py

COPY ./compose/production/django/entrypoint /entrypoint
RUN sed -i 's/\r$//g' /entrypoint
Expand Down
16 changes: 16 additions & 0 deletions download_embeddings_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
from sentence_transformers import SentenceTransformer

# Specify the desired sentence transformer model
model_name = "multi-qa-MiniLM-L6-cos-v1"

# Directory to save the model (absolute path)
cache_dir = "/models"

# Create the directory if it doesn't exist
os.makedirs(cache_dir, exist_ok=True)

# Download and save the sentence transformer model
model = SentenceTransformer(model_name, cache_folder=cache_dir)

print(f"Sentence transformer model '{model_name}' has been downloaded and saved to '{cache_dir}'.")
2 changes: 2 additions & 0 deletions fixtures/vcr_cassettes/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.yaml

3 changes: 3 additions & 0 deletions frontend/.husky/post-checkout
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
git lfs post-checkout "$@"
3 changes: 3 additions & 0 deletions frontend/.husky/post-commit
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
git lfs post-commit "$@"
3 changes: 3 additions & 0 deletions frontend/.husky/post-merge
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
git lfs post-merge "$@"
3 changes: 3 additions & 0 deletions frontend/.husky/pre-push
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
git lfs pre-push "$@"
5 changes: 4 additions & 1 deletion opencontractserver/llms/vector_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
super().__init__(
corpus_id=corpus_id,
document_id=document_id,
must_have_text=must_have_text,
hybrid_search=hybrid_search,
text_search_config=text_search_config,
embed_dim=embed_dim,
Expand Down Expand Up @@ -148,7 +149,9 @@ def _db_rows_to_query_result(
node = TextNode(
doc_id=str(row.id),
text=row.raw_text,
embedding=row.embedding.tolist(),
embedding=row.embedding.tolist()
if getattr(row, "embedding", None) is not None
else [],
extra_info={
"page": row.page,
"bounding_box": row.bounding_box,
Expand Down
4 changes: 2 additions & 2 deletions opencontractserver/tasks/extract_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ def llama_index_doc_query(cell_id, similarity_top_k=3):

document = datacell.document
embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
)
model_name="/models/multi-qa-MiniLM-L6-cos-v1"
) # Using our pre-load cache path where the model was stored on container build
Settings.embed_model = embed_model

llm = OpenAI(model=settings.OPENAI_MODEL, api_key=settings.OPENAI_API_KEY)
Expand Down
12 changes: 9 additions & 3 deletions opencontractserver/tasks/query_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,29 @@ def run_query(

try:
embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
)
model_name="/models/multi-qa-MiniLM-L6-cos-v1"
) # Using our pre-load cache path where the model was stored on container build
Settings.embed_model = embed_model

llm = OpenAI(model=settings.OPENAI_MODEL, api_key=settings.OPENAI_API_KEY)
Settings.llm = llm

print("Setting up vector store...")
vector_store = DjangoAnnotationVectorStore.from_params(
corpus_id=query.corpus.id
)
print(f"Vector store: {vector_store}")
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
print(f"Index: {index}")

query_engine = CitationQueryEngine.from_args(
index,
similarity_top_k=3,
# here we can control how granular citation sources are, the default is 512
citation_chunk_size=512,
)
print(f"Query engine: {query_engine}")

response = query_engine.query(str(query.query))
print(f"{len(response.source_nodes)} Sources: {response.source_nodes[0].node}")

Expand All @@ -60,7 +65,8 @@ def run_query(
query.completed = timezone.now()
query.save()

except Exception:
except Exception as e:
print(f"Query failed: {e}")
query.failed = timezone.now()
query.stacktrace = traceback.format_exc()
query.save()
83 changes: 83 additions & 0 deletions opencontractserver/tests/test_corpus_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import vcr
from django.contrib.auth import get_user_model
from django.core.files.base import ContentFile
from django.db.models.signals import post_save
from django.test import TestCase
from django.test.utils import override_settings

from opencontractserver.annotations.models import Annotation
from opencontractserver.annotations.signals import process_annot_on_create_atomic
from opencontractserver.corpuses.models import Corpus, CorpusQuery
from opencontractserver.corpuses.signals import run_query_on_create
from opencontractserver.documents.models import Document
from opencontractserver.tasks.doc_tasks import nlm_ingest_pdf
from opencontractserver.tasks.embeddings_task import (
calculate_embedding_for_annotation_text,
)
from opencontractserver.tasks.query_tasks import run_query
from opencontractserver.tests.fixtures import SAMPLE_PDF_FILE_TWO_PATH

User = get_user_model()


class QueryTasksTestCase(TestCase):
@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
def setUp(self):

post_save.disconnect(run_query_on_create, sender=CorpusQuery)
post_save.disconnect(process_annot_on_create_atomic, sender=Annotation)

self.user = User.objects.create_user(
username="testuser", password="testpassword"
)

# Create any necessary test data
self.corpus = Corpus.objects.create(title="Test Corpus")

pdf_file = ContentFile(
SAMPLE_PDF_FILE_TWO_PATH.open("rb").read(), name="test.pdf"
)

self.doc = Document.objects.create(
creator=self.user,
title="Test Doc",
description="USC Title 1 - Chapter 1",
custom_meta={},
pdf_file=pdf_file,
backend_lock=True,
)

# Run ingest pipeline SYNCHRONOUS and, with @responses.activate decorator, no API call ought to go out to
# nlm-ingestor host
nlm_ingest_pdf.delay(user_id=self.user.id, doc_id=self.doc.id)

# Manually run the calcs for the embeddings as post_save hook is hard
# to await for in test
for annot in Annotation.objects.all():
calculate_embedding_for_annotation_text.delay(annotation_id=annot.id)

self.corpus.documents.add(self.doc)
self.corpus.save()

self.query = CorpusQuery.objects.create(
query="Test query", corpus=self.corpus, creator=self.user
)

@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
@vcr.use_cassette("fixtures/vcr_cassettes/run_query.yaml")
def test_run_query(self):

print(self.query)
# Call the run_query task
run_query.delay(query_id=self.query.id)

# Refresh the query object from the database
self.query.refresh_from_db()

# Assert the expected behavior
self.assertIsNotNone(self.query.started)
self.assertIsNotNone(self.query.completed)
self.assertIsNone(self.query.failed)
self.assertIsNone(self.query.stacktrace)
self.assertIsNotNone(self.query.response)
self.assertTrue(self.query.sources.exists())
4 changes: 3 additions & 1 deletion opencontractserver/tests/test_nlm_ingestor_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django.core.files.base import ContentFile
from django.db import transaction
from django.test import TestCase
from django.test.utils import override_settings

from opencontractserver.annotations.models import Annotation, AnnotationLabel
from opencontractserver.documents.models import Document
Expand Down Expand Up @@ -43,6 +44,7 @@ def setUp(self):
backend_lock=True,
)

@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
@responses.activate
def test_load_nlm_ingested_doc(self):

Expand All @@ -56,7 +58,7 @@ def test_load_nlm_ingested_doc(self):

# Run ingest pipeline SYNCHRONOUS and, with @responses.activate decorator, no API call ought to go out to
# nlm-ingestor host
nlm_ingest_pdf.s(user_id=self.user.id, doc_id=self.doc.id).apply().get()
nlm_ingest_pdf.delay(user_id=self.user.id, doc_id=self.doc.id)

# Let's make sure we have right # of annotations + labels in database
assert Annotation.objects.all().count() == 27
Expand Down
1 change: 1 addition & 0 deletions requirements/local.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pytest-cov==4.0.0 # https://github.com/pytest-dev/pytest-cov
pytest-sugar==0.9.5 # https://github.com/Frozenball/pytest-sugar
djangorestframework-stubs==1.8.0 # https://github.com/typeddjango/djangorestframework-stubs
responses==0.22.0 # https://github.com/getsentry/responses
vcrpy==6.0.1 # https://vcrpy.readthedocs.io/en/latest/

# Profiling
# ------------------------------------------------------------------------------
Expand Down

0 comments on commit bb3f607

Please sign in to comment.