Updated Dockerfile to preload embeddings model into container. Remove…

…d cassette file from repo where it container model binary in it. Was too large. Now this won't be necessary thanks to cached model in image.
JSv4 · Jun 16, 2024 · bb3f607 · bb3f607
1 parent be28ab1
commit bb3f607
Show file tree

Hide file tree

Showing 15 changed files with 144 additions and 47 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,3 @@
 frontend/public/** linguist-vendored
 frontend/src/assets/** linguist-vendored
+*.yaml filter=lfs diff=lfs merge=lfs -text
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
@@ -64,7 +64,6 @@ COPY ./requirements .
 RUN pip wheel --wheel-dir /usr/src/app/wheels  \
   -r ${BUILD_ENVIRONMENT}.txt
 
-
 # Python 'run' stage
 FROM python as python-run-stage
 
@@ -99,6 +98,10 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
 # copy python dependency wheels from python-build-stage
 COPY --from=python-build-stage /usr/src/app/wheels  /wheels/
 
+# Install CPU-less requirements
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN pip install sentence-transformers
+
 # use wheels to install python dependencies
 RUN pip install --no-cache-dir --no-index --find-links=/wheels/ /wheels/* \
 	&& rm -rf /wheels/
@@ -108,6 +111,10 @@ RUN echo "RUN STAGE GITHUB_ACTIONS: $GITHUB_ACTIONS"
 COPY ./setup_codecov.sh .
 RUN if [ "$GITHUB_ACTIONS" ] ; then echo "GITHUB ACTION MODE" && chmod u+x setup_codecov.sh && ./setup_codecov.sh ; else echo "NOT GITHUB ACTION. DO NOT INSTALL CODECOV" ; fi
 
+# Download sentence transformer binaries
+COPY download_embeddings_model.py .
+RUN mkdir -p /models
+RUN python download_embeddings_model.py
 
 COPY ./compose/production/django/entrypoint /entrypoint
 RUN sed -i 's/\r$//g' /entrypoint

diff --git a/download_embeddings_model.py b/download_embeddings_model.py
@@ -0,0 +1,16 @@
+import os
+from sentence_transformers import SentenceTransformer
+
+# Specify the desired sentence transformer model
+model_name = "multi-qa-MiniLM-L6-cos-v1"
+
+# Directory to save the model (absolute path)
+cache_dir = "/models"
+
+# Create the directory if it doesn't exist
+os.makedirs(cache_dir, exist_ok=True)
+
+# Download and save the sentence transformer model
+model = SentenceTransformer(model_name, cache_folder=cache_dir)
+
+print(f"Sentence transformer model '{model_name}' has been downloaded and saved to '{cache_dir}'.")
diff --git a/fixtures/vcr_cassettes/.gitignore b/fixtures/vcr_cassettes/.gitignore
@@ -0,0 +1,2 @@
+*.yaml
+
diff --git a/frontend/.husky/post-checkout b/frontend/.husky/post-checkout
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-checkout "$@"
diff --git a/frontend/.husky/post-commit b/frontend/.husky/post-commit
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-commit "$@"
diff --git a/frontend/.husky/post-merge b/frontend/.husky/post-merge
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-merge "$@"
diff --git a/frontend/.husky/pre-push b/frontend/.husky/pre-push
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs pre-push "$@"
diff --git a/opencontractserver/llms/vector_stores.py b/opencontractserver/llms/vector_stores.py
@@ -57,6 +57,7 @@ def __init__(
         super().__init__(
             corpus_id=corpus_id,
             document_id=document_id,
+            must_have_text=must_have_text,
             hybrid_search=hybrid_search,
             text_search_config=text_search_config,
             embed_dim=embed_dim,
@@ -148,7 +149,9 @@ def _db_rows_to_query_result(
             node = TextNode(
                 doc_id=str(row.id),
                 text=row.raw_text,
-                embedding=row.embedding.tolist(),
+                embedding=row.embedding.tolist()
+                if getattr(row, "embedding", None) is not None
+                else [],
                 extra_info={
                     "page": row.page,
                     "bounding_box": row.bounding_box,

diff --git a/opencontractserver/tasks/extract_tasks.py b/opencontractserver/tasks/extract_tasks.py
@@ -81,8 +81,8 @@ def llama_index_doc_query(cell_id, similarity_top_k=3):
 
         document = datacell.document
         embed_model = HuggingFaceEmbedding(
-            model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
-        )
+            model_name="/models/multi-qa-MiniLM-L6-cos-v1"
+        )  # Using our pre-load cache path where the model was stored on container build
         Settings.embed_model = embed_model
 
         llm = OpenAI(model=settings.OPENAI_MODEL, api_key=settings.OPENAI_API_KEY)

diff --git a/opencontractserver/tasks/query_tasks.py b/opencontractserver/tasks/query_tasks.py
@@ -25,24 +25,29 @@ def run_query(
 
     try:
         embed_model = HuggingFaceEmbedding(
-            model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
-        )
+            model_name="/models/multi-qa-MiniLM-L6-cos-v1"
+        )  # Using our pre-load cache path where the model was stored on container build
         Settings.embed_model = embed_model
 
         llm = OpenAI(model=settings.OPENAI_MODEL, api_key=settings.OPENAI_API_KEY)
         Settings.llm = llm
 
+        print("Setting up vector store...")
         vector_store = DjangoAnnotationVectorStore.from_params(
             corpus_id=query.corpus.id
         )
+        print(f"Vector store: {vector_store}")
         index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+        print(f"Index: {index}")
 
         query_engine = CitationQueryEngine.from_args(
             index,
             similarity_top_k=3,
             # here we can control how granular citation sources are, the default is 512
             citation_chunk_size=512,
         )
+        print(f"Query engine: {query_engine}")
+
         response = query_engine.query(str(query.query))
         print(f"{len(response.source_nodes)} Sources: {response.source_nodes[0].node}")
 
@@ -60,7 +65,8 @@ def run_query(
         query.completed = timezone.now()
         query.save()
 
-    except Exception:
+    except Exception as e:
+        print(f"Query failed: {e}")
         query.failed = timezone.now()
         query.stacktrace = traceback.format_exc()
         query.save()
diff --git a/opencontractserver/tests/test_corpus_query.py b/opencontractserver/tests/test_corpus_query.py
@@ -0,0 +1,83 @@
+import vcr
+from django.contrib.auth import get_user_model
+from django.core.files.base import ContentFile
+from django.db.models.signals import post_save
+from django.test import TestCase
+from django.test.utils import override_settings
+
+from opencontractserver.annotations.models import Annotation
+from opencontractserver.annotations.signals import process_annot_on_create_atomic
+from opencontractserver.corpuses.models import Corpus, CorpusQuery
+from opencontractserver.corpuses.signals import run_query_on_create
+from opencontractserver.documents.models import Document
+from opencontractserver.tasks.doc_tasks import nlm_ingest_pdf
+from opencontractserver.tasks.embeddings_task import (
+    calculate_embedding_for_annotation_text,
+)
+from opencontractserver.tasks.query_tasks import run_query
+from opencontractserver.tests.fixtures import SAMPLE_PDF_FILE_TWO_PATH
+
+User = get_user_model()
+
+
+class QueryTasksTestCase(TestCase):
+    @override_settings(CELERY_TASK_ALWAYS_EAGER=True)
+    def setUp(self):
+
+        post_save.disconnect(run_query_on_create, sender=CorpusQuery)
+        post_save.disconnect(process_annot_on_create_atomic, sender=Annotation)
+
+        self.user = User.objects.create_user(
+            username="testuser", password="testpassword"
+        )
+
+        # Create any necessary test data
+        self.corpus = Corpus.objects.create(title="Test Corpus")
+
+        pdf_file = ContentFile(
+            SAMPLE_PDF_FILE_TWO_PATH.open("rb").read(), name="test.pdf"
+        )
+
+        self.doc = Document.objects.create(
+            creator=self.user,
+            title="Test Doc",
+            description="USC Title 1 - Chapter 1",
+            custom_meta={},
+            pdf_file=pdf_file,
+            backend_lock=True,
+        )
+
+        # Run ingest pipeline SYNCHRONOUS and, with @responses.activate decorator, no API call ought to go out to
+        # nlm-ingestor host
+        nlm_ingest_pdf.delay(user_id=self.user.id, doc_id=self.doc.id)
+
+        # Manually run the calcs for the embeddings as post_save hook is hard
+        # to await for in test
+        for annot in Annotation.objects.all():
+            calculate_embedding_for_annotation_text.delay(annotation_id=annot.id)
+
+        self.corpus.documents.add(self.doc)
+        self.corpus.save()
+
+        self.query = CorpusQuery.objects.create(
+            query="Test query", corpus=self.corpus, creator=self.user
+        )
+
+    @override_settings(CELERY_TASK_ALWAYS_EAGER=True)
+    @vcr.use_cassette("fixtures/vcr_cassettes/run_query.yaml")
+    def test_run_query(self):
+
+        print(self.query)
+        # Call the run_query task
+        run_query.delay(query_id=self.query.id)
+
+        # Refresh the query object from the database
+        self.query.refresh_from_db()
+
+        # Assert the expected behavior
+        self.assertIsNotNone(self.query.started)
+        self.assertIsNotNone(self.query.completed)
+        self.assertIsNone(self.query.failed)
+        self.assertIsNone(self.query.stacktrace)
+        self.assertIsNotNone(self.query.response)
+        self.assertTrue(self.query.sources.exists())
diff --git a/opencontractserver/tests/test_nlm_ingestor_pipeline.py b/opencontractserver/tests/test_nlm_ingestor_pipeline.py
@@ -8,6 +8,7 @@
 from django.core.files.base import ContentFile
 from django.db import transaction
 from django.test import TestCase
+from django.test.utils import override_settings
 
 from opencontractserver.annotations.models import Annotation, AnnotationLabel
 from opencontractserver.documents.models import Document
@@ -43,6 +44,7 @@ def setUp(self):
                 backend_lock=True,
             )
 
+    @override_settings(CELERY_TASK_ALWAYS_EAGER=True)
     @responses.activate
     def test_load_nlm_ingested_doc(self):
 
@@ -56,7 +58,7 @@ def test_load_nlm_ingested_doc(self):
 
         # Run ingest pipeline SYNCHRONOUS and, with @responses.activate decorator, no API call ought to go out to
         # nlm-ingestor host
-        nlm_ingest_pdf.s(user_id=self.user.id, doc_id=self.doc.id).apply().get()
+        nlm_ingest_pdf.delay(user_id=self.user.id, doc_id=self.doc.id)
 
         # Let's make sure we have right # of annotations + labels in database
         assert Annotation.objects.all().count() == 27

diff --git a/requirements/local.txt b/requirements/local.txt
@@ -14,6 +14,7 @@ pytest-cov==4.0.0  # https://github.com/pytest-dev/pytest-cov
 pytest-sugar==0.9.5  # https://github.com/Frozenball/pytest-sugar
 djangorestframework-stubs==1.8.0  # https://github.com/typeddjango/djangorestframework-stubs
 responses==0.22.0  # https://github.com/getsentry/responses
+vcrpy==6.0.1  # https://vcrpy.readthedocs.io/en/latest/
 
 # Profiling
 # ------------------------------------------------------------------------------