From 195565981e2fd670a6d947c464baa8f7b11f75ca Mon Sep 17 00:00:00 2001
From: JSv4 <scrudato@umich.edu>
Date: Sun, 14 Jul 2024 21:20:50 -0700
Subject: [PATCH] Fixed missing lines in prod Dockerfile and handle extract
 properly where search_text is none.

---
 compose/local/django/Dockerfile               | 24 -------------------
 compose/production/django/Dockerfile          |  9 +++++++
 .../tasks/data_extract_tasks.py               |  2 +-
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
index a27f1fb4..e6d04a3f 100644
--- a/compose/local/django/Dockerfile
+++ b/compose/local/django/Dockerfile
@@ -33,30 +33,6 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   # psycopg2 dependencies
   libpq-dev
 
-# RUN wget https://poppler.freedesktop.org/poppler-data-0.4.10.tar.gz \
-#     && tar -xf poppler-data-0.4.10.tar.gz \
-#     && cd poppler-data-0.4.10 \
-#     && make install \
-#     && cd .. \
-#     && wget https://poppler.freedesktop.org/poppler-21.03.0.tar.xz \
-#     && tar -xf poppler-21.03.0.tar.xz \
-#     && cd poppler-21.03.0 \
-#     && mkdir build \
-#     && cd build \
-#     && cmake .. \
-#     && make \
-#     && make install \
-#     && cd ../.. \
-#     && ldconfig \
-#     && rm poppler-data-0.4.10.tar.gz \
-#     && rm -rf poppler-data-0.4.10 \
-#     && rm poppler-21.03.0.tar.xz \
-#     && rm -rf poppler-21.03.0
-# CMD tail -f /dev/null
-
-# Get tesseract tessdata file
-# RUN wget https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata -P /usr/share/tessdata
-
 # Requirements are installed here to ensure they will be cached.
 COPY ./requirements .
 
diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile
index c9c51842..0f7f62ab 100644
--- a/compose/production/django/Dockerfile
+++ b/compose/production/django/Dockerfile
@@ -71,10 +71,19 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
 # copy python dependency wheels from python-build-stage
 COPY --from=python-build-stage /usr/src/app/wheels  /wheels/
 
+# Install CPU-less requirements
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN pip install sentence-transformers
+
 # use wheels to install python dependencies
 RUN pip install --no-cache-dir --no-index --find-links=/wheels/ /wheels/* \
   && rm -rf /wheels/
 
+# Download sentence transformer binaries
+# ensure /models directory exists
+COPY download_embeddings_model.py .
+RUN mkdir -p /models
+RUN python download_embeddings_model.py
 
 COPY --chown=django:django ./compose/production/django/entrypoint /entrypoint
 RUN sed -i 's/\r$//g' /entrypoint
diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py
index 1fc58160..f58d4052 100644
--- a/opencontractserver/tasks/data_extract_tasks.py
+++ b/opencontractserver/tasks/data_extract_tasks.py
@@ -64,7 +64,7 @@ def oc_llama_index_doc_query(cell_id, similarity_top_k=15, max_token_length: int
         query = datacell.column.query
 
         # Special character
-        if "|||" in search_text:
+        if isinstance(search_text, str) and "|||" in search_text:
 
             logger.info(
                 "Detected special break character in examples `|||` - splitting and averaging embeddings."