From 195565981e2fd670a6d947c464baa8f7b11f75ca Mon Sep 17 00:00:00 2001 From: JSv4 Date: Sun, 14 Jul 2024 21:20:50 -0700 Subject: [PATCH] Fixed missing lines in prod Dockerfile and handle extract properly where search_text is none. --- compose/local/django/Dockerfile | 24 ------------------- compose/production/django/Dockerfile | 9 +++++++ .../tasks/data_extract_tasks.py | 2 +- 3 files changed, 10 insertions(+), 25 deletions(-) diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index a27f1fb4..e6d04a3f 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -33,30 +33,6 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ # psycopg2 dependencies libpq-dev -# RUN wget https://poppler.freedesktop.org/poppler-data-0.4.10.tar.gz \ -# && tar -xf poppler-data-0.4.10.tar.gz \ -# && cd poppler-data-0.4.10 \ -# && make install \ -# && cd .. \ -# && wget https://poppler.freedesktop.org/poppler-21.03.0.tar.xz \ -# && tar -xf poppler-21.03.0.tar.xz \ -# && cd poppler-21.03.0 \ -# && mkdir build \ -# && cd build \ -# && cmake .. \ -# && make \ -# && make install \ -# && cd ../.. \ -# && ldconfig \ -# && rm poppler-data-0.4.10.tar.gz \ -# && rm -rf poppler-data-0.4.10 \ -# && rm poppler-21.03.0.tar.xz \ -# && rm -rf poppler-21.03.0 -# CMD tail -f /dev/null - -# Get tesseract tessdata file -# RUN wget https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata -P /usr/share/tessdata - # Requirements are installed here to ensure they will be cached. COPY ./requirements . diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile index c9c51842..0f7f62ab 100644 --- a/compose/production/django/Dockerfile +++ b/compose/production/django/Dockerfile @@ -71,10 +71,19 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ # copy python dependency wheels from python-build-stage COPY --from=python-build-stage /usr/src/app/wheels /wheels/ +# Install CPU-less requirements +RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +RUN pip install sentence-transformers + # use wheels to install python dependencies RUN pip install --no-cache-dir --no-index --find-links=/wheels/ /wheels/* \ && rm -rf /wheels/ +# Download sentence transformer binaries +# ensure /models directory exists +COPY download_embeddings_model.py . +RUN mkdir -p /models +RUN python download_embeddings_model.py COPY --chown=django:django ./compose/production/django/entrypoint /entrypoint RUN sed -i 's/\r$//g' /entrypoint diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py index 1fc58160..f58d4052 100644 --- a/opencontractserver/tasks/data_extract_tasks.py +++ b/opencontractserver/tasks/data_extract_tasks.py @@ -64,7 +64,7 @@ def oc_llama_index_doc_query(cell_id, similarity_top_k=15, max_token_length: int query = datacell.column.query # Special character - if "|||" in search_text: + if isinstance(search_text, str) and "|||" in search_text: logger.info( "Detected special break character in examples `|||` - splitting and averaging embeddings."