Dockerfile.nvidia

# Dockerfile
# Uses multi-stage builds requiring Docker 17.05 or higher
# See https://docs.docker.com/develop/develop-images/multistage-build/

# ------------ python-base -------------------------------------------- #

# TODO: check if we can somehow get rid of the apt-get dependencies (opencv, ghostscript etc..)
# they take up a lot of space ...

ARG PROJECT_DIR="/project"
ARG PROJECT_NAME="pydoxtools"

FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04 as python-base
# compatible with pytorch1.10.1cu113

ARG PROJECT_DIR

# python configuration
ENV PYTHONUNBUFFERED=1 \
    # prevents python creating .pyc files
    PYTHONDONTWRITEBYTECODE=1 \
    # pip
    PIP_NO_CACHE_DIR=off \
    PIP_DISABLE_PIP_VERSION_CHECK=on \
    PIP_DEFAULT_TIMEOUT=100 \
    # paths
    # our own app directory
    VENV_DIR="${PROJECT_DIR}/.venv/" \
    # prepend poetry and venv to path
    PATH="${PROJECT_DIR}/.venv/bin:$PATH"
    #PATH="${PROJECT_DIR}/LIBS/bin:$PATH"\
    #PYTHONPATH="${PROJECT_DIR}/LIBS:$PYTHONPATH"

# ------------------------- building the app ----------------------------    
FROM python-base as builder-base
# install build dependencies
# g++ is needed for hnswlib compilation
# git is needed for yfinance
RUN apt-get update && \
    DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y \
    iputils-ping build-essential \
    curl g++ wget\
    git \
    python3-minimal python3-pip python3-dev\
    && pip install -U pip \
    && apt-get clean autoclean \
    && apt-get autoremove --yes \
    && rm -rf /var/lib/{apt,dpkg,cache,log}/
    
# Install Poetry - respects $POETRY_VERSION & $POETRY_HOME
# ENV POETRY_VERSION=1.0.5
# RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python3
RUN pip install poetry

ARG PROJECT_DIR
# copy only the pyproject.toml and lockfile in order to make sure they are cached
COPY ./pyproject.toml ./poetry.lock ${PROJECT_DIR}/
WORKDIR ${PROJECT_DIR}/

# export requirements text for pip in order to create packages pywheels as a later stage
#RUN poetry config virtualenvs.create false \
#    poetry install
# export dependencies including dev deps
#RUN poetry export -E docker-training -E etl --dev \
#     --without-hashes --no-interaction --no-ansi -f requirements.txt -o requirements.txt \
#    && ls -l
ARG PROJECT_NAME
# poetry config virtualenvs.create false
RUN --mount=type=cache,target=/root/.cache/ \
    mkdir "${PROJECT_DIR}/${PROJECT_NAME}" &&\
    touch "${PROJECT_DIR}/${PROJECT_NAME}/__init__.py" &&\
    POETRY_VIRTUALENVS_IN_PROJECT=true poetry install -E pytorch-docker -E etl -vvv

RUN --mount=type=cache,target=/root/.cache/\
    poetry run pip install torch pytorch-lightning torchmetrics==0.9.3

RUN poetry run pip install transformers

# this line needs to be run as:
#  >> COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose build doxcavator-serverless
# as we are caching the pip downloads with the --mount option which is only supported by docker with the
# above flags
# RUN --mount=type=cache,target=/root/.cache/pip pip install -I -t LIBS -r requirements.txt
# RUN --mount=type=cache,target=/root/.cache/pip pip install -t LIBS -r requirements.txt
# RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt


# -------------------------- production build --------------------------------------
# `production` image used for runtime
FROM python-base as production
# if we need to install some apt packages....
RUN apt-get update && \
    DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y \
    python3-minimal python3-pip curl unzip htop byobu\
    #cleaning up
    && apt-get clean autoclean \
    && apt-get autoremove --yes \
    && rm -rf /var/lib/{apt,dpkg,cache,log}/

# install rclone in order to synchronize training files
RUN curl https://rclone.org/install.sh | bash

ARG PROJECT_DIR
# TODO: copy training datasets
# TODO: copy training datasets from online repository (S3)?
COPY ./training_data "${PROJECT_DIR}/training_data"
# COPY ./src directory here
COPY --from=builder-base $VENV_DIR $VENV_DIR
# COPY --from=builder-base "$PROJECT_DIR/LIBS" "$PROJECT_DIR/LIBS"
# COPY --from=builder-base "/usr/lib" "/usr/lib"
# ENV PATH="$PROJECT_DIR/LIBS:$PATH"

ARG PROJECT_NAME
# TODO: move the following dependency into pyproject.toml
RUN pip install pymysql
# copy only necessary files
COPY ./analysis      ${PROJECT_DIR}/analysis/
COPY ./$PROJECT_NAME ${PROJECT_DIR}/$PROJECT_NAME
# initialize training cache
# this is no needed right now, as the calculation was sped up siginificantly
# due to multiprocessing
# RUN python -c 'from pydoxtools import classifier; classifier.load_labeled_text_blocks()'

WORKDIR "${PROJECT_DIR}"
ENV TOKENIZERS_PARALLELISM=true
ENTRYPOINT [ "jupyter", "lab", "--allow-root", "--ip", "0.0.0.0", "--no-browser" ]