-
Notifications
You must be signed in to change notification settings - Fork 10
/
Dockerfile.nvidia
126 lines (107 loc) · 4.73 KB
/
Dockerfile.nvidia
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Dockerfile
# Uses multi-stage builds requiring Docker 17.05 or higher
# See https://docs.docker.com/develop/develop-images/multistage-build/
# ------------ python-base -------------------------------------------- #
# TODO: check if we can somehow get rid of the apt-get dependencies (opencv, ghostscript etc..)
# they take up a lot of space ...
ARG PROJECT_DIR="/project"
ARG PROJECT_NAME="pydoxtools"
FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04 as python-base
# compatible with pytorch1.10.1cu113
ARG PROJECT_DIR
# python configuration
ENV PYTHONUNBUFFERED=1 \
# prevents python creating .pyc files
PYTHONDONTWRITEBYTECODE=1 \
# pip
PIP_NO_CACHE_DIR=off \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
# paths
# our own app directory
VENV_DIR="${PROJECT_DIR}/.venv/" \
# prepend poetry and venv to path
PATH="${PROJECT_DIR}/.venv/bin:$PATH"
#PATH="${PROJECT_DIR}/LIBS/bin:$PATH"\
#PYTHONPATH="${PROJECT_DIR}/LIBS:$PYTHONPATH"
# ------------------------- building the app ----------------------------
FROM python-base as builder-base
# install build dependencies
# g++ is needed for hnswlib compilation
# git is needed for yfinance
RUN apt-get update && \
DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y \
iputils-ping build-essential \
curl g++ wget\
git \
python3-minimal python3-pip python3-dev\
&& pip install -U pip \
&& apt-get clean autoclean \
&& apt-get autoremove --yes \
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
# Install Poetry - respects $POETRY_VERSION & $POETRY_HOME
# ENV POETRY_VERSION=1.0.5
# RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python3
RUN pip install poetry
ARG PROJECT_DIR
# copy only the pyproject.toml and lockfile in order to make sure they are cached
COPY ./pyproject.toml ./poetry.lock ${PROJECT_DIR}/
WORKDIR ${PROJECT_DIR}/
# export requirements text for pip in order to create packages pywheels as a later stage
#RUN poetry config virtualenvs.create false \
# poetry install
# export dependencies including dev deps
#RUN poetry export -E docker-training -E etl --dev \
# --without-hashes --no-interaction --no-ansi -f requirements.txt -o requirements.txt \
# && ls -l
ARG PROJECT_NAME
# poetry config virtualenvs.create false
RUN --mount=type=cache,target=/root/.cache/ \
mkdir "${PROJECT_DIR}/${PROJECT_NAME}" &&\
touch "${PROJECT_DIR}/${PROJECT_NAME}/__init__.py" &&\
POETRY_VIRTUALENVS_IN_PROJECT=true poetry install -E pytorch-docker -E etl -vvv
RUN --mount=type=cache,target=/root/.cache/\
poetry run pip install torch pytorch-lightning torchmetrics==0.9.3
RUN poetry run pip install transformers
# this line needs to be run as:
# >> COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose build doxcavator-serverless
# as we are caching the pip downloads with the --mount option which is only supported by docker with the
# above flags
# RUN --mount=type=cache,target=/root/.cache/pip pip install -I -t LIBS -r requirements.txt
# RUN --mount=type=cache,target=/root/.cache/pip pip install -t LIBS -r requirements.txt
# RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
# -------------------------- production build --------------------------------------
# `production` image used for runtime
FROM python-base as production
# if we need to install some apt packages....
RUN apt-get update && \
DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y \
python3-minimal python3-pip curl unzip htop byobu\
#cleaning up
&& apt-get clean autoclean \
&& apt-get autoremove --yes \
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
# install rclone in order to synchronize training files
RUN curl https://rclone.org/install.sh | bash
ARG PROJECT_DIR
# TODO: copy training datasets
# TODO: copy training datasets from online repository (S3)?
COPY ./training_data "${PROJECT_DIR}/training_data"
# COPY ./src directory here
COPY --from=builder-base $VENV_DIR $VENV_DIR
# COPY --from=builder-base "$PROJECT_DIR/LIBS" "$PROJECT_DIR/LIBS"
# COPY --from=builder-base "/usr/lib" "/usr/lib"
# ENV PATH="$PROJECT_DIR/LIBS:$PATH"
ARG PROJECT_NAME
# TODO: move the following dependency into pyproject.toml
RUN pip install pymysql
# copy only necessary files
COPY ./analysis ${PROJECT_DIR}/analysis/
COPY ./$PROJECT_NAME ${PROJECT_DIR}/$PROJECT_NAME
# initialize training cache
# this is no needed right now, as the calculation was sped up siginificantly
# due to multiprocessing
# RUN python -c 'from pydoxtools import classifier; classifier.load_labeled_text_blocks()'
WORKDIR "${PROJECT_DIR}"
ENV TOKENIZERS_PARALLELISM=true
ENTRYPOINT [ "jupyter", "lab", "--allow-root", "--ip", "0.0.0.0", "--no-browser" ]