-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathdocker-compose.yaml
180 lines (167 loc) · 9.65 KB
/
docker-compose.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Requires Docker Compose V2.
# See https://docs.docker.com/compose/compose-file/compose-file-v3
# and https://github.com/compose-spec/compose-spec/blob/master/spec.md
# for details concerning the `docker-compose.yaml` file syntax.
# Using `docker-compose.yaml` has many advantages over writing custom shell scripts.
# The settings are much easier to see and maintain than scattered shell scripts.
# Also, Compose is a native Docker component, simplifying project maintenance.
# Run `make env` to create a basic `.env` file with the UID and GID variables.
# Variables are in ${VARIABLE:-DEFAULT_VALUE} format to specify default values.
# Using the `.env` file to set variables to non-default values is strongly recommended.
# Note that the host shell has higher priority than `.env` for Docker Compose.
# https://docs.docker.com/compose/environment-variables/envvars-precedence
# Variables specified via Docker Compose have higher priority than those
# specified in the Dockerfile, which only function as default values.
# All default values in the Dockerfile are overridden by default values in Compose.
# See https://pytorch.org/docs/stable/cpp_extension.html for an
# explanation of how to specify the `TORCH_CUDA_ARCH_LIST` variable.
# The variable `CCC` is used to specify `TORCH_CUDA_ARCH_LIST`.
# Compute Capability must be specified via the `CCC` variable.
# Set the host environment variable `BUILDKIT_PROGRESS=plain` to see the full build log.
# https://github.com/docker/cli/blob/master/docs/reference/commandline/cli.md#environment-variables
networks: # Use the host network instead of creating a separate network.
default: # This reduces load and conflicts with the host network.
name: host # This may cause security issues in production, however.
external: true
services:
base: # Base service containing configurations common to all services.
tty: true # Equivalent to `-t` flag in `docker run`.
init: true # Equivalent to `--init` flag in `docker run`.
stdin_open: true # equivalent to `-i` flag in `docker run`.
working_dir: ${PROJECT_ROOT:-/opt/project}
user: ${UID:-1000}:${GID:-1000} # Specify USR/GRP at runtime.
# Use different image names for different users and projects.
# Otherwise, images will be repeatedly removed and recreated.
# The removed images will remain cached, however.
image: ${IMAGE_NAME}
network_mode: host # Use the same network as the host, may cause security issues.
# `ipc: host` removes the shared memory cap but is a known security vulnerability.
ipc: host # Equivalent to `--ipc=host` in `docker run`. **Disable this on WSL.**
# shm_size: 1GB # Explicit shared memory limit. No security issues this way.
# hostname: ${SERVICE} # Makes `pure` terminals easier to tell apart.
# extra_hosts: # Prevents "unknown host" issue when using `sudo`.
# - "${SERVICE}:127.0.0.1"
# Common environment variables for the container runtime. No effect on build.
environment: # Equivalent to `--env`
CUDA_DEVICE_ORDER: PCI_BUS_ID
HISTSIZE: 50000 # Hard-coded large command history size.
TZ: ${TZ:-UTC} # Timezone settings used during runtime.
# PyTorch cuDNN v8 info: https://github.com/pytorch/pytorch/pull/60755
TORCH_CUDNN_V8_API_ENABLED: 1 # cuDNN v8 uses cuDNN bfloat16 kernels directly.
TORCH_CUDNN_V8_API_DEBUG: 0 # Enable to verify whether the cuDNN v8 frontend is being used.
# Lazy loading: https://docs.nvidia.com/cuda/cuda-c-programming-guide/lazy-loading.html
CUDA_MODULE_LOADING: LAZY # Effective only on CUDA 11.7+.
# tmpfs: # Create directory in RAM for fast data IO.
# - /opt/data
# Default volume pairings of ${HOST_PATH}:${CONTAINER_PATH}.
# Allows the container to access `HOST_PATH` as `CONTAINER_PATH`.
# See https://docs.docker.com/storage/volumes for details.
# Always use the ${HOME} variable to specify the host home directory.
# See https://github.com/docker/compose/issues/6506 for details.
volumes: # Equivalent to `-v` flag in `docker run`.
# Current working directory `.` is connected to `PROJECT_ROOT`.
# Mount `.` if the docker-compose.yaml file is at the project root.
# Mount `..` if Cresset is a subdirectory in a different project, etc.
- ${HOST_ROOT:-.}:${PROJECT_ROOT:-/opt/project}
# Preserve VSCode extensions between containers.
# Assumes default VSCode server directory.
# May cause VSCode issues if multiple Cresset-based projects are on the
# same machine writing to the `${HOME}/.vscode-server` directory.
# If so, specify a different host directory for each project.
- ${HOME}/.vscode-server:/home/${USR:-user}/.vscode-server
build:
context: . # Nearly all files are ignored due to `.dockerignore` settings.
target: ${TARGET_STAGE:-train} # Specify the `train.Dockerfile` target build stage.
args: # Common build-time environment variables.
# Even if these variables are unnecessary during the build,
# they can be ignored simply by not defining them in that stage.
ADD_USER: ${ADD_USER:-include} # Whether to create a new sudo user in the image.
PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
GID: ${GID:-1000}
UID: ${UID:-1000}
GRP: ${GRP:-user}
USR: ${USR:-user}
TZ: ${TZ:-UTC}
TMUX_HIST_LIMIT: 50000 # Size of `tmux` scrolling history.
# Change the `CONDA_URL` for different hardware architectures.
# URLs from https://github.com/conda-forge/miniforge are recommended over
# Miniconda URLs from https://docs.conda.io/en/latest/miniconda.html.
# The `CONDA_MANAGER` may be either `mamba` (the default) or `conda`.
# However, `mamba` may be unable to resolve conflicts that `conda` can.
# In such cases, set `CONDA_MANAGER=conda` for conda-based installation.
# Installing `mamba` via mini-forge is strongly recommended.
CONDA_URL: ${CONDA_URL:-https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh}
CONDA_MANAGER: ${CONDA_MANAGER:-mamba}
# URLs for faster `apt` and `pip` installs.
# Use URLs optimized for location and security requirements.
# DEB_OLD: ${DEB_OLD:-http://archive.ubuntu.com}
# DEB_NEW: ${DEB_NEW:-http://kr.archive.ubuntu.com}
INDEX_URL: ${INDEX_URL:-https://pypi.org/simple}
EXTRA_INDEX_URL: ${INDEX_URL:-https://pypi.ngc.nvidia.com}
TRUSTED_HOST: ${TRUSTED_HOST:-pypi.ngc.nvidia.com}
runtime: nvidia # The official `deploy` option is not working for some reason.
deploy: # API dependent on compose version. Remove if this is causing problems.
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
# device_ids: [ "0" ] # Use only GPU 0.
train:
extends:
service: base
build: # Options for building. Used when `--build` is called in `docker compose`.
# Set `TARGET_STAGE` to `train-builds` to get just the wheels in `/tmp/dist`.
dockerfile: dockerfiles/train.Dockerfile
args: # Equivalent to `--build-arg`.
BUILD_MODE: ${BUILD_MODE:-exclude}
# BUILD_TEST: 0
# USE_NNPACK: 0 # Disabling NNPack and QNNPack by default as they are
# USE_QNNPACK: 0 # legacy libraries and most users do not need them.
# BUILD_CAFFE2: 0 # Most users do not need Caffe2.
# BUILD_CAFFE2_OPS: 0
# USE_PRECOMPILED_HEADERS: 1
LINUX_DISTRO: ${LINUX_DISTRO:-ubuntu}
DISTRO_VERSION: ${DISTRO_VERSION:-22.04}
CUDA_VERSION: ${CUDA_VERSION:-12.4.1}
CUDNN_VERSION: ${CUDNN_VERSION} # Leave empty for CUDA 12.4+.
IMAGE_FLAVOR: ${IMAGE_FLAVOR:-devel}
PYTHON_VERSION: ${PYTHON_VERSION:-3.10}
MKL_MODE: ${MKL_MODE:-include} # MKL_MODE can be `include` or `exclude`.
# Fails if `BUILD_MODE=include` but `CCC` is not set explicitly.
TORCH_CUDA_ARCH_LIST: ${CCC} # Ignore the missing CCC warning otherwise.
# Variables for building PyTorch. Must be valid git tags or commits.
PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v2.4.1}
TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.19.1}
# Variables for downloading PyTorch instead of building.
PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu124}
# Set `PYTORCH_FETCH_NIGHTLY` to any value to fetch the nightly binaries.
# Also remember to change the index url to the nightly version.
PYTORCH_FETCH_NIGHTLY: ${PYTORCH_FETCH_NIGHTLY:+--pre}
PYTORCH_VERSION: ${PYTORCH_VERSION:-2.4.1}
TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.19.1}
devel: # Skeleton service for development and debugging.
extends: # This service may be useful for PyTorch CUDA/C++ contributors.
service: base
build:
target: ${TARGET_STAGE:-build-base} # All builds begin at `build-base`.
dockerfile: dockerfiles/train.Dockerfile
ngc: # Service based on images from the NGC PyTorch image catalog.
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html
# NGC images are very unstable, with many differences between versions.
# This service may break for different `NGC_YEAR` and `NGC_MONTH` values.
extends:
service: base
build:
dockerfile: dockerfiles/ngc.Dockerfile
args:
NGC_YEAR: ${NGC_YEAR:-24}
NGC_MONTH: ${NGC_MONTH:-08}
simple: # Service installed purely from official/verified Docker images and `conda`.
extends:
service: base
build:
dockerfile: dockerfiles/simple.Dockerfile
args:
BASE_IMAGE: ${LINUX_DISTRO:-ubuntu}:${DISTRO_VERSION:-22.04}
LOCK_MODE: ${LOCK_MODE:-exclude}