docker-compose.yaml

# Requires Docker Compose V2.
# See https://docs.docker.com/compose/compose-file/compose-file-v3
# and https://github.com/compose-spec/compose-spec/blob/master/spec.md
# for details concerning the `docker-compose.yaml` file syntax.

# Using `docker-compose.yaml` has many advantages over writing custom shell scripts.
# The settings are much easier to see and maintain than scattered shell scripts.
# Also, Compose is a native Docker component, simplifying project maintenance.

# Run `make env` to create a basic `.env` file with the UID and GID variables.
# Variables are in ${VARIABLE:-DEFAULT_VALUE} format to specify default values.
# Using the `.env` file to set variables to non-default values is strongly recommended.
# Note that the host shell has higher priority than `.env` for Docker Compose.
# https://docs.docker.com/compose/environment-variables/envvars-precedence

# Variables specified via Docker Compose have higher priority than those
# specified in the Dockerfile, which only function as default values.
# All default values in the Dockerfile are overridden by default values in Compose.

# See https://pytorch.org/docs/stable/cpp_extension.html for an
# explanation of how to specify the `TORCH_CUDA_ARCH_LIST` variable.
# The variable `CCC` is used to specify `TORCH_CUDA_ARCH_LIST`.
# Compute Capability must be specified via the `CCC` variable.

# Set the host environment variable `BUILDKIT_PROGRESS=plain` to see the full build log.
# https://github.com/docker/cli/blob/master/docs/reference/commandline/cli.md#environment-variables

networks: # Use the host network instead of creating a separate network.
  default: # This reduces load and conflicts with the host network.
    name: host # This may cause security issues in production, however.
    external: true

services:
  base: # Base service containing configurations common to all services.
    tty: true # Equivalent to `-t` flag in `docker run`.
    init: true # Equivalent to `--init` flag in `docker run`.
    stdin_open: true # equivalent to `-i` flag in `docker run`.
    working_dir: ${PROJECT_ROOT:-/opt/project}
    user: ${UID:-1000}:${GID:-1000} # Specify USR/GRP at runtime.
    # Use different image names for different users and projects.
    # Otherwise, images will be repeatedly removed and recreated.
    # The removed images will remain cached, however.
    image: ${IMAGE_NAME}
    network_mode: host # Use the same network as the host, may cause security issues.
    # `ipc: host` removes the shared memory cap but is a known security vulnerability.
    ipc: host # Equivalent to `--ipc=host` in `docker run`. **Disable this on WSL.**
    # shm_size: 1GB  # Explicit shared memory limit. No security issues this way.
    # hostname: ${SERVICE} # Makes `pure` terminals easier to tell apart.
    # extra_hosts: # Prevents "unknown host" issue when using `sudo`.
    #   - "${SERVICE}:127.0.0.1"

    # Common environment variables for the container runtime. No effect on build.
    environment: # Equivalent to `--env`
      CUDA_DEVICE_ORDER: PCI_BUS_ID
      HISTSIZE: 50000 # Hard-coded large command history size.
      TZ: ${TZ:-UTC} # Timezone settings used during runtime.
      # PyTorch cuDNN v8 info: https://github.com/pytorch/pytorch/pull/60755
      TORCH_CUDNN_V8_API_ENABLED: 1 # cuDNN v8 uses cuDNN bfloat16 kernels directly.
      TORCH_CUDNN_V8_API_DEBUG: 0 # Enable to verify whether the cuDNN v8 frontend is being used.
      # Lazy loading: https://docs.nvidia.com/cuda/cuda-c-programming-guide/lazy-loading.html
      CUDA_MODULE_LOADING: LAZY # Effective only on CUDA 11.7+.
    # tmpfs:  # Create directory in RAM for fast data IO.
    #   - /opt/data
    # Default volume pairings of ${HOST_PATH}:${CONTAINER_PATH}.
    # Allows the container to access `HOST_PATH` as `CONTAINER_PATH`.
    # See https://docs.docker.com/storage/volumes for details.
    # Always use the ${HOME} variable to specify the host home directory.
    # See https://github.com/docker/compose/issues/6506 for details.
    volumes: # Equivalent to `-v` flag in `docker run`.
      # Current working directory `.` is connected to `PROJECT_ROOT`.
      # Mount `.` if the docker-compose.yaml file is at the project root.
      # Mount `..` if Cresset is a subdirectory in a different project, etc.
      - ${HOST_ROOT:-.}:${PROJECT_ROOT:-/opt/project}
      # Preserve VSCode extensions between containers.
      # Assumes default VSCode server directory.
      # May cause VSCode issues if multiple Cresset-based projects are on the
      # same machine writing to the `${HOME}/.vscode-server` directory.
      # If so, specify a different host directory for each project.
      - ${HOME}/.vscode-server:/home/${USR:-user}/.vscode-server

    build:
      context: . # Nearly all files are ignored due to `.dockerignore` settings.
      target: ${TARGET_STAGE:-train} # Specify the `train.Dockerfile` target build stage.
      args: # Common build-time environment variables.
        # Even if these variables are unnecessary during the build,
        # they can be ignored simply by not defining them in that stage.
        ADD_USER: ${ADD_USER:-include} # Whether to create a new sudo user in the image.
        PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
        GID: ${GID:-1000}
        UID: ${UID:-1000}
        GRP: ${GRP:-user}
        USR: ${USR:-user}
        TZ: ${TZ:-UTC}
        TMUX_HIST_LIMIT: 50000 # Size of `tmux` scrolling history.
        # Change the `CONDA_URL` for different hardware architectures.
        # URLs from https://github.com/conda-forge/miniforge are recommended over
        # Miniconda URLs from https://docs.conda.io/en/latest/miniconda.html.
        # The `CONDA_MANAGER` may be either `mamba` (the default) or `conda`.
        # However, `mamba` may be unable to resolve conflicts that `conda` can.
        # In such cases, set `CONDA_MANAGER=conda` for conda-based installation.
        # Installing `mamba` via mini-forge is strongly recommended.
        CONDA_URL: ${CONDA_URL:-https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh}
        CONDA_MANAGER: ${CONDA_MANAGER:-mamba}
        # URLs for faster `apt` and `pip` installs.
        # Use URLs optimized for location and security requirements.
        # DEB_OLD: ${DEB_OLD:-http://archive.ubuntu.com}
        # DEB_NEW: ${DEB_NEW:-http://kr.archive.ubuntu.com}
        INDEX_URL: ${INDEX_URL:-https://pypi.org/simple}
        EXTRA_INDEX_URL: ${INDEX_URL:-https://pypi.ngc.nvidia.com}
        TRUSTED_HOST: ${TRUSTED_HOST:-pypi.ngc.nvidia.com}
    runtime: nvidia  # The official `deploy` option is not working for some reason.
    deploy: # API dependent on compose version. Remove if this is causing problems.
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
      #        device_ids: [ "0" ]  # Use only GPU 0.

  train:
    extends:
      service: base
    build: # Options for building. Used when `--build` is called in `docker compose`.
      # Set `TARGET_STAGE` to `train-builds` to get just the wheels in `/tmp/dist`.
      dockerfile: dockerfiles/train.Dockerfile
      args: # Equivalent to `--build-arg`.
        BUILD_MODE: ${BUILD_MODE:-exclude}
        # BUILD_TEST: 0
        # USE_NNPACK: 0 # Disabling NNPack and QNNPack by default as they are
        # USE_QNNPACK: 0 # legacy libraries and most users do not need them.
        # BUILD_CAFFE2: 0 # Most users do not need Caffe2.
        # BUILD_CAFFE2_OPS: 0
        # USE_PRECOMPILED_HEADERS: 1
        LINUX_DISTRO: ${LINUX_DISTRO:-ubuntu}
        DISTRO_VERSION: ${DISTRO_VERSION:-22.04}
        CUDA_VERSION: ${CUDA_VERSION:-12.4.1}
        CUDNN_VERSION: ${CUDNN_VERSION} # Leave empty for CUDA 12.4+.
        IMAGE_FLAVOR: ${IMAGE_FLAVOR:-devel}
        PYTHON_VERSION: ${PYTHON_VERSION:-3.10}
        MKL_MODE: ${MKL_MODE:-include} # MKL_MODE can be `include` or `exclude`.
        # Fails if `BUILD_MODE=include` but `CCC` is not set explicitly.
        TORCH_CUDA_ARCH_LIST: ${CCC} # Ignore the missing CCC warning otherwise.
        # Variables for building PyTorch. Must be valid git tags or commits.
        PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v2.4.1}
        TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.19.1}
        # Variables for downloading PyTorch instead of building.
        PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu124}
        # Set `PYTORCH_FETCH_NIGHTLY` to any value to fetch the nightly binaries.
        # Also remember to change the index url to the nightly version.
        PYTORCH_FETCH_NIGHTLY: ${PYTORCH_FETCH_NIGHTLY:+--pre}
        PYTORCH_VERSION: ${PYTORCH_VERSION:-2.4.1}
        TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.19.1}

  devel: # Skeleton service for development and debugging.
    extends: # This service may be useful for PyTorch CUDA/C++ contributors.
      service: base
    build:
      target: ${TARGET_STAGE:-build-base} # All builds begin at `build-base`.
      dockerfile: dockerfiles/train.Dockerfile

  ngc: # Service based on images from the NGC PyTorch image catalog.
    # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html
    # NGC images are very unstable, with many differences between versions.
    # This service may break for different `NGC_YEAR` and `NGC_MONTH` values.
    extends:
      service: base
    build:
      dockerfile: dockerfiles/ngc.Dockerfile
      args:
        NGC_YEAR: ${NGC_YEAR:-24}
        NGC_MONTH: ${NGC_MONTH:-08}

  simple: # Service installed purely from official/verified Docker images and `conda`.
    extends:
      service: base
    build:
      dockerfile: dockerfiles/simple.Dockerfile
      args:
        BASE_IMAGE: ${LINUX_DISTRO:-ubuntu}:${DISTRO_VERSION:-22.04}
        LOCK_MODE: ${LOCK_MODE:-exclude}