Skip to content

Commit

Permalink
Update Docker build (#75)
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh authored Oct 31, 2024
1 parent 55d261e commit 282c120
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 53 deletions.
30 changes: 16 additions & 14 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@ concurrency:
cancel-in-progress: true

on:
pull_request:
branches:
- main
paths:
- 'Makefile'
- 'pyproject.toml'
- 'src/olmo_core/version.py'
- 'src/Dockerfile'
- '.github/workflows/docker.yml'
push:
branches:
- main
tags:
- 'v*.*.*'
workflow_dispatch:
# TODO: disabled for now because it takes too long in CI
# pull_request:
# branches:
# - main
# paths:
# - 'Makefile'
# - 'pyproject.toml'
# - 'src/olmo_core/version.py'
# - 'src/Dockerfile'
# - '.github/workflows/docker.yml'
# push:
# branches:
# - main
# tags:
# - 'v*.*.*'

jobs:
beaker:
Expand Down
14 changes: 7 additions & 7 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ jobs:
gpu_checks:
name: ${{ matrix.task.name }}
runs-on: ubuntu-latest
timeout-minutes: 8
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
Expand All @@ -118,14 +118,14 @@ jobs:
src/test/
- name: Test checkpoint (GPU)
image: olmo-core-nightly
image: olmo-core
gpus: 2
run: |
pytest -v --color=yes --durations=3 -m gpu \
src/test/distributed/checkpoint*
- name: Test MoE (GPU)
image: olmo-core-nightly
image: olmo-core
gpus: 1
run: |
pytest -v --color=yes --durations=3 -m gpu \
Expand Down Expand Up @@ -174,17 +174,17 @@ jobs:
image:
beaker: ${{ env.BEAKER_IMAGE }}
context:
priority: low
priority: normal
preemptible: true
resources:
gpuCount: ${{ matrix.task.gpus }}
constraints:
cluster:
- ai2/jupiter-cirrascale-2
- ai2/pluto-cirrascale
# - ai2/allennlp-cirrascale
# - ai2/allennlp-elanding-a100-40g
- ai2/pluto-cirrascale
- ai2/jupiter-cirrascale-2
# - ai2/saturn-cirrascale
- ai2/saturn-cirrascale
envVars:
- name: CUBLAS_WORKSPACE_CONFIG
value: ":16:8"
Expand Down
4 changes: 0 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Added `DownstreamEvaluatorCallbackConfig` class for running in-loop downstream eval via [OLMo-in-loop-evals](https://github.com/allenai/OLMo-in-loop-evals).

### Removed

- Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302.

### Fixed

- Made GCS client more robust by automatically retrying timeout errors for most operations.
Expand Down
22 changes: 13 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# NOTE: make sure CUDA versions match across these variables
BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29
CUDA_TOOLKIT_VERSION = 12.1.0
TORCH_CUDA_VERSION = 121
CUDA_VERSION = 12.1
TORCH_CUDA_VERSION = $(shell echo $(CUDA_VERSION) | tr -d .)
BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel
BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime

# NOTE: when upgrading the nightly version you also need to upgrade the torch version specification
# in 'pyproject.toml' to include that nightly version.
NIGHTLY_VERSION = "2.6.0.dev20241009+cu121"
TORCHAO_VERSION = "torchao==0.5.0"
NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)"
TORCHAO_VERSION = "0.5.0"
MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://[email protected]/epwalsh/megablocks.git@epwalsh/deps"
FLASH_ATTN_VERSION = "2.6.3"

VERSION = $(shell python src/olmo_core/version.py)
VERSION_SHORT = $(shell python src/olmo_core/version.py short)
Expand Down Expand Up @@ -49,9 +51,10 @@ build :
stable-image :
docker build -f src/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--build-arg BASE=$(BASE_IMAGE) \
--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
--target stable \
Expand All @@ -63,9 +66,10 @@ stable-image :
nightly-image :
docker build -f src/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--build-arg BASE=$(BASE_IMAGE) \
--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
--build-arg NIGHTLY_VERSION=$(NIGHTLY_VERSION) \
Expand Down
66 changes: 47 additions & 19 deletions src/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
# Base image comes with PyTorch, numpy, flash-attn
ARG BASE
ARG BASE_BUILD
ARG BASE_RUNTIME

#########################################################################
# Build image
#########################################################################

FROM ${BASE} as build
FROM ${BASE_BUILD} as build

WORKDIR /app/build

# Install CUDA toolkit.
ARG CUDA_TOOLKIT_VERSION
RUN conda install -y -c nvidia cuda-toolkit==${CUDA_TOOLKIT_VERSION}
# Install system dependencies.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
curl \
wget \
libxml2-dev \
git && \
rm -rf /var/lib/apt/lists/*

ARG TORCH_CUDA_VERSION
# Install/upgrade Python build dependencies.
RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja

# Build flash-attn.
ARG FLASH_ATTN_VERSION
RUN pip wheel --no-build-isolation --no-cache-dir flash-attn==${FLASH_ATTN_VERSION}

# Build megablocks and grouped-gemm.
# Build megablocks, grouped-gemm, stanford-stk
ENV TORCH_CUDA_ARCH_LIST="8.0 9.0"
ENV GROUPED_GEMM_CUTLASS=1
ARG MEGABLOCKS_VERSION
RUN pip wheel --no-build-isolation --no-cache-dir \
--extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
"${MEGABLOCKS_VERSION}"

# Flash-attn from pre-built wheel (can't get this to work at the moment)
#RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}"

# Only keep the target wheels and dependencies with CUDA extensions.
RUN echo "Built wheels:" \
Expand All @@ -37,15 +43,38 @@ RUN echo "Built wheels:" \
# Stable image
#########################################################################

FROM ${BASE} as stable

ARG TORCH_CUDA_VERSION
FROM ${BASE_RUNTIME} as stable

# Install system dependencies.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
curl \
wget \
libxml2-dev \
git && \
rm -rf /var/lib/apt/lists/*

# Install MLNX OFED user-space drivers
# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
ENV MOFED_VER 24.01-0.3.3.1
ENV OS_VER ubuntu22.04
ENV PLATFORM x86_64
RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz

# Install/upgrade Python build dependencies.
RUN pip install --upgrade --no-cache-dir pip wheel packaging

# Install torchao.
ARG TORCH_CUDA_VERSION
ARG TORCHAO_VERSION
RUN pip install --no-cache-dir \
--extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
${TORCHAO_VERSION}
torchao==${TORCHAO_VERSION}

# Copy and install wheels from build image.
COPY --from=build /app/build /app/build
Expand All @@ -68,7 +97,6 @@ WORKDIR /app/olmo-core
FROM stable as nightly

ARG TORCH_CUDA_VERSION

ARG NIGHTLY_VERSION
RUN pip install --no-cache-dir --pre \
--index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} \
Expand Down

0 comments on commit 282c120

Please sign in to comment.