Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Docker build #75

Merged
merged 18 commits into from
Oct 31, 2024
30 changes: 16 additions & 14 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@ concurrency:
cancel-in-progress: true

on:
pull_request:
branches:
- main
paths:
- 'Makefile'
- 'pyproject.toml'
- 'src/olmo_core/version.py'
- 'src/Dockerfile'
- '.github/workflows/docker.yml'
push:
branches:
- main
tags:
- 'v*.*.*'
workflow_dispatch:
# TODO: disabled for now because it takes too long in CI
# pull_request:
# branches:
# - main
# paths:
# - 'Makefile'
# - 'pyproject.toml'
# - 'src/olmo_core/version.py'
# - 'src/Dockerfile'
# - '.github/workflows/docker.yml'
# push:
# branches:
# - main
# tags:
# - 'v*.*.*'

jobs:
beaker:
Expand Down
14 changes: 7 additions & 7 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ jobs:
gpu_checks:
name: ${{ matrix.task.name }}
runs-on: ubuntu-latest
timeout-minutes: 8
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
Expand All @@ -118,14 +118,14 @@ jobs:
src/test/
- name: Test checkpoint (GPU)
image: olmo-core-nightly
image: olmo-core
gpus: 2
run: |
pytest -v --color=yes --durations=3 -m gpu \
src/test/distributed/checkpoint*
- name: Test MoE (GPU)
image: olmo-core-nightly
image: olmo-core
gpus: 1
run: |
pytest -v --color=yes --durations=3 -m gpu \
Expand Down Expand Up @@ -174,17 +174,17 @@ jobs:
image:
beaker: ${{ env.BEAKER_IMAGE }}
context:
priority: low
priority: normal
preemptible: true
resources:
gpuCount: ${{ matrix.task.gpus }}
constraints:
cluster:
- ai2/jupiter-cirrascale-2
- ai2/pluto-cirrascale
# - ai2/allennlp-cirrascale
# - ai2/allennlp-elanding-a100-40g
- ai2/pluto-cirrascale
- ai2/jupiter-cirrascale-2
# - ai2/saturn-cirrascale
- ai2/saturn-cirrascale
envVars:
- name: CUBLAS_WORKSPACE_CONFIG
value: ":16:8"
Expand Down
4 changes: 0 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Added `DownstreamEvaluatorCallbackConfig` class for running in-loop downstream eval via [OLMo-in-loop-evals](https://github.com/allenai/OLMo-in-loop-evals).

### Removed

- Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302.

### Fixed

- Made GCS client more robust by automatically retrying timeout errors for most operations.
Expand Down
22 changes: 13 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# NOTE: make sure CUDA versions match across these variables
BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29
CUDA_TOOLKIT_VERSION = 12.1.0
TORCH_CUDA_VERSION = 121
CUDA_VERSION = 12.1
TORCH_CUDA_VERSION = $(shell echo $(CUDA_VERSION) | tr -d .)
BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel
BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime

# NOTE: when upgrading the nightly version you also need to upgrade the torch version specification
# in 'pyproject.toml' to include that nightly version.
NIGHTLY_VERSION = "2.6.0.dev20241009+cu121"
TORCHAO_VERSION = "torchao==0.5.0"
NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)"
TORCHAO_VERSION = "0.5.0"
MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://[email protected]/epwalsh/megablocks.git@epwalsh/deps"
FLASH_ATTN_VERSION = "2.6.3"

VERSION = $(shell python src/olmo_core/version.py)
VERSION_SHORT = $(shell python src/olmo_core/version.py short)
Expand Down Expand Up @@ -49,9 +51,10 @@ build :
stable-image :
docker build -f src/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--build-arg BASE=$(BASE_IMAGE) \
--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
--target stable \
Expand All @@ -63,9 +66,10 @@ stable-image :
nightly-image :
docker build -f src/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--build-arg BASE=$(BASE_IMAGE) \
--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
--build-arg NIGHTLY_VERSION=$(NIGHTLY_VERSION) \
Expand Down
66 changes: 47 additions & 19 deletions src/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
# Base image comes with PyTorch, numpy, flash-attn
ARG BASE
ARG BASE_BUILD
ARG BASE_RUNTIME

#########################################################################
# Build image
#########################################################################

FROM ${BASE} as build
FROM ${BASE_BUILD} as build

WORKDIR /app/build

# Install CUDA toolkit.
ARG CUDA_TOOLKIT_VERSION
RUN conda install -y -c nvidia cuda-toolkit==${CUDA_TOOLKIT_VERSION}
# Install system dependencies.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
curl \
wget \
libxml2-dev \
git && \
rm -rf /var/lib/apt/lists/*

ARG TORCH_CUDA_VERSION
# Install/upgrade Python build dependencies.
RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja

# Build flash-attn.
ARG FLASH_ATTN_VERSION
RUN pip wheel --no-build-isolation --no-cache-dir flash-attn==${FLASH_ATTN_VERSION}

# Build megablocks and grouped-gemm.
# Build megablocks, grouped-gemm, stanford-stk
ENV TORCH_CUDA_ARCH_LIST="8.0 9.0"
ENV GROUPED_GEMM_CUTLASS=1
ARG MEGABLOCKS_VERSION
RUN pip wheel --no-build-isolation --no-cache-dir \
--extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
"${MEGABLOCKS_VERSION}"

# Flash-attn from pre-built wheel (can't get this to work at the moment)
#RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}"

# Only keep the target wheels and dependencies with CUDA extensions.
RUN echo "Built wheels:" \
Expand All @@ -37,15 +43,38 @@ RUN echo "Built wheels:" \
# Stable image
#########################################################################

FROM ${BASE} as stable

ARG TORCH_CUDA_VERSION
FROM ${BASE_RUNTIME} as stable

# Install system dependencies.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
curl \
wget \
libxml2-dev \
git && \
rm -rf /var/lib/apt/lists/*

# Install MLNX OFED user-space drivers
# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
ENV MOFED_VER 24.01-0.3.3.1
ENV OS_VER ubuntu22.04
ENV PLATFORM x86_64
RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz

# Install/upgrade Python build dependencies.
RUN pip install --upgrade --no-cache-dir pip wheel packaging

# Install torchao.
ARG TORCH_CUDA_VERSION
ARG TORCHAO_VERSION
RUN pip install --no-cache-dir \
--extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
${TORCHAO_VERSION}
torchao==${TORCHAO_VERSION}

# Copy and install wheels from build image.
COPY --from=build /app/build /app/build
Expand All @@ -68,7 +97,6 @@ WORKDIR /app/olmo-core
FROM stable as nightly

ARG TORCH_CUDA_VERSION

ARG NIGHTLY_VERSION
RUN pip install --no-cache-dir --pre \
--index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} \
Expand Down
Loading