Skip to content

Add option to throttle checkpoint uploads to one rank from each node at a time #1025

Add option to throttle checkpoint uploads to one rank from each node at a time

Add option to throttle checkpoint uploads to one rank from each node at a time #1025

Workflow file for this run

name: Main
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on:
pull_request:
branches:
- main
push:
branches:
- main
tags:
- 'v*.*.*'
env:
# Change this to invalidate existing cache.
CACHE_PREFIX: v0
PYTHONPATH: ./src/
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
jobs:
checks:
name: ${{ matrix.task.name }}
runs-on: [ubuntu-latest]
timeout-minutes: 8
strategy:
fail-fast: false
matrix:
python: ['3.10']
task:
- name: Lint
run: make lint-check
- name: Test
run: |
pytest -v --color=yes --durations=3 src/test/ \
--ignore-glob='src/test/distributed/checkpoint*'
- name: Test checkpoint
run: |
pytest -v --color=yes --durations=3 src/test/distributed/checkpoint*
- name: Type check
run: make type-check
- name: Build
run: make build
- name: Style
run: make style-check
- name: Docs
run: |
cd docs && make html SPHINXOPTS="-W --keep-going"
include:
- python: '3.9'
task:
name: Lint (min Python)
run: make lint-check
steps:
- uses: actions/checkout@v4
- name: Setup Python environment
uses: ./.github/actions/setup-venv
with:
python-version: ${{ matrix.python }}
cache-prefix: ${{ env.CACHE_PREFIX }}
- name: Restore mypy cache
if: matrix.task.name == 'Type check'
uses: actions/cache@v4
with:
path: .mypy_cache
key: mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }}
mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}
- name: ${{ matrix.task.name }}
run: |
. .venv/bin/activate
${{ matrix.task.run }}
- name: Upload package distribution files
if: matrix.task.name == 'Build'
uses: actions/upload-artifact@v4
with:
name: package
path: dist
- name: Clean up
if: always()
run: |
. .venv/bin/activate
pip uninstall -y ai2-olmo-core
gpu_checks:
name: ${{ matrix.task.name }}
runs-on: ubuntu-latest
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
task:
- name: Test (GPU)
image: olmo-core-tch251cu124
gpus: 2
run: |
pytest -v --color=yes --durations=3 -m gpu \
--ignore-glob='src/test/distributed/checkpoint*' \
--ignore-glob='src/test/nn/moe*' \
src/test/
- name: Test checkpoint (GPU)
image: olmo-core-tch251cu124
gpus: 2
run: |
pytest -v --color=yes --durations=3 -m gpu \
src/test/distributed/checkpoint*
- name: Test MoE (GPU)
image: olmo-core-tch251cu124
gpus: 1
run: |
pytest -v --color=yes --durations=3 -m gpu \
src/test/nn/moe*
steps:
- uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Set env vars
run: |
echo "BEAKER_WORKSPACE=$(make get-beaker-workspace)" >> $GITHUB_ENV
- name: Determine current commit SHA (pull request)
if: github.event_name == 'pull_request'
run: |
echo "COMMIT_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
- name: Determine current commit SHA (push)
if: github.event_name != 'pull_request'
run: |
echo "COMMIT_SHA=$GITHUB_SHA" >> $GITHUB_ENV
- uses: allenai/setup-beaker@v2
if: env.BEAKER_TOKEN != ''
with:
token: ${{ env.BEAKER_TOKEN }}
workspace: ${{ env.BEAKER_WORKSPACE }}
- name: Get full image name
if: env.BEAKER_TOKEN != ''
run:
echo "BEAKER_IMAGE=$(make get-full-beaker-image-name IMAGE_NAME=${{ matrix.task.image }})" >> $GITHUB_ENV
- name: GPU Tests
uses: allenai/[email protected]
if: env.BEAKER_TOKEN != ''
with:
spec: |
version: v2
description: OLMo-core ${{ matrix.task.name }}
budget: ai2/oe-training
tasks:
- name: tests
image:
beaker: ${{ env.BEAKER_IMAGE }}
context:
priority: normal
preemptible: true
resources:
gpuCount: ${{ matrix.task.gpus }}
constraints:
cluster:
# H100 clusters
- ai2/jupiter-cirrascale-2
- ai2/augusta-google-1
- ai2/allennlp-elara-cirrascale
- ai2/ganymede-cirrascale
- ai2/ceres-cirrascale
# A100 clusters
- ai2/saturn-cirrascale
# - ai2/allennlp-elanding-a100-40g
envVars:
- name: CUBLAS_WORKSPACE_CONFIG
value: ":16:8"
- name: TOKENIZERS_PARALLELISM
value: "false"
- name: AWS_ACCESS_KEY_ID
secret: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
secret: AWS_SECRET_ACCESS_KEY
command:
- "bash"
- "-c"
- "git clone https://github.com/allenai/OLMo-core.git && cd OLMo-core && git checkout ${{ env.COMMIT_SHA }} && pip install -e .[all] && ${{ matrix.task.run }}"
result:
path: /unused
token: ${{ env.BEAKER_TOKEN }}
workspace: ${{ env.BEAKER_WORKSPACE }}
release:
name: Release
runs-on: ubuntu-latest
needs: [checks]
if: startsWith(github.ref, 'refs/tags/')
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Python environment
uses: ./.github/actions/setup-venv
with:
python-version: '3.10'
cache-prefix: ${{ env.CACHE_PREFIX }}
- name: Prepare environment
run: |
echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
- name: Download package distribution files
uses: actions/download-artifact@v4
with:
name: package
path: dist
- name: Generate release notes
run: |
. .venv/bin/activate
python src/scripts/release/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md
- name: Publish package to PyPI
run: |
. .venv/bin/activate
twine upload -u __token__ -p '${{ secrets.PYPI_TOKEN }}' dist/*
- name: Publish GitHub release
uses: softprops/action-gh-release@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
body_path: ${{ github.workspace }}-RELEASE_NOTES.md
prerelease: ${{ contains(env.TAG, 'rc') }}
files: |
dist/*
- name: Add PR comments on release
env:
GH_TOKEN: ${{ github.token }}
run: |
./src/scripts/release/add_pr_comments_on_release.sh