diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml index 82687692..d2239e87 100644 --- a/.github/workflows/test_api_rocm.yaml +++ b/.github/workflows/test_api_rocm.yaml @@ -25,43 +25,46 @@ concurrency: cancel-in-progress: true group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm - jobs: - build_image_and_run_api_rocm_tests: - runs-on: [self-hosted, amd-gpu, single-gpu, mi250] - + setup_rocm_docker_devices: + # the script in scripts/rocm_docker_devices.sh sets the environment variable ROCM_DOCKER_DEVICES + # which is a string of the form --device /dev/kfd --device /dev/dri/renderD128 --device /dev/dri/renderD129 ... + # that's used in the next job to mount the devices in the docker container + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 - - name: Set target devices + - name: Set up ROCM_DOCKER_DEVICES env var run: | - echo "DEVICE:$DEVICE" - echo "DEVICE=$DEVICE" >> $GITHUB_ENV + chmod +x scripts/rocm_docker_devices.sh + scripts/rocm_docker_devices.sh + shell: bash + + - name: Set outputs + id: rocm_docker_devices + run: echo "::set-output name=rocm_docker_devices::$ROCM_DOCKER_DEVICES" + + run_api_rocm_tests: + needs: setup_rocm_docker_devices + + runs-on: [self-hosted, amd-gpu, single-gpu, mi250] - - name: Unroot docker image + container: + image: ghcr.io/huggingface/optimum-benchmark:latest-rocm + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ ${{ needs.setup_rocm_docker_devices.outputs.rocm_docker_devices }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install dependencies run: | - docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot + pip install -e .[testing,timm,diffusers,codecarbon] - name: Run tests - uses: addnab/docker-run-action@v3 env: - DEVICE: ${{ env.DEVICE }} HF_TOKEN: ${{ secrets.HF_TOKEN }} PUSH_REPO_ID: optimum-benchmark/rocm - with: - image: ${{ env.IMAGE }}-unroot - options: | - --rm - --shm-size 64G - --env HF_TOKEN - --env PUSH_REPO_ID - --device /dev/kfd - --device /dev/dri/${{ env.DEVICE }} - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install -e .[testing,timm,diffusers,codecarbon] - pytest -s -x -k "api and cuda" + run: | + pytest -s -x -k "api and cuda" diff --git a/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml index 6c332a8b..7f8949f2 100644 --- a/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml +++ b/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml @@ -25,43 +25,25 @@ concurrency: cancel-in-progress: true group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm - jobs: run_cli_rocm_pytorch_multi_gpu_tests: runs-on: [self-hosted, amd-gpu, multi-gpu, mi250] + container: + image: ghcr.io/huggingface/optimum-benchmark:latest-rocm + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: - name: Checkout code uses: actions/checkout@v4 - - name: Set target devices + - name: Install dependencies run: | - echo "DEVICE0:$DEVICE0" - echo "DEVICE1:$DEVICE1" - echo "DEVICE0=$DEVICE0" >> $GITHUB_ENV - echo "DEVICE1=$DEVICE1" >> $GITHUB_ENV - - - name: Unroot docker image - run: | - docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot + pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq] - name: Run tests - uses: addnab/docker-run-action@v3 env: - DEVICE0: ${{ env.DEVICE0 }} - DEVICE1: ${{ env.DEVICE1 }} - with: - image: ${{ env.IMAGE }}-unroot - options: | - --rm - --shm-size 64G - --device /dev/kfd - --device /dev/dri/${{ env.DEVICE0 }} - --device /dev/dri/${{ env.DEVICE1 }} - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq] - pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not (bnb or awq)" + HF_TOKEN: ${{ secrets.HF_TOKEN }} + PUSH_REPO_ID: optimum-benchmark/rocm + run: | + pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not (bnb or awq)" diff --git a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml index 524832d5..75ed3ac7 100644 --- a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml +++ b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml @@ -25,39 +25,25 @@ concurrency: cancel-in-progress: true group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm - jobs: run_cli_rocm_pytorch_single_gpu_tests: runs-on: [self-hosted, amd-gpu, single-gpu, mi250] + container: + image: ghcr.io/huggingface/optimum-benchmark:latest-rocm + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: - name: Checkout code uses: actions/checkout@v4 - - name: Set target devices + - name: Install dependencies run: | - echo "DEVICE:$DEVICE" - echo "DEVICE=$DEVICE" >> $GITHUB_ENV - - - name: Unroot docker image - run: | - docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot + pip install -e .[testing,diffusers,timm,peft,autoawq,auto-gptq] - name: Run tests - uses: addnab/docker-run-action@v3 env: - DEVICE: ${{ env.DEVICE }} - with: - image: ${{ env.IMAGE }}-unroot - options: | - --rm - --shm-size 64G - --device /dev/kfd - --device /dev/dri/${{ env.DEVICE }} - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install -e .[testing,diffusers,timm,peft,autoawq,auto-gptq] - pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)" + HF_TOKEN: ${{ secrets.HF_TOKEN }} + PUSH_REPO_ID: optimum-benchmark/rocm + run: | + pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)" diff --git a/scripts/rocm_docker_devices.sh b/scripts/rocm_docker_devices.sh new file mode 100644 index 00000000..85040220 --- /dev/null +++ b/scripts/rocm_docker_devices.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Check if the variable is set +if [ -z "$ROCR_VISIBLE_DEVICES" ]; then + echo "Environment variable ROCR_VISIBLE_DEVICES is not set" + exit 1 +fi + +# Get the list of renderDxxx devices in /dev/dri +ROCM_RENDER_DEVICES=($(ls /dev/dri/renderD*)) + +# Split the ROCR_VISIBLE_DEVICES variable by commas to get individual device indices +IFS=',' read -r -a DEVICE_INDICES <<<"$ROCR_VISIBLE_DEVICES" + +# Construct the --device options for Docker +ROCM_DOCKER_DEVICES="--device /dev/kfd" +for INDEX in "${DEVICE_INDICES[@]}"; do + if [ "$INDEX" -lt "${#ROCM_RENDER_DEVICES[@]}" ]; then + ROCM_DOCKER_DEVICES+=" --device ${ROCM_RENDER_DEVICES[$INDEX]}" + else + echo "Index $INDEX is out of range for available render devices" + exit 1 + fi +done + +# export the ROCM_DOCKER_DEVICES variable +export ROCM_DOCKER_DEVICES