Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: adding HPU agents #100

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .azure/ci-testig-parameterized.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ schedules:
include: ["main"]

jobs:
- template: testing-template.yml
- template: cuda-template.yml
parameters:
configs:
- "Lightning-AI/metrics_pl-develop.yaml"
Expand All @@ -24,3 +24,9 @@ jobs:
- "microsoft/deepspeed-release.yaml"
- "neptune-ai/lightning_integration.yaml"
- "manujosephv/pytorch-tabular_lit-release.yaml"

- template: habana-template.yml
parameters:
configs:
- "Lightning-AI/metrics_pl-develop.yaml"
- "Lightning-AI/metrics_pl-release.yaml"
13 changes: 7 additions & 6 deletions .azure/testing-template.yml → .azure/cuda-template.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
jobs:

- job: check_diff
- job: check_cuda_diff
pool:
vmImage: 'Ubuntu-20.04'
steps:
Expand All @@ -24,10 +24,10 @@ jobs:
- ${{ each config in parameters.configs }}:
- job:
displayName: ${{config}}
dependsOn: check_diff
dependsOn: check_cuda_diff
variables:
# map the output variable from A into this job
configs: $[ dependencies.check_diff.outputs['files.diff'] ]
configs: $[ dependencies.check_cuda_diff.outputs['files.diff'] ]
config: "${{ config }}"
DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0"; print(gpus)' )

Expand All @@ -36,8 +36,6 @@ jobs:
timeoutInMinutes: 75
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: 2
workspace:
clean: all

pool: 'lit-rtx-3090'
# this need to have installed docker in the base image...
Expand All @@ -47,6 +45,9 @@ jobs:
# image: "nvcr.io/nvidia/pytorch:21.11-py3"
image: "pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime"
options: "--gpus=all --shm-size=8g -v /usr/bin/docker:/tmp/docker:ro"
workspace:
clean: all

steps:

- bash: |
Expand All @@ -70,7 +71,7 @@ jobs:

- bash: |
sudo apt-get update -q --fix-missing
sudo apt-get install -q -y build-essential gcc g++ cmake git unzip tree --no-install-recommends
sudo apt-get install -q -y --no-install-recommends build-essential gcc g++ cmake git unzip tree
# Python's dependencies
pip --version
pip install -r requirements.txt
Expand Down
107 changes: 107 additions & 0 deletions .azure/habana-template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
jobs:

- job: check_habana_diff
pool:
vmImage: 'Ubuntu-20.04'
steps:
- bash: |
pip --version
pip install -q -r requirements.txt
pip list
displayName: 'Install dependencies'

- script: |
echo $PR_NUMBER
CONFIGS=$(python _actions/assistant.py changed_configs $PR_NUMBER --as_list=False 2>&1)
printf "Changed configs: $CONFIGS\n"
echo "##vso[task.setvariable variable=diff;isOutput=true]$CONFIGS"
name: files
env:
PR_NUMBER: "$(System.PullRequest.PullRequestNumber)"
displayName: 'Config diff'


- ${{ each config in parameters.configs }}:
- job:
displayName: ${{config}}
dependsOn: check_habana_diff
variables:
# map the output variable from A into this job
configs: $[ dependencies.check_habana_diff.outputs['files.diff'] ]
config: "${{ config }}"

condition: or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), contains(variables['configs'], variables['config']))
# how long to run the job before automatically cancelling
timeoutInMinutes: 75
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: 2

pool: 'intel-hpus'
# this need to have installed docker in the base image...
container:
image: "vault.habana.ai/gaudi-docker/1.8.0/ubuntu20.04/habanalabs/pytorch-installer-1.13.1:latest"
options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g -v /usr/bin/docker:/tmp/docker:ro"
workspace:
clean: all

steps:

- script: |
container_id=$(head -1 /proc/self/cgroup|cut -d/ -f3)
/tmp/docker exec -t -u 0 $container_id \
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
echo "##vso[task.setvariable variable=CONTAINER_ID]$container_id"
displayName: 'Install Sudo in container (thanks Microsoft!)'

- bash: |
whoami && id
sudo apt-get install -q -y hwinfo
hwinfo --short
hl-smi -L
lsmod | grep habanalabs
python --version
pip --version
pip list
displayName: 'Image info & HW status'

- bash: |
sudo apt-get update -q --fix-missing
sudo apt-get install -q -y --no-install-recommends build-essential gcc g++ cmake git unzip tree
# Python's dependencies
pip --version
pip install -r requirements.txt
pip list
displayName: 'Install dependencies'

#- bash: |
# echo $CONTAINER_ID
# displayName: 'Sanity check'

- bash: |
python _actions/assistant.py prepare_env --config_file=${{config}} > prepare_env.sh
cat prepare_env.sh
displayName: 'Create scripts'

- bash: |
bash prepare_env.sh
# pip list
tree .
displayName: 'Prepare env.'

- script: |
ENVS=$(python _actions/assistant.py list_env --config_file=${{config}} --export 2>&1)
printf "PyTest env. variables: $ENVS\n"
echo "##vso[task.setvariable variable=envs;isOutput=true]$ENVS"
ARGS=$(python _actions/assistant.py specify_tests --config_file=${{config}} 2>&1)
printf "PyTest arguments: $ARGS\n"
echo "##vso[task.setvariable variable=args;isOutput=true]$ARGS"
name: testing
displayName: 'testing specs'

- bash: |
$(testing.envs)
python -m pytest $(testing.args) -v
workingDirectory: _integrations
displayName: 'Integration tests'

# ToDo: add Slack notification
4 changes: 2 additions & 2 deletions configs/Lightning-AI/metrics_pl-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ dependencies:
# install_extras: all

runtimes:
- {os: "ubuntu-20.04", python: "3.8"}
- {os: "macOS-11", python: "3.8"}
- {os: "ubuntu-22.04", python: "3.10"}
- {os: "macOS-12", python: "3.9"}
- {os: "windows-2022", python: "3.8"}