Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a test docker image #6

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
5 changes: 5 additions & 0 deletions intel-gaudi/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:1.15.1-15

WORKDIR /

COPY ../hccl_demo /hccl_demo
6 changes: 3 additions & 3 deletions intel-gaudi/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ habana-configure:
fi
#
# Intel GAUDI external ports
wget https://raw.githubusercontent.com/HabanaAI/Setup_and_Install/r1.14.0/utils/manage_network_ifs.sh && \
wget https://raw.githubusercontent.com/HabanaAI/Setup_and_Install/main/utils/manage_network_ifs.sh && \
chmod +x ./manage_network_ifs.sh && \
./manage_network_ifs.sh --up

Expand Down Expand Up @@ -191,15 +191,15 @@ test-16-cards:
# Ensure that habana ENV is already configured on the second node
ssh $(SECOND_NODE) "hl-smi -Q name --format=csv"
ssh $(SECOND_NODE) grep habana /etc/docker/daemon.json
MANAGE_NETWORK_IFS=$$(ssh $(SECOND_NODE) "curl -s https://raw.githubusercontent.com/HabanaAI/Setup_and_Install/r1.14.0/utils/manage_network_ifs.sh | bash -s -- --status"); \
MANAGE_NETWORK_IFS=$$(ssh $(SECOND_NODE) "curl -s https://raw.githubusercontent.com/HabanaAI/Setup_and_Install/main/utils/manage_network_ifs.sh | bash -s -- --status"); \
if [[ "$${MANAGE_NETWORK_IFS}" =~ down ]]; then \
exit 1; \
fi
#
# Install slurm the way it was installed on the first node as a second single node cluster
ssh $(SECOND_NODE) "mkdir -p $(REPO_DIR)"; \
scp -rp $(REPO_DIR)/* $(SECOND_NODE):$(REPO_DIR); \
INSTALLATION_TYPE=$$(test -d $(REPO_DIR)/debian-packages -o -d fedora-packages && echo package || echo source)); \
INSTALLATION_TYPE=$$(test -d $(REPO_DIR)/debian-packages && echo debian-package || echo source); \
ssh $(SECOND_NODE) "cd $(MAKEFILE_DIR); make install-$$INSTALLATION_TYPE $(filter-out --,$(MAKEFLAGS)) IS_CONTROLLER='false'"
#
# Configure slurm as a single two node cluster
Expand Down
1 change: 1 addition & 0 deletions intel-gaudi/slurmd.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NodeName=NodeNamePlaceHolder NumOfNumaNodes=NumaNodesPlaceHolder