-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 71b085f
Showing
14 changed files
with
506 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
name: "CI Tests" | ||
on: | ||
push: | ||
branches: | ||
- master | ||
pull_request: | ||
branches: | ||
- master | ||
concurrency: | ||
group: build-${{ github.event.pull_request.number || github.ref }} | ||
cancel-in-progress: true | ||
jobs: | ||
slurm: | ||
name: "Slurm Docker images tests" | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- uses: docker/setup-qemu-action@v3 | ||
- uses: docker/setup-buildx-action@v3 | ||
- name: "Get current date" | ||
run: echo "NOW=$(date +'%Y-%m-%dT%H:%M:%S')" >> $GITHUB_ENV | ||
- name: "Build slurmctld image" | ||
uses: docker/build-push-action@v5 | ||
with: | ||
build-args: | | ||
COMMIT=${{ github.sha }} | ||
CREATION_DATE={{ env.NOW }} | ||
context: slurm/slurmctld | ||
load: true | ||
tags: alphaunito/slurmctld:21.08.5 | ||
- name: "Build slurmd image" | ||
uses: docker/build-push-action@v5 | ||
with: | ||
build-args: | | ||
COMMIT=${{ github.sha }} | ||
CREATION_DATE={{ env.NOW }} | ||
context: slurm/slurmd | ||
load: true | ||
tags: alphaunito/slurmd:21.08.5 | ||
- name: "Start Docker Compose" | ||
id: start-compose | ||
run: | | ||
docker compose \ | ||
--file slurm/docker-compose.yml \ | ||
--project-name slurm \ | ||
up \ | ||
--wait \ | ||
--wait-timeout 60 \ | ||
- name: "Run test with Docker Compose" | ||
id: run-tests | ||
run: | | ||
docker compose \ | ||
--file slurm/docker-compose.yml \ | ||
--project-name slurm \ | ||
exec \ | ||
--user hpcuser \ | ||
slurmctld srun hostname | ||
- name: "Show slurmctld logs on failure" | ||
if: ${{ always() && (steps.start-compose.outcome == 'failure' || steps.run-tests.outcome == 'failure') }} | ||
run: | | ||
docker compose \ | ||
--file slurm/docker-compose.yml \ | ||
--project-name slurm \ | ||
logs \ | ||
slurmctld | ||
- name: "Show slurmd logs on failure" | ||
if: ${{ always() && (steps.start-compose.outcome == 'failure' || steps.run-tests.outcome == 'failure') }} | ||
run: | | ||
docker compose \ | ||
--file slurm/docker-compose.yml \ | ||
--project-name slurm \ | ||
logs \ | ||
slurmd | ||
- name: "Stop Docker Compose" | ||
if: ${{ always() }} | ||
run: | | ||
docker compose \ | ||
--file slurm/docker-compose.yml \ | ||
--project-name slurm \ | ||
down \ | ||
--volumes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
name: "Release Docker images" | ||
on: | ||
workflow_run: | ||
workflows: | ||
- "CI Tests" | ||
branches: | ||
- master | ||
types: | ||
- completed | ||
jobs: | ||
slurm: | ||
name: "Slurm Docker images" | ||
runs-on: ubuntu-22.04 | ||
if: ${{ github.event.workflow_run.conclusion == 'success' }} | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- uses: docker/setup-qemu-action@v3 | ||
- uses: docker/setup-buildx-action@v3 | ||
- name: "Get current date" | ||
run: echo "NOW=$(date +'%Y-%m-%dT%H:%M:%S')" >> $GITHUB_ENV | ||
- uses: docker/login-action@v3 | ||
with: | ||
username: ${{ secrets.DOCKERHUB_USERNAME }} | ||
password: ${{ secrets.DOCKERHUB_TOKEN }} | ||
- name: "Build slurmctld image" | ||
uses: docker/build-push-action@v5 | ||
with: | ||
build-args: | | ||
COMMIT=${{ github.sha }} | ||
CREATION_DATE={{ env.NOW }} | ||
context: slurm/slurmctld | ||
push: true | ||
tags: alphaunito/slurmctld:21.08.5 | ||
- name: "Build slurmd image" | ||
uses: docker/build-push-action@v5 | ||
with: | ||
build-args: | | ||
COMMIT=${{ github.sha }} | ||
CREATION_DATE={{ env.NOW }} | ||
context: slurm/slurmd | ||
push: true | ||
tags: alphaunito/slurmd:21.08.5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Docker for HPC | ||
|
||
This repository collects Dockerized versions of the most widespread HPC stacks. In particular, each folder contains: | ||
|
||
- A set of `Dockerfiles` that can be used standalone or manually assembled in a microservices application; | ||
- A `docker-compose.yml` file that simulates an HPC cluster orchestrated by the selected stack. | ||
|
||
This repository's main purpose is to provide the HPC community with Docker-based simulated environments of HPC stacks, which will be used mainly for small-scale experiments, debugging, and CI/CD pipelines. | ||
|
||
All Docker containers are published on DockerHub under the `alphaunito` organization. At the moment, this repository contains Dockerized versions of: | ||
|
||
- [Slurm](./slurm/README.md) | ||
|
||
All images support the explicit definition of opencontainers [annotations](https://github.com/opencontainers/image-spec/blob/main/annotations.md) through two build args: | ||
|
||
- The `COMMIT` build arg should contain the digest of the commit the image is being created from. It populates the `org.opencontainers.image.revision` label | ||
- The `CREATION_DATE` build arg should contain the creation time of the image in `RFC 3339` format. It populates the `org.opencontainers.image.created` label |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Slurm on Docker | ||
|
||
This folder contains a fully Dockerized version of the [Slurm](https://slurm.schedmd.com/) queue manager. The version of SLURM shipped is the one downloaded from the apt repository, which is currently `21.08.5` | ||
|
||
This repository contains the source code of different container images: | ||
|
||
- `alphaunito/slurmctld:21.08.5`, which runs the Slurm control plane | ||
- `alphaunito/slurmd:21:08.5`, which runs a Slurm compute node | ||
|
||
Plus, it also contains a [docker-compose.yml](./docker-compose.yml) file that can deplyo an entire Slurm cluster with a single controller and a set of compute nodes. All these components are detailed below | ||
|
||
## Slurmctld | ||
|
||
The `slurmctld` process is the central management daemon of Slurm. It constitutes the control plane of the Slurm queue manager. The `slurmctld` Docker image can be build and published on DockerHub using the following commands | ||
|
||
```bash | ||
docker build -t alphaunito/slurmctld:21.08.5 slurmctld | ||
docker push alphaunito/slurmctld:21.08.5 | ||
``` | ||
|
||
To correctly populate the `slurm.conf` file, a `slurmctld` container needs 3 environment variables: | ||
|
||
- The `SLURMCTLD_HOSTNAME` variable should contain the hostname of the container. It is populated by default using the `hostname` command | ||
- The `SLURMD_NODES` variable must contain the number of compute nodes that Slurm should manage. If this variable is not set, the container displays an error message and terminates | ||
- The `SLURMD_HOSTNAME_PREFIX` variable should contain the prefix of the hostname used to identify compute nodes. If this variable is not set, the container displays an error message and terminates | ||
|
||
Note that all the compute nodes in the simulated HPC cluster should have a reachable hostname equal to `"${SLURMD_HOSTNAME_PREFIX}${X}"`, where `X` is an integer in the range `[1, ${SLURMD_NODES}]` | ||
|
||
## Compute | ||
|
||
The `slurmd` process is the compute node daemon for Slurm. It monitors all tasks running on the compute node , accepts work (tasks), launches tasks, and kills running tasks upon request. The `slurmd` Docker image can be build and published using the following commands | ||
|
||
```bash | ||
docker build -t alphaunito/slurmd:21.08.5 slurmd | ||
docker push alphaunito/slurmd:21.08.5 | ||
``` | ||
|
||
To correctly connect to a `slurmctld` node, a `slurmd` container needs a `SLURMCTLD_HOSTNAME` variable that should contain the hostname of the target `slurmctld` container. If this variable is not set, the container displays an error message and terminates | ||
|
||
## Cluster | ||
|
||
The `slurmctld` and `slurmd` images described above can be used to set up a Docker-based Slurm cluster with a single controller and many compute nodes. This task can be achieved either manually or starting from the [docker-compose.yml](./docker-compose.yml) file contained in this repository | ||
|
||
Note that the `slurmctld` node should have an identifiable hostname, as compute nodes must register with the control plane to be addressable. In Docker Compose, an explicit hostname can be set for a given service using the `hostname` keyword. | ||
|
||
Slurm relies on [MUNGE](https://dun.github.io/munge/) for authentication. To setup a munge cluster, all the involved nodes should share a common key located in the the `/etc/munge` fodler. Given that, the `/etc/munge` folder should be mounted as a shared volume among all the containers in the Slurm cluster. | ||
|
||
To allow for unprivileged workloads, an `hpcuser` has been configured inside the images. Commands can be executed by explicitly impersonating this user, through the `--user hpcuser` flag. For example | ||
|
||
```bash | ||
docker exec -it --user hpcuser slurmctld bash | ||
``` | ||
|
||
In order to simulate an HPC facility, where the home folder is commonly mounted on a shared parallel file system, users may want to mount the `/home/hpcuser` folder as a shared volume among all the containers in the Slurm cluster. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
version: "3.8" | ||
services: | ||
slurmctld: | ||
image: alphaunito/slurmctld:21.08.5 | ||
environment: | ||
SLURMD_HOSTNAME_PREFIX: ${COMPOSE_PROJECT_NAME}-slurmd | ||
SLURMD_NODES: 2 | ||
hostname: slurmctld | ||
networks: | ||
- slurmnet | ||
volumes: | ||
- home:/home/hpcuser | ||
- munge:/etc/munge | ||
- mysql:/var/lib/mysql | ||
slurmd: | ||
image: alphaunito/slurmd:21.08.5 | ||
deploy: | ||
mode: replicated | ||
replicas: 2 | ||
environment: | ||
SLURMCTLD_HOSTNAME: slurmctld | ||
networks: | ||
- slurmnet | ||
volumes: | ||
- home:/home/hpcuser | ||
- munge:/etc/munge | ||
networks: | ||
slurmnet: | ||
volumes: | ||
home: | ||
munge: | ||
mysql: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
FROM ubuntu:jammy | ||
|
||
ARG COMMIT="none" | ||
ARG CREATION_DATE="none" | ||
|
||
LABEL org.opencontainers.image.authors="Iacopo Colonnelli <[email protected]>" | ||
LABEL org.opencontainers.image.base.name="docker.io/ubuntu:jammy" | ||
LABEL org.opencontainers.image.created="${CREATION_DATE}" | ||
LABEL org.opencontainers.image.licenses="OpenSSL" | ||
LABEL org.opencontainers.image.revision="${COMMIT}" | ||
LABEL org.opencontainers.image.ref.name="alphaunito/slurmctld" | ||
LABEL org.opencontainers.image.source="https://github.com/alpha-unito/docker-for-hpc" | ||
LABEL org.opencontainers.image.title="Slurm management daemon" | ||
LABEL org.opencontainers.image.version="21.08.5" | ||
|
||
RUN apt update \ | ||
&& apt install -y --no-install-recommends \ | ||
ca-certificates \ | ||
hwloc \ | ||
mailutils \ | ||
munge \ | ||
openmpi-bin \ | ||
openssh-server \ | ||
slurm-client \ | ||
slurmctld \ | ||
supervisor \ | ||
wget \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& wget -O /bin/gosu https://github.com/tianon/gosu/releases/download/1.17/gosu-amd64 \ | ||
&& chmod +x /bin/gosu \ | ||
&& gosu nobody true \ | ||
&& mkdir -p \ | ||
/run/munge \ | ||
/run/sshd \ | ||
/var/log/munge \ | ||
/var/log/slurm \ | ||
/var/run/slurm \ | ||
/var/spool/slurmctld \ | ||
&& chown munge:munge \ | ||
/run/munge \ | ||
/var/log/munge \ | ||
&& chown slurm:slurm \ | ||
/var/log/slurm \ | ||
/var/run/slurm \ | ||
/var/spool/slurmctld \ | ||
&& adduser \ | ||
--disabled-password \ | ||
--gecos "" \ | ||
hpcuser | ||
|
||
COPY config/cgroups.conf \ | ||
config/slurm.conf \ | ||
/etc/slurm/ | ||
|
||
COPY config/supervisord.conf \ | ||
/etc/supervisor/conf.d/ | ||
|
||
COPY scripts/run-slurmctld \ | ||
/bin/ | ||
|
||
HEALTHCHECK --start-period=15s CMD \ | ||
test "$(supervisorctl status | awk '{print $2}' | sort | uniq)" == "RUNNING" | ||
|
||
EXPOSE 22 6817 | ||
|
||
WORKDIR /home/hpcuser | ||
|
||
ENTRYPOINT supervisord |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
CgroupAutomount=yes | ||
CgroupReleaseAgentDir="/etc/slurm/cgroup" | ||
ConstrainCores=yes | ||
ConstrainDevices=no | ||
ConstrainRAMSpace=yes | ||
ConstrainSwapSpace=yes | ||
IgnoreSystemd=yes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
AccountingStorageType=accounting_storage/none | ||
ClusterName=docker-slurm | ||
InactiveLimit=0 | ||
JobAcctGatherFrequency=30 | ||
JobAcctGatherType=jobacct_gather/none | ||
JobCompType=jobcomp/none | ||
KillWait=30 | ||
MinJobAge=300 | ||
MpiDefault=none | ||
NodeName=__SLURMD_HOSTNAME_PREFIX__-[1-__SLURMD_NODES__] CPUs=__SLURM_CPUS_ON_NODE__ Sockets=__SLURM_SOCKETS_ON_NODE__ CoresPerSocket=__SLURM_CORES_PER_SOCKET_ON_NODE__ ThreadsPerCore=__SLURM_THREADS_PER_CORE__ State=UNKNOWN | ||
PartitionName=docker Nodes=ALL Default=YES MaxTime=INFINITE State=UP | ||
ProctrackType=proctrack/linuxproc | ||
ReturnToService=1 | ||
SchedulerType=sched/backfill | ||
SelectType=select/cons_res | ||
SelectTypeParameters=CR_Core | ||
SlurmctldDebug=info | ||
SlurmctldHost=__SLURMCTLD_HOST__ | ||
SlurmctldLogFile=/var/log/slurm/slurmctld.log | ||
SlurmctldParameters=enable_configless | ||
SlurmctldPidFile=/var/run/slurm/slurmctld.pid | ||
SlurmctldPort=6817 | ||
SlurmctldTimeout=120 | ||
SlurmdDebug=info | ||
SlurmdLogFile=/var/log/slurm/slurmd.log | ||
SlurmdPidFile=/var/run/slurm/slurmd.pid | ||
SlurmdPort=6818 | ||
SlurmdSpoolDir=/var/spool/slurmd | ||
SlurmdTimeout=300 | ||
SlurmdUser=root | ||
SlurmUser=slurm | ||
StateSaveLocation=/var/spool/slurmctld | ||
SwitchType=switch/none | ||
TreeWidth=65533 | ||
Waittime=0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
[supervisord] | ||
nodaemon=true | ||
logfile=/dev/null | ||
logfile_maxbytes=0 | ||
|
||
[program:munge] | ||
command=gosu munge /usr/sbin/munged --foreground | ||
autostart=true | ||
stdout_logfile=/dev/stdout | ||
stdout_logfile_maxbytes=0 | ||
redirect_stderr=true | ||
|
||
[program:slurmctld] | ||
command=run-slurmctld | ||
autostart=true | ||
stdout_logfile=/dev/stdout | ||
stdout_logfile_maxbytes=0 | ||
redirect_stderr=true | ||
|
||
[program:sshd] | ||
command=/usr/sbin/sshd -D | ||
autostart=true | ||
stdout_logfile=/dev/stdout | ||
stdout_logfile_maxbytes=0 | ||
redirect_stderr=true |
Oops, something went wrong.