Skip to content

Commit

Permalink
Merge branch 'main' into feat/no-ohpc
Browse files Browse the repository at this point in the history
  • Loading branch information
sjpb committed Nov 10, 2023
2 parents 29c8018 + 6f31af4 commit f4b02ce
Show file tree
Hide file tree
Showing 59 changed files with 856 additions and 74 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
. environments/.stackhpc/activate
cd packer/
packer init .
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=ask -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
- name: Get created image name from manifest
id: manifest
Expand Down
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,5 @@ roles/*
!roles/proxy/**
!roles/resolv_conf/
!roles/resolv_conf/**
!roles/cve-2023-41914
!roles/cve-2023-41914/**
11 changes: 11 additions & 0 deletions ansible/adhoc/backup-keytabs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Use ONE of the following tags on this playbook:
# - retrieve: copies keytabs out of the state volume to the environment
# - deploy: copies keytabs from the environment to the state volume

- hosts: freeipa_client
become: yes
gather_facts: no
tasks:
- import_role:
name: freeipa
tasks_from: backup-keytabs.yml
6 changes: 6 additions & 0 deletions ansible/adhoc/cve-2023-41914.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- hosts: openhpc
gather_facts: no
become: yes
tasks:
- import_role:
name: cve-2023-41914
25 changes: 15 additions & 10 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,21 @@
policy: "{{ selinux_policy }}"
register: sestatus

- hosts: freeipa_server
# Done here as it might be providing DNS
tags:
- freeipa
- freeipa_server
gather_facts: yes
become: yes
tasks:
- name: Install FreeIPA server
import_role:
name: freeipa
tasks_from: server.yml

# --- tasks after here require access to package repos ---

- hosts: firewalld
gather_facts: false
become: yes
Expand Down Expand Up @@ -112,16 +127,6 @@
tasks_from: config.yml
tags: config

- name: Setup EESSI
hosts: eessi
tags: eessi
become: true
gather_facts: false
tasks:
- name: Install and configure EESSI
import_role:
name: eessi

- hosts: update
gather_facts: false
become: yes
Expand Down
3 changes: 2 additions & 1 deletion ansible/ci/retrieve_inventory.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
gather_facts: no
vars:
cluster_prefix: "{{ undef(hint='cluster_prefix must be defined') }}" # e.g. ci4005969475
cluster_network: WCDC-iLab-60
ci_vars_file: "{{ appliances_environment_root + '/terraform/' + lookup('env', 'CI_CLOUD') }}.tfvars"
cluster_network: "{{ lookup('ansible.builtin.ini', 'cluster_net', file=ci_vars_file, type='properties') | trim('\"') }}"
tasks:
- name: Get control host IP
set_fact:
Expand Down
23 changes: 22 additions & 1 deletion ansible/extras.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,25 @@
- hosts: cuda
- hosts: basic_users
become: yes
tags:
- basic_users
- users
gather_facts: yes
tasks:
- import_role:
name: basic_users

- name: Setup EESSI
hosts: eessi
tags: eessi
become: true
gather_facts: false
tasks:
- name: Install and configure EESSI
import_role:
name: eessi

- name: Setup CUDA
hosts: cuda
become: yes
gather_facts: no
tags: cuda
Expand Down
18 changes: 15 additions & 3 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Builder version of site.yml just installing binaries

- hosts: builder
become: no
gather_facts: no
tasks:
- name: Report hostname (= final image name)
command: hostname

- name: Run pre.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Expand Down Expand Up @@ -27,6 +34,13 @@
state: stopped
enabled: false

# - import_playbook: iam.yml
- name: Install FreeIPA client
import_role:
name: freeipa
tasks_from: client-install.yml
when: "'freeipa_client' in group_names"

# - import_playbook: filesystems.yml
- name: nfs
dnf:
Expand All @@ -44,7 +58,7 @@
tasks_from: "install-{{ openhpc_install_type }}.yml"

- name: Include distribution variables for osc.ood
include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky.yml"
include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/8.yml"
# FUTURE: install-apps.yml - this is git clones

# - import_playbook: portal.yml
Expand Down Expand Up @@ -141,8 +155,6 @@
name: cloudalchemy.grafana
tasks_from: install.yml

# - import_playbook: iam.yml - nothing to do

- name: Run post.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Expand Down
42 changes: 38 additions & 4 deletions ansible/iam.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,42 @@
- hosts: basic_users
- hosts: freeipa_client
tags:
- freeipa
- freeipa_server # as this is only relevant if using freeipa_server
- freeipa_host
gather_facts: no
become: yes
tasks:
- name: Ensure FreeIPA client hosts are added to the FreeIPA server
import_role:
name: freeipa
tasks_from: addhost.yml
when: groups['freeipa_server'] | length > 0

- hosts: freeipa_client
tags:
- basic_users
- freeipa
- freeipa_client
gather_facts: yes
become: yes
tasks:
- name: Install FreeIPA client
import_role:
name: freeipa
tasks_from: client-install.yml
- name: Enrol FreeIPA client
import_role:
name: freeipa
tasks_from: enrol.yml

- hosts: freeipa_server
tags:
- freeipa
- freeipa_server
- users
gather_facts: yes
become: yes
tasks:
- import_role:
name: basic_users
- name: Add FreeIPA users
import_role:
name: freeipa
tasks_from: users.yml
1 change: 1 addition & 0 deletions ansible/roles/cuda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ Requires OFED to be installed to provide required kernel-* packages.

- `cuda_distro`: Optional. Default `rhel8`.
- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.
5 changes: 3 additions & 2 deletions ansible/roles/cuda/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
cuda_distro: rhel8
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
cuda_driver_stream: default
cuda_packages:
- cuda
- nvidia-gds
# cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
cuda_version_short: "{{ cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
cuda_samples_programs:
Expand Down
34 changes: 26 additions & 8 deletions ansible/roles/cuda/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,40 @@
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
url: "{{ cuda_repo }}"

- name: Check if nvidia driver module is enabled
shell:
cmd: dnf module list --enabled nvidia-driver
changed_when: false
failed_when: false
register: _cuda_driver_module_enabled

- name: List nvidia driver dnf module stream versions
shell:
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
# Output of interest from command is something like (some whitespace removed):
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
changed_when: false
register: _cuda_driver_module_streams
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"

- name: Enable nvidia driver module
ansible.builtin.command: dnf module enable -y nvidia-driver:latest-dkms
register: nvidiadriver_enable
changed_when: "'Nothing to do' not in nvidiadriver_enable.stdout"
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
register: _cuda_driver_module_enable
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"

- name: Install nvidia driver module
ansible.builtin.command: dnf module install -y nvidia-driver:latest-dkms
register: nvidiadriver_install
changed_when: "'Nothing to do' not in nvidiadriver_install.stdout"
- name: Install nvidia drivers # TODO: make removal possible?
ansible.builtin.command: dnf module install -y nvidia-driver
register: _cuda_driver_install
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"

- name: Install cuda packages
ansible.builtin.dnf:
name: "{{ cuda_packages }}"
register: cuda_package_install

- name: Add latest cuda binaries to path
- name: Add cuda binaries to path
lineinfile:
path: /etc/profile.d/sh.local
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/cuda/tasks/samples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

- name: Set fact for discovered cuda version
set_fact:
cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0'
_cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0'

- name: Ensure cuda_samples_path exists
file:
Expand Down
32 changes: 32 additions & 0 deletions ansible/roles/cve-2023-41914/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# cve-2023-41914

This role fixes [Slurm CVE-2023-41914](https://lists.schedmd.com/pipermail/slurm-announce/2023/000100.html):

> A number of race conditions have been identified within the slurmd/slurmstepd processes that can lead to the user taking ownership of an arbitrary file on the system. A related issue can lead to the user overwriting an arbitrary file on the compute node (although with data that is not directly under their control). A related issue can also lead to the user deleting all files and sub-directories of an arbitrary target directory on the compute node.
**NB:** It is only suitable for use on systems installed from OpenHPC v2.6.1 (Slurm v22.05).

At the time of writing, new OpenHPC packages have been built but are not available from the respositories (reference), hence `dnf update ...` is not available.

This role can be run in two ways:

1. To remediate an existing system, run `tasks/main.yml`, e.g. using the playbook `ansible/adhoc/cve-2023-41914.yml`. This will:
- Stop all Slurm services
- Backup the slurmdbd mysql database to the volume-backed directory `/var/lib/state/mysql-backups/` on the control node (by default).
- Uninstall the affected packages and install updated rpms from the OpenHPC build system.
- Restart Slurm services.

**NB**: This playbook will ALWAYS stop and restart Slurm, even if no updates are actually required.

2. To remediate images during build (i.e no Slurm services are running, no slurm database exists), run `tasks/install-rpms.yml`, e.g. using the following in an environment pre-hook:

```yaml
- hosts: builder
gather_facts: no
become: yes
tasks:
- name: Apply fixes for cve-2023-41914
import_role:
name: cve-2023-41914
tasks_from: install-rpms.yml
```
24 changes: 24 additions & 0 deletions ansible/roles/cve-2023-41914/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

# _cve_2023_41814_installed_slurm: []
cve_2023_41914_mysql_backup_path: "{{ mysql_datadir }}-backups/{{ lookup('pipe', 'date --iso-8601=seconds') }}.sql"

cve_2023_41914_rpm_url: http://obs.openhpc.community:82/OpenHPC:/2.6.2:/Factory/EL_8/x86_64
cve_2023_41914_rpms: # see cve_2023_41914_rpm_url
- slurm-ohpc # has to be first as dependency
- slurm-contribs-ohpc
- slurm-devel-ohpc
- slurm-example-configs-ohpc
- slurm-libpmi-ohpc
- slurm-ohpc-slurmrestd
- slurm-openlava-ohpc
- slurm-pam_slurm-ohpc
- slurm-perlapi-ohpc
- slurm-slurmctld-ohpc
- slurm-slurmd-ohpc
- slurm-slurmdbd-ohpc
- slurm-sview-ohpc
- slurm-torque-ohpc
cve_2023_41914_rpm_fix_ver: '22.05.10'
cve_2023_41914_rpm_fix_release: '2.1.ohpc.2.6.2'
_cve_2023_41814_updates: []
cve_2023_41914_pkglist_path: "{{ appliances_environment_root }}/{{ inventory_hostname }}-cve_2023_41814_updates"
42 changes: 42 additions & 0 deletions ansible/roles/cve-2023-41914/tasks/install-rpms.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
- name: Validate suitability
include_tasks: validate.yml
when: _cve_2023_41814_installed_pkgs is undefined

- name: Identify packages to update
set_fact:
_cve_2023_41814_updates: "{{ _cve_2023_41814_updates + [item] }}"
loop: "{{ cve_2023_41914_rpms }}"
when:
- item in ansible_facts.packages
- cve_2023_41914_rpm_fix_ver is version(ansible_facts.packages[item][0].version, '>')

- name: Write packages to be modified to a file
# allows recovery from failures in subsequent package deletion/rpm install
copy:
dest: "{{ cve_2023_41914_pkglist_path }}"
content: "{{ _cve_2023_41814_updates | to_nice_yaml }}"
when: _cve_2023_41814_updates | length > 0
delegate_to: localhost

- name: Read packages to modify
set_fact:
_cve_2023_41814_updates: "{{ lookup('file', cve_2023_41914_pkglist_path) | from_yaml }}"

- name: Identify architecture
setup:
gather_subset: architecture

- name: Remove installed packages
dnf:
name: "{{ _cve_2023_41814_updates }}"
state: absent

- name: Install rpms
dnf:
name: "{{ cve_2023_41914_rpm_url }}/{{ item }}-{{ cve_2023_41914_rpm_fix_ver }}-{{ cve_2023_41914_rpm_fix_release }}.{{ ansible_architecture }}.rpm"
loop: "{{ _cve_2023_41814_updates }}"
register: _cve_2023_41814_rpm_installs

- name: Reload systemd units
command: systemctl daemon-reload
when: _cve_2023_41814_rpm_installs.changed
4 changes: 4 additions & 0 deletions ansible/roles/cve-2023-41914/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- include_tasks: validate.yml
- include_tasks: pre-upgrade.yml
- include_tasks: install-rpms.yml
- include_tasks: post-upgrade.yml
Loading

0 comments on commit f4b02ce

Please sign in to comment.