diff --git a/ansible/.gitignore b/ansible/.gitignore index 8ad2ac3ab..6883c6ae5 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -42,3 +42,5 @@ roles/* !roles/proxy/** !roles/resolv_conf/ !roles/resolv_conf/** +!roles/cve-2023-41914 +!roles/cve-2023-41914/** diff --git a/ansible/adhoc/cve-2023-41914.yml b/ansible/adhoc/cve-2023-41914.yml new file mode 100644 index 000000000..e4b907d44 --- /dev/null +++ b/ansible/adhoc/cve-2023-41914.yml @@ -0,0 +1,6 @@ +- hosts: openhpc + gather_facts: no + become: yes + tasks: + - import_role: + name: cve-2023-41914 diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7f06923d5..511bc3b82 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -1,5 +1,12 @@ # Builder version of site.yml just installing binaries +- hosts: builder + become: no + gather_facts: no + tasks: + - name: Report hostname (= final image name) + command: hostname + - name: Run pre.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/cve-2023-41914/README.md b/ansible/roles/cve-2023-41914/README.md new file mode 100644 index 000000000..02c650857 --- /dev/null +++ b/ansible/roles/cve-2023-41914/README.md @@ -0,0 +1,32 @@ +# cve-2023-41914 + +This role fixes [Slurm CVE-2023-41914](https://lists.schedmd.com/pipermail/slurm-announce/2023/000100.html): + +> A number of race conditions have been identified within the slurmd/slurmstepd processes that can lead to the user taking ownership of an arbitrary file on the system. A related issue can lead to the user overwriting an arbitrary file on the compute node (although with data that is not directly under their control). A related issue can also lead to the user deleting all files and sub-directories of an arbitrary target directory on the compute node. + +**NB:** It is only suitable for use on systems installed from OpenHPC v2.6.1 (Slurm v22.05). + +At the time of writing, new OpenHPC packages have been built but are not available from the respositories (reference), hence `dnf update ...` is not available. + +This role can be run in two ways: + +1. To remediate an existing system, run `tasks/main.yml`, e.g. using the playbook `ansible/adhoc/cve-2023-41914.yml`. This will: +- Stop all Slurm services +- Backup the slurmdbd mysql database to the volume-backed directory `/var/lib/state/mysql-backups/` on the control node (by default). +- Uninstall the affected packages and install updated rpms from the OpenHPC build system. +- Restart Slurm services. + + **NB**: This playbook will ALWAYS stop and restart Slurm, even if no updates are actually required. + +2. To remediate images during build (i.e no Slurm services are running, no slurm database exists), run `tasks/install-rpms.yml`, e.g. using the following in an environment pre-hook: + +```yaml +- hosts: builder + gather_facts: no + become: yes + tasks: + - name: Apply fixes for cve-2023-41914 + import_role: + name: cve-2023-41914 + tasks_from: install-rpms.yml +``` diff --git a/ansible/roles/cve-2023-41914/defaults/main.yml b/ansible/roles/cve-2023-41914/defaults/main.yml new file mode 100644 index 000000000..685c6619c --- /dev/null +++ b/ansible/roles/cve-2023-41914/defaults/main.yml @@ -0,0 +1,24 @@ + +# _cve_2023_41814_installed_slurm: [] +cve_2023_41914_mysql_backup_path: "{{ mysql_datadir }}-backups/{{ lookup('pipe', 'date --iso-8601=seconds') }}.sql" + +cve_2023_41914_rpm_url: http://obs.openhpc.community:82/OpenHPC:/2.6.2:/Factory/EL_8/x86_64 +cve_2023_41914_rpms: # see cve_2023_41914_rpm_url + - slurm-ohpc # has to be first as dependency + - slurm-contribs-ohpc + - slurm-devel-ohpc + - slurm-example-configs-ohpc + - slurm-libpmi-ohpc + - slurm-ohpc-slurmrestd + - slurm-openlava-ohpc + - slurm-pam_slurm-ohpc + - slurm-perlapi-ohpc + - slurm-slurmctld-ohpc + - slurm-slurmd-ohpc + - slurm-slurmdbd-ohpc + - slurm-sview-ohpc + - slurm-torque-ohpc +cve_2023_41914_rpm_fix_ver: '22.05.10' +cve_2023_41914_rpm_fix_release: '2.1.ohpc.2.6.2' +_cve_2023_41814_updates: [] +cve_2023_41914_pkglist_path: "{{ appliances_environment_root }}/{{ inventory_hostname }}-cve_2023_41814_updates" diff --git a/ansible/roles/cve-2023-41914/tasks/install-rpms.yml b/ansible/roles/cve-2023-41914/tasks/install-rpms.yml new file mode 100644 index 000000000..42168fd9b --- /dev/null +++ b/ansible/roles/cve-2023-41914/tasks/install-rpms.yml @@ -0,0 +1,42 @@ +- name: Validate suitability + include_tasks: validate.yml + when: _cve_2023_41814_installed_pkgs is undefined + +- name: Identify packages to update + set_fact: + _cve_2023_41814_updates: "{{ _cve_2023_41814_updates + [item] }}" + loop: "{{ cve_2023_41914_rpms }}" + when: + - item in ansible_facts.packages + - cve_2023_41914_rpm_fix_ver is version(ansible_facts.packages[item][0].version, '>') + +- name: Write packages to be modified to a file + # allows recovery from failures in subsequent package deletion/rpm install + copy: + dest: "{{ cve_2023_41914_pkglist_path }}" + content: "{{ _cve_2023_41814_updates | to_nice_yaml }}" + when: _cve_2023_41814_updates | length > 0 + delegate_to: localhost + +- name: Read packages to modify + set_fact: + _cve_2023_41814_updates: "{{ lookup('file', cve_2023_41914_pkglist_path) | from_yaml }}" + +- name: Identify architecture + setup: + gather_subset: architecture + +- name: Remove installed packages + dnf: + name: "{{ _cve_2023_41814_updates }}" + state: absent + +- name: Install rpms + dnf: + name: "{{ cve_2023_41914_rpm_url }}/{{ item }}-{{ cve_2023_41914_rpm_fix_ver }}-{{ cve_2023_41914_rpm_fix_release }}.{{ ansible_architecture }}.rpm" + loop: "{{ _cve_2023_41814_updates }}" + register: _cve_2023_41814_rpm_installs + +- name: Reload systemd units + command: systemctl daemon-reload + when: _cve_2023_41814_rpm_installs.changed diff --git a/ansible/roles/cve-2023-41914/tasks/main.yml b/ansible/roles/cve-2023-41914/tasks/main.yml new file mode 100644 index 000000000..83053baab --- /dev/null +++ b/ansible/roles/cve-2023-41914/tasks/main.yml @@ -0,0 +1,4 @@ +- include_tasks: validate.yml +- include_tasks: pre-upgrade.yml +- include_tasks: install-rpms.yml +- include_tasks: post-upgrade.yml diff --git a/ansible/roles/cve-2023-41914/tasks/post-upgrade.yml b/ansible/roles/cve-2023-41914/tasks/post-upgrade.yml new file mode 100644 index 000000000..d9540faa0 --- /dev/null +++ b/ansible/roles/cve-2023-41914/tasks/post-upgrade.yml @@ -0,0 +1,19 @@ +- name: Start slurmdbd + systemd: + name: slurmdbd + state: started + # NB: this approach is only suitable for minor version upgrades + # major ones may timeout on service start due to db upgrades + when: openhpc_enable.database | default('false') | bool + +- name: Start slurmctld + systemd: + name: slurmctld + state: started + when: openhpc_enable.control | default('false') | bool + +- name: Start slurmd + systemd: + name: slurmd + state: started + when: openhpc_enable.batch | default('false') | bool or 'login' in group_names diff --git a/ansible/roles/cve-2023-41914/tasks/pre-upgrade.yml b/ansible/roles/cve-2023-41914/tasks/pre-upgrade.yml new file mode 100644 index 000000000..59629a482 --- /dev/null +++ b/ansible/roles/cve-2023-41914/tasks/pre-upgrade.yml @@ -0,0 +1,40 @@ +- name: Stop slurmd + systemd: + name: slurmd + state: stopped + when: openhpc_enable.batch | default('false') | bool or 'login' in group_names + +- name: Stop slurmctld + systemd: + name: slurmctld + state: stopped + when: openhpc_enable.control | default('false') | bool + +- name: Stop slurmdbd + systemd: + name: slurmdbd + state: stopped + when: openhpc_enable.database | default('false') | bool + +- name: Ensure backup directory exists + file: + path: "{{ cve_2023_41914_mysql_backup_path | dirname }}" + state: directory + owner: root + group: root + when: openhpc_enable.control | default(false) | bool + +- name: Ensure mysqldump tool installed + dnf: + name: mysql + when: openhpc_enable.control | default(false) | bool + +- name: Backup database + community.mysql.mysql_db: + name: slurm_acct_db + state: dump + target: "{{ cve_2023_41914_mysql_backup_path }}" + login_user: root + login_password: "{{ mysql_root_password }}" + login_host: "{{ mysql_host }}" + when: openhpc_enable.control | default(false) | bool diff --git a/ansible/roles/cve-2023-41914/tasks/validate.yml b/ansible/roles/cve-2023-41914/tasks/validate.yml new file mode 100644 index 000000000..5da1afdc2 --- /dev/null +++ b/ansible/roles/cve-2023-41914/tasks/validate.yml @@ -0,0 +1,22 @@ +- name: Get package facts + package_facts: + +- name: Set fact for installed Slurm packages + # this is a subset (same format) as ansible_facts.packages + set_fact: + _cve_2023_41814_installed_pkgs: "{{ ansible_facts.packages | dict2items | selectattr('key', 'match', 'slurm-') | items2dict }}" + +- name: Ensure only a single version of all slurm-* packages is installed + assert: + that: item.value | length == 1 + loop: "{{ _cve_2023_41814_installed_pkgs | dict2items }}" + +- name: Ensure major version of installed Slurm matches upgrade + assert: + that: _slurm_installed_major_ver == ['22', '05'] + fail_msg: "{{ item.key }} has major version {{ _slurm_installed_major_ver | join('.') }}, expecting 22.05" + loop: "{{ _cve_2023_41814_installed_pkgs | dict2items }}" + when: item.key.startswith('slurm') + vars: + _slurm_installed_major_ver: "{{ item.value[0].version.split('.')[0:2] }}" + diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml new file mode 100644 index 000000000..2c479e087 --- /dev/null +++ b/environments/.stackhpc/hooks/post.yml @@ -0,0 +1,8 @@ +- hosts: builder + gather_facts: no + become: yes + tasks: + - name: Apply fixes for cve-2023-41914 + import_role: + name: cve-2023-41914 + tasks_from: install-rpms.yml diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index d6806baa3..ef8c1281e 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -13,7 +13,7 @@ variable "cluster_name" { variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" type = string - default = "openhpc-230926-1343-e3d3e307" # https://github.com/stackhpc/ansible-slurm-appliance/pull/314 + default = "openhpc-231020-1357-b5d8b056" # https://github.com/stackhpc/ansible-slurm-appliance/pull/320 # default = "Rocky-8-GenericCloud-Base-8.7-20221130.0.x86_64.qcow2" # default = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2" }