diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 3b7bcfdcb..06072a3c8 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -48,6 +48,8 @@ jobs: cd packer/ packer init . PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl + env: + TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Get created image name from manifest id: manifest diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 000000000..09c5b9fb9 --- /dev/null +++ b/ansible.cfg @@ -0,0 +1,19 @@ +# Only used for Azimuth running the caas environment +[defaults] +any_errors_fatal = True +gathering = smart +forks = 30 +host_key_checking = False +remote_tmp = /tmp +collections_path = ansible/collections +roles_path = ansible/roles +filter_plugins = ansible/filter_plugins +callbacks_enabled = ansible.posix.profile_tasks + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True +# This is important because we are using one of the hosts in the play as a jump host +# This ensures that if the proxy connection is interrupted, rendering the other hosts +# unreachable, the connection is retried instead of failing the entire play +retries = 10 diff --git a/ansible/.gitignore b/ansible/.gitignore index 6883c6ae5..ff35312d3 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -28,8 +28,6 @@ roles/* !roles/firewalld/** !roles/etc_hosts/ !roles/etc_hosts/** -!roles/cloud_init/ -!roles/cloud_init/** !roles/mysql/ !roles/mysql/** !roles/systemd/ @@ -42,5 +40,16 @@ roles/* !roles/proxy/** !roles/resolv_conf/ !roles/resolv_conf/** -!roles/cve-2023-41914 -!roles/cve-2023-41914/** +!roles/cluster_infra/ +!roles/cluster_infra/** +!roles/image_build_infra/ +!roles/image_build_infra/** +!roles/persist_openhpc_secrets/ +!roles/persist_openhpc_secrets/** +!roles/zenith_proxy/ +!roles/zenith_proxy/** +!roles/image_build/ +!roles/image_build/** +!roles/persist_hostkeys/ +!roles/persist_hostkeys/** +!roles/requirements.yml diff --git a/ansible/adhoc/cve-2023-41914.yml b/ansible/adhoc/cve-2023-41914.yml deleted file mode 100644 index e4b907d44..000000000 --- a/ansible/adhoc/cve-2023-41914.yml +++ /dev/null @@ -1,6 +0,0 @@ -- hosts: openhpc - gather_facts: no - become: yes - tasks: - - import_role: - name: cve-2023-41914 diff --git a/ansible/adhoc/template-cloud-init.yml b/ansible/adhoc/template-cloud-init.yml deleted file mode 100644 index 92bb14a5d..000000000 --- a/ansible/adhoc/template-cloud-init.yml +++ /dev/null @@ -1,9 +0,0 @@ -- hosts: cloud_init - become: no - gather_facts: no - tasks: - - name: Template out cloud-init userdata - import_role: - name: cloud_init - tasks_from: template.yml - delegate_to: localhost diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index e3baf1e51..9b6fda0de 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -114,6 +114,7 @@ name: fail2ban - name: Setup podman + gather_facts: false hosts: podman tags: podman tasks: diff --git a/ansible/extras.yml b/ansible/extras.yml index 0a27d1806..15d0fbffa 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,4 +1,4 @@ -- hosts: basic_users +- hosts: basic_users:!builder become: yes tags: - basic_users diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index e62694fea..46da8da16 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -52,8 +52,13 @@ become: yes gather_facts: no tasks: - # - import_playbook: slurm.yml - - include_role: + # - import_playbook: slurm.yml: + - name: Setup DB + include_role: + name: mysql + tasks_from: install.yml + - name: OpenHPC + import_role: name: stackhpc.openhpc tasks_from: "install-{{ openhpc_install_type }}.yml" @@ -76,10 +81,10 @@ name: opensearch tasks_from: install.yml become: true - - # opensearch - containerised, nothing to do # slurm_stats - nothing to do - # filebeat - containerised - nothing to do + - import_role: + name: filebeat + tasks_from: install.yml - import_role: # can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index b8d5fc0a5..84f319688 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -26,19 +26,12 @@ # Collection currently requires root for all tasks. become: true -- name: Setup filebeat +- name: Deploy filebeat hosts: filebeat tags: filebeat tasks: - import_role: name: filebeat - tasks_from: config.yml - tags: config - - - import_role: - name: filebeat - tasks_from: deploy.yml - tags: deploy - name: Deploy node_exporter hosts: node_exporter diff --git a/ansible/noop.yml b/ansible/noop.yml index 49317736a..adad24813 100644 --- a/ansible/noop.yml +++ b/ansible/noop.yml @@ -6,4 +6,4 @@ - hosts: localhost gather_facts: false - tasks: [] \ No newline at end of file + tasks: [] diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml new file mode 100644 index 000000000..ef8ea609b --- /dev/null +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -0,0 +1,7 @@ +cluster_deploy_ssh_keys_extra: [] + +# List of hw_scsi_models that result in block devices presenting as /dev/sdX +# rather than /dev/vdX +scsi_models: + # Ceph [https://docs.ceph.com/en/quincy/rbd/rbd-openstack/#image-properties] + - virtio-scsi diff --git a/ansible/roles/cluster_infra/tasks/main.yml b/ansible/roles/cluster_infra/tasks/main.yml new file mode 100644 index 000000000..a535802f5 --- /dev/null +++ b/ansible/roles/cluster_infra/tasks/main.yml @@ -0,0 +1,81 @@ +- debug: + msg: | + terraform_backend_type: {{ terraform_backend_type }} + terraform_state: {{ terraform_state }} + cluster_upgrade_system_packages: {{ cluster_upgrade_system_packages | default('undefined') }} + +# We need to convert the floating IP id to an address for Terraform +# if we we have cluster_floating_ip, otherwise assume that we're +# assigning the FIP in Terraform and that it will be available in +# outputs.cluster_gateway_ip. +- block: + - name: Look up floating IP + include_role: + name: stackhpc.terraform.infra + tasks_from: lookup_floating_ip + vars: + os_floating_ip_id: "{{ cluster_floating_ip }}" + + - name: Set floating IP address fact + set_fact: + cluster_floating_ip_address: "{{ os_floating_ip_info.floating_ip_address }}" + when: cluster_floating_ip is defined + +- name: Install Terraform binary + include_role: + name: stackhpc.terraform.install + +- name: Make Terraform project directory + file: + path: "{{ terraform_project_path }}" + state: directory + +- name: Write backend configuration + copy: + content: | + terraform { + backend "{{ terraform_backend_type }}" { } + } + dest: "{{ terraform_project_path }}/backend.tf" + +# Patching in this appliance is implemented as a switch to a new base image +# So unless explicitly patching, we want to use the same image as last time +# To do this, we query the previous Terraform state before updating +- block: + - name: Get previous Terraform state + stackhpc.terraform.terraform_output: + binary_path: "{{ terraform_binary_path }}" + project_path: "{{ terraform_project_path }}" + backend_config: "{{ terraform_backend_config }}" + register: cluster_infra_terraform_output + + - name: Extract image from Terraform state + set_fact: + cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}" + when: '"cluster_image" in cluster_infra_terraform_output.outputs' + when: + - terraform_state == "present" + - cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages + +- name: Template Terraform files into project directory + template: + src: >- + {{ + "{}{}.j2".format( + ( + cluster_terraform_template_dir ~ "/" + if cluster_terraform_template_dir is defined + else "" + ), + item + ) + }} + dest: "{{ terraform_project_path }}/{{ item }}" + loop: + - outputs.tf + - providers.tf + - resources.tf + +- name: Provision infrastructure + include_role: + name: stackhpc.terraform.infra diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 new file mode 100644 index 000000000..70b57d119 --- /dev/null +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -0,0 +1,53 @@ +output "cluster_gateway_ip" { + description = "The IP address of the gateway used to contact the cluster nodes" + value = openstack_compute_floatingip_associate_v2.login_floatingip_assoc.floating_ip +} + +{% if cluster_ssh_private_key_file is not defined %} +output "cluster_ssh_private_key" { + description = "The private component of the keypair generated on cluster provision" + value = openstack_compute_keypair_v2.cluster_keypair.private_key + sensitive = true +} +{% endif %} + +output "cluster_nodes" { + description = "A list of the nodes in the cluster from which an Ansible inventory will be populated" + value = concat( + [ + { + name = openstack_compute_instance_v2.login.name + ip = openstack_compute_instance_v2.login.network[0].fixed_ip_v4 + groups = ["login", "{{ cluster_name }}_login"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + }, + { + name = openstack_compute_instance_v2.control.name + ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + groups = ["control", "{{ cluster_name }}_control"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + } + ], + {% for partition in openhpc_slurm_partitions %} + [ + for compute in openstack_compute_instance_v2.{{ partition.name }}: { + name = compute.name + ip = compute.network[0].fixed_ip_v4 + groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + } + ]{{ ',' if not loop.last }} + {% endfor %} + ) +} + +output "cluster_image" { + description = "The id of the image used to build the cluster nodes" + value = "{{ cluster_previous_image | default(cluster_image) }}" +} diff --git a/ansible/roles/cluster_infra/templates/providers.tf.j2 b/ansible/roles/cluster_infra/templates/providers.tf.j2 new file mode 100644 index 000000000..32a16f27b --- /dev/null +++ b/ansible/roles/cluster_infra/templates/providers.tf.j2 @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 0.14" + + # We need the OpenStack provider + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 new file mode 100644 index 000000000..eca467ea0 --- /dev/null +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -0,0 +1,447 @@ +#jinja2: trim_blocks:False +##### +##### The identity scope we are operating in +##### Used to output the OpenStack project ID as a fact for provisioned hosts +##### +data "openstack_identity_auth_scope_v3" "scope" { + name = "{{ cluster_name }}" +} + +##### +##### Security groups for the cluster +##### + +# Security group to hold common rules for the cluster +resource "openstack_networking_secgroup_v2" "secgroup_slurm_cluster" { + name = "{{ cluster_name }}-secgroup-slurm-cluster" + description = "Rules for the slurm cluster nodes" + delete_default_rules = true # Fully manage with terraform +} + +# Security group to hold specific rules for the login node +resource "openstack_networking_secgroup_v2" "secgroup_slurm_login" { + name = "{{ cluster_name }}-secgroup-slurm-login" + description = "Specific rules for the slurm login node" + delete_default_rules = true # Fully manage with terraform +} + +## Allow all egress for all cluster nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_cluster_rule_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" +} + +## Allow all ingress between nodes in the cluster +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_cluster_rule_ingress_internal_v4" { + direction = "ingress" + ethertype = "IPv4" + remote_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" +} + +## Allow ingress on port 22 (SSH) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_ssh_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 22 + port_range_max = 22 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +## Allow ingress on port 443 (HTTPS) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_https_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 443 + port_range_max = 443 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +## Allow ingress on port 80 (HTTP) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_http_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 80 + port_range_max = 80 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +##### +##### Volumes +##### +resource "openstack_blockstorage_volume_v3" "state" { + name = "{{ cluster_name }}-state" + description = "State for control node" + size = "{{ state_volume_size }}" +} + +resource "openstack_blockstorage_volume_v3" "home" { + name = "{{ cluster_name }}-home" + description = "Home for control node" + size = "{{ home_volume_size }}" + {% if use_home_volume_type_fast is defined and use_home_volume_type_fast %} + {% if home_volume_type_fast is defined %} + volume_type = "{{ home_volume_type_fast }}" + {% endif %} + {% endif %} +} + +###### +###### Cluster network +###### + +# Always get cluster_external_network network and subnet data +data "openstack_networking_network_v2" "cluster_external_network" { + name = "{{ cluster_external_network }}" +} + +data "openstack_networking_subnet_ids_v2" "cluster_external_subnets" { + network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}" +} + +{% if cluster_network is not defined %} +# Create a new network +resource "openstack_networking_network_v2" "cluster_network" { + name = "{{ cluster_name }}-net" + admin_state_up = "true" +} + +resource "openstack_networking_subnet_v2" "cluster_subnet" { + name = "{{ cluster_name }}-subnet" + network_id = "${openstack_networking_network_v2.cluster_network.id}" + cidr = "{{ cluster_cidr | default('192.168.44.0/24') }}" + {% if cluster_nameservers is defined %} + dns_nameservers = [ + {% for nameserver in cluster_nameservers %} + "{{ nameserver }}"{{ ',' if not loop.last }} + {% endfor %} + ] + {% endif %} + ip_version = 4 +} + +resource "openstack_networking_router_v2" "cluster_router" { + name = "{{ cluster_name }}-router" + admin_state_up = true + external_network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}" +} + +resource "openstack_networking_router_interface_v2" "cluster_router_interface" { + router_id = "${openstack_networking_router_v2.cluster_router.id}" + subnet_id = "${openstack_networking_subnet_v2.cluster_subnet.id}" +} +{% endif %} + +# Get existing network resource data by name, from either the created +# network or the network name if supplied +data "openstack_networking_network_v2" "cluster_network" { + {% if cluster_network is not defined %} + network_id = "${openstack_networking_network_v2.cluster_network.id}" + {% else %} + name = "{{ cluster_network }}" + {% endif %} +} + +data "openstack_networking_subnet_v2" "cluster_subnet" { + # Get subnet data from the subnet we create, or if it exists already + # get it from the cluster network data above + {% if cluster_network is not defined %} + subnet_id = "${openstack_networking_subnet_v2.cluster_subnet.id}" + {% else %} + network_id = "${data.openstack_networking_network_v2.cluster_network.id}" + {% endif %} +} + +##### +##### Cluster ports +##### + +resource "openstack_networking_port_v2" "login" { + name = "{{ cluster_name }}-login-0" + network_id = "${data.openstack_networking_network_v2.cluster_network.id}" + admin_state_up = "true" + + fixed_ip { + subnet_id = "${data.openstack_networking_subnet_v2.cluster_subnet.id}" + } + + security_group_ids = [ + "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}", + "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" + ] + + binding { + vnic_type = "{{ cluster_vnic_type | default('normal') }}" + {% if cluster_vnic_profile is defined %} + profile = < A number of race conditions have been identified within the slurmd/slurmstepd processes that can lead to the user taking ownership of an arbitrary file on the system. A related issue can lead to the user overwriting an arbitrary file on the compute node (although with data that is not directly under their control). A related issue can also lead to the user deleting all files and sub-directories of an arbitrary target directory on the compute node. - -**NB:** It is only suitable for use on systems installed from OpenHPC v2.6.1 (Slurm v22.05). - -At the time of writing, new OpenHPC packages have been built but are not available from the respositories (reference), hence `dnf update ...` is not available. - -This role can be run in two ways: - -1. To remediate an existing system, run `tasks/main.yml`, e.g. using the playbook `ansible/adhoc/cve-2023-41914.yml`. This will: -- Stop all Slurm services -- Backup the slurmdbd mysql database to the volume-backed directory `/var/lib/state/mysql-backups/` on the control node (by default). -- Uninstall the affected packages and install updated rpms from the OpenHPC build system. -- Restart Slurm services. - - **NB**: This playbook will ALWAYS stop and restart Slurm, even if no updates are actually required. - -2. To remediate images during build (i.e no Slurm services are running, no slurm database exists), run `tasks/install-rpms.yml`, e.g. using the following in an environment pre-hook: - -```yaml -- hosts: builder - gather_facts: no - become: yes - tasks: - - name: Apply fixes for cve-2023-41914 - import_role: - name: cve-2023-41914 - tasks_from: install-rpms.yml -``` diff --git a/ansible/roles/cve-2023-41914/defaults/main.yml b/ansible/roles/cve-2023-41914/defaults/main.yml deleted file mode 100644 index 685c6619c..000000000 --- a/ansible/roles/cve-2023-41914/defaults/main.yml +++ /dev/null @@ -1,24 +0,0 @@ - -# _cve_2023_41814_installed_slurm: [] -cve_2023_41914_mysql_backup_path: "{{ mysql_datadir }}-backups/{{ lookup('pipe', 'date --iso-8601=seconds') }}.sql" - -cve_2023_41914_rpm_url: http://obs.openhpc.community:82/OpenHPC:/2.6.2:/Factory/EL_8/x86_64 -cve_2023_41914_rpms: # see cve_2023_41914_rpm_url - - slurm-ohpc # has to be first as dependency - - slurm-contribs-ohpc - - slurm-devel-ohpc - - slurm-example-configs-ohpc - - slurm-libpmi-ohpc - - slurm-ohpc-slurmrestd - - slurm-openlava-ohpc - - slurm-pam_slurm-ohpc - - slurm-perlapi-ohpc - - slurm-slurmctld-ohpc - - slurm-slurmd-ohpc - - slurm-slurmdbd-ohpc - - slurm-sview-ohpc - - slurm-torque-ohpc -cve_2023_41914_rpm_fix_ver: '22.05.10' -cve_2023_41914_rpm_fix_release: '2.1.ohpc.2.6.2' -_cve_2023_41814_updates: [] -cve_2023_41914_pkglist_path: "{{ appliances_environment_root }}/{{ inventory_hostname }}-cve_2023_41814_updates" diff --git a/ansible/roles/cve-2023-41914/tasks/install-rpms.yml b/ansible/roles/cve-2023-41914/tasks/install-rpms.yml deleted file mode 100644 index 42168fd9b..000000000 --- a/ansible/roles/cve-2023-41914/tasks/install-rpms.yml +++ /dev/null @@ -1,42 +0,0 @@ -- name: Validate suitability - include_tasks: validate.yml - when: _cve_2023_41814_installed_pkgs is undefined - -- name: Identify packages to update - set_fact: - _cve_2023_41814_updates: "{{ _cve_2023_41814_updates + [item] }}" - loop: "{{ cve_2023_41914_rpms }}" - when: - - item in ansible_facts.packages - - cve_2023_41914_rpm_fix_ver is version(ansible_facts.packages[item][0].version, '>') - -- name: Write packages to be modified to a file - # allows recovery from failures in subsequent package deletion/rpm install - copy: - dest: "{{ cve_2023_41914_pkglist_path }}" - content: "{{ _cve_2023_41814_updates | to_nice_yaml }}" - when: _cve_2023_41814_updates | length > 0 - delegate_to: localhost - -- name: Read packages to modify - set_fact: - _cve_2023_41814_updates: "{{ lookup('file', cve_2023_41914_pkglist_path) | from_yaml }}" - -- name: Identify architecture - setup: - gather_subset: architecture - -- name: Remove installed packages - dnf: - name: "{{ _cve_2023_41814_updates }}" - state: absent - -- name: Install rpms - dnf: - name: "{{ cve_2023_41914_rpm_url }}/{{ item }}-{{ cve_2023_41914_rpm_fix_ver }}-{{ cve_2023_41914_rpm_fix_release }}.{{ ansible_architecture }}.rpm" - loop: "{{ _cve_2023_41814_updates }}" - register: _cve_2023_41814_rpm_installs - -- name: Reload systemd units - command: systemctl daemon-reload - when: _cve_2023_41814_rpm_installs.changed diff --git a/ansible/roles/cve-2023-41914/tasks/main.yml b/ansible/roles/cve-2023-41914/tasks/main.yml deleted file mode 100644 index 83053baab..000000000 --- a/ansible/roles/cve-2023-41914/tasks/main.yml +++ /dev/null @@ -1,4 +0,0 @@ -- include_tasks: validate.yml -- include_tasks: pre-upgrade.yml -- include_tasks: install-rpms.yml -- include_tasks: post-upgrade.yml diff --git a/ansible/roles/cve-2023-41914/tasks/post-upgrade.yml b/ansible/roles/cve-2023-41914/tasks/post-upgrade.yml deleted file mode 100644 index d9540faa0..000000000 --- a/ansible/roles/cve-2023-41914/tasks/post-upgrade.yml +++ /dev/null @@ -1,19 +0,0 @@ -- name: Start slurmdbd - systemd: - name: slurmdbd - state: started - # NB: this approach is only suitable for minor version upgrades - # major ones may timeout on service start due to db upgrades - when: openhpc_enable.database | default('false') | bool - -- name: Start slurmctld - systemd: - name: slurmctld - state: started - when: openhpc_enable.control | default('false') | bool - -- name: Start slurmd - systemd: - name: slurmd - state: started - when: openhpc_enable.batch | default('false') | bool or 'login' in group_names diff --git a/ansible/roles/cve-2023-41914/tasks/pre-upgrade.yml b/ansible/roles/cve-2023-41914/tasks/pre-upgrade.yml deleted file mode 100644 index 59629a482..000000000 --- a/ansible/roles/cve-2023-41914/tasks/pre-upgrade.yml +++ /dev/null @@ -1,40 +0,0 @@ -- name: Stop slurmd - systemd: - name: slurmd - state: stopped - when: openhpc_enable.batch | default('false') | bool or 'login' in group_names - -- name: Stop slurmctld - systemd: - name: slurmctld - state: stopped - when: openhpc_enable.control | default('false') | bool - -- name: Stop slurmdbd - systemd: - name: slurmdbd - state: stopped - when: openhpc_enable.database | default('false') | bool - -- name: Ensure backup directory exists - file: - path: "{{ cve_2023_41914_mysql_backup_path | dirname }}" - state: directory - owner: root - group: root - when: openhpc_enable.control | default(false) | bool - -- name: Ensure mysqldump tool installed - dnf: - name: mysql - when: openhpc_enable.control | default(false) | bool - -- name: Backup database - community.mysql.mysql_db: - name: slurm_acct_db - state: dump - target: "{{ cve_2023_41914_mysql_backup_path }}" - login_user: root - login_password: "{{ mysql_root_password }}" - login_host: "{{ mysql_host }}" - when: openhpc_enable.control | default(false) | bool diff --git a/ansible/roles/cve-2023-41914/tasks/validate.yml b/ansible/roles/cve-2023-41914/tasks/validate.yml deleted file mode 100644 index 5da1afdc2..000000000 --- a/ansible/roles/cve-2023-41914/tasks/validate.yml +++ /dev/null @@ -1,22 +0,0 @@ -- name: Get package facts - package_facts: - -- name: Set fact for installed Slurm packages - # this is a subset (same format) as ansible_facts.packages - set_fact: - _cve_2023_41814_installed_pkgs: "{{ ansible_facts.packages | dict2items | selectattr('key', 'match', 'slurm-') | items2dict }}" - -- name: Ensure only a single version of all slurm-* packages is installed - assert: - that: item.value | length == 1 - loop: "{{ _cve_2023_41814_installed_pkgs | dict2items }}" - -- name: Ensure major version of installed Slurm matches upgrade - assert: - that: _slurm_installed_major_ver == ['22', '05'] - fail_msg: "{{ item.key }} has major version {{ _slurm_installed_major_ver | join('.') }}, expecting 22.05" - loop: "{{ _cve_2023_41814_installed_pkgs | dict2items }}" - when: item.key.startswith('slurm') - vars: - _slurm_installed_major_ver: "{{ item.value[0].version.split('.')[0:2] }}" - diff --git a/ansible/roles/filebeat/tasks/deploy.yml b/ansible/roles/filebeat/tasks/deploy.yml deleted file mode 100644 index aa4f46f32..000000000 --- a/ansible/roles/filebeat/tasks/deploy.yml +++ /dev/null @@ -1,7 +0,0 @@ ---- -- name: Create systemd unit file - template: - dest: /etc/systemd/system/filebeat.service - src: filebeat.service.j2 - become: true - notify: Restart filebeat container diff --git a/ansible/roles/filebeat/tasks/install.yml b/ansible/roles/filebeat/tasks/install.yml new file mode 100644 index 000000000..8e64722ec --- /dev/null +++ b/ansible/roles/filebeat/tasks/install.yml @@ -0,0 +1,17 @@ +--- +- name: Create systemd unit file + template: + dest: /etc/systemd/system/filebeat.service + src: filebeat.service.j2 + become: true + register: _filebeat_unit + +- name: Pull container image + containers.podman.podman_image: + name: "docker.elastic.co/beats/filebeat-oss" + tag: "{{ filebeat_version }}" + become_user: "{{ filebeat_podman_user }}" + +- name: Reload filebeat unit file + command: systemctl daemon-reload + when: _filebeat_unit.changed diff --git a/ansible/roles/filebeat/tasks/main.yml b/ansible/roles/filebeat/tasks/main.yml new file mode 100644 index 000000000..849683c38 --- /dev/null +++ b/ansible/roles/filebeat/tasks/main.yml @@ -0,0 +1,2 @@ +- import_tasks: install.yml +- import_tasks: runtime.yml diff --git a/ansible/roles/filebeat/tasks/post.yml b/ansible/roles/filebeat/tasks/post.yml deleted file mode 100644 index 73b314ff7..000000000 --- a/ansible/roles/filebeat/tasks/post.yml +++ /dev/null @@ -1 +0,0 @@ ---- \ No newline at end of file diff --git a/ansible/roles/filebeat/tasks/config.yml b/ansible/roles/filebeat/tasks/runtime.yml similarity index 82% rename from ansible/roles/filebeat/tasks/config.yml rename to ansible/roles/filebeat/tasks/runtime.yml index 1e454347e..119745096 100644 --- a/ansible/roles/filebeat/tasks/config.yml +++ b/ansible/roles/filebeat/tasks/runtime.yml @@ -27,3 +27,13 @@ mode: 0600 notify: Restart filebeat container become: true + +- name: Flush handlers + meta: flush_handlers + +- name: Ensure filebeat service state + systemd: + name: filebeat.service + state: started + enabled: true + become: true diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index 7e19c7726..4427b7d18 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -8,3 +8,9 @@ dest: /etc/systemd/system/mysql.service src: mysql.service.j2 register: _mysql_unitfile + +- name: Pull container image + containers.podman.podman_image: + name: "mysql" + tag: "{{ mysql_tag }}" + become_user: "{{ mysql_podman_user }}" diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index a74f0e24e..34e1ac223 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -128,3 +128,19 @@ # loop: # - /var/www/ood/public # - /usr/share/ondemand-dex/web/themes/ + +- name: Keyscan login host + command: + cmd: "ssh-keyscan {{ openondemand_clusters.slurm.v2.login.host }}" + register: _openondemand_login_key + changed_when: false + +- name: Add login hostkeys to known hosts + blockinfile: + path: /etc/ssh/ssh_known_hosts + create: true + block: "{{ _openondemand_login_key.stdout_lines | sort | join('\n') }}" + marker: "# {mark} ANSIBLE MANAGED BLOCK: openondemand login host" # allows other tasks to use blockinfile on this file + owner: root + group: root + mode: o=rw,go=r diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index ad08bb0bd..bde13c383 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -2,7 +2,7 @@ - name: Enable TurboVNC repo tags: install get_url: - url: https://turbovnc.org/pmwiki/uploads/Downloads/TurboVNC.repo + url: https://raw.githubusercontent.com/TurboVNC/repo/main/TurboVNC.repo dest: /etc/yum.repos.d/TurboVNC.repo - name: Install EPEL diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index 902c71d1f..81547e5a0 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -14,6 +14,12 @@ src: opensearch.service.j2 register: _opensearch_unit +- name: Pull container image + containers.podman.podman_image: + name: "opensearchproject/opensearch" + tag: "{{ opensearch_version }}" + become_user: "{{ opensearch_podman_user }}" + - name: Reload opensearch unit file command: systemctl daemon-reload when: _opensearch_unit.changed diff --git a/ansible/roles/opensearch/tasks/runtime.yml b/ansible/roles/opensearch/tasks/runtime.yml index b2cdeb456..7fe197abe 100644 --- a/ansible/roles/opensearch/tasks/runtime.yml +++ b/ansible/roles/opensearch/tasks/runtime.yml @@ -74,11 +74,6 @@ notify: Restart opensearch service become: true -- name: Pull container - containers.podman.podman_image: - name: "opensearchproject/opensearch:{{ opensearch_version }}" - become_user: "{{ opensearch_podman_user }}" - - name: Flush handlers meta: flush_handlers diff --git a/ansible/roles/passwords/tasks/validate.yml b/ansible/roles/passwords/tasks/validate.yml new file mode 100644 index 000000000..9279ffdbf --- /dev/null +++ b/ansible/roles/passwords/tasks/validate.yml @@ -0,0 +1,4 @@ +- name: Assert secrets created + assert: + that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_testuser_password defined in dev + fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?" diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml new file mode 100644 index 000000000..47493220d --- /dev/null +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -0,0 +1,33 @@ +--- + +- name: Ensure hostkeys directory exists on persistent storage + file: + path: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}" + state: directory + owner: root + group: root + mode: 0600 + +- name: Copy hostkeys from persistent storage + # won't fail if no keys are in persistent storage + copy: + src: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" + dest: /etc/ssh/ + remote_src: true + +- name: Find hostkeys + find: + path: /etc/ssh/ + patterns: ssh_host_*_key* + register: _find_ssh_keys + +- name: Persist hostkeys + copy: + dest: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" + src: "{{ item }}" + remote_src: true + mode: preserve + loop: "{{ _find_ssh_keys.files | map(attribute='path') }}" + +- meta: reset_connection + diff --git a/ansible/roles/persist_openhpc_secrets/tasks/main.yml b/ansible/roles/persist_openhpc_secrets/tasks/main.yml new file mode 100644 index 000000000..6ae9bcd59 --- /dev/null +++ b/ansible/roles/persist_openhpc_secrets/tasks/main.yml @@ -0,0 +1,35 @@ +--- + +- name: Check if OpenHPC secrets exist in persistent storage + stat: + path: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + register: openhpc_secrets_stat + +- name: Ensure Ansible facts directories exist + file: + path: "{{ item }}" + state: directory + owner: root + mode: 0600 + loop: + - "{{ appliances_state_dir }}/ansible.facts.d" + - "/etc/ansible/facts.d" + +- name: Write OpenHPC secrets + template: + src: openhpc_secrets.fact + dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + owner: root + mode: 0600 + when: "not openhpc_secrets_stat.stat.exists" + +- name: Symlink persistent facts to facts_path + file: + state: link + src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + dest: /etc/ansible/facts.d/openhpc_secrets.fact + owner: root + +- name: Read facts + ansible.builtin.setup: + filter: ansible_local diff --git a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact new file mode 100644 index 000000000..9d6de37d8 --- /dev/null +++ b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact @@ -0,0 +1,9 @@ +{ + "vault_azimuth_user_password": "{{ lookup('password', '/dev/null') }}", + "vault_grafana_admin_password": "{{ lookup('password', '/dev/null') }}", + "vault_elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}", + "vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}", + "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}", + "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}", + "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}" +} diff --git a/ansible/roles/podman/tasks/config.yml b/ansible/roles/podman/tasks/config.yml index 05dc8f757..86b8716b0 100644 --- a/ansible/roles/podman/tasks/config.yml +++ b/ansible/roles/podman/tasks/config.yml @@ -34,52 +34,3 @@ with_items: "{{ podman_users }}" register: podman_user_info become: yes - -- name: Define tmp directories on tmpfs - blockinfile: - path: /etc/tmpfiles.d/podman.conf - create: yes - block: | - d {{ podman_tmp_dir_root }}/{{ item.name }}/libpod/tmp 0755 {{ item.name }} {{ item.name }} - Z {{ podman_tmp_dir_root }}/{{ item.name }} 0755 {{ item.name }} {{ item.name }} - become: yes - loop: "{{ podman_users }}" - register: podman_tmp_dirs - -- name: Create tmp directories - command: systemd-tmpfiles --create - become: true - when: podman_tmp_dirs.results | selectattr('changed') | list | length > 0 # when: any changed - -- name: Create podman configuration directories - file: - path: "{{ item.home }}/.config/containers/" - state: directory - owner: "{{ item.name }}" - group: "{{ item.name }}" - become: yes - loop: "{{ podman_user_info.results }}" - -- name: Set podman to use temp directories - community.general.ini_file: - path: "{{ item.home }}/.config/containers/containers.conf" - section: engine - option: tmp_dir - value: '"{{ podman_tmp_dir_root }}/{{ item.name }}/libpod/tmp"' - owner: "{{ item.name }}" - group: "{{ item.name }}" - create: yes - loop: "{{ podman_user_info.results }}" - become: yes - register: podman_tmp - -- name: Reset podman database - # otherwise old config overrides! - command: - cmd: podman system reset --force - become: yes - become_user: "{{ item.item.name }}" - when: item.changed - loop: "{{ podman_tmp.results }}" - loop_control: - label: "{{ item.item.name }}" diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml new file mode 100644 index 000000000..dbb920c58 --- /dev/null +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -0,0 +1,57 @@ +--- + +zenith_registrar_url: "{{ undef(hint = 'zenith_registrar_url is required') }}" +zenith_registrar_verify_ssl: true +zenith_sshd_host: "{{ undef(hint = 'zenith_sshd_host is required') }}" +zenith_sshd_port: 22 + +zenith_proxy_podman_user: "{{ ansible_user }}" + +zenith_proxy_service_name: "{{ undef(hint = 'zenith_proxy_service_name is required') }}" +zenith_proxy_client_service_name: "{{ zenith_proxy_service_name }}-client" +zenith_proxy_mitm_service_name: "{{ zenith_proxy_service_name }}-mitm" + +zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" +zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" +zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" + +zenith_proxy_image_tag: '0.1.0' + +zenith_proxy_client_image_repository: ghcr.io/stackhpc/zenith-client +zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" + +zenith_proxy_mitm_image_repository: ghcr.io/stackhpc/zenith-proxy +zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_image_tag }}" + +zenith_proxy_upstream_scheme: http +zenith_proxy_upstream_host: "{{ undef(hint = 'zenith_proxy_upstream_host is required') }}" +zenith_proxy_upstream_port: "{{ undef(hint = 'zenith_proxy_upstream_port is required') }}" +zenith_proxy_upstream_read_timeout: + +zenith_proxy_client_token: "{{ undef(hint = 'zenith_proxy_client_token is required') }}" +zenith_proxy_client_auth_skip: false +zenith_proxy_client_auth_params: {} + +zenith_proxy_mitm_enabled: no +zenith_proxy_mitm_listen_port: 8080 +zenith_proxy_mitm_auth_inject: none # valid values are 'basic' and 'bearer' +zenith_proxy_mitm_auth_basic_username: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_basic_username is required') + if zenith_proxy_mitm_auth_inject == "basic" + else None + }} +zenith_proxy_mitm_auth_basic_password: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_basic_password is required') + if zenith_proxy_mitm_auth_inject == "basic" + else None + }} +zenith_proxy_mitm_auth_bearer_header_name: Authorization +zenith_proxy_mitm_auth_bearer_header_prefix: Bearer +zenith_proxy_mitm_auth_bearer_token: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_bearer_token is required') + if zenith_proxy_mitm_auth_inject == "bearer" + else None + }} diff --git a/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh new file mode 100644 index 000000000..aab232a0a --- /dev/null +++ b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +##### +# Small script that can be used to attach to the infra container of a pod +# +# Useful in a systemd service that starts a pod in order to track the execution +# +# Accepts a single argument which is the name of the pod whose infra container we should attach to +##### + +set -e + +echo "[INFO] Finding infra container for pod '$1'" +INFRA_CONTAINER_ID="$(podman pod inspect --format '{{.InfraContainerID}}' "$1")" + +echo "[INFO] Attaching to infra container '${INFRA_CONTAINER_ID}'" +exec podman container attach --no-stdin ${INFRA_CONTAINER_ID} diff --git a/ansible/roles/zenith_proxy/tasks/main.yml b/ansible/roles/zenith_proxy/tasks/main.yml new file mode 100644 index 000000000..1a42b0438 --- /dev/null +++ b/ansible/roles/zenith_proxy/tasks/main.yml @@ -0,0 +1,103 @@ +--- + +- name: Install script for attaching to pod infra containers + copy: + src: podman-pod-infra-attach.sh + dest: /usr/bin/ + mode: +x + become: true + +- name: Create systemd unit for Zenith pod + template: + src: pod.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_service_name }}.service + become: true + register: zenith_proxy_pod_systemd_unit + +- name: Ensure Zenith pod is started and enabled + service: + name: "{{ zenith_proxy_service_name }}.service" + state: "{{ 'restarted' if zenith_proxy_pod_systemd_unit is changed else 'started' }}" + enabled: yes + daemon_reload: "{{ zenith_proxy_pod_systemd_unit is changed }}" + become: true + +- block: + - name: Create systemd unit file for MITM proxy + template: + src: mitm.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_mitm_service_name }}.service + register: zenith_proxy_mitm_systemd_unit + + - name: Ensure MITM proxy is started and enabled + service: + name: "{{ zenith_proxy_mitm_service_name }}.service" + state: "{{ 'restarted' if zenith_proxy_mitm_systemd_unit is changed else 'started' }}" + enabled: yes + daemon_reload: "{{ zenith_proxy_mitm_systemd_unit is changed }}" + become: true + when: zenith_proxy_mitm_enabled + +- name: Ensure Zenith config directory exists + file: + path: /etc/zenith/{{ zenith_proxy_service_name }} + state: directory + become: true + +- name: Write Zenith client configuration + template: + src: zenith-client.yaml.j2 + dest: /etc/zenith/{{ zenith_proxy_service_name }}/client.yaml + become: true + register: zenith_proxy_client_config_file + +- name: Create directory to persist SSH key + file: + path: "{{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh" + state: directory + owner: "{{ zenith_proxy_podman_user }}" + group: "{{ zenith_proxy_podman_user }}" + become: true + +- name: Initialise Zenith client + # Use a foreground command rather than the podman_container module as I could not + # work out the combination of parameters that produced the desired behaviour :-( + command: >- + podman run + --name {{ zenith_proxy_service_name }}-init + --replace + --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro + --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh + {{ zenith_proxy_client_image }} + zenith-client init + become: true + become_user: "{{ zenith_proxy_podman_user }}" + register: zenith_proxy_client_init + changed_when: zenith_proxy_client_init.rc == 0 + failed_when: >- + zenith_proxy_client_init.rc != 0 and + "token has already been used" not in zenith_proxy_client_init.stderr + +- name: Create systemd unit file for Zenith client + template: + src: client.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_client_service_name }}.service + become: true + register: zenith_proxy_client_systemd_unit + +- name: Ensure Zenith client is started and enabled + service: + name: "{{ zenith_proxy_client_service_name }}.service" + state: >- + {{ + 'restarted' + if ( + zenith_proxy_client_config_file is changed or + zenith_proxy_client_systemd_unit is changed or + zenith_proxy_client_init is changed + ) + else 'started' + }} + enabled: yes + daemon_reload: "{{ zenith_proxy_client_systemd_unit is changed }}" + become: true diff --git a/ansible/roles/zenith_proxy/templates/client.service.j2 b/ansible/roles/zenith_proxy/templates/client.service.j2 new file mode 100644 index 000000000..809b19b87 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/client.service.j2 @@ -0,0 +1,34 @@ +[Unit] +Description=Podman {{ zenith_proxy_client_service_name }}.service +Wants=network.target +After=network-online.target +BindsTo={{ zenith_proxy_service_name }}.service +PartOf={{ zenith_proxy_service_name }}.service +After={{ zenith_proxy_service_name }}.service +{% if zenith_proxy_mitm_enabled %} +Wants={{ zenith_proxy_mitm_service_name }}.service +After={{ zenith_proxy_mitm_service_name }}.service +{% endif %} + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +RestartSec=5 +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStart=/usr/bin/podman run \ + --cgroups=no-conmon \ + --replace \ + --restart=no \ + --pod {{ zenith_proxy_pod_name }} \ + --name {{ zenith_proxy_client_container_name }} \ + --security-opt label=disable \ + --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro \ + --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh \ + {{ zenith_proxy_client_image }} +ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_client_container_name }} +ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_client_container_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/mitm.service.j2 b/ansible/roles/zenith_proxy/templates/mitm.service.j2 new file mode 100644 index 000000000..d8b3c954b --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/mitm.service.j2 @@ -0,0 +1,46 @@ + + +[Unit] +Description=Podman {{ zenith_proxy_mitm_service_name }}.service +Wants=network.target +After=network-online.target +BindsTo={{ zenith_proxy_service_name }}.service +PartOf={{ zenith_proxy_service_name }}.service +After={{ zenith_proxy_service_name }}.service + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStart=/usr/bin/podman run \ + --cgroups=no-conmon \ + --replace \ + --restart=no \ + --pod {{ zenith_proxy_pod_name }} \ + --name {{ zenith_proxy_mitm_container_name }} \ + --security-opt label=disable \ + --env ZENITH_PROXY_LISTEN_PORT={{ zenith_proxy_mitm_listen_port }} \ + --env ZENITH_PROXY_UPSTREAM_SCHEME={{ zenith_proxy_upstream_scheme }} \ + --env ZENITH_PROXY_UPSTREAM_HOST={{ zenith_proxy_upstream_host }} \ + --env ZENITH_PROXY_UPSTREAM_PORT={{ zenith_proxy_upstream_port }} \ +{% if zenith_proxy_upstream_read_timeout %} + --env ZENITH_PROXY_READ_TIMEOUT={{ zenith_proxy_upstream_read_timeout }} \ +{% endif %} +{% if zenith_proxy_mitm_auth_inject == "basic" %} + --env ZENITH_PROXY_AUTH_INJECT=basic \ + --env ZENITH_PROXY_AUTH_BASIC_USERNAME={{ zenith_proxy_mitm_auth_basic_username }} \ + --env {{ "ZENITH_PROXY_AUTH_BASIC_PASSWORD={}".format(zenith_proxy_mitm_auth_basic_password) | quote }} \ +{% elif zenith_proxy_mitm_auth_inject == "bearer" %} + --env ZENITH_PROXY_AUTH_INJECT=bearer \ + --env ZENITH_PROXY_AUTH_BEARER_HEADER={{ zenith_proxy_mitm_auth_bearer_header_name }} \ + --env ZENITH_PROXY_AUTH_BEARER_PREFIX={{ zenith_proxy_mitm_auth_bearer_header_prefix }} \ + --env ZENITH_PROXY_AUTH_BEARER_TOKEN={{ zenith_proxy_mitm_auth_bearer_token }} \ +{% endif %} + {{ zenith_proxy_mitm_image }} +ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_mitm_container_name }} +ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_mitm_container_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/pod.service.j2 b/ansible/roles/zenith_proxy/templates/pod.service.j2 new file mode 100644 index 000000000..d46617556 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/pod.service.j2 @@ -0,0 +1,19 @@ +[Unit] +Description=Podman {{ zenith_proxy_service_name }}.service +Wants=network.target +After=network-online.target + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} +ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }} +ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }} +ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }} +ExecStopPost=/usr/bin/podman pod rm --ignore -f {{ zenith_proxy_pod_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 b/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 new file mode 100644 index 000000000..c037d7dc6 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 @@ -0,0 +1,27 @@ +ssh_identity_path: /home/zenith/.ssh/id_zenith + +# Init options +registrar_url: {{ zenith_registrar_url }} +token: {{ zenith_proxy_client_token }} +verify_ssl: {{ 'yes' if zenith_registrar_verify_ssl else 'no' }} + +# Connect options +server_address: {{ zenith_sshd_host }} +server_port: {{ zenith_sshd_port }} +{% if zenith_proxy_mitm_enabled %} +backend_protocol: http +forward_to_host: 127.0.0.1 +forward_to_port: {{ zenith_proxy_mitm_listen_port }} +{% else %} +backend_protocol: {{ zenith_proxy_upstream_scheme }} +forward_to_host: {{ zenith_proxy_upstream_host }} +forward_to_port: {{ zenith_proxy_upstream_port }} +{% endif %} +{% if zenith_proxy_upstream_read_timeout %} +read_timeout: {{ zenith_proxy_upstream_read_timeout }} +{% endif %} +skip_auth: {{ 'yes' if zenith_proxy_client_auth_skip else 'no' }} +{% if zenith_proxy_client_auth_params %} +auth_params: + {{ zenith_proxy_client_auth_params | to_nice_yaml | indent(2) }} +{% endif %} diff --git a/ansible/site.yml b/ansible/site.yml index fd564367f..bb379399d 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -2,17 +2,21 @@ - name: Run pre.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists - import_playbook: validate.yml + when: appliances_validate | default(true) + - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post-bootstrap.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists @@ -26,7 +30,8 @@ - name: Run post.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists diff --git a/ansible/validate.yml b/ansible/validate.yml index 02da9d285..c22873615 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -2,8 +2,17 @@ # Fail early if configuration is invalid +- name: Validate secrets created + hosts: localhost + gather_facts: false + tasks: + - import_role: + name: passwords + tasks_from: validate.yml + - name: Ensure control node is in inventory hosts: all + gather_facts: false tasks: - assert: that: groups['control'] | length @@ -11,6 +20,7 @@ - name: Validate openhpc configuration hosts: openhpc + gather_facts: false tags: openhpc tasks: - assert: @@ -22,6 +32,7 @@ - name: Validate podman configuration hosts: podman + gather_facts: false tags: podman tasks: - import_role: @@ -31,6 +42,7 @@ - name: Validate filebeat configuration hosts: filebeat + gather_facts: false tags: filebeat tasks: - import_role: diff --git a/environments/.caas/README.md b/environments/.caas/README.md new file mode 100644 index 000000000..4a08433b0 --- /dev/null +++ b/environments/.caas/README.md @@ -0,0 +1,18 @@ +# Caas cluster + +Environment for default Azimuth Slurm. This is not intended to be manually deployed. + +Non-standard things for this environment: +- There is no activate script. +- `ansible.cgf` is provided in the repo root, as expected by the caas operator. +- `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the + runner project directory: + + azimuth_caas_stackhpc_slurm_appliance_template: + ... + envVars: + ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory + + Ansible then defines `ansible_inventory_sources` which contains absolute paths, and + that is used to derive the `appliances_environment_root` and + `appliances_repository_root`. diff --git a/environments/.caas/ansible.cfg b/environments/.caas/ansible.cfg new file mode 100644 index 000000000..54a1c2a50 --- /dev/null +++ b/environments/.caas/ansible.cfg @@ -0,0 +1,15 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles +filter_plugins = ../../ansible/filter_plugins + +[ssh_connection] +ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True diff --git a/environments/.caas/assets/ood-icon.png b/environments/.caas/assets/ood-icon.png new file mode 100644 index 000000000..b5f4b6ea7 Binary files /dev/null and b/environments/.caas/assets/ood-icon.png differ diff --git a/environments/.stackhpc/cloud_init/.gitkeep b/environments/.caas/hooks/.gitkeep similarity index 100% rename from environments/.stackhpc/cloud_init/.gitkeep rename to environments/.caas/hooks/.gitkeep diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml new file mode 100644 index 000000000..7aafe409f --- /dev/null +++ b/environments/.caas/hooks/post.yml @@ -0,0 +1,86 @@ +- name: Persist login hostkey across rebuilds +# Need NFS for this so can't do it before the appliance plays + hosts: login + gather_facts: no + become: yes + roles: + - persist_hostkeys + +# Configure the Zenith clients that are required +# First, ensure that podman is installed on all hosts that will run Zenith clients +- hosts: zenith,!podman + tasks: + - import_role: + name: podman + tasks_from: prereqs.yml + - import_role: + name: podman + tasks_from: config.yml + +- hosts: grafana + tasks: + - name: Deploy the Zenith client for Grafana + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-monitoring + # Use the IP address for the upstream host + zenith_proxy_upstream_host: "{{ ansible_host }}" # IP + zenith_proxy_upstream_port: "{{ grafana_port }}" + zenith_proxy_client_token: "{{ zenith_token_monitoring }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" + zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" + when: zenith_subdomain_monitoring is defined + +- hosts: openondemand + tasks: + - name: Deploy the Zenith client for OOD + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-ood + # Use the IP address for the upstream host + zenith_proxy_upstream_scheme: https + zenith_proxy_upstream_host: "{{ ansible_host }}" # IP + zenith_proxy_upstream_port: 443 + zenith_proxy_client_token: "{{ zenith_token_ood }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: azimuth + zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" + when: zenith_subdomain_ood is defined + +# Run hpctests if set in UI +- hosts: hpctests[0] + become: false + gather_facts: false + tasks: + - import_role: + name: hpctests + when: cluster_run_validation | default(false) | bool + +# Write the outputs as the final task +- hosts: localhost + tasks: + - debug: var=outputs + vars: + # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, + # so we have to repeat logic here unfortunately + outputs: >- + {{- + { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | + combine( + { + "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", + "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password + } + if zenith_fqdn_ood is not defined + else {} + ) + }} \ No newline at end of file diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml new file mode 100644 index 000000000..05b0255c8 --- /dev/null +++ b/environments/.caas/hooks/pre.yml @@ -0,0 +1,45 @@ +--- + +# Provision the infrastructure using Terraform +- name: Provision infrastructure + hosts: openstack + roles: + - cluster_infra + +# Ensure that the secrets are generated and persisted on the control host +- name: Generate and persist secrets + hosts: control + gather_facts: no + become: yes + roles: + - persist_openhpc_secrets + +# validate.yml asserts presence of a control group which doesn't exist when +# destroying infra, so only validate when we're not destroying +- hosts: openstack + gather_facts: no + become: no + tasks: + - set_fact: + appliances_validate: false + when: "cluster_state | default('') == 'absent'" + +# TODO: FIXME: maybe by doing the user move in cloud-init? +# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run +# This can disrupt the SSH connection, particularly because we use the login host as a jump host +# So we move the home directory on the login node and reset the connections first +- hosts: login + gather_facts: false + tasks: + - name: Set up Ansible user + user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: "sudo" + # Need to change working directory otherwise we try to switch back to non-existent directory. + become_flags: '-i' + become: true + +- hosts: cluster + gather_facts: no + tasks: + - name: Reset persistent SSH connections + meta: reset_connection diff --git a/environments/.caas/inventory/everything b/environments/.caas/inventory/everything new file mode 120000 index 000000000..dc66b9576 --- /dev/null +++ b/environments/.caas/inventory/everything @@ -0,0 +1 @@ +../../../environments/common/layouts/everything \ No newline at end of file diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups new file mode 100644 index 000000000..a6f06b7a7 --- /dev/null +++ b/environments/.caas/inventory/extra_groups @@ -0,0 +1,9 @@ +[basic_users:children] +cluster + +[etc_hosts:children] +cluster + +[zenith:children] +grafana +openondemand diff --git a/environments/skeleton/{{cookiecutter.environment}}/cloud_init/.gitkeep b/environments/.caas/inventory/group_vars/all/.gitkeep similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/cloud_init/.gitkeep rename to environments/.caas/inventory/group_vars/all/.gitkeep diff --git a/environments/.caas/inventory/group_vars/all/basic_users.yml b/environments/.caas/inventory/group_vars/all/basic_users.yml new file mode 100644 index 000000000..6105df821 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/basic_users.yml @@ -0,0 +1,6 @@ +basic_users_users: + - name: azimuth + # Hash the password with a salt that is different for each host + password: "{{ vault_azimuth_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" + uid: 1005 + public_key: "{{ cluster_user_ssh_public_key }}" diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml new file mode 100644 index 000000000..b9ea63586 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -0,0 +1,22 @@ +# Account for the fact we are running outside of the expected environment system: +caas_inventory: "{{ ansible_inventory_sources | last }}" # ansible_inventory_sources is absolute +appliances_environment_root: "{{ caas_inventory | dirname }}" +appliances_repository_root: "{{ appliances_environment_root | dirname | dirname }}" + +# Read the secrets from the Ansible local facts on the control host +vault_azimuth_user_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password }}" +vault_grafana_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_grafana_admin_password }}" +vault_elasticsearch_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_admin_password }}" +vault_elasticsearch_kibana_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_kibana_password }}" +vault_mysql_root_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_root_password }}" +vault_mysql_slurm_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_slurm_password }}" +vault_openhpc_mungekey: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_openhpc_mungekey }}" + +# Override this to cope with the case where the podman group just doesn't exist +appliances_local_users_podman_enable: "{{ groups.get('podman', []) | length > 0 }}" + +# The server name for Open OnDemand depends on whether Zenith is enabled or not +openondemand_servername_default: "{{ hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-') ~ '.sslip.io' }}" +openondemand_servername: "{{ zenith_fqdn_ood | default(openondemand_servername_default) }}" + +appliances_state_dir: /var/lib/state diff --git a/environments/.caas/inventory/group_vars/all/grafana.yml b/environments/.caas/inventory/group_vars/all/grafana.yml new file mode 100644 index 000000000..10fdc926c --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/grafana.yml @@ -0,0 +1 @@ +grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" diff --git a/environments/.caas/inventory/group_vars/all/hpctests.yml b/environments/.caas/inventory/group_vars/all/hpctests.yml new file mode 100644 index 000000000..a31437be3 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/hpctests.yml @@ -0,0 +1,6 @@ +# Skip plotting pingpong as matplotlib not in runner environment +hpctests_pingpong_plot: false + +# In Azimuth, the Ansible controller is an ephemeral pod, so all that matters is that +# this is a location that is writable by the container user +hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml new file mode 100644 index 000000000..2ea3abe57 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -0,0 +1,16 @@ +nfs_server: "{{ nfs_server_default }}" + +nfs_configurations: + - comment: Export /exports/home from Slurm control node as /home + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" + nfs_export: "/exports/home" # assumes skeleton TF is being used + nfs_client_mnt_point: "/home" + - comment: Export /var/lib/state from Slurm control node to OOD + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['openondemand'] }}" + nfs_export: "{{ appliances_state_dir }}" + nfs_client_mnt_point: "{{ appliances_state_dir }}" + nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service" diff --git a/environments/.caas/inventory/group_vars/all/openhpc.yml b/environments/.caas/inventory/group_vars/all/openhpc.yml new file mode 100644 index 000000000..624402f9f --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/openhpc.yml @@ -0,0 +1,5 @@ +openhpc_cluster_name: "{{ cluster_name }}" + +# Provision a single "standard" compute partition using the supplied +# node count and flavor +openhpc_slurm_partitions: "{{ hostvars[groups['openstack'][0]]['openhpc_slurm_partitions'] }}" diff --git a/environments/.caas/inventory/group_vars/all/openondemand.yml b/environments/.caas/inventory/group_vars/all/openondemand.yml new file mode 100644 index 000000000..60461bd61 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/openondemand.yml @@ -0,0 +1,9 @@ +--- +openondemand_auth: basic_pam +openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" + +httpd_listen_addr_port: + - 80 + - 443 + diff --git a/environments/.caas/inventory/group_vars/all/prometheus.yml b/environments/.caas/inventory/group_vars/all/prometheus.yml new file mode 100644 index 000000000..eb28fda63 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/prometheus.yml @@ -0,0 +1,4 @@ +--- + +# Set Prometheus storage retention size +prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB" diff --git a/environments/.caas/inventory/group_vars/all/selinux.yml b/environments/.caas/inventory/group_vars/all/selinux.yml new file mode 100644 index 000000000..1f1098126 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/selinux.yml @@ -0,0 +1 @@ +selinux_state: disabled \ No newline at end of file diff --git a/environments/.caas/inventory/group_vars/all/zenith.yml b/environments/.caas/inventory/group_vars/all/zenith.yml new file mode 100644 index 000000000..56dd0ca16 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/zenith.yml @@ -0,0 +1 @@ +zenith_proxy_podman_user: podman diff --git a/environments/.caas/inventory/group_vars/openstack.yml b/environments/.caas/inventory/group_vars/openstack.yml new file mode 100644 index 000000000..836078e10 --- /dev/null +++ b/environments/.caas/inventory/group_vars/openstack.yml @@ -0,0 +1,28 @@ +# The default Terraform state key for backends that support it +terraform_state_key: "cluster/{{ cluster_id }}/tfstate" + +# Set up the terraform backend +terraform_backend_type: "{{ 'consul' if 'CONSUL_HTTP_ADDR' in ansible_env else 'local' }}" +terraform_backend_config_defaults: + consul: + path: "{{ terraform_state_key }}" + gzip: "true" + local: {} +terraform_backend_config: "{{ terraform_backend_config_defaults[terraform_backend_type] }}" + +terraform_binary_directory: "{{ appliances_environment_root }}/bin" +terraform_project_path: "{{ playbook_dir }}/terraform" + +terraform_state: "{{ cluster_state | default('present') }}" +cluster_ssh_user: rocky + +# Set the size of the state volume to metrics_db_maximum_size + 10 +state_volume_size: "{{ metrics_db_maximum_size + 10 }}" + +# Provision a single "standard" compute partition using the supplied +# node count and flavor +openhpc_slurm_partitions: + - name: "standard" + count: "{{ compute_count }}" + flavor: "{{ compute_flavor }}" + default: "YES" diff --git a/environments/.caas/inventory/hosts b/environments/.caas/inventory/hosts new file mode 100644 index 000000000..88ce71000 --- /dev/null +++ b/environments/.caas/inventory/hosts @@ -0,0 +1,2 @@ +[openstack] +localhost ansible_connection=local ansible_python_interpreter=/usr/bin/python3 diff --git a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml new file mode 100644 index 000000000..d210fec47 --- /dev/null +++ b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml @@ -0,0 +1,117 @@ +name: "slurm" +label: "Slurm" +description: >- + Batch cluster running the Slurm workload manager, the Open + OnDemand web interface, and custom monitoring. +logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png + +parameters: + - name: cluster_floating_ip + label: External IP + description: The external IP to use for the login node. + kind: cloud.ip + immutable: true + + - name: compute_count + label: Compute node count + description: The number of compute nodes in the cluster. + kind: integer + options: + min: 1 + default: 3 + + - name: compute_flavor + label: Compute node size + description: The size to use for the compute node. + kind: "cloud.size" + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: home_volume_size + label: Home volume size (GB) + description: The size of the cloud volume to use for home directories + kind: integer + immutable: true + options: + min: 10 + default: 100 + + - name: use_home_volume_type_fast + label: Provision high-performance storage for home directories + description: | + If a high-performance storage type is available to the Slurm platform, + use it for cluster home directories. If no high-performance storage type + is available, this option has no effect and a standard cloud volume will + be provisioned for home directories. + kind: boolean + required: false + default: true + options: + checkboxLabel: Put home directories on high-performance storage? + + - name: metrics_db_maximum_size + label: Metrics database size (GB) + description: | + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + discarded to ensure that the database does not grow larger than this size. + + **A cloud volume of this size +10GB will be created to hold and persist the metrics + database and important Slurm files.** + kind: integer + immutable: true + options: + min: 10 + default: 10 + + - name: cluster_run_validation + label: Post-configuration validation + description: >- + If selected, post-configuration jobs will be executed to validate the core functionality + of the cluster when it is re-configured. + kind: boolean + required: false + default: true + options: + checkboxLabel: Run post-configuration validation? + +usage_template: |- + # Accessing the cluster using Open OnDemand + + [Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical + environments such as [Jupyter Notebooks](https://jupyter.org/). + + {% if cluster.outputs.openondemand_url %} + The Open OnDemand portal for this cluster is available at + [{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}). + + Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted. + {% else %} + The Open OnDemand portal for this cluster can be accessed from the services list. + {% endif %} + + # Accessing the cluster using SSH + + The cluster can be accessed over SSH via the external IP. The SSH public key of the user that + deployed the cluster is injected into the `azimuth` user: + + ``` + $ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }} + [azimuth@{{ cluster.name }}-login-0 ~]$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] + ``` + + The `rocky` user can be accessed the same way and has passwordless `sudo` enabled. + + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. + +services: + - name: ood + label: Open OnDemand + icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png + - name: monitoring + label: Monitoring + icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png + diff --git a/environments/.caas/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml new file mode 100644 index 000000000..250b96469 --- /dev/null +++ b/environments/.caas/ui-meta/slurm-infra.yml @@ -0,0 +1,103 @@ +name: "slurm" +label: "Slurm" +description: >- + Batch cluster running the Slurm workload manager, the Open + OnDemand web interface, and custom monitoring. +logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png + +parameters: + - name: cluster_floating_ip + label: External IP + description: The external IP to use for the login node. + kind: cloud.ip + immutable: true + + - name: compute_count + label: Compute node count + description: The number of compute nodes in the cluster. + kind: integer + options: + min: 1 + default: 3 + + - name: compute_flavor + label: Compute node size + description: The size to use for the compute node. + kind: "cloud.size" + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: home_volume_size + label: Home volume size (GB) + description: The size of the cloud volume to use for home directories + kind: integer + immutable: true + options: + min: 10 + default: 100 + + - name: metrics_db_maximum_size + label: Metrics database size (GB) + description: | + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + discarded to ensure that the database does not grow larger than this size. + + **A cloud volume of this size +10GB will be created to hold and persist the metrics + database and important Slurm files.** + kind: integer + immutable: true + options: + min: 10 + default: 10 + + - name: cluster_run_validation + label: Post-configuration validation + description: >- + If selected, post-configuration jobs will be executed to validate the core functionality + of the cluster when it is re-configured. + kind: boolean + required: false + default: true + options: + checkboxLabel: Run post-configuration validation? + +usage_template: |- + # Accessing the cluster using Open OnDemand + + [Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical + environments such as [Jupyter Notebooks](https://jupyter.org/). + + {% if cluster.outputs.openondemand_url %} + The Open OnDemand portal for this cluster is available at + [{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}). + + Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted. + {% else %} + The Open OnDemand portal for this cluster can be accessed from the services list. + {% endif %} + + # Accessing the cluster using SSH + + The cluster can be accessed over SSH via the external IP. The SSH public key of the user that + deployed the cluster is injected into the `azimuth` user: + + ``` + $ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }} + [azimuth@{{ cluster.name }}-login-0 ~]$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] + ``` + + The `rocky` user can be accessed the same way and has passwordless `sudo` enabled. + + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. + +services: + - name: ood + label: Open OnDemand + icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png + - name: monitoring + label: Monitoring + icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl index 78f007753..2b1bbfb39 100644 --- a/environments/.stackhpc/ARCUS.pkrvars.hcl +++ b/environments/.stackhpc/ARCUS.pkrvars.hcl @@ -1,12 +1,11 @@ flavor = "vm.ska.cpu.general.small" use_blockstorage_volume = true -volume_size = 10 # GB +volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny image_disk_format = "qcow2" networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.8-20230518.0.x86_64.qcow2" +fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] -ssh_bastion_host = "128.232.222.183" -ssh_bastion_username = "slurm-app-ci" +floating_ip_network = "CUDN-Internet" # Use FIP to avoid docker ratelimits on portal-internal outbound IP diff --git a/environments/.stackhpc/SMS.pkrvars.hcl b/environments/.stackhpc/SMS.pkrvars.hcl index a62d79929..cd9fe589a 100644 --- a/environments/.stackhpc/SMS.pkrvars.hcl +++ b/environments/.stackhpc/SMS.pkrvars.hcl @@ -1,7 +1,7 @@ flavor = "general.v1.tiny" networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # stackhpc-ipv4-geneve source_image_name = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252 -fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.8-20230518.0.x86_64.qcow2" +fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] diff --git a/environments/.stackhpc/activate b/environments/.stackhpc/activate index e74031095..2a58b40e4 100644 --- a/environments/.stackhpc/activate +++ b/environments/.stackhpc/activate @@ -1,8 +1,7 @@ export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" +export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml new file mode 100644 index 000000000..8d7ee98d2 --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -0,0 +1 @@ +#update_enable: false # Can uncomment for speed debugging non-update related build issues diff --git a/environments/.stackhpc/terraform/ARCUS.tfvars b/environments/.stackhpc/terraform/ARCUS.tfvars index b3bf7e825..8ebf93478 100644 --- a/environments/.stackhpc/terraform/ARCUS.tfvars +++ b/environments/.stackhpc/terraform/ARCUS.tfvars @@ -3,5 +3,3 @@ cluster_subnet = "portal-internal" vnic_type = "normal" control_node_flavor = "vm.ska.cpu.general.quarter" other_node_flavor = "vm.ska.cpu.general.small" -state_volume_device_path = "/dev/sdb" -home_volume_device_path = "/dev/sdc" diff --git a/environments/.stackhpc/terraform/SMS.tfvars b/environments/.stackhpc/terraform/SMS.tfvars index c1127ae0e..05a180401 100644 --- a/environments/.stackhpc/terraform/SMS.tfvars +++ b/environments/.stackhpc/terraform/SMS.tfvars @@ -3,5 +3,3 @@ cluster_subnet = "stackhpc-ipv4-geneve-subnet" vnic_type = "normal" control_node_flavor = "general.v1.medium" other_node_flavor = "general.v1.tiny" -state_volume_device_path = "/dev/vdb" -home_volume_device_path = "/dev/vdc" diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 68c6e5058..59e27f11d 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -13,7 +13,7 @@ variable "cluster_name" { variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" type = string - # default = "openhpc-231027-0916-893570de" # https://github.com/stackhpc/ansible-slurm-appliance/pull/324 + # default = "openhpc-240116-1156-aa8dba7d" # https://github.com/stackhpc/ansible-slurm-appliance/pull/351 default = "Rocky-8-GenericCloud-Base-8.8-20230518.0.x86_64.qcow2" } @@ -31,10 +31,6 @@ variable "volume_backed_instances" { default = false } -variable "state_volume_device_path" {} - -variable "home_volume_device_path" {} - module "cluster" { source = "../../skeleton/{{cookiecutter.environment}}/terraform/" @@ -74,6 +70,4 @@ module "cluster" { state_volume_size = 10 home_volume_size = 20 - state_volume_device_path = var.state_volume_device_path - home_volume_device_path = var.home_volume_device_path } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index a45d1ac23..23448c80d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -3,6 +3,7 @@ ansible_user: rocky appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" +appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform # Address(ip/dns) for internal communication between services. This is diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index c63191095..b7bdfdabc 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -13,14 +13,18 @@ # or include regex special characters. openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}" -ondemand_package: ondemand-3.0.1 +ondemand_package: ondemand-3.0.3 -openondemand_dashboard_links: # TODO: should really only be deployed if grafana is deployed and proxying configured +# Add grafana to dashboard links to OOD only if grafana group is available +openondemand_dashboard_links_grafana: - name: Grafana app_name: grafana category: Monitoring description: Dashboards url: "{{ grafana_url_openondemand_proxy }}" +openondemand_dashboard_links: "{{ openondemand_dashboard_links_grafana if groups['grafana'] | length > 0 }}" + +openondemand_login_host: localhost openondemand_clusters: slurm: @@ -28,7 +32,7 @@ openondemand_clusters: metadata: title: "{{ openhpc_cluster_name }}" # interpolation here works as openondemand is lexically after openhpc login: - host: "{{ hostvars[groups['login'].0].api_address }}" + host: "{{ openondemand_login_host }}" default: true job: adapter: slurm @@ -52,21 +56,23 @@ openondemand_clusters: export -f xfce4-session %s set_host: host=$(hostname -s) - custom: - # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support - grafana: - host: "{{ grafana_url }}" - orgId: 1 - dashboard: - name: "node-exporter-slurm" - uid: "node-exporter-slurm" - panels: - cpu: 77 - memory: 78 - labels: - cluster: "cluster" - host: "host" - jobid: "jobid" + custom: "{{ openondemand_clusters_grafana if groups['grafana'] | length > 0 else {} }}" + +openondemand_clusters_grafana: + # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support + grafana: + host: "{{ grafana_url }}" + orgId: 1 + dashboard: + name: "node-exporter-slurm" + uid: "node-exporter-slurm" + panels: + cpu: 77 + memory: 78 + labels: + cluster: "cluster" + host: "host" + jobid: "jobid" ood_install_apps_defaults: jupyter: @@ -174,7 +180,7 @@ openondemand_scrape_configs: - targets: - "{{ openondemand_address }}:9301" labels: - environment: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_NAME') }}" + environment: "{{ appliances_environment_name }}" service: "openondemand" openondemand_dashboard: diff --git a/environments/common/inventory/group_vars/all/update.yml b/environments/common/inventory/group_vars/all/update.yml index b409ea3d6..715d418c7 100644 --- a/environments/common/inventory/group_vars/all/update.yml +++ b/environments/common/inventory/group_vars/all/update.yml @@ -9,4 +9,4 @@ update_exclude: - apptainer # see https://github.com/stackhpc/ansible-slurm-appliance/pull/245 update_disablerepo: omit # Log changes during update here on localhost: -update_log_path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/logs/{{ inventory_hostname }}-updates.log" +update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index 414387ee1..a9fde767c 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -2,6 +2,7 @@ # NOTE: Might be better of as extra vars or in a builder specific inventory as # as dependent on alphabetical ordering of groups, so if these variables are # defined elsewhere the group that is ordered lower will determine the values. +update_enable: true openhpc_slurm_service_started: false nfs_client_mnt_state: present block_devices_partition_state: skip diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 84e6e5a72..008551c99 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -25,7 +25,7 @@ control [filebeat:children] slurm_stats -# NB: [rebuild] not defined here as this template is used in CI, which does not run in openstack +# NB: [rebuild] not defined here as this template is used in CI [update:children] cluster diff --git a/environments/skeleton/{{cookiecutter.environment}}/activate b/environments/skeleton/{{cookiecutter.environment}}/activate index e74031095..2a58b40e4 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/activate +++ b/environments/skeleton/{{cookiecutter.environment}}/activate @@ -1,8 +1,7 @@ export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" +export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 914b5f1e1..050cd49bf 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -5,18 +5,21 @@ locals { data "openstack_images_image_v2" "control" { name = var.control_node.image + most_recent = true } data "openstack_images_image_v2" "login" { for_each = var.login_nodes name = each.value.image + most_recent = true } data "openstack_images_image_v2" "compute" { for_each = var.compute_nodes name = lookup(var.compute_images, each.key, var.compute_types[each.value].image) + most_recent = true } resource "openstack_networking_port_v2" "login" { @@ -83,7 +86,7 @@ resource "openstack_compute_instance_v2" "control" { for_each = toset(["control"]) name = "${var.cluster_name}-${each.key}" - image_name = data.openstack_images_image_v2.control.name + image_id = data.openstack_images_image_v2.control.id flavor_name = var.control_node.flavor key_pair = var.key_pair @@ -126,27 +129,16 @@ resource "openstack_compute_instance_v2" "control" { #cloud-config fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} - fs_setup: - - label: state - filesystem: ext4 - device: ${var.state_volume_device_path} - partition: auto - - label: home - filesystem: ext4 - device: ${var.home_volume_device_path} - partition: auto + bootcmd: + %{for volume in [openstack_blockstorage_volume_v3.state, openstack_blockstorage_volume_v3.home]} + - BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV + %{endfor} mounts: - [LABEL=state, ${var.state_dir}] - - [LABEL=home, /exports/home, auto, "x-systemd.required-by=nfs-server.service,x-systemd.before=nfs-server.service"] + - [LABEL=home, /exports/home] EOF - lifecycle{ - ignore_changes = [ - image_name, - ] - } - } resource "openstack_compute_instance_v2" "login" { @@ -154,7 +146,7 @@ resource "openstack_compute_instance_v2" "login" { for_each = var.login_nodes name = "${var.cluster_name}-${each.key}" - image_name = each.value.image + image_id = data.openstack_images_image_v2.login[each.key].id flavor_name = each.value.flavor key_pair = var.key_pair @@ -184,12 +176,6 @@ resource "openstack_compute_instance_v2" "login" { fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} EOF - lifecycle{ - ignore_changes = [ - image_name, - ] - } - } resource "openstack_compute_instance_v2" "compute" { @@ -197,7 +183,7 @@ resource "openstack_compute_instance_v2" "compute" { for_each = var.compute_nodes name = "${var.cluster_name}-${each.key}" - image_name = lookup(var.compute_images, each.key, var.compute_types[each.value].image) + image_id = data.openstack_images_image_v2.compute[each.key].id flavor_name = var.compute_types[each.value].flavor key_pair = var.key_pair @@ -227,10 +213,4 @@ resource "openstack_compute_instance_v2" "compute" { fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} EOF - lifecycle{ - ignore_changes = [ - image_name, - ] - } - } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0804c6f33..69f9a63da 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -55,18 +55,6 @@ variable "environment_root" { description = "Path to environment root, automatically set by activate script" } -variable "state_volume_device_path" { - type = string - description = "Path to block device for state" - default = "/dev/sdb" -} - -variable "home_volume_device_path" { - type = string - description = "Path to block device name for home directories" - default = "/dev/sdc" -} - variable "state_dir" { type = string description = "Path to state directory on control node" diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/volumes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/volumes.tf index bb96f1ab0..37c0ddc31 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/volumes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/volumes.tf @@ -1,11 +1,11 @@ resource "openstack_blockstorage_volume_v3" "state" { name = "${var.cluster_name}-state" - description = "State for control node" + description = "State for control node" # first word used to label filesystem size = var.state_volume_size } resource "openstack_blockstorage_volume_v3" "home" { name = "${var.cluster_name}-home" - description = "Home for control node" + description = "Home for control node" # first word used to label filesystem size = var.home_volume_size } diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index 96a1b022b..66f668649 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1 +1 @@ -update_enable: true +workaround_ansible_issue_61497: yes # extravars files can't be empty diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 3e4b570e0..cad500e3f 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -11,6 +11,10 @@ packer { version = ">= 1.0.0" source = "github.com/hashicorp/openstack" } + ansible = { + version = ">= 1.1.1" + source = "github.com/hashicorp/ansible" + } } } @@ -88,12 +92,12 @@ variable "image_visibility" { variable "ssh_bastion_host" { type = string - default = "" + default = null } variable "ssh_bastion_username" { type = string - default = "" + default = null } variable "ssh_bastion_private_key_file" { diff --git a/requirements.yml b/requirements.yml index 60b2905dc..59e717c67 100644 --- a/requirements.yml +++ b/requirements.yml @@ -1,12 +1,12 @@ --- roles: - src: stackhpc.nfs - version: v22.9.1 + version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git version: feat/no-ohpc # https://github.com/stackhpc/ansible-role-openhpc/pull/162 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git - version: feature/no-install + version: stackhpc name: cloudalchemy.node_exporter - src: https://github.com/cloudalchemy/ansible-prometheus.git version: 4d2c8d742de39e50387e0aa6d5510b21c7451343 # need fix in preceeding commit for rocky @@ -22,9 +22,26 @@ roles: version: v3.0.6 collections: -- name: containers.podman -- name: community.grafana -- name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools - type: git - version: v0.2.0 + - name: containers.podman + version: 1.10.2 + - name: community.grafana + version: 1.5.4 + - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools + type: git + version: v0.2.0 + - name: ansible.posix + version: 1.5.4 + - name: ansible.netcommon + version: 5.1.1 + - name: community.general + version: 7.1.0 + - name: community.crypto + version: 2.10.0 + - name: community.mysql + version: 3.7.2 + - name: openstack.cloud + version: 2.1.0 + - name: https://github.com/stackhpc/ansible-collection-terraform + type: git + version: 0.1.0 ...