Skip to content

Commit

Permalink
Merge branch 'main' into feat/no-ohpc-mergeable
Browse files Browse the repository at this point in the history
  • Loading branch information
sjpb committed Jan 24, 2024
2 parents 2253fb1 + 2d78fc9 commit b850948
Show file tree
Hide file tree
Showing 88 changed files with 1,684 additions and 379 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ jobs:
cd packer/
packer init .
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Get created image name from manifest
id: manifest
Expand Down
19 changes: 19 additions & 0 deletions ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Only used for Azimuth running the caas environment
[defaults]
any_errors_fatal = True
gathering = smart
forks = 30
host_key_checking = False
remote_tmp = /tmp
collections_path = ansible/collections
roles_path = ansible/roles
filter_plugins = ansible/filter_plugins
callbacks_enabled = ansible.posix.profile_tasks

[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
pipelining = True
# This is important because we are using one of the hosts in the play as a jump host
# This ensures that if the proxy connection is interrupted, rendering the other hosts
# unreachable, the connection is retried instead of failing the entire play
retries = 10
17 changes: 13 additions & 4 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ roles/*
!roles/firewalld/**
!roles/etc_hosts/
!roles/etc_hosts/**
!roles/cloud_init/
!roles/cloud_init/**
!roles/mysql/
!roles/mysql/**
!roles/systemd/
Expand All @@ -42,5 +40,16 @@ roles/*
!roles/proxy/**
!roles/resolv_conf/
!roles/resolv_conf/**
!roles/cve-2023-41914
!roles/cve-2023-41914/**
!roles/cluster_infra/
!roles/cluster_infra/**
!roles/image_build_infra/
!roles/image_build_infra/**
!roles/persist_openhpc_secrets/
!roles/persist_openhpc_secrets/**
!roles/zenith_proxy/
!roles/zenith_proxy/**
!roles/image_build/
!roles/image_build/**
!roles/persist_hostkeys/
!roles/persist_hostkeys/**
!roles/requirements.yml
6 changes: 0 additions & 6 deletions ansible/adhoc/cve-2023-41914.yml

This file was deleted.

9 changes: 0 additions & 9 deletions ansible/adhoc/template-cloud-init.yml

This file was deleted.

1 change: 1 addition & 0 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
name: fail2ban

- name: Setup podman
gather_facts: false
hosts: podman
tags: podman
tasks:
Expand Down
2 changes: 1 addition & 1 deletion ansible/extras.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
- hosts: basic_users
- hosts: basic_users:!builder
become: yes
tags:
- basic_users
Expand Down
15 changes: 10 additions & 5 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,13 @@
become: yes
gather_facts: no
tasks:
# - import_playbook: slurm.yml
- include_role:
# - import_playbook: slurm.yml:
- name: Setup DB
include_role:
name: mysql
tasks_from: install.yml
- name: OpenHPC
import_role:
name: stackhpc.openhpc
tasks_from: "install-{{ openhpc_install_type }}.yml"

Expand All @@ -76,10 +81,10 @@
name: opensearch
tasks_from: install.yml
become: true

# opensearch - containerised, nothing to do
# slurm_stats - nothing to do
# filebeat - containerised - nothing to do
- import_role:
name: filebeat
tasks_from: install.yml

- import_role:
# can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start
Expand Down
9 changes: 1 addition & 8 deletions ansible/monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,12 @@
# Collection currently requires root for all tasks.
become: true

- name: Setup filebeat
- name: Deploy filebeat
hosts: filebeat
tags: filebeat
tasks:
- import_role:
name: filebeat
tasks_from: config.yml
tags: config

- import_role:
name: filebeat
tasks_from: deploy.yml
tags: deploy

- name: Deploy node_exporter
hosts: node_exporter
Expand Down
2 changes: 1 addition & 1 deletion ansible/noop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@

- hosts: localhost
gather_facts: false
tasks: []
tasks: []
7 changes: 7 additions & 0 deletions ansible/roles/cluster_infra/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cluster_deploy_ssh_keys_extra: []

# List of hw_scsi_models that result in block devices presenting as /dev/sdX
# rather than /dev/vdX
scsi_models:
# Ceph [https://docs.ceph.com/en/quincy/rbd/rbd-openstack/#image-properties]
- virtio-scsi
81 changes: 81 additions & 0 deletions ansible/roles/cluster_infra/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
- debug:
msg: |
terraform_backend_type: {{ terraform_backend_type }}
terraform_state: {{ terraform_state }}
cluster_upgrade_system_packages: {{ cluster_upgrade_system_packages | default('undefined') }}
# We need to convert the floating IP id to an address for Terraform
# if we we have cluster_floating_ip, otherwise assume that we're
# assigning the FIP in Terraform and that it will be available in
# outputs.cluster_gateway_ip.
- block:
- name: Look up floating IP
include_role:
name: stackhpc.terraform.infra
tasks_from: lookup_floating_ip
vars:
os_floating_ip_id: "{{ cluster_floating_ip }}"

- name: Set floating IP address fact
set_fact:
cluster_floating_ip_address: "{{ os_floating_ip_info.floating_ip_address }}"
when: cluster_floating_ip is defined

- name: Install Terraform binary
include_role:
name: stackhpc.terraform.install

- name: Make Terraform project directory
file:
path: "{{ terraform_project_path }}"
state: directory

- name: Write backend configuration
copy:
content: |
terraform {
backend "{{ terraform_backend_type }}" { }
}
dest: "{{ terraform_project_path }}/backend.tf"

# Patching in this appliance is implemented as a switch to a new base image
# So unless explicitly patching, we want to use the same image as last time
# To do this, we query the previous Terraform state before updating
- block:
- name: Get previous Terraform state
stackhpc.terraform.terraform_output:
binary_path: "{{ terraform_binary_path }}"
project_path: "{{ terraform_project_path }}"
backend_config: "{{ terraform_backend_config }}"
register: cluster_infra_terraform_output

- name: Extract image from Terraform state
set_fact:
cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}"
when: '"cluster_image" in cluster_infra_terraform_output.outputs'
when:
- terraform_state == "present"
- cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages

- name: Template Terraform files into project directory
template:
src: >-
{{
"{}{}.j2".format(
(
cluster_terraform_template_dir ~ "/"
if cluster_terraform_template_dir is defined
else ""
),
item
)
}}
dest: "{{ terraform_project_path }}/{{ item }}"
loop:
- outputs.tf
- providers.tf
- resources.tf

- name: Provision infrastructure
include_role:
name: stackhpc.terraform.infra
53 changes: 53 additions & 0 deletions ansible/roles/cluster_infra/templates/outputs.tf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
output "cluster_gateway_ip" {
description = "The IP address of the gateway used to contact the cluster nodes"
value = openstack_compute_floatingip_associate_v2.login_floatingip_assoc.floating_ip
}

{% if cluster_ssh_private_key_file is not defined %}
output "cluster_ssh_private_key" {
description = "The private component of the keypair generated on cluster provision"
value = openstack_compute_keypair_v2.cluster_keypair.private_key
sensitive = true
}
{% endif %}

output "cluster_nodes" {
description = "A list of the nodes in the cluster from which an Ansible inventory will be populated"
value = concat(
[
{
name = openstack_compute_instance_v2.login.name
ip = openstack_compute_instance_v2.login.network[0].fixed_ip_v4
groups = ["login", "{{ cluster_name }}_login"],
facts = {
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
}
},
{
name = openstack_compute_instance_v2.control.name
ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
groups = ["control", "{{ cluster_name }}_control"],
facts = {
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
}
}
],
{% for partition in openhpc_slurm_partitions %}
[
for compute in openstack_compute_instance_v2.{{ partition.name }}: {
name = compute.name
ip = compute.network[0].fixed_ip_v4
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"],
facts = {
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
}
}
]{{ ',' if not loop.last }}
{% endfor %}
)
}

output "cluster_image" {
description = "The id of the image used to build the cluster nodes"
value = "{{ cluster_previous_image | default(cluster_image) }}"
}
10 changes: 10 additions & 0 deletions ansible/roles/cluster_infra/templates/providers.tf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
terraform {
required_version = ">= 0.14"

# We need the OpenStack provider
required_providers {
openstack = {
source = "terraform-provider-openstack/openstack"
}
}
}
Loading

0 comments on commit b850948

Please sign in to comment.