Skip to content

Commit

Permalink
Merge branch '2025-1' into grappe-BCM-4010-7010
Browse files Browse the repository at this point in the history
  • Loading branch information
helene authored and helene committed Jan 20, 2025
2 parents c9e1f74 + 4557499 commit 2ca91d0
Show file tree
Hide file tree
Showing 27 changed files with 594 additions and 72 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/linter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@ name: Linting
# yamllint disable-line rule:truthy
on:
push:
branches: ["2024-3"]
paths:
- '**.yaml'
- '**.yml'
pull_request:
branches: ["2024-3"]
paths:
- '**.yaml'
- '**.yml'
Expand Down
28 changes: 28 additions & 0 deletions ant3814/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
jupyterhub::jupyterhub_config_hash:
SbatchForm:
runtime:
min: 1.0
def: 2.0
max: 5.0
nprocs:
min: 1
def: 1
max: 8
memory:
min: 1024
max: 10000
def: 6144
oversubscribe:
def: true
lock: true
ui:
def: 'lab'
partition:
lock: false
SlurmFormSpawner:
disable_form: false

profile::software_stack::lmod_default_modules:
- 'StdEnv/2023'
- 'gcc'
- 'mii'
17 changes: 17 additions & 0 deletions ant3814/custom.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
locals {
custom = {
nnodes = {
cpupool = 2
}
home_size = 50
project_size = 100
scratch_size = 100

user_quotas = {
home = "4g"
project = "8g"
scratch = "8g"
}
}
name = "ant3814"
}
1 change: 1 addition & 0 deletions ant3814/main.tf
2 changes: 1 addition & 1 deletion bcm4010-7010/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jupyterhub::jupyterhub_config_hash:
SlurmFormSpawner:
disable_form: false

jupyterhub::kernel::venv::packages: ['numpy', 'pandas', 'matplotlib', 'random', 'seaborn', 'itertools']
jupyterhub::kernel::venv::packages: ['numpy', 'pandas', 'matplotlib', 'seaborn']

cron::job:
spawn_nodes:
Expand Down
70 changes: 70 additions & 0 deletions bif4007/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
jupyterhub::jupyterhub_config_hash:
SbatchForm:
runtime:
min: 1.0
def: 2.0
max: 5.0
nprocs:
min: 1
def: 1
max: 8
memory:
min: 1024
max: 10000
def: 6144
oversubscribe:
def: true
lock: true
ui:
def: 'lab'
partition:
lock: false
SlurmFormSpawner:
disable_form: false

profile::software_stack::lmod_default_modules:
- 'StdEnv/2023'
- 'gcc'
- 'mii'
- 'ipython-kernel/3.11'

cron::job:
spawn_nodes:
command: '/opt/software/slurm/bin/scontrol update node=nodecpupool[1-2] state=power_up'
minute: '25'
hour: '10'
month: '1-4'
weekday: '4'
user: 'root'
description: 'spawn nodes at 10:25 on each Thursdays'
environment: ['CRON_TZ=America/New_York']
spawn_nodes_jan:
command: '/opt/software/slurm/bin/scontrol update node=nodecpupool[1-2] state=power_up'
minute: '25'
hour: '8'
date: '29'
month: '1'
weekday: '*'
user: 'root'
description: 'spawn nodes at 8:25 on 29 January'
environment: ['CRON_TZ=America/New_York']
spawn_nodes_feb:
command: '/opt/software/slurm/bin/scontrol update node=nodecpupool[1-2] state=power_up'
minute: '25'
hour: '8'
date: '12'
month: '2'
weekday: '*'
user: 'root'
description: 'spawn nodes at 8:25 on 12 February'
environment: ['CRON_TZ=America/New_York']
spawn_nodes_apr:
command: '/opt/software/slurm/bin/scontrol update node=nodecpupool[1-2] state=power_up'
minute: '25'
hour: '8'
date: '2,16'
month: '4'
weekday: '*'
user: 'root'
description: 'spawn nodes at 8:25 on for April'
environment: ['CRON_TZ=America/New_York']
15 changes: 15 additions & 0 deletions bif4007/custom.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
locals {
custom = {
nnodes = {
cpupool = 2
}
home_size = 50
project_size = 100
scratch_size = 800

user_quotas = {
home = "3g"
}
}
name = "bif4007"
}
1 change: 1 addition & 0 deletions bif4007/main.tf
1 change: 1 addition & 0 deletions chem505/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
profile::cvmfs::client::lmod_default_modules: ['StdEnv/2023']
16 changes: 16 additions & 0 deletions chem505/custom.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
locals {
custom = {
nnodes = {
cpu = 0
cpupool = 8
}

instances_type_map = {
beluga = {
cpu = "c2-7.5gb"
cpupool = "c2-7.5gb"
}
}
}
name = "chem505"
}
1 change: 1 addition & 0 deletions chem505/main.tf
1 change: 1 addition & 0 deletions common/Puppetfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mod 'puppet-cron', '2.0.0'
13 changes: 13 additions & 0 deletions common/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,16 @@ prometheus::remote_write_configs:
username: 'cqformation'
password: "%{alias('prometheus_password')}"

profile::software_stack::lmod_default_modules: ['StdEnv/2023']
jupyterhub::kernel::venv::python: /cvmfs/soft.computecanada.ca/easybuild/software/2023/x86-64-v3/Compiler/gcccore/python/3.11.5/bin/python3
jupyterhub::kernel::venv::prefix: /opt/ipython-kernel-3.11
lookup_options:
jupyterhub::kernel::venv::pip_environment:
merge: deep
jupyterhub::kernel::venv::pip_environment:
PIP_NO_INDEX: 1
PIP_CONFIG_FILE: /cvmfs/soft.computecanada.ca/config/python/pip-x86-64-v3-gentoo2023.conf

magic_castle::site::tags:
cron:
- cron
50 changes: 36 additions & 14 deletions common/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ data "tfe_workspace" "current" {

locals {
default_pod = {
image = "Rocky-8"
image_cpu = "snapshot-cpunode-2024-R810.5"
image_gpu = "snapshot-gpunode-2024-R810.5"
nb_users = 0
image = "AlmaLinux-9.4"
image_cpu = "snapshot-cpunode-2025-A9.4-1"
image_gpu = "snapshot-gpunode-2025-A9.4-1"
nb_users = 1

nnodes = {
cpu = 0
Expand All @@ -63,7 +63,8 @@ locals {

cluster_purpose = "cours_academiques"
config_git_url = "https://github.com/ComputeCanada/puppet-magic_castle.git"
config_version = "2972853"
# for mkfs_options
config_version = "5de4c7f"

instances_type_map = {
arbutus = {
Expand All @@ -75,7 +76,7 @@ locals {
gpupool = "g1-8gb-c4-22gb"
}
beluga = {
mgmt = "p8-15gb"
mgmt = "p4-15gb"
login = "p4-7.5gb"
cpu = "c8-60gb"
cpupool = "c8-60gb"
Expand All @@ -86,7 +87,7 @@ locals {
cpu = "c8-30gb"
cpupool = "c8-30gb"
gpu = "gpu16-240-3450gb-a100x1_cq"
gpupool = "gpu16-240-3450gb-a100x1_cq"
gpupool = "gpu12-120-850gb-a100x1_MC"
gpupool16 = "gpu16-240-3375gb-a100x1"
gpupool80 = "gpu13-240-2500gb-a100-80gx1"
gpupool12 = "gpu12-120-850gb-a100x1"
Expand All @@ -105,6 +106,16 @@ locals {
gpupool12-j = { "1g.5gb" = 7 }
}

shard = {
gpu = null
gpupool = null
gpupool16 = null
gpupool80 = null
gpupool12 = null
gpupool16-cq = null
gpupool12-j = null
}

network_map = {
arbutus = {
subnet_id = null
Expand Down Expand Up @@ -132,7 +143,7 @@ locals {
}
login = {
type = try(local.custom.instances_type_map.arbutus.login, local.default_pod.instances_type_map.arbutus.login),
tags = ["login", "public", "proxy"],
tags = ["login", "public", "proxy", "cron"],
disk_size = 20,
count = try(local.custom.nnodes.login, local.default_pod.nnodes.login)
}
Expand All @@ -153,12 +164,14 @@ locals {
tags = ["node"],
count = try(local.custom.nnodes.gpu, local.default_pod.nnodes.gpu),
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpu, local.default_pod.shard.gpu),
}
nodegpupool = {
type = try(local.custom.instances_type_map.arbutus.gpupool, local.default_pod.instances_type_map.arbutus.gpupool),
tags = ["node", "pool"],
count = try(local.custom.nnodes.gpupool, local.default_pod.nnodes.gpupool),
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpupool, local.default_pod.shard.gpupool),
}
}
beluga = {
Expand All @@ -170,7 +183,7 @@ locals {
}
login = {
type = try(local.custom.instances_type_map.beluga.login, local.default_pod.instances_type_map.beluga.login),
tags = ["login", "public", "proxy"],
tags = ["login", "public", "proxy", "cron"],
disk_size = 20,
count = try(local.custom.nnodes.login, local.default_pod.nnodes.login)
}
Expand Down Expand Up @@ -198,7 +211,7 @@ locals {
}
login = {
type = try(local.custom.instances_type_map.juno.login, local.default_pod.instances_type_map.juno.login),
tags = ["login", "public", "proxy"],
tags = ["login", "public", "proxy", "cron"],
disk_size = 20,
count = try(local.custom.nnodes.login, local.default_pod.nnodes.login)
}
Expand All @@ -222,6 +235,7 @@ locals {
count = try(local.custom.nnodes.gpu, local.default_pod.nnodes.gpu),
mig = try(local.custom.mig.gpu, local.default_pod.mig.gpu)
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpu, local.default_pod.shard.gpu),
disk_size = "50"
}
nodegpupool = {
Expand All @@ -230,6 +244,7 @@ locals {
count = try(local.custom.nnodes.gpupool, 0),
mig = try(local.custom.mig.gpupool, local.default_pod.mig.gpupool)
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpupool, local.default_pod.shard.gpupool),
disk_size = "50"
}
nodegpupool16 = {
Expand All @@ -238,6 +253,7 @@ locals {
count = try(local.custom.nnodes.gpupool16, 0),
mig = try(local.custom.mig.gpupool16, local.default_pod.mig.gpupool16)
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpupool16, local.default_pod.shard.gpupool16),
disk_size = "50"
}
nodegpupool16-cq = {
Expand All @@ -246,6 +262,7 @@ locals {
count = try(local.custom.nnodes.gpupool16-cq, 0),
mig = try(local.custom.mig.gpupool16-cq, local.default_pod.mig.gpupool16-cq)
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpupool16-cq, local.default_pod.shard.gpupool16-cq),
disk_size = "50"
}
nodegpupool12 = {
Expand All @@ -254,6 +271,7 @@ locals {
count = try(local.custom.nnodes.gpupool12, 0),
mig = try(local.custom.mig.gpupool12, local.default_pod.mig.gpupool12)
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpupool12, local.default_pod.shard.gpupool12),
disk_size = "50"
}
nodegpupool12-j = {
Expand All @@ -262,6 +280,7 @@ locals {
count = try(local.custom.nnodes.gpupool12-j, 0),
mig = try(local.custom.mig.gpupool12-j, local.default_pod.mig.gpupool12-j)
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpupool12-j, local.default_pod.shard.gpupool12-j),
disk_size = "50"
}
nodegpupool80 = {
Expand All @@ -270,6 +289,7 @@ locals {
count = try(local.custom.nnodes.gpupool80, 0),
mig = try(local.custom.mig.gpupool80, local.default_pod.mig.gpupool80)
image = try(local.custom.image_gpu, local.default_pod.image_gpu),
shard = try(local.custom.shard.gpupool80, local.default_pod.shard.gpupool80),
disk_size = "50"
}
}
Expand All @@ -291,9 +311,9 @@ locals {
}
juno = {
nfs = {
home = { size = try(local.custom.home_size, local.default_pod.home_size), quota = try(local.custom.user_quotas.home, local.default_pod.user_quotas.home) }
project = { size = try(local.custom.project_size, local.default_pod.project_size), quota = try(local.custom.user_quotas.project, local.default_pod.user_quotas.project) }
scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size), quota = try(local.custom.user_quotas.scratch, local.default_pod.user_quotas.scratch) }
home = { size = try(local.custom.home_size, local.default_pod.home_size), quota = try(local.custom.user_quotas.home, local.default_pod.user_quotas.home), mkfs_options = "-K" }
project = { size = try(local.custom.project_size, local.default_pod.project_size), quota = try(local.custom.user_quotas.project, local.default_pod.user_quotas.project), mkfs_options = "-K" }
scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size), quota = try(local.custom.user_quotas.scratch, local.default_pod.user_quotas.scratch), mkfs_options = "-K" }
}
}
}
Expand All @@ -320,7 +340,7 @@ locals {
}

module "openstack" {
source = "git::https://github.com/ComputeCanada/magic_castle.git//openstack?ref=14.0.0-beta"
source = "git::https://github.com/ComputeCanada/magic_castle.git//openstack?ref=14.1.2"
config_git_url = try(local.custom.config_git_url, local.default_pod.config_git_url)
config_version = try(local.custom.config_version, local.default_pod.config_version)

Expand Down Expand Up @@ -348,6 +368,8 @@ module "openstack" {

subnet_id = local.default_pod.network_map[var.cloud_name].subnet_id
os_ext_network = local.default_pod.network_map[var.cloud_name].os_ext_network

puppetfile = file("../common/Puppetfile")
}

output "accounts" {
Expand Down
1 change: 1 addition & 0 deletions common/sshkeys.pub
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIH4ahFXyvFnKcLojCQ8nrMS1jkXa9fn8ztjRAAG92UMI
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBr2jTiSZFWTB0RVFV9eFcLBogbhH34ZFQf46+wYOhQb [email protected]
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKtwVw/w5qNSIqQwksmYRwnNzj53tdNakM0iYqnQWUds [email protected]
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGH5ZbhAxZTonq5YYcaHRMGPHb92vSlWWOwwParu7T59 [email protected]
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDjg/qhQHotFzZrYfCdru/MFMdGfcS2/F9lcjP1khfIq [email protected]
Loading

0 comments on commit 2ca91d0

Please sign in to comment.