Skip to content

Commit

Permalink
Add per cluster's node gpu maps
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Apr 3, 2024
1 parent 262d37c commit f4a0030
Show file tree
Hide file tree
Showing 9 changed files with 271 additions and 60 deletions.
93 changes: 88 additions & 5 deletions config/sarc-dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,37 @@
"duc_storage_command": null,
"diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
"prometheus_url": "http://monitoring.server.mila.quebec:9090/",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"gpus_per_nodes": {
"cn-a{{[001-011]}}": {
"rtx8000": "Quadro RTX 8000"
},
"cn-b{{[001-005]}}": {
"v100": "Tesla V100-SXM2-32GB"
},
"cn-c{{[001-040]}}": {
"rtx8000": "Quadro RTX 8000"
},
"cn-g{{[001-029]}}": {
"a100": "NVIDIA A100 80GB PCIe",
"[0-9]+g\\.[0-9]+gb": "__MIG_FLAG__a100"
},
"cn-i001": {
"a100": "NVIDIA A100 80GB PCIe"
},
"cn-j001": {
"a6000": "NVIDIA RTX A6000"
},
"cn-d{{[001-002]}}": {
"a100": "NVIDIA A100-SXM4-40GB"
},
"cn-d{{[003-004]}}": {
"a100": "NVIDIA A100-SXM4-80GB"
},
"cn-e{{[002-003]}}": {
"v100": "Tesla V100-SXM2-32GB"
}
}
},
"narval": {
"host": "narval.computecanada.ca",
Expand All @@ -40,7 +70,12 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"gpus_per_nodes": {
"__DEFAULTS__": {
"a100": "NVIDIA A100-SXM4-40GB"
}
}
},
"beluga": {
"host": "beluga.computecanada.ca",
Expand All @@ -52,7 +87,12 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"gpus_per_nodes": {
"__DEFAULTS__": {
"v100": "Tesla V100-SXM2-16GB"
}
}
},
"graham": {
"host": "graham.computecanada.ca",
Expand All @@ -65,7 +105,30 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_graham.txt"
"nodes_info_file": "secrets/nodes_graham.txt",
"gpus_per_nodes": {
"gra{{[828-987]}}": {
"p100": "NVIDIA P100-12G PCIe"
},
"gra{{[1147-1153]}}": {
"v100": "NVIDIA V100-16G PCIe"
},
"gra{{[1154-1189]}}": {
"t4": "NVIDIA T4-16G PCIe"
},
"gra{{[1337-1338]}}": {
"v100": "NVIDIA V100-32G PCIe"
},
"gra1342": {
"a100": "NVIDIA A100 PCIe"
},
"gra{{[1361-1362]}}": {
"a100": "NVIDIA A100 PCIe"
},
"gra{{[1363-1373]}}": {
"a5000": "NVIDIA A5000-24G PCIe"
}
}
},
"cedar": {
"host": "cedar.computecanada.ca",
Expand All @@ -78,7 +141,27 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_cedar.txt"
"nodes_info_file": "secrets/nodes_cedar.txt",
"gpus_per_nodes": {
"cdr{{[26-386]}}": {
"p100": "NVIDIA P100-12G PCIe"
},
"cdr{{[876-904]}}": {
"p100l": "NVIDIA P100-16G PCIe"
},
"cdr{{[905-906]}}": {
"p100": "NVIDIA P100-12G PCIe"
},
"cdr{{[908-911]}}": {
"p100l": "NVIDIA P100-16G PCIe"
},
"cdr{{[912-922]}}": {
"p100": "NVIDIA P100-12G PCIe"
},
"cdr{{[2468-2678]}}": {
"v100l": "NVIDIA V100-32G PCIe"
}
}
}
}
}
7 changes: 2 additions & 5 deletions sarc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ class ClusterConfig(BaseModel):
duc_storage_command: str = None
diskusage_report_command: str = None
start_date: str = "2022-04-01"
gpus: list = []
harmonize_gpu_map: dict = {}
gpus_per_nodes: dict = {}

@validator("timezone")
def _timezone(cls, value):
Expand Down Expand Up @@ -137,9 +136,7 @@ def node_to_gpu(self):
"""
from .jobs.node_gpu_mapping import NodeToGPUMapping

return NodeToGPUMapping(
self.name, self.nodes_info_file, self.harmonize_gpu_map, self.gpus
)
return NodeToGPUMapping(self.name, self.nodes_info_file, self.gpus_per_nodes)


class MongoConfig(BaseModel):
Expand Down
76 changes: 65 additions & 11 deletions sarc/jobs/node_gpu_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,63 @@
import regex as re
from hostlist import expand_hostlist

MIG_FLAG = "__MIG__"
DEFAULTS_FLAG = "__DEFAULTS__"


def _find_pattern(string: str) -> tuple:
try:
begin = string.index("{{")
end = string.index("}}")
except ValueError:
return None, None
return string[begin : end + 2], string[begin + 2 : end]


def _expand_list(list_pattern: str) -> str:
start, stop = list_pattern[1:-1].split("-")
start, stop = int(start), int(stop)
regex = "|".join([f"0*{i}" for i in range(start, stop + 1)])
return f"({regex})"


EXPAND_PATTERNS = {re.compile(r"^\[.*\]$"): _expand_list}


def expand_patterns(string: str) -> re.Pattern:
full_pattern, pattern = _find_pattern(string)
while pattern:
for pattern_regex, _expand in EXPAND_PATTERNS.items():
if pattern_regex.match(pattern):
regex = _expand(pattern)
string = string.replace(full_pattern, regex)
break
else:
raise ValueError(f"Unknown pattern {full_pattern}")

full_pattern, pattern = _find_pattern(string)

return re.compile(string)


class NodeToGPUMapping:
"""Helper class to generate JSON file, load it in memory, and query GPU type for a nodename."""

def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
def __init__(self, cluster_name, nodes_info_file, gpus_per_nodes: dict):
"""Initialize with cluster name and TXT file path to parse."""

# Mapping is empty by default.
self.mapping = {}
self.json_path = None
self.harmonize_gpu_map = {
**{
re.compile(regex): gpu_type
for regex, gpu_type in harmonize_gpu_map.items()
},
**{re.compile(f".*{gpu}.*"): gpu for gpu in gpus},
}
self.harmonize_gpu_map = {}
for node_pattern, node_gpus in gpus_per_nodes.items():
if node_pattern != DEFAULTS_FLAG:
node_pattern = expand_patterns(f"^{node_pattern}$")
self.harmonize_gpu_map[node_pattern] = {
re.compile(f".*{gpu}.*"): descriptive_gpu
for gpu, descriptive_gpu in node_gpus.items()
}
self.default_gpu_map = self.harmonize_gpu_map.pop(DEFAULTS_FLAG, {})

# Mapping is filled only if TXT file is available.
if nodes_info_file and os.path.exists(nodes_info_file):
Expand All @@ -54,22 +94,36 @@ def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
with open(self.json_path, encoding="utf-8") as file:
self.mapping = json.load(file)

def _harmonize_gpu(self, gpu_type: str):
def _harmonize_gpu(self, nodename: str, gpu_type: str):
gpu_type = gpu_type.lower().replace(" ", "-").split(":")
if gpu_type[0] == "gpu":
gpu_type.pop(0)
gpu_type = gpu_type[0]
for regex, harmonized_gpu in self.harmonize_gpu_map.items():

for node_regex, gpu_map in self.harmonize_gpu_map.items():
if node_regex.match(nodename):
break
else:
gpu_map = self.default_gpu_map

for regex, harmonized_gpu in gpu_map.items():
if regex.match(gpu_type):
break
else:
harmonized_gpu = None

if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
harmonized_gpu = self._harmonize_gpu(
nodename, harmonized_gpu[len(MIG_FLAG) :]
)
harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"

return harmonized_gpu

def __getitem__(self, nodename):
"""Return GPU type for nodename, or None if not found."""
gpu_type = self.mapping.get(nodename, None)
return self._harmonize_gpu(gpu_type)
return self._harmonize_gpu(nodename, gpu_type)

@staticmethod
def _parse_nodenames(path: str, output: dict):
Expand Down
15 changes: 4 additions & 11 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import json
import os
import shutil
import sys
import tempfile
import zoneinfo
from pathlib import Path
from unittest.mock import MagicMock, mock_open
from unittest.mock import MagicMock, mock_open, patch

from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from opentelemetry.trace import set_tracer_provider
from opentelemetry.trace import get_tracer_provider, set_tracer_provider

_tracer_provider = TracerProvider()
_exporter = InMemorySpanExporter()
Expand All @@ -24,6 +24,7 @@
from sarc.config import (
ClusterConfig,
Config,
MongoConfig,
ScraperConfig,
config,
parse_config,
Expand Down Expand Up @@ -72,14 +73,6 @@ def disabled_cache():
yield


# Make sure the cache dir is empty before running the tests
@pytest.fixture(scope="session", autouse=True)
def clean_up_test_cache_before_run(standard_config_object):
if standard_config_object.cache.exists():
shutil.rmtree(str(standard_config_object.cache))
yield


@pytest.fixture
def tzlocal_is_mtl(monkeypatch):
monkeypatch.setattr("sarc.config.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal"))
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/jobs/test_func_sacct.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def test_get_gpu_type_without_prometheus(
job = jobs[0]
print(job)
print(job.nodes)
assert job.allocated.gpu_type == "asupergpu"
assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000"

file_regression.check(
f"Found {len(jobs)} job(s):\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Found 1 job(s):
"node": 1,
"billing": 1,
"gres_gpu": 1,
"gpu_type": "asupergpu"
"gpu_type": "Nec Plus ULTRA GPU 2000"
},
"stored_statistics": null
}
9 changes: 3 additions & 6 deletions tests/functional/test_clusterconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ def test_clusterconfig_node_to_gpu():
cluster_config = config().clusters["raisin_no_prometheus"]
mapping = cluster_config.node_to_gpu

result = mapping["cn-c018"]
assert result in cluster_config.gpus
assert (
mapping._harmonize_gpu(f"{cluster_config.gpus[0]}_suffix")
== cluster_config.gpus[0]
)
nodename = "cn-c018"
result = mapping[nodename]
assert result == cluster_config.gpus_per_nodes[nodename]["asupergpu"]
10 changes: 7 additions & 3 deletions tests/sarc-test.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,13 @@
"diskusage_report_command": null,
"prometheus_url": null,
"nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt",
"gpus": ["asupergpu"],
"harmonize_gpu_map": {
".*asupergpu_suffix.*": "asupergpu"
"gpus_per_nodes": {
"cn-c018": {
"asupergpu": "Nec Plus Plus ULTRA GPU 2000"
},
"cn-c{{[019-030]}}": {
"asupergpu": "Nec Plus ULTRA GPU 2000"
}
}
},
"fromage": {
Expand Down
Loading

0 comments on commit f4a0030

Please sign in to comment.