From f4a0030c0a0a798c9c80534527e8ee6fb333173c Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 3 Apr 2024 16:00:51 -0400 Subject: [PATCH] Add per cluster's node gpu maps --- config/sarc-dev.json | 93 +++++++++++++- sarc/config.py | 7 +- sarc/jobs/node_gpu_mapping.py | 76 ++++++++++-- tests/conftest.py | 15 +-- tests/functional/jobs/test_func_sacct.py | 2 +- ...ut_prometheus_json_jobs0_test_config0_.txt | 2 +- tests/functional/test_clusterconfig.py | 9 +- tests/sarc-test.json | 10 +- .../jobs/test_node_to_gpu_mapping.py | 117 +++++++++++++++--- 9 files changed, 271 insertions(+), 60 deletions(-) diff --git a/config/sarc-dev.json b/config/sarc-dev.json index 350f44c6..a7a0b4f7 100644 --- a/config/sarc-dev.json +++ b/config/sarc-dev.json @@ -28,7 +28,37 @@ "duc_storage_command": null, "diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv", "prometheus_url": "http://monitoring.server.mila.quebec:9090/", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "gpus_per_nodes": { + "cn-a{{[001-011]}}": { + "rtx8000": "Quadro RTX 8000" + }, + "cn-b{{[001-005]}}": { + "v100": "Tesla V100-SXM2-32GB" + }, + "cn-c{{[001-040]}}": { + "rtx8000": "Quadro RTX 8000" + }, + "cn-g{{[001-029]}}": { + "a100": "NVIDIA A100 80GB PCIe", + "[0-9]+g\\.[0-9]+gb": "__MIG_FLAG__a100" + }, + "cn-i001": { + "a100": "NVIDIA A100 80GB PCIe" + }, + "cn-j001": { + "a6000": "NVIDIA RTX A6000" + }, + "cn-d{{[001-002]}}": { + "a100": "NVIDIA A100-SXM4-40GB" + }, + "cn-d{{[003-004]}}": { + "a100": "NVIDIA A100-SXM4-80GB" + }, + "cn-e{{[002-003]}}": { + "v100": "Tesla V100-SXM2-32GB" + } + } }, "narval": { "host": "narval.computecanada.ca", @@ -40,7 +70,12 @@ "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://mila-thanos.calculquebec.ca", "prometheus_headers_file": "secrets/drac_prometheus/headers.json", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "gpus_per_nodes": { + "__DEFAULTS__": { + "a100": "NVIDIA A100-SXM4-40GB" + } + } }, "beluga": { "host": "beluga.computecanada.ca", @@ -52,7 +87,12 @@ "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://mila-thanos.calculquebec.ca", "prometheus_headers_file": "secrets/drac_prometheus/headers.json", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "gpus_per_nodes": { + "__DEFAULTS__": { + "v100": "Tesla V100-SXM2-16GB" + } + } }, "graham": { "host": "graham.computecanada.ca", @@ -65,7 +105,30 @@ "prometheus_url": null, "prometheus_headers_file": null, "start_date": "2022-04-01", - "nodes_info_file": "secrets/nodes_graham.txt" + "nodes_info_file": "secrets/nodes_graham.txt", + "gpus_per_nodes": { + "gra{{[828-987]}}": { + "p100": "NVIDIA P100-12G PCIe" + }, + "gra{{[1147-1153]}}": { + "v100": "NVIDIA V100-16G PCIe" + }, + "gra{{[1154-1189]}}": { + "t4": "NVIDIA T4-16G PCIe" + }, + "gra{{[1337-1338]}}": { + "v100": "NVIDIA V100-32G PCIe" + }, + "gra1342": { + "a100": "NVIDIA A100 PCIe" + }, + "gra{{[1361-1362]}}": { + "a100": "NVIDIA A100 PCIe" + }, + "gra{{[1363-1373]}}": { + "a5000": "NVIDIA A5000-24G PCIe" + } + } }, "cedar": { "host": "cedar.computecanada.ca", @@ -78,7 +141,27 @@ "prometheus_url": null, "prometheus_headers_file": null, "start_date": "2022-04-01", - "nodes_info_file": "secrets/nodes_cedar.txt" + "nodes_info_file": "secrets/nodes_cedar.txt", + "gpus_per_nodes": { + "cdr{{[26-386]}}": { + "p100": "NVIDIA P100-12G PCIe" + }, + "cdr{{[876-904]}}": { + "p100l": "NVIDIA P100-16G PCIe" + }, + "cdr{{[905-906]}}": { + "p100": "NVIDIA P100-12G PCIe" + }, + "cdr{{[908-911]}}": { + "p100l": "NVIDIA P100-16G PCIe" + }, + "cdr{{[912-922]}}": { + "p100": "NVIDIA P100-12G PCIe" + }, + "cdr{{[2468-2678]}}": { + "v100l": "NVIDIA V100-32G PCIe" + } + } } } } diff --git a/sarc/config.py b/sarc/config.py index e13f5b45..0220e373 100644 --- a/sarc/config.py +++ b/sarc/config.py @@ -83,8 +83,7 @@ class ClusterConfig(BaseModel): duc_storage_command: str = None diskusage_report_command: str = None start_date: str = "2022-04-01" - gpus: list = [] - harmonize_gpu_map: dict = {} + gpus_per_nodes: dict = {} @validator("timezone") def _timezone(cls, value): @@ -137,9 +136,7 @@ def node_to_gpu(self): """ from .jobs.node_gpu_mapping import NodeToGPUMapping - return NodeToGPUMapping( - self.name, self.nodes_info_file, self.harmonize_gpu_map, self.gpus - ) + return NodeToGPUMapping(self.name, self.nodes_info_file, self.gpus_per_nodes) class MongoConfig(BaseModel): diff --git a/sarc/jobs/node_gpu_mapping.py b/sarc/jobs/node_gpu_mapping.py index b8747ae6..90c86c74 100644 --- a/sarc/jobs/node_gpu_mapping.py +++ b/sarc/jobs/node_gpu_mapping.py @@ -11,23 +11,63 @@ import regex as re from hostlist import expand_hostlist +MIG_FLAG = "__MIG__" +DEFAULTS_FLAG = "__DEFAULTS__" + + +def _find_pattern(string: str) -> tuple: + try: + begin = string.index("{{") + end = string.index("}}") + except ValueError: + return None, None + return string[begin : end + 2], string[begin + 2 : end] + + +def _expand_list(list_pattern: str) -> str: + start, stop = list_pattern[1:-1].split("-") + start, stop = int(start), int(stop) + regex = "|".join([f"0*{i}" for i in range(start, stop + 1)]) + return f"({regex})" + + +EXPAND_PATTERNS = {re.compile(r"^\[.*\]$"): _expand_list} + + +def expand_patterns(string: str) -> re.Pattern: + full_pattern, pattern = _find_pattern(string) + while pattern: + for pattern_regex, _expand in EXPAND_PATTERNS.items(): + if pattern_regex.match(pattern): + regex = _expand(pattern) + string = string.replace(full_pattern, regex) + break + else: + raise ValueError(f"Unknown pattern {full_pattern}") + + full_pattern, pattern = _find_pattern(string) + + return re.compile(string) + class NodeToGPUMapping: """Helper class to generate JSON file, load it in memory, and query GPU type for a nodename.""" - def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus): + def __init__(self, cluster_name, nodes_info_file, gpus_per_nodes: dict): """Initialize with cluster name and TXT file path to parse.""" # Mapping is empty by default. self.mapping = {} self.json_path = None - self.harmonize_gpu_map = { - **{ - re.compile(regex): gpu_type - for regex, gpu_type in harmonize_gpu_map.items() - }, - **{re.compile(f".*{gpu}.*"): gpu for gpu in gpus}, - } + self.harmonize_gpu_map = {} + for node_pattern, node_gpus in gpus_per_nodes.items(): + if node_pattern != DEFAULTS_FLAG: + node_pattern = expand_patterns(f"^{node_pattern}$") + self.harmonize_gpu_map[node_pattern] = { + re.compile(f".*{gpu}.*"): descriptive_gpu + for gpu, descriptive_gpu in node_gpus.items() + } + self.default_gpu_map = self.harmonize_gpu_map.pop(DEFAULTS_FLAG, {}) # Mapping is filled only if TXT file is available. if nodes_info_file and os.path.exists(nodes_info_file): @@ -54,22 +94,36 @@ def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus): with open(self.json_path, encoding="utf-8") as file: self.mapping = json.load(file) - def _harmonize_gpu(self, gpu_type: str): + def _harmonize_gpu(self, nodename: str, gpu_type: str): gpu_type = gpu_type.lower().replace(" ", "-").split(":") if gpu_type[0] == "gpu": gpu_type.pop(0) gpu_type = gpu_type[0] - for regex, harmonized_gpu in self.harmonize_gpu_map.items(): + + for node_regex, gpu_map in self.harmonize_gpu_map.items(): + if node_regex.match(nodename): + break + else: + gpu_map = self.default_gpu_map + + for regex, harmonized_gpu in gpu_map.items(): if regex.match(gpu_type): break else: harmonized_gpu = None + + if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG): + harmonized_gpu = self._harmonize_gpu( + nodename, harmonized_gpu[len(MIG_FLAG) :] + ) + harmonized_gpu = f"{harmonized_gpu} : {gpu_type}" + return harmonized_gpu def __getitem__(self, nodename): """Return GPU type for nodename, or None if not found.""" gpu_type = self.mapping.get(nodename, None) - return self._harmonize_gpu(gpu_type) + return self._harmonize_gpu(nodename, gpu_type) @staticmethod def _parse_nodenames(path: str, output: dict): diff --git a/tests/conftest.py b/tests/conftest.py index 03caac83..d7bc9222 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,15 +1,15 @@ +import json import os -import shutil import sys import tempfile import zoneinfo from pathlib import Path -from unittest.mock import MagicMock, mock_open +from unittest.mock import MagicMock, mock_open, patch from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import SimpleSpanProcessor from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter -from opentelemetry.trace import set_tracer_provider +from opentelemetry.trace import get_tracer_provider, set_tracer_provider _tracer_provider = TracerProvider() _exporter = InMemorySpanExporter() @@ -24,6 +24,7 @@ from sarc.config import ( ClusterConfig, Config, + MongoConfig, ScraperConfig, config, parse_config, @@ -72,14 +73,6 @@ def disabled_cache(): yield -# Make sure the cache dir is empty before running the tests -@pytest.fixture(scope="session", autouse=True) -def clean_up_test_cache_before_run(standard_config_object): - if standard_config_object.cache.exists(): - shutil.rmtree(str(standard_config_object.cache)) - yield - - @pytest.fixture def tzlocal_is_mtl(monkeypatch): monkeypatch.setattr("sarc.config.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal")) diff --git a/tests/functional/jobs/test_func_sacct.py b/tests/functional/jobs/test_func_sacct.py index 2e8bbaa3..bd8fe7a2 100644 --- a/tests/functional/jobs/test_func_sacct.py +++ b/tests/functional/jobs/test_func_sacct.py @@ -445,7 +445,7 @@ def test_get_gpu_type_without_prometheus( job = jobs[0] print(job) print(job.nodes) - assert job.allocated.gpu_type == "asupergpu" + assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000" file_regression.check( f"Found {len(jobs)} job(s):\n" diff --git a/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt b/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt index 8aff3fe2..bc0767e8 100644 --- a/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt +++ b/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt @@ -42,7 +42,7 @@ Found 1 job(s): "node": 1, "billing": 1, "gres_gpu": 1, - "gpu_type": "asupergpu" + "gpu_type": "Nec Plus ULTRA GPU 2000" }, "stored_statistics": null } \ No newline at end of file diff --git a/tests/functional/test_clusterconfig.py b/tests/functional/test_clusterconfig.py index ef803235..5935a256 100644 --- a/tests/functional/test_clusterconfig.py +++ b/tests/functional/test_clusterconfig.py @@ -8,9 +8,6 @@ def test_clusterconfig_node_to_gpu(): cluster_config = config().clusters["raisin_no_prometheus"] mapping = cluster_config.node_to_gpu - result = mapping["cn-c018"] - assert result in cluster_config.gpus - assert ( - mapping._harmonize_gpu(f"{cluster_config.gpus[0]}_suffix") - == cluster_config.gpus[0] - ) + nodename = "cn-c018" + result = mapping[nodename] + assert result == cluster_config.gpus_per_nodes[nodename]["asupergpu"] diff --git a/tests/sarc-test.json b/tests/sarc-test.json index a12e72cb..8e504db2 100644 --- a/tests/sarc-test.json +++ b/tests/sarc-test.json @@ -41,9 +41,13 @@ "diskusage_report_command": null, "prometheus_url": null, "nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt", - "gpus": ["asupergpu"], - "harmonize_gpu_map": { - ".*asupergpu_suffix.*": "asupergpu" + "gpus_per_nodes": { + "cn-c018": { + "asupergpu": "Nec Plus Plus ULTRA GPU 2000" + }, + "cn-c{{[019-030]}}": { + "asupergpu": "Nec Plus ULTRA GPU 2000" + } } }, "fromage": { diff --git a/tests/unittests/jobs/test_node_to_gpu_mapping.py b/tests/unittests/jobs/test_node_to_gpu_mapping.py index 75a2083a..74927f20 100644 --- a/tests/unittests/jobs/test_node_to_gpu_mapping.py +++ b/tests/unittests/jobs/test_node_to_gpu_mapping.py @@ -1,38 +1,121 @@ +import re + import pytest -from sarc.jobs.node_gpu_mapping import NodeToGPUMapping +from sarc.jobs.node_gpu_mapping import ( + DEFAULTS_FLAG, + MIG_FLAG, + NodeToGPUMapping, + _expand_list, + _find_pattern, + expand_patterns, +) + +GPUS_PER_NODES = { + "node{{[0-9]}}": {"gpu1": "DESCRIPTIVE GPU 1"}, + "node{{[9-19]}}": {"gpu2": "DESCRIPTIVE GPU 2"}, + "node_mig20": {"gpu3": "DESCRIPTIVE GPU 3", "[0-9]+g\.[0-9]+gb": f"{MIG_FLAG}gpu3"}, + DEFAULTS_FLAG: {"gpu_default": "DESCRIPTIVE GPU DEFAULT"}, +} + + +@pytest.mark.parametrize( + "pattern,expected", + [ + ["{{}}{{}}", ("{{}}", "")], + ["{{pattern}}", ("{{pattern}}", "pattern")], + ["{{pattern1}}something{{pattern2}}", ("{{pattern1}}", "pattern1")], + ], +) +def test__find_pattern(pattern, expected): + assert _find_pattern(pattern) == expected + + +def test__expand_list(): + start = 9 + stop = 19 + pattern = "{{[9-19]}}" + expected = f"({'|'.join([f'0*{i}' for i in range(start, stop + 1)])})" + + _, pattern = _find_pattern(pattern) + + assert _expand_list(pattern) == expected + + for i in (start - 1, stop + 1): + assert re.match(expected, "0" * int(start / 2) + str(i)) is None + assert re.match(expected, str(i)) is None + + for i in range(start, stop + 1): + assert re.match(expected, "0" * int(start / 2) + str(i)) + assert re.match(expected, str(i)) + + +@pytest.mark.parametrize( + "string,expected,match", + [ + [ + "prefix {{[9-11]}}__{{[11-13]}} suffix", + re.compile( + f"prefix {_expand_list('[9-11]')}__{_expand_list('[11-13]')} suffix" + ), + "prefix 10__11 suffix", + ], + ["{{[9-11]}}{{DoesNotExist}}", None, None], + ], +) +def test_expand_patterns(string, expected, match): + if expected is None: + with pytest.raises(ValueError): + expand_patterns(string) + return + + regex = expand_patterns(string) + assert regex.pattern == expected.pattern + assert regex.match(match) @pytest.mark.parametrize( - "gpu_type,expected,harmonize_gpu_map,gpus", + "node,gpu_type,expected,gpus_per_nodes", [ [ + "DoesNotExist", "DoesNotExist", None, {}, - [], ], [ + "node1", "prefix GPU1:suffix", - "gpu1", - {}, - ["gpu1", "gpu2"], + "DESCRIPTIVE GPU 1", + GPUS_PER_NODES, ], [ - "prefix GPU2 suffix", - "gpu2", - {}, - ["gpu1", "gpu2"], + "node11", + "prefix GPU2:suffix", + "DESCRIPTIVE GPU 2", + GPUS_PER_NODES, + ], + [ + "DoesNotExist", + "prefix GPU_DEFAULT:suffix", + "DESCRIPTIVE GPU DEFAULT", + GPUS_PER_NODES, + ], + [ + "node1", + "DoesNotExist", + None, + GPUS_PER_NODES, ], [ - "prefix GPU1_suffix", - "gpu1", - {".*gpu1_suffix.*": "gpu1"}, - ["gpu1", "gpu2"], + "node_mig20", + "4g.40gb", + "DESCRIPTIVE GPU 3 : 4g.40gb", + GPUS_PER_NODES, ], ], ) -def test_node_to_gpu_mapping(gpu_type, expected, harmonize_gpu_map, gpus): - mapping = NodeToGPUMapping("cluster", None, harmonize_gpu_map, gpus) +def test_node_to_gpu_mapping(node, gpu_type, expected, gpus_per_nodes): + mapping = NodeToGPUMapping("cluster", None, gpus_per_nodes) - assert mapping._harmonize_gpu(gpu_type) == expected + assert mapping._harmonize_gpu(node, gpu_type) == expected