Add per cluster's node gpu maps

mila-iqia · Apr 3, 2024 · 8d26d49 · 8d26d49
1 parent 262d37c
commit 8d26d49
Show file tree

Hide file tree

Showing 9 changed files with 271 additions and 60 deletions.
diff --git a/config/sarc-dev.json b/config/sarc-dev.json
@@ -28,7 +28,37 @@
             "duc_storage_command": null,
             "diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
             "prometheus_url": "http://monitoring.server.mila.quebec:9090/",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "cn-a{{[001-011]}}": {
+                    "rtx8000": "Quadro RTX 8000"
+                },
+                "cn-b{{[001-005]}}": {
+                    "v100": "Tesla V100-32GB"
+                },
+                "cn-c{{[001-040]}}": {
+                    "rtx8000": "Quadro RTX 8000"
+                },
+                "cn-g{{[001-029]}}": {
+                    "a100": "NVIDIA A100 80GB",
+                    "[0-9]+g\\.[0-9]+gb": "__MIG_FLAG__a100"
+                },
+                "cn-i001": {
+                    "a100": "NVIDIA A100 80GB"
+                },
+                "cn-j001": {
+                    "a6000": "NVIDIA RTX A6000"
+                },
+                "cn-d{{[001-002]}}": {
+                    "a100": "NVIDIA A100-SXM4-40GB"
+                },
+                "cn-d{{[003-004]}}": {
+                    "a100": "NVIDIA A100-SXM4-80GB"
+                },
+                "cn-e{{[002-003]}}": {
+                    "v100": "Tesla V100-SXM2-32GB"
+                }
+            }
         },
         "narval": {
             "host": "narval.computecanada.ca",
@@ -40,7 +70,12 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "a100": "NVIDIA A100-SXM4-40GB"
+                }
+            }
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -52,7 +87,12 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "v100": "Tesla V100-SXM2-16GB"
+                }
+            }
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -65,7 +105,30 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_graham.txt"
+            "nodes_info_file": "secrets/nodes_graham.txt",
+            "gpus_per_nodes": {
+                "gra{{[828-987]}}": {
+                    "p100": "NVIDIA P100-12G"
+                },
+                "gra{{[1147-1153]}}": {
+                    "v100": "NVIDIA V100-16G"
+                },
+                "gra{{[1154-1189]}}": {
+                    "t4": "NVIDIA T4-16G"
+                },
+                "gra{{[1337-1338]}}": {
+                    "v100": "NVIDIA V100-32G"
+                },
+                "gra1342": {
+                    "a100": "NVIDIA A100"
+                },
+                "gra{{[1361-1362]}}": {
+                    "a100": "NVIDIA A100"
+                },
+                "gra{{[1363-1373]}}": {
+                    "a5000": "NVIDIA A5000-24G"
+                }
+            }
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -78,7 +141,27 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_cedar.txt"
+            "nodes_info_file": "secrets/nodes_cedar.txt",
+            "gpus_per_nodes": {
+                "cdr{{[26-386]}}": {
+                    "p100": "NVIDIA P100-12G"
+                },
+                "cdr{{[876-904]}}": {
+                    "p100l": "NVIDIA P100-16G"
+                },
+                "cdr{{[905-906]}}": {
+                    "p100": "NVIDIA P100-12G"
+                },
+                "cdr{{[908-911]}}": {
+                    "p100l": "NVIDIA P100-16G"
+                },
+                "cdr{{[912-922]}}": {
+                    "p100": "NVIDIA P100-12G"
+                },
+                "cdr{{[2468-2678]}}": {
+                    "v100l": "NVIDIA V100-32G"
+                }
+            }
         }
     }
 }
diff --git a/sarc/config.py b/sarc/config.py
@@ -83,8 +83,7 @@ class ClusterConfig(BaseModel):
     duc_storage_command: str = None
     diskusage_report_command: str = None
     start_date: str = "2022-04-01"
-    gpus: list = []
-    harmonize_gpu_map: dict = {}
+    gpus_per_nodes: dict = {}
 
     @validator("timezone")
     def _timezone(cls, value):
@@ -137,9 +136,7 @@ def node_to_gpu(self):
         """
         from .jobs.node_gpu_mapping import NodeToGPUMapping
 
-        return NodeToGPUMapping(
-            self.name, self.nodes_info_file, self.harmonize_gpu_map, self.gpus
-        )
+        return NodeToGPUMapping(self.name, self.nodes_info_file, self.gpus_per_nodes)
 
 
 class MongoConfig(BaseModel):

diff --git a/sarc/jobs/node_gpu_mapping.py b/sarc/jobs/node_gpu_mapping.py
@@ -11,23 +11,63 @@
 import regex as re
 from hostlist import expand_hostlist
 
+MIG_FLAG = "__MIG__"
+DEFAULTS_FLAG = "__DEFAULTS__"
+
+
+def _find_pattern(string: str) -> tuple:
+    try:
+        begin = string.index("{{")
+        end = string.index("}}")
+    except ValueError:
+        return None, None
+    return string[begin : end + 2], string[begin + 2 : end]
+
+
+def _expand_list(list_pattern: str) -> str:
+    start, stop = list_pattern[1:-1].split("-")
+    start, stop = int(start), int(stop)
+    regex = "|".join([f"0*{i}" for i in range(start, stop + 1)])
+    return f"({regex})"
+
+
+EXPAND_PATTERNS = {re.compile(r"^\[.*\]$"): _expand_list}
+
+
+def expand_patterns(string: str) -> re.Pattern:
+    full_pattern, pattern = _find_pattern(string)
+    while pattern:
+        for pattern_regex, _expand in EXPAND_PATTERNS.items():
+            if pattern_regex.match(pattern):
+                regex = _expand(pattern)
+                string = string.replace(full_pattern, regex)
+                break
+        else:
+            raise ValueError(f"Unknown pattern {full_pattern}")
+
+        full_pattern, pattern = _find_pattern(string)
+
+    return re.compile(string)
+
 
 class NodeToGPUMapping:
     """Helper class to generate JSON file, load it in memory, and query GPU type for a nodename."""
 
-    def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
+    def __init__(self, cluster_name, nodes_info_file, gpus_per_nodes: dict):
         """Initialize with cluster name and TXT file path to parse."""
 
         # Mapping is empty by default.
         self.mapping = {}
         self.json_path = None
-        self.harmonize_gpu_map = {
-            **{
-                re.compile(regex): gpu_type
-                for regex, gpu_type in harmonize_gpu_map.items()
-            },
-            **{re.compile(f".*{gpu}.*"): gpu for gpu in gpus},
-        }
+        self.harmonize_gpu_map = {}
+        for node_pattern, node_gpus in gpus_per_nodes.items():
+            if node_pattern != DEFAULTS_FLAG:
+                node_pattern = expand_patterns(f"^{node_pattern}$")
+            self.harmonize_gpu_map[node_pattern] = {
+                re.compile(f".*{gpu}.*"): descriptive_gpu
+                for gpu, descriptive_gpu in node_gpus.items()
+            }
+        self.default_gpu_map = self.harmonize_gpu_map.pop(DEFAULTS_FLAG, {})
 
         # Mapping is filled only if TXT file is available.
         if nodes_info_file and os.path.exists(nodes_info_file):
@@ -54,22 +94,36 @@ def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
                 with open(self.json_path, encoding="utf-8") as file:
                     self.mapping = json.load(file)
 
-    def _harmonize_gpu(self, gpu_type: str):
+    def _harmonize_gpu(self, nodename: str, gpu_type: str):
         gpu_type = gpu_type.lower().replace(" ", "-").split(":")
         if gpu_type[0] == "gpu":
             gpu_type.pop(0)
         gpu_type = gpu_type[0]
-        for regex, harmonized_gpu in self.harmonize_gpu_map.items():
+
+        for node_regex, gpu_map in self.harmonize_gpu_map.items():
+            if node_regex.match(nodename):
+                break
+        else:
+            gpu_map = self.default_gpu_map
+
+        for regex, harmonized_gpu in gpu_map.items():
             if regex.match(gpu_type):
                 break
         else:
             harmonized_gpu = None
+
+        if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
+            harmonized_gpu = self._harmonize_gpu(
+                nodename, harmonized_gpu[len(MIG_FLAG) :]
+            )
+            harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"
+
         return harmonized_gpu
 
     def __getitem__(self, nodename):
         """Return GPU type for nodename, or None if not found."""
         gpu_type = self.mapping.get(nodename, None)
-        return self._harmonize_gpu(gpu_type)
+        return self._harmonize_gpu(nodename, gpu_type)
 
     @staticmethod
     def _parse_nodenames(path: str, output: dict):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,15 +1,15 @@
+import json
 import os
-import shutil
 import sys
 import tempfile
 import zoneinfo
 from pathlib import Path
-from unittest.mock import MagicMock, mock_open
+from unittest.mock import MagicMock, mock_open, patch
 
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
-from opentelemetry.trace import set_tracer_provider
+from opentelemetry.trace import get_tracer_provider, set_tracer_provider
 
 _tracer_provider = TracerProvider()
 _exporter = InMemorySpanExporter()
@@ -24,6 +24,7 @@
 from sarc.config import (
     ClusterConfig,
     Config,
+    MongoConfig,
     ScraperConfig,
     config,
     parse_config,
@@ -72,14 +73,6 @@ def disabled_cache():
         yield
 
 
-# Make sure the cache dir is empty before running the tests
-@pytest.fixture(scope="session", autouse=True)
-def clean_up_test_cache_before_run(standard_config_object):
-    if standard_config_object.cache.exists():
-        shutil.rmtree(str(standard_config_object.cache))
-    yield
-
-
 @pytest.fixture
 def tzlocal_is_mtl(monkeypatch):
     monkeypatch.setattr("sarc.config.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal"))

diff --git a/tests/functional/jobs/test_func_sacct.py b/tests/functional/jobs/test_func_sacct.py
@@ -445,7 +445,7 @@ def test_get_gpu_type_without_prometheus(
     job = jobs[0]
     print(job)
     print(job.nodes)
-    assert job.allocated.gpu_type == "asupergpu"
+    assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000"
 
     file_regression.check(
         f"Found {len(jobs)} job(s):\n"

diff --git a/...al/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt b/...al/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt
@@ -42,7 +42,7 @@ Found 1 job(s):
         "node": 1,
         "billing": 1,
         "gres_gpu": 1,
-        "gpu_type": "asupergpu"
+        "gpu_type": "Nec Plus ULTRA GPU 2000"
     },
     "stored_statistics": null
 }
diff --git a/tests/functional/test_clusterconfig.py b/tests/functional/test_clusterconfig.py
@@ -8,9 +8,6 @@ def test_clusterconfig_node_to_gpu():
     cluster_config = config().clusters["raisin_no_prometheus"]
     mapping = cluster_config.node_to_gpu
 
-    result = mapping["cn-c018"]
-    assert result in cluster_config.gpus
-    assert (
-        mapping._harmonize_gpu(f"{cluster_config.gpus[0]}_suffix")
-        == cluster_config.gpus[0]
-    )
+    nodename = "cn-c018"
+    result = mapping[nodename]
+    assert result == cluster_config.gpus_per_nodes[nodename]["asupergpu"]
diff --git a/tests/sarc-test.json b/tests/sarc-test.json
@@ -41,9 +41,13 @@
             "diskusage_report_command": null,
             "prometheus_url": null,
             "nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt",
-            "gpus": ["asupergpu"],
-            "harmonize_gpu_map": {
-                ".*asupergpu_suffix.*": "asupergpu"
+            "gpus_per_nodes": {
+                "cn-c018": {
+                    "asupergpu": "Nec Plus Plus ULTRA GPU 2000"
+                },
+                "cn-c{{[019-030]}}": {
+                    "asupergpu": "Nec Plus ULTRA GPU 2000"
+                }
             }
         },
         "fromage": {