From f4a0030c0a0a798c9c80534527e8ee6fb333173c Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Wed, 3 Apr 2024 16:00:51 -0400
Subject: [PATCH] Add per cluster's node gpu maps

---
 config/sarc-dev.json                          |  93 +++++++++++++-
 sarc/config.py                                |   7 +-
 sarc/jobs/node_gpu_mapping.py                 |  76 ++++++++++--
 tests/conftest.py                             |  15 +--
 tests/functional/jobs/test_func_sacct.py      |   2 +-
 ...ut_prometheus_json_jobs0_test_config0_.txt |   2 +-
 tests/functional/test_clusterconfig.py        |   9 +-
 tests/sarc-test.json                          |  10 +-
 .../jobs/test_node_to_gpu_mapping.py          | 117 +++++++++++++++---
 9 files changed, 271 insertions(+), 60 deletions(-)

diff --git a/config/sarc-dev.json b/config/sarc-dev.json
index 350f44c6..a7a0b4f7 100644
--- a/config/sarc-dev.json
+++ b/config/sarc-dev.json
@@ -28,7 +28,37 @@
             "duc_storage_command": null,
             "diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
             "prometheus_url": "http://monitoring.server.mila.quebec:9090/",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "cn-a{{[001-011]}}": {
+                    "rtx8000": "Quadro RTX 8000"
+                },
+                "cn-b{{[001-005]}}": {
+                    "v100": "Tesla V100-SXM2-32GB"
+                },
+                "cn-c{{[001-040]}}": {
+                    "rtx8000": "Quadro RTX 8000"
+                },
+                "cn-g{{[001-029]}}": {
+                    "a100": "NVIDIA A100 80GB PCIe",
+                    "[0-9]+g\\.[0-9]+gb": "__MIG_FLAG__a100"
+                },
+                "cn-i001": {
+                    "a100": "NVIDIA A100 80GB PCIe"
+                },
+                "cn-j001": {
+                    "a6000": "NVIDIA RTX A6000"
+                },
+                "cn-d{{[001-002]}}": {
+                    "a100": "NVIDIA A100-SXM4-40GB"
+                },
+                "cn-d{{[003-004]}}": {
+                    "a100": "NVIDIA A100-SXM4-80GB"
+                },
+                "cn-e{{[002-003]}}": {
+                    "v100": "Tesla V100-SXM2-32GB"
+                }
+            }
         },
         "narval": {
             "host": "narval.computecanada.ca",
@@ -40,7 +70,12 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "a100": "NVIDIA A100-SXM4-40GB"
+                }
+            }
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -52,7 +87,12 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "v100": "Tesla V100-SXM2-16GB"
+                }
+            }
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -65,7 +105,30 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_graham.txt"
+            "nodes_info_file": "secrets/nodes_graham.txt",
+            "gpus_per_nodes": {
+                "gra{{[828-987]}}": {
+                    "p100": "NVIDIA P100-12G PCIe"
+                },
+                "gra{{[1147-1153]}}": {
+                    "v100": "NVIDIA V100-16G PCIe"
+                },
+                "gra{{[1154-1189]}}": {
+                    "t4": "NVIDIA T4-16G PCIe"
+                },
+                "gra{{[1337-1338]}}": {
+                    "v100": "NVIDIA V100-32G PCIe"
+                },
+                "gra1342": {
+                    "a100": "NVIDIA A100 PCIe"
+                },
+                "gra{{[1361-1362]}}": {
+                    "a100": "NVIDIA A100 PCIe"
+                },
+                "gra{{[1363-1373]}}": {
+                    "a5000": "NVIDIA A5000-24G PCIe"
+                }
+            }
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -78,7 +141,27 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_cedar.txt"
+            "nodes_info_file": "secrets/nodes_cedar.txt",
+            "gpus_per_nodes": {
+                "cdr{{[26-386]}}": {
+                    "p100": "NVIDIA P100-12G PCIe"
+                },
+                "cdr{{[876-904]}}": {
+                    "p100l": "NVIDIA P100-16G PCIe"
+                },
+                "cdr{{[905-906]}}": {
+                    "p100": "NVIDIA P100-12G PCIe"
+                },
+                "cdr{{[908-911]}}": {
+                    "p100l": "NVIDIA P100-16G PCIe"
+                },
+                "cdr{{[912-922]}}": {
+                    "p100": "NVIDIA P100-12G PCIe"
+                },
+                "cdr{{[2468-2678]}}": {
+                    "v100l": "NVIDIA V100-32G PCIe"
+                }
+            }
         }
     }
 }
diff --git a/sarc/config.py b/sarc/config.py
index e13f5b45..0220e373 100644
--- a/sarc/config.py
+++ b/sarc/config.py
@@ -83,8 +83,7 @@ class ClusterConfig(BaseModel):
     duc_storage_command: str = None
     diskusage_report_command: str = None
     start_date: str = "2022-04-01"
-    gpus: list = []
-    harmonize_gpu_map: dict = {}
+    gpus_per_nodes: dict = {}
 
     @validator("timezone")
     def _timezone(cls, value):
@@ -137,9 +136,7 @@ def node_to_gpu(self):
         """
         from .jobs.node_gpu_mapping import NodeToGPUMapping
 
-        return NodeToGPUMapping(
-            self.name, self.nodes_info_file, self.harmonize_gpu_map, self.gpus
-        )
+        return NodeToGPUMapping(self.name, self.nodes_info_file, self.gpus_per_nodes)
 
 
 class MongoConfig(BaseModel):
diff --git a/sarc/jobs/node_gpu_mapping.py b/sarc/jobs/node_gpu_mapping.py
index b8747ae6..90c86c74 100644
--- a/sarc/jobs/node_gpu_mapping.py
+++ b/sarc/jobs/node_gpu_mapping.py
@@ -11,23 +11,63 @@
 import regex as re
 from hostlist import expand_hostlist
 
+MIG_FLAG = "__MIG__"
+DEFAULTS_FLAG = "__DEFAULTS__"
+
+
+def _find_pattern(string: str) -> tuple:
+    try:
+        begin = string.index("{{")
+        end = string.index("}}")
+    except ValueError:
+        return None, None
+    return string[begin : end + 2], string[begin + 2 : end]
+
+
+def _expand_list(list_pattern: str) -> str:
+    start, stop = list_pattern[1:-1].split("-")
+    start, stop = int(start), int(stop)
+    regex = "|".join([f"0*{i}" for i in range(start, stop + 1)])
+    return f"({regex})"
+
+
+EXPAND_PATTERNS = {re.compile(r"^\[.*\]$"): _expand_list}
+
+
+def expand_patterns(string: str) -> re.Pattern:
+    full_pattern, pattern = _find_pattern(string)
+    while pattern:
+        for pattern_regex, _expand in EXPAND_PATTERNS.items():
+            if pattern_regex.match(pattern):
+                regex = _expand(pattern)
+                string = string.replace(full_pattern, regex)
+                break
+        else:
+            raise ValueError(f"Unknown pattern {full_pattern}")
+
+        full_pattern, pattern = _find_pattern(string)
+
+    return re.compile(string)
+
 
 class NodeToGPUMapping:
     """Helper class to generate JSON file, load it in memory, and query GPU type for a nodename."""
 
-    def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
+    def __init__(self, cluster_name, nodes_info_file, gpus_per_nodes: dict):
         """Initialize with cluster name and TXT file path to parse."""
 
         # Mapping is empty by default.
         self.mapping = {}
         self.json_path = None
-        self.harmonize_gpu_map = {
-            **{
-                re.compile(regex): gpu_type
-                for regex, gpu_type in harmonize_gpu_map.items()
-            },
-            **{re.compile(f".*{gpu}.*"): gpu for gpu in gpus},
-        }
+        self.harmonize_gpu_map = {}
+        for node_pattern, node_gpus in gpus_per_nodes.items():
+            if node_pattern != DEFAULTS_FLAG:
+                node_pattern = expand_patterns(f"^{node_pattern}$")
+            self.harmonize_gpu_map[node_pattern] = {
+                re.compile(f".*{gpu}.*"): descriptive_gpu
+                for gpu, descriptive_gpu in node_gpus.items()
+            }
+        self.default_gpu_map = self.harmonize_gpu_map.pop(DEFAULTS_FLAG, {})
 
         # Mapping is filled only if TXT file is available.
         if nodes_info_file and os.path.exists(nodes_info_file):
@@ -54,22 +94,36 @@ def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
                 with open(self.json_path, encoding="utf-8") as file:
                     self.mapping = json.load(file)
 
-    def _harmonize_gpu(self, gpu_type: str):
+    def _harmonize_gpu(self, nodename: str, gpu_type: str):
         gpu_type = gpu_type.lower().replace(" ", "-").split(":")
         if gpu_type[0] == "gpu":
             gpu_type.pop(0)
         gpu_type = gpu_type[0]
-        for regex, harmonized_gpu in self.harmonize_gpu_map.items():
+
+        for node_regex, gpu_map in self.harmonize_gpu_map.items():
+            if node_regex.match(nodename):
+                break
+        else:
+            gpu_map = self.default_gpu_map
+
+        for regex, harmonized_gpu in gpu_map.items():
             if regex.match(gpu_type):
                 break
         else:
             harmonized_gpu = None
+
+        if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
+            harmonized_gpu = self._harmonize_gpu(
+                nodename, harmonized_gpu[len(MIG_FLAG) :]
+            )
+            harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"
+
         return harmonized_gpu
 
     def __getitem__(self, nodename):
         """Return GPU type for nodename, or None if not found."""
         gpu_type = self.mapping.get(nodename, None)
-        return self._harmonize_gpu(gpu_type)
+        return self._harmonize_gpu(nodename, gpu_type)
 
     @staticmethod
     def _parse_nodenames(path: str, output: dict):
diff --git a/tests/conftest.py b/tests/conftest.py
index 03caac83..d7bc9222 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,15 +1,15 @@
+import json
 import os
-import shutil
 import sys
 import tempfile
 import zoneinfo
 from pathlib import Path
-from unittest.mock import MagicMock, mock_open
+from unittest.mock import MagicMock, mock_open, patch
 
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
-from opentelemetry.trace import set_tracer_provider
+from opentelemetry.trace import get_tracer_provider, set_tracer_provider
 
 _tracer_provider = TracerProvider()
 _exporter = InMemorySpanExporter()
@@ -24,6 +24,7 @@
 from sarc.config import (
     ClusterConfig,
     Config,
+    MongoConfig,
     ScraperConfig,
     config,
     parse_config,
@@ -72,14 +73,6 @@ def disabled_cache():
         yield
 
 
-# Make sure the cache dir is empty before running the tests
-@pytest.fixture(scope="session", autouse=True)
-def clean_up_test_cache_before_run(standard_config_object):
-    if standard_config_object.cache.exists():
-        shutil.rmtree(str(standard_config_object.cache))
-    yield
-
-
 @pytest.fixture
 def tzlocal_is_mtl(monkeypatch):
     monkeypatch.setattr("sarc.config.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal"))
diff --git a/tests/functional/jobs/test_func_sacct.py b/tests/functional/jobs/test_func_sacct.py
index 2e8bbaa3..bd8fe7a2 100644
--- a/tests/functional/jobs/test_func_sacct.py
+++ b/tests/functional/jobs/test_func_sacct.py
@@ -445,7 +445,7 @@ def test_get_gpu_type_without_prometheus(
     job = jobs[0]
     print(job)
     print(job.nodes)
-    assert job.allocated.gpu_type == "asupergpu"
+    assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000"
 
     file_regression.check(
         f"Found {len(jobs)} job(s):\n"
diff --git a/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt b/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt
index 8aff3fe2..bc0767e8 100644
--- a/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt
+++ b/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt
@@ -42,7 +42,7 @@ Found 1 job(s):
         "node": 1,
         "billing": 1,
         "gres_gpu": 1,
-        "gpu_type": "asupergpu"
+        "gpu_type": "Nec Plus ULTRA GPU 2000"
     },
     "stored_statistics": null
 }
\ No newline at end of file
diff --git a/tests/functional/test_clusterconfig.py b/tests/functional/test_clusterconfig.py
index ef803235..5935a256 100644
--- a/tests/functional/test_clusterconfig.py
+++ b/tests/functional/test_clusterconfig.py
@@ -8,9 +8,6 @@ def test_clusterconfig_node_to_gpu():
     cluster_config = config().clusters["raisin_no_prometheus"]
     mapping = cluster_config.node_to_gpu
 
-    result = mapping["cn-c018"]
-    assert result in cluster_config.gpus
-    assert (
-        mapping._harmonize_gpu(f"{cluster_config.gpus[0]}_suffix")
-        == cluster_config.gpus[0]
-    )
+    nodename = "cn-c018"
+    result = mapping[nodename]
+    assert result == cluster_config.gpus_per_nodes[nodename]["asupergpu"]
diff --git a/tests/sarc-test.json b/tests/sarc-test.json
index a12e72cb..8e504db2 100644
--- a/tests/sarc-test.json
+++ b/tests/sarc-test.json
@@ -41,9 +41,13 @@
             "diskusage_report_command": null,
             "prometheus_url": null,
             "nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt",
-            "gpus": ["asupergpu"],
-            "harmonize_gpu_map": {
-                ".*asupergpu_suffix.*": "asupergpu"
+            "gpus_per_nodes": {
+                "cn-c018": {
+                    "asupergpu": "Nec Plus Plus ULTRA GPU 2000"
+                },
+                "cn-c{{[019-030]}}": {
+                    "asupergpu": "Nec Plus ULTRA GPU 2000"
+                }
             }
         },
         "fromage": {
diff --git a/tests/unittests/jobs/test_node_to_gpu_mapping.py b/tests/unittests/jobs/test_node_to_gpu_mapping.py
index 75a2083a..74927f20 100644
--- a/tests/unittests/jobs/test_node_to_gpu_mapping.py
+++ b/tests/unittests/jobs/test_node_to_gpu_mapping.py
@@ -1,38 +1,121 @@
+import re
+
 import pytest
 
-from sarc.jobs.node_gpu_mapping import NodeToGPUMapping
+from sarc.jobs.node_gpu_mapping import (
+    DEFAULTS_FLAG,
+    MIG_FLAG,
+    NodeToGPUMapping,
+    _expand_list,
+    _find_pattern,
+    expand_patterns,
+)
+
+GPUS_PER_NODES = {
+    "node{{[0-9]}}": {"gpu1": "DESCRIPTIVE GPU 1"},
+    "node{{[9-19]}}": {"gpu2": "DESCRIPTIVE GPU 2"},
+    "node_mig20": {"gpu3": "DESCRIPTIVE GPU 3", "[0-9]+g\.[0-9]+gb": f"{MIG_FLAG}gpu3"},
+    DEFAULTS_FLAG: {"gpu_default": "DESCRIPTIVE GPU DEFAULT"},
+}
+
+
+@pytest.mark.parametrize(
+    "pattern,expected",
+    [
+        ["{{}}{{}}", ("{{}}", "")],
+        ["{{pattern}}", ("{{pattern}}", "pattern")],
+        ["{{pattern1}}something{{pattern2}}", ("{{pattern1}}", "pattern1")],
+    ],
+)
+def test__find_pattern(pattern, expected):
+    assert _find_pattern(pattern) == expected
+
+
+def test__expand_list():
+    start = 9
+    stop = 19
+    pattern = "{{[9-19]}}"
+    expected = f"({'|'.join([f'0*{i}' for i in range(start, stop + 1)])})"
+
+    _, pattern = _find_pattern(pattern)
+
+    assert _expand_list(pattern) == expected
+
+    for i in (start - 1, stop + 1):
+        assert re.match(expected, "0" * int(start / 2) + str(i)) is None
+        assert re.match(expected, str(i)) is None
+
+    for i in range(start, stop + 1):
+        assert re.match(expected, "0" * int(start / 2) + str(i))
+        assert re.match(expected, str(i))
+
+
+@pytest.mark.parametrize(
+    "string,expected,match",
+    [
+        [
+            "prefix {{[9-11]}}__{{[11-13]}} suffix",
+            re.compile(
+                f"prefix {_expand_list('[9-11]')}__{_expand_list('[11-13]')} suffix"
+            ),
+            "prefix 10__11 suffix",
+        ],
+        ["{{[9-11]}}{{DoesNotExist}}", None, None],
+    ],
+)
+def test_expand_patterns(string, expected, match):
+    if expected is None:
+        with pytest.raises(ValueError):
+            expand_patterns(string)
+        return
+
+    regex = expand_patterns(string)
+    assert regex.pattern == expected.pattern
+    assert regex.match(match)
 
 
 @pytest.mark.parametrize(
-    "gpu_type,expected,harmonize_gpu_map,gpus",
+    "node,gpu_type,expected,gpus_per_nodes",
     [
         [
+            "DoesNotExist",
             "DoesNotExist",
             None,
             {},
-            [],
         ],
         [
+            "node1",
             "prefix GPU1:suffix",
-            "gpu1",
-            {},
-            ["gpu1", "gpu2"],
+            "DESCRIPTIVE GPU 1",
+            GPUS_PER_NODES,
         ],
         [
-            "prefix GPU2 suffix",
-            "gpu2",
-            {},
-            ["gpu1", "gpu2"],
+            "node11",
+            "prefix GPU2:suffix",
+            "DESCRIPTIVE GPU 2",
+            GPUS_PER_NODES,
+        ],
+        [
+            "DoesNotExist",
+            "prefix GPU_DEFAULT:suffix",
+            "DESCRIPTIVE GPU DEFAULT",
+            GPUS_PER_NODES,
+        ],
+        [
+            "node1",
+            "DoesNotExist",
+            None,
+            GPUS_PER_NODES,
         ],
         [
-            "prefix GPU1_suffix",
-            "gpu1",
-            {".*gpu1_suffix.*": "gpu1"},
-            ["gpu1", "gpu2"],
+            "node_mig20",
+            "4g.40gb",
+            "DESCRIPTIVE GPU 3 : 4g.40gb",
+            GPUS_PER_NODES,
         ],
     ],
 )
-def test_node_to_gpu_mapping(gpu_type, expected, harmonize_gpu_map, gpus):
-    mapping = NodeToGPUMapping("cluster", None, harmonize_gpu_map, gpus)
+def test_node_to_gpu_mapping(node, gpu_type, expected, gpus_per_nodes):
+    mapping = NodeToGPUMapping("cluster", None, gpus_per_nodes)
 
-    assert mapping._harmonize_gpu(gpu_type) == expected
+    assert mapping._harmonize_gpu(node, gpu_type) == expected