From 03b33db8a6f20fbc5228ce9c57f906a1a0831d25 Mon Sep 17 00:00:00 2001
From: Tensor Templar <lxk@droidcraft.org>
Date: Sat, 5 Oct 2024 19:11:44 +0300
Subject: [PATCH] Add test fixture for amd multigpu xgmi and nvidia dualgpu
 nvlink. Update tests to use fixtures

---
 tests/test_utils.py | 54 +++++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 12929df3c2..c740908c43 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -616,6 +616,36 @@ def nvlink_all_gpu_connected_but_other_connected_output():
     )
 
 
+@pytest.fixture
+def nvidia_smi_nvlink_output_dual_gpu_no_numa():
+    return """
+        GPU0    GPU1
+GPU0    X      NV1
+GPU1    NV1    X
+    """
+
+
+@pytest.fixture
+def rocm_smi_xgmi_output_multi_gpu():
+    """
+    rocm-smi --showtopotype on ROCm 6.0.3+
+    """
+    return """
+=============================== ROCm System Management Interface ============================
+=============================== Link Type between two GPUs ===============================
+       GPU0         GPU1         GPU2         GPU3         GPU4         GPU5         GPU6         GPU7
+GPU0   0            XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         XGMI
+GPU1   XGMI         0            XGMI         XGMI         XGMI         XGMI         XGMI         XGMI
+GPU2   XGMI         XGMI         0            XGMI         XGMI         XGMI         XGMI         XGMI
+GPU3   XGMI         XGMI         XGMI         0            XGMI         XGMI         XGMI         XGMI
+GPU4   XGMI         XGMI         XGMI         XGMI         0            XGMI         XGMI         XGMI
+GPU5   XGMI         XGMI         XGMI         XGMI         XGMI         0            XGMI         XGMI
+GPU6   XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         0            XGMI
+GPU7   XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         0
+================================== End of ROCm SMI Log ===================================
+    """
+
+
 @mock.patch("subprocess.run")
 def test_nvlink_all_gpu_connected_but_other_connected_output(
     mock_run, nvlink_all_gpu_connected_but_other_connected_output
@@ -680,39 +710,31 @@ def test_fix_and_load_json():
     assert result_missing_commas == expected_output_missing_commas
 
 
-def test_check_nvlink_connectivity__returns_fully_connected_when_nvidia_all_nvlink(monkeypatch):
+def test_check_nvlink_connectivity__returns_fully_connected_when_nvidia_all_nvlink_two_gpus(
+    monkeypatch, nvidia_smi_nvlink_output_dual_gpu_no_numa
+):
     mock_device_properties = mock.MagicMock(name="GPU Device", spec=["name"])
     mock_device_properties.name = "NVIDIA GeForce RTX 3090"
     monkeypatch.setattr(torch.cuda, "get_device_properties", lambda idx: mock_device_properties)
     monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
 
-    nvidia_smi_output = """
-    GPU0    GPU1
-    GPU0    X      NV1
-    GPU1    NV1    X
-    """
-    mock_run = mock.MagicMock(return_value=mock.Mock(stdout=nvidia_smi_output, returncode=0))
+    mock_run = mock.MagicMock(return_value=mock.Mock(stdout=nvidia_smi_nvlink_output_dual_gpu_no_numa, returncode=0))
     with mock.patch("subprocess.run", mock_run):
         with mock.patch("builtins.print") as mock_print:
             check_nvlink_connectivity()
             mock_print.assert_any_call("All GPUs are fully connected via NVLink.")
 
 
-def test_check_nvlink_connectivity_returns_fully_connected_when_amd_all_xgmi(monkeypatch):
+def test_check_nvlink_connectivity_returns_fully_connected_when_amd_all_xgmi_8_gpus(
+    monkeypatch, rocm_smi_xgmi_output_multi_gpu
+):
     # Mock the GPU device properties to simulate AMD GPUs
     mock_device_properties = mock.MagicMock(name="GPU Device", spec=["name"])
     mock_device_properties.name = "amd instinct mi250x"  # ROCM 6.0.3
     monkeypatch.setattr(torch.cuda, "get_device_properties", lambda idx: mock_device_properties)
     monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
 
-    rocm_smi_output = """
-    =============================== Link Type between two GPUs ===============================
-        GPU0         GPU1
-    GPU0   0            XGMI
-    GPU1   XGMI         0
-    ==========================================================================================
-    """
-    mock_run = mock.MagicMock(return_value=mock.Mock(stdout=rocm_smi_output, returncode=0))
+    mock_run = mock.MagicMock(return_value=mock.Mock(stdout=rocm_smi_xgmi_output_multi_gpu, returncode=0))
     with mock.patch("subprocess.run", mock_run):
         with mock.patch("builtins.print") as mock_print:
             check_nvlink_connectivity()