fix amdsmi on multiple rocm versions

huggingface · Feb 14, 2024 · 21cebb7 · 21cebb7
1 parent 4af1c06
commit 21cebb7
Show file tree

Hide file tree

Showing 10 changed files with 307 additions and 241 deletions.
diff --git a/Makefile b/Makefile
@@ -27,23 +27,23 @@ test_cli_cpu_neural_compressor:
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor,diffusers,timm] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"
 
-test_cli_cpu_openvino:
+test_cli_cpu_onnxruntime:
 	docker run \
 	--rm \
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers] && pytest tests/ -k 'cli and cpu and openvino' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"
 
-test_cli_cpu_onnxruntime:
+test_cli_cpu_openvino:
 	docker run \
 	--rm \
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers,timm] && pytest tests/ -k 'cli and cpu and openvino' -x"
 
 test_cli_cpu_pytorch:
 	docker run \
@@ -53,13 +53,25 @@ test_cli_cpu_pytorch:
 	--workdir /workspace \
 	opt-bench-cpu:latest -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x"
 
+test_cli_rocm_pytorch:
+	docker run \
+	--rm \
+	--device=/dev/kfd \
+	--device /dev/dri/renderD128 \
+	--device /dev/dri/renderD129 \
+	--group-add video \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-rocm:5.6.1 -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cuda and pytorch' -x"
+
 test_api_cpu:
 	docker run \
 	--rm \
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cpu' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cpu' -x"
 
 test_api_cuda:
 	docker run \
@@ -68,12 +80,24 @@ test_api_cuda:
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cuda' -x"
+	opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x"
+
+test_api_rocm:
+	docker run \
+	--rm \
+	--device=/dev/kfd \
+	--device /dev/dri/renderD128 \
+	--device /dev/dri/renderD129 \
+	--group-add video \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-rocm:5.6.1 -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x"
 
 test_api_misc:
 	docker run \
 	--rm \
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml
@@ -9,8 +9,12 @@ defaults:
 
 experiment_name: pytorch_bert
 
+benchmark:
+  latency: true
+  memory: true
+
 backend:
-  device: cpu
+  device: cuda
   device_ids: 0
   model: bert-base-uncased
 

diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py
@@ -122,7 +122,7 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
             else:
                 self.run_inference_memory_tracking(backend)
 
-            self.report.log_max_memory()
+            self.report.log_memory()
 
         if self.config.latency:
             LOGGER.info("\t+ Creating inference latency tracker")

diff --git a/optimum_benchmark/benchmarks/report.py b/optimum_benchmark/benchmarks/report.py
@@ -6,7 +6,7 @@
 
 from ..trackers.latency import Latency, Throughput
 from ..trackers.energy import Energy, Efficiency
-from ..trackers.memory import MaxMemory
+from ..trackers.memory import Memory
 
 from transformers.configuration_utils import PushToHubMixin
 from flatten_dict import flatten
@@ -17,22 +17,22 @@
 
 @dataclass
 class BenchmarkMeasurements:
-    max_memory: Optional[MaxMemory] = None
+    memory: Optional[Memory] = None
     latency: Optional[Latency] = None
     throughput: Optional[Throughput] = None
     energy: Optional[Energy] = None
     efficiency: Optional[Efficiency] = None
 
     @staticmethod
     def aggregate(measurements: List["BenchmarkMeasurements"]) -> "BenchmarkMeasurements":
-        max_memory = MaxMemory.aggregate([m.max_memory for m in measurements if m.max_memory is not None])
+        memory = Memory.aggregate([m.memory for m in measurements if m.memory is not None])
         latency = Latency.aggregate([m.latency for m in measurements if m.latency is not None])
         throughput = Throughput.aggregate([m.throughput for m in measurements if m.throughput is not None])
         energy = Energy.aggregate([m.energy for m in measurements if m.energy is not None])
         efficiency = Efficiency.aggregate([m.efficiency for m in measurements if m.efficiency is not None])
 
         return BenchmarkMeasurements(
-            max_memory=max_memory,
+            memory=memory,
             latency=latency,
             throughput=throughput,
             energy=energy,
@@ -101,11 +101,11 @@ def to_dataframe(self) -> pd.DataFrame:
     def to_csv(self, path: str) -> None:
         self.to_dataframe().to_csv(path, index=False)
 
-    def log_max_memory(self):
+    def log_memory(self):
         for target in self.to_dict().keys():
             benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
-            if benchmark_measurements.max_memory is not None:
-                benchmark_measurements.max_memory.log(prefix=target)
+            if benchmark_measurements.memory is not None:
+                benchmark_measurements.memory.log(prefix=target)
 
     def log_latency(self):
         for target in self.to_dict().keys():
@@ -132,7 +132,7 @@ def log_efficiency(self):
                 benchmark_measurements.efficiency.log(prefix=target)
 
     def log_all(self):
-        self.log_max_memory()
+        self.log_memory()
         self.log_latency()
         self.log_throughput()
         self.log_energy()

diff --git a/optimum_benchmark/benchmarks/training/benchmark.py b/optimum_benchmark/benchmarks/training/benchmark.py
@@ -68,6 +68,14 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
                 training_arguments=self.config.training_arguments,
             )
 
+        if self.config.memory:
+            # it's the same
+            self.report.overall.max_memory = memory_tracker.get_max_memory()
+            self.report.warmup.max_memory = memory_tracker.get_max_memory()
+            self.report.train.max_memory = memory_tracker.get_max_memory()
+
+            self.report.log_memory()
+
         if self.config.latency:
             self.report.overall.latency = latency_callback.get_latency()
             self.report.overall.throughput = Throughput.from_latency(
@@ -91,14 +99,6 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
             self.report.log_latency()
             self.report.log_throughput()
 
-        if self.config.memory:
-            # it's the same
-            self.report.overall.max_memory = memory_tracker.get_max_memory()
-            self.report.warmup.max_memory = memory_tracker.get_max_memory()
-            self.report.train.max_memory = memory_tracker.get_max_memory()
-
-            self.report.log_max_memory()
-
         if self.config.energy:
             # can only get overall energy consumption
             self.report.overall.energy = energy_tracker.get_energy()

diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py
@@ -5,7 +5,7 @@
 import importlib.util
 from typing import Optional, List
 
-from .import_utils import is_pynvml_available, is_amdsmi_available
+from .import_utils import is_pynvml_available, is_amdsmi_available, torch_version
 
 import psutil
 
@@ -26,9 +26,13 @@ def is_rocm_system():
         return False
 
 
-def bytes_to_mega_bytes(bytes: int) -> float:
-    # MB, not MiB
-    return bytes / 1e6
+if is_nvidia_system():
+    if is_pynvml_available():
+        import pynvml as pynvml
+
+if is_rocm_system():
+    if is_amdsmi_available():
+        import amdsmi as amdsmi
 
 
 def get_cpu() -> Optional[str]:
@@ -52,7 +56,7 @@ def get_cpu() -> Optional[str]:
 
 
 def get_cpu_ram_mb():
-    return bytes_to_mega_bytes(psutil.virtual_memory().total)
+    return psutil.virtual_memory().total / 1e6
 
 
 def get_gpus():
@@ -62,32 +66,37 @@ def get_gpus():
                 "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
                 "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
             )
-        import pynvml as nvml
 
         gpus = []
-        nvml.nvmlInit()
-        device_count = nvml.nvmlDeviceGetCount()
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
         for i in range(device_count):
-            handle = nvml.nvmlDeviceGetHandleByIndex(i)
-            gpus.append(nvml.nvmlDeviceGetName(handle))
-        nvml.nvmlShutdown()
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            gpus.append(pynvml.nvmlDeviceGetName(handle))
+        pynvml.nvmlShutdown()
     elif is_rocm_system():
         if not is_amdsmi_available():
             raise ValueError(
                 "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
                 "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
             )
-        import amdsmi as rocml
 
         gpus = []
-        rocml.amdsmi_init()
-        devices_handles = rocml.amdsmi_get_processor_handles()
-        for device_handle in devices_handles:
-            gpus.append(rocml.amdsmi_get_gpu_vendor_name(device_handle))
+        amdsmi.amdsmi_init()
+        rocm_version = torch_version().split("rocm")[-1]
+
+        if rocm_version >= "5.7":
+            devices_handles = amdsmi.amdsmi_get_processor_handles()
+            for device_handle in devices_handles:
+                gpus.append(amdsmi.amdsmi_get_gpu_vendor_name(device_handle))
+        else:
+            devices_handles = amdsmi.amdsmi_get_device_handles()
+            for device_handle in devices_handles:
+                gpus.append(amdsmi.amdsmi_dev_get_vendor_name(device_handle))
 
-        rocml.amdsmi_shut_down()
+        amdsmi.amdsmi_shut_down()
     else:
-        gpus = []
+        raise ValueError("No NVIDIA or ROCm GPUs found.")
 
     return gpus
 
@@ -99,27 +108,37 @@ def get_gpu_vram_mb() -> List[int]:
                 "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
                 "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
             )
-        import pynvml as nvml
 
-        nvml.nvmlInit()
-        device_count = nvml.nvmlDeviceGetCount()
-        vrams = [nvml.nvmlDeviceGetMemoryInfo(nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)]
-        nvml.nvmlShutdown()
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        vrams = [
+            pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)
+        ]
+        pynvml.nvmlShutdown()
     elif is_rocm_system():
         if not is_amdsmi_available():
             raise ValueError(
                 "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
                 "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
             )
 
-        import amdsmi as rocml
+        amdsmi.amdsmi_init()
+        rocm_version = torch_version().split("rocm")[-1]
+
+        if rocm_version >= "5.7":
+            device_handles = amdsmi.amdsmi_get_processor_handles()
+            vrams = [amdsmi.amdsmi_get_gpu_memory_total(device_handle) for device_handle in device_handles]
+        else:
+            device_handles = amdsmi.amdsmi_get_device_handles()
+            vrams = [
+                amdsmi.amdsmi_dev_get_memory_total(device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM)
+                for device_handle in device_handles
+            ]
+
+        amdsmi.amdsmi_shut_down()
 
-        rocml.amdsmi_init()
-        device_handles = rocml.amdsmi_get_processor_handles()
-        vrams = [rocml.amdsmi_get_gpu_memory_total(device_handle) for device_handle in device_handles]
-        rocml.amdsmi_shut_down()
     else:
-        vrams = []
+        raise ValueError("No NVIDIA or ROCm GPUs found.")
 
     return sum(vrams)
 
@@ -134,26 +153,32 @@ def get_cuda_device_ids() -> str:
                     "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
                     "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
                 )
-            import pynvml as nvml
 
-            nvml.nvmlInit()
-            device_ids = list(range(nvml.nvmlDeviceGetCount()))
-            nvml.nvmlShutdown()
+            pynvml.nvmlInit()
+            device_ids = list(range(pynvml.nvmlDeviceGetCount()))
+            pynvml.nvmlShutdown()
         elif is_rocm_system():
             if not is_amdsmi_available():
                 raise ValueError(
                     "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
                     "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
                 )
-            import amdsmi as rocml
 
-            rocml.amdsmi_init()
-            device_ids = len(rocml.amdsmi_get_processor_handles())
-            rocml.amdsmi_shut_down()
+            amdsmi.amdsmi_init()
+            rocm_version = torch_version().split("rocm")[-1]
+
+            if rocm_version >= "5.7":
+                device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles())))
+            else:
+                device_ids = list(range(len(amdsmi.amdsmi_get_device_handles())))
+
+            amdsmi.amdsmi_shut_down()
         else:
             raise ValueError("No NVIDIA or ROCm GPUs found.")
 
-    return ",".join(str(i) for i in device_ids)
+        device_ids = ",".join(str(i) for i in device_ids)
+
+    return device_ids
 
 
 def get_git_revision_hash(package_name: str) -> Optional[str]:

diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
@@ -9,10 +9,11 @@
 from ..import_utils import is_codecarbon_available
 
 if is_codecarbon_available():
-    from codecarbon import EmissionsTracker, OfflineEmissionsTracker
+    from codecarbon import EmissionsTracker, OfflineEmissionsTracker  # type: ignore
 
 LOGGER = getLogger("energy")
 
+ENERGY_UNIT = "kWh"
 Energy_Unit_Literal = Literal["kWh"]
 Efficiency_Unit_Literal = Literal["samples/kWh", "tokens/kWh", "images/kWh"]
 
@@ -156,7 +157,7 @@ def get_elapsed_time(self) -> float:
 
     def get_energy(self) -> Energy:
         return Energy(
-            unit="kWh",
+            unit=ENERGY_UNIT,
             cpu=self.cpu_energy,
             gpu=self.gpu_energy,
             ram=self.ram_energy,