Skip to content

Commit

Permalink
fix amdsmi on multiple rocm versions
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil committed Feb 14, 2024
1 parent 4af1c06 commit 21cebb7
Show file tree
Hide file tree
Showing 10 changed files with 307 additions and 241 deletions.
40 changes: 32 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,23 @@ test_cli_cpu_neural_compressor:
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"
opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor,diffusers,timm] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"

test_cli_cpu_openvino:
test_cli_cpu_onnxruntime:
docker run \
--rm \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers] && pytest tests/ -k 'cli and cpu and openvino' -x"
opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"

test_cli_cpu_onnxruntime:
test_cli_cpu_openvino:
docker run \
--rm \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"
opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers,timm] && pytest tests/ -k 'cli and cpu and openvino' -x"

test_cli_cpu_pytorch:
docker run \
Expand All @@ -53,13 +53,25 @@ test_cli_cpu_pytorch:
--workdir /workspace \
opt-bench-cpu:latest -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x"

test_cli_rocm_pytorch:
docker run \
--rm \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
--group-add video \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-rocm:5.6.1 -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cuda and pytorch' -x"

test_api_cpu:
docker run \
--rm \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cpu' -x"
opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cpu' -x"

test_api_cuda:
docker run \
Expand All @@ -68,12 +80,24 @@ test_api_cuda:
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cuda' -x"
opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x"

test_api_rocm:
docker run \
--rm \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
--group-add video \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-rocm:5.6.1 -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x"

test_api_misc:
docker run \
--rm \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
6 changes: 5 additions & 1 deletion examples/pytorch_bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@ defaults:

experiment_name: pytorch_bert

benchmark:
latency: true
memory: true

backend:
device: cpu
device: cuda
device_ids: 0
model: bert-base-uncased

Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/benchmarks/inference/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
else:
self.run_inference_memory_tracking(backend)

self.report.log_max_memory()
self.report.log_memory()

if self.config.latency:
LOGGER.info("\t+ Creating inference latency tracker")
Expand Down
16 changes: 8 additions & 8 deletions optimum_benchmark/benchmarks/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from ..trackers.latency import Latency, Throughput
from ..trackers.energy import Energy, Efficiency
from ..trackers.memory import MaxMemory
from ..trackers.memory import Memory

from transformers.configuration_utils import PushToHubMixin
from flatten_dict import flatten
Expand All @@ -17,22 +17,22 @@

@dataclass
class BenchmarkMeasurements:
max_memory: Optional[MaxMemory] = None
memory: Optional[Memory] = None
latency: Optional[Latency] = None
throughput: Optional[Throughput] = None
energy: Optional[Energy] = None
efficiency: Optional[Efficiency] = None

@staticmethod
def aggregate(measurements: List["BenchmarkMeasurements"]) -> "BenchmarkMeasurements":
max_memory = MaxMemory.aggregate([m.max_memory for m in measurements if m.max_memory is not None])
memory = Memory.aggregate([m.memory for m in measurements if m.memory is not None])
latency = Latency.aggregate([m.latency for m in measurements if m.latency is not None])
throughput = Throughput.aggregate([m.throughput for m in measurements if m.throughput is not None])
energy = Energy.aggregate([m.energy for m in measurements if m.energy is not None])
efficiency = Efficiency.aggregate([m.efficiency for m in measurements if m.efficiency is not None])

return BenchmarkMeasurements(
max_memory=max_memory,
memory=memory,
latency=latency,
throughput=throughput,
energy=energy,
Expand Down Expand Up @@ -101,11 +101,11 @@ def to_dataframe(self) -> pd.DataFrame:
def to_csv(self, path: str) -> None:
self.to_dataframe().to_csv(path, index=False)

def log_max_memory(self):
def log_memory(self):
for target in self.to_dict().keys():
benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
if benchmark_measurements.max_memory is not None:
benchmark_measurements.max_memory.log(prefix=target)
if benchmark_measurements.memory is not None:
benchmark_measurements.memory.log(prefix=target)

def log_latency(self):
for target in self.to_dict().keys():
Expand All @@ -132,7 +132,7 @@ def log_efficiency(self):
benchmark_measurements.efficiency.log(prefix=target)

def log_all(self):
self.log_max_memory()
self.log_memory()
self.log_latency()
self.log_throughput()
self.log_energy()
Expand Down
16 changes: 8 additions & 8 deletions optimum_benchmark/benchmarks/training/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
training_arguments=self.config.training_arguments,
)

if self.config.memory:
# it's the same
self.report.overall.max_memory = memory_tracker.get_max_memory()
self.report.warmup.max_memory = memory_tracker.get_max_memory()
self.report.train.max_memory = memory_tracker.get_max_memory()

self.report.log_memory()

if self.config.latency:
self.report.overall.latency = latency_callback.get_latency()
self.report.overall.throughput = Throughput.from_latency(
Expand All @@ -91,14 +99,6 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
self.report.log_latency()
self.report.log_throughput()

if self.config.memory:
# it's the same
self.report.overall.max_memory = memory_tracker.get_max_memory()
self.report.warmup.max_memory = memory_tracker.get_max_memory()
self.report.train.max_memory = memory_tracker.get_max_memory()

self.report.log_max_memory()

if self.config.energy:
# can only get overall energy consumption
self.report.overall.energy = energy_tracker.get_energy()
Expand Down
101 changes: 63 additions & 38 deletions optimum_benchmark/env_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import importlib.util
from typing import Optional, List

from .import_utils import is_pynvml_available, is_amdsmi_available
from .import_utils import is_pynvml_available, is_amdsmi_available, torch_version

import psutil

Expand All @@ -26,9 +26,13 @@ def is_rocm_system():
return False


def bytes_to_mega_bytes(bytes: int) -> float:
# MB, not MiB
return bytes / 1e6
if is_nvidia_system():
if is_pynvml_available():
import pynvml as pynvml

if is_rocm_system():
if is_amdsmi_available():
import amdsmi as amdsmi


def get_cpu() -> Optional[str]:
Expand All @@ -52,7 +56,7 @@ def get_cpu() -> Optional[str]:


def get_cpu_ram_mb():
return bytes_to_mega_bytes(psutil.virtual_memory().total)
return psutil.virtual_memory().total / 1e6


def get_gpus():
Expand All @@ -62,32 +66,37 @@ def get_gpus():
"The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
"Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
)
import pynvml as nvml

gpus = []
nvml.nvmlInit()
device_count = nvml.nvmlDeviceGetCount()
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
for i in range(device_count):
handle = nvml.nvmlDeviceGetHandleByIndex(i)
gpus.append(nvml.nvmlDeviceGetName(handle))
nvml.nvmlShutdown()
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpus.append(pynvml.nvmlDeviceGetName(handle))
pynvml.nvmlShutdown()
elif is_rocm_system():
if not is_amdsmi_available():
raise ValueError(
"The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
"Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
)
import amdsmi as rocml

gpus = []
rocml.amdsmi_init()
devices_handles = rocml.amdsmi_get_processor_handles()
for device_handle in devices_handles:
gpus.append(rocml.amdsmi_get_gpu_vendor_name(device_handle))
amdsmi.amdsmi_init()
rocm_version = torch_version().split("rocm")[-1]

if rocm_version >= "5.7":
devices_handles = amdsmi.amdsmi_get_processor_handles()
for device_handle in devices_handles:
gpus.append(amdsmi.amdsmi_get_gpu_vendor_name(device_handle))
else:
devices_handles = amdsmi.amdsmi_get_device_handles()
for device_handle in devices_handles:
gpus.append(amdsmi.amdsmi_dev_get_vendor_name(device_handle))

rocml.amdsmi_shut_down()
amdsmi.amdsmi_shut_down()
else:
gpus = []
raise ValueError("No NVIDIA or ROCm GPUs found.")

return gpus

Expand All @@ -99,27 +108,37 @@ def get_gpu_vram_mb() -> List[int]:
"The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
"Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
)
import pynvml as nvml

nvml.nvmlInit()
device_count = nvml.nvmlDeviceGetCount()
vrams = [nvml.nvmlDeviceGetMemoryInfo(nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)]
nvml.nvmlShutdown()
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
vrams = [
pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)
]
pynvml.nvmlShutdown()
elif is_rocm_system():
if not is_amdsmi_available():
raise ValueError(
"The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
"Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
)

import amdsmi as rocml
amdsmi.amdsmi_init()
rocm_version = torch_version().split("rocm")[-1]

if rocm_version >= "5.7":
device_handles = amdsmi.amdsmi_get_processor_handles()
vrams = [amdsmi.amdsmi_get_gpu_memory_total(device_handle) for device_handle in device_handles]
else:
device_handles = amdsmi.amdsmi_get_device_handles()
vrams = [
amdsmi.amdsmi_dev_get_memory_total(device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM)
for device_handle in device_handles
]

amdsmi.amdsmi_shut_down()

rocml.amdsmi_init()
device_handles = rocml.amdsmi_get_processor_handles()
vrams = [rocml.amdsmi_get_gpu_memory_total(device_handle) for device_handle in device_handles]
rocml.amdsmi_shut_down()
else:
vrams = []
raise ValueError("No NVIDIA or ROCm GPUs found.")

return sum(vrams)

Expand All @@ -134,26 +153,32 @@ def get_cuda_device_ids() -> str:
"The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
"Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
)
import pynvml as nvml

nvml.nvmlInit()
device_ids = list(range(nvml.nvmlDeviceGetCount()))
nvml.nvmlShutdown()
pynvml.nvmlInit()
device_ids = list(range(pynvml.nvmlDeviceGetCount()))
pynvml.nvmlShutdown()
elif is_rocm_system():
if not is_amdsmi_available():
raise ValueError(
"The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
"Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
)
import amdsmi as rocml

rocml.amdsmi_init()
device_ids = len(rocml.amdsmi_get_processor_handles())
rocml.amdsmi_shut_down()
amdsmi.amdsmi_init()
rocm_version = torch_version().split("rocm")[-1]

if rocm_version >= "5.7":
device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles())))
else:
device_ids = list(range(len(amdsmi.amdsmi_get_device_handles())))

amdsmi.amdsmi_shut_down()
else:
raise ValueError("No NVIDIA or ROCm GPUs found.")

return ",".join(str(i) for i in device_ids)
device_ids = ",".join(str(i) for i in device_ids)

return device_ids


def get_git_revision_hash(package_name: str) -> Optional[str]:
Expand Down
5 changes: 3 additions & 2 deletions optimum_benchmark/trackers/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from ..import_utils import is_codecarbon_available

if is_codecarbon_available():
from codecarbon import EmissionsTracker, OfflineEmissionsTracker
from codecarbon import EmissionsTracker, OfflineEmissionsTracker # type: ignore

LOGGER = getLogger("energy")

ENERGY_UNIT = "kWh"
Energy_Unit_Literal = Literal["kWh"]
Efficiency_Unit_Literal = Literal["samples/kWh", "tokens/kWh", "images/kWh"]

Expand Down Expand Up @@ -156,7 +157,7 @@ def get_elapsed_time(self) -> float:

def get_energy(self) -> Energy:
return Energy(
unit="kWh",
unit=ENERGY_UNIT,
cpu=self.cpu_energy,
gpu=self.gpu_energy,
ram=self.ram_energy,
Expand Down
Loading

0 comments on commit 21cebb7

Please sign in to comment.