diff --git a/.github/workflows/check_quality.yaml b/.github/workflows/check_quality.yaml
index da468da3..36b99f99 100644
--- a/.github/workflows/check_quality.yaml
+++ b/.github/workflows/check_quality.yaml
@@ -18,10 +18,10 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install quality requirements
run: |
diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml
index 25ba8d1a..752afab7 100644
--- a/.github/workflows/test_api_cpu.yaml
+++ b/.github/workflows/test_api_cpu.yaml
@@ -18,10 +18,10 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install dependencies
run: |
diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml
index abc7aed4..df72ffb2 100644
--- a/.github/workflows/test_api_misc.yaml
+++ b/.github/workflows/test_api_misc.yaml
@@ -18,10 +18,10 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install requirements
run: |
diff --git a/.github/workflows/test_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml
similarity index 82%
rename from .github/workflows/test_cpu_neural_compressor.yaml
rename to .github/workflows/test_cli_cpu_neural_compressor.yaml
index 7e3488d4..9150a90f 100644
--- a/.github/workflows/test_cpu_neural_compressor.yaml
+++ b/.github/workflows/test_cli_cpu_neural_compressor.yaml
@@ -1,4 +1,4 @@
-name: CPU Intel Neural Compressor Tests
+name: CLI CPU Intel Neural Compressor Tests
on:
workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
cancel-in-progress: true
jobs:
- run_cpu_neural_compressor_tests:
+ run_cli_cpu_neural_compressor_tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install Intel Neural Compressor CPU requirements
run: |
diff --git a/.github/workflows/test_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml
similarity index 82%
rename from .github/workflows/test_cpu_onnxruntime.yaml
rename to .github/workflows/test_cli_cpu_onnxruntime.yaml
index 2770b23f..e7caf218 100644
--- a/.github/workflows/test_cpu_onnxruntime.yaml
+++ b/.github/workflows/test_cli_cpu_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: CPU OnnxRuntime Tests
+name: CLI CPU OnnxRuntime Tests
on:
workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
cancel-in-progress: true
jobs:
- run_cpu_onnxruntime_tests:
+ run_cli_cpu_onnxruntime_tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install requirements
run: |
diff --git a/.github/workflows/test_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml
similarity index 83%
rename from .github/workflows/test_cpu_openvino.yaml
rename to .github/workflows/test_cli_cpu_openvino.yaml
index d2d93cce..00b40aef 100644
--- a/.github/workflows/test_cpu_openvino.yaml
+++ b/.github/workflows/test_cli_cpu_openvino.yaml
@@ -1,4 +1,4 @@
-name: CPU OpenVINO Tests
+name: CLI CPU OpenVINO Tests
on:
workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
cancel-in-progress: true
jobs:
- run_cpu_openvino_tests:
+ run_cli_cpu_openvino_tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install requirements
run: |
diff --git a/.github/workflows/test_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml
similarity index 83%
rename from .github/workflows/test_cpu_pytorch.yaml
rename to .github/workflows/test_cli_cpu_pytorch.yaml
index 1c6809cc..3df5368b 100644
--- a/.github/workflows/test_cpu_pytorch.yaml
+++ b/.github/workflows/test_cli_cpu_pytorch.yaml
@@ -1,4 +1,4 @@
-name: CPU Pytorch tests
+name: CLI CPU Pytorch tests
on:
workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
cancel-in-progress: true
jobs:
- run_cpu_pytorch_tests:
+ run_cli_cpu_pytorch_tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install requirements
run: |
diff --git a/.github/workflows/test_cuda_onnxruntime_inference.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
similarity index 86%
rename from .github/workflows/test_cuda_onnxruntime_inference.yaml
rename to .github/workflows/test_cli_cuda_onnxruntime.yaml
index bbb81b36..0b03608e 100644
--- a/.github/workflows/test_cuda_onnxruntime_inference.yaml
+++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: CUDA OnnxRuntime Inference Tests
+name: CLI CUDA OnnxRuntime Tests
on:
workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
cancel-in-progress: true
jobs:
- build_image_and_run_cuda_onnxruntime_inference_tests:
+ build_image_and_run_cli_cuda_onnxruntime_tests:
runs-on: hf-dgx-01
steps:
- name: Checkout
@@ -40,4 +40,4 @@ jobs:
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
opt-bench-cuda:11.8.0
- -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime and inference' -x"
+ -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
diff --git a/.github/workflows/test_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
similarity index 95%
rename from .github/workflows/test_cuda_pytorch.yaml
rename to .github/workflows/test_cli_cuda_pytorch.yaml
index 49e77f8a..1b3fd99f 100644
--- a/.github/workflows/test_cuda_pytorch.yaml
+++ b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -1,4 +1,4 @@
-name: CUDA Pytorch Tests
+name: CLI CUDA Pytorch Tests
on:
workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
cancel-in-progress: true
jobs:
- build_image_and_run_cuda_pytorch_tests:
+ build_image_and_run_cli_cuda_pytorch_tests:
strategy:
fail-fast: false
matrix:
diff --git a/.github/workflows/test_cuda_torch_ort_training.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
similarity index 91%
rename from .github/workflows/test_cuda_torch_ort_training.yaml
rename to .github/workflows/test_cli_cuda_torch_ort.yaml
index 20f87e67..71bfd33e 100644
--- a/.github/workflows/test_cuda_torch_ort_training.yaml
+++ b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -1,4 +1,4 @@
-name: CUDA Torch-ORT Training Tests
+name: CLI CUDA Torch-ORT Tests
on:
workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
cancel-in-progress: true
jobs:
- build_image_and_run_cuda_torch_ort_training_tests:
+ build_image_and_run_cli_cuda_torch_ort_tests:
runs-on: hf-dgx-01
steps:
- name: Checkout
@@ -40,4 +40,4 @@ jobs:
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
opt-bench-cuda:11.8.0
- -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort and training' -x"
+ -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml
index c448a213..5b55c0a7 100644
--- a/.github/workflows/test_cli_misc.yaml
+++ b/.github/workflows/test_cli_misc.yaml
@@ -18,10 +18,10 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- - name: Set up Python 3.8
+ - name: Set up Python 3.10
uses: actions/setup-python@v3
with:
- python-version: 3.8
+ python-version: '3.10'
- name: Install requirements
run: |
diff --git a/.github/workflows/test_rocm_onnxruntime_inference.yaml b/.github/workflows/test_cli_rocm_onnxruntime.yaml
similarity index 90%
rename from .github/workflows/test_rocm_onnxruntime_inference.yaml
rename to .github/workflows/test_cli_rocm_onnxruntime.yaml
index 5a8cc0a3..fcd0f53d 100644
--- a/.github/workflows/test_rocm_onnxruntime_inference.yaml
+++ b/.github/workflows/test_cli_rocm_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: ROCm OnnxRuntime Inference Tests
+name: CLI ROCm OnnxRuntime Tests
on:
workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
cancel-in-progress: true
jobs:
- build_image_and_run_rocm_onnxruntime_inference_tests:
+ build_image_and_run_cli_rocm_onnxruntime_tests:
runs-on: hf-amd-mi210-dev
steps:
- name: Checkout
@@ -51,4 +51,4 @@ jobs:
--device /dev/dri/renderD129
--entrypoint /bin/bash
opt-bench-rocm-ort:5.7
- -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime and inference' -x"
+ -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x"
diff --git a/.github/workflows/test_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
similarity index 95%
rename from .github/workflows/test_rocm_pytorch.yaml
rename to .github/workflows/test_cli_rocm_pytorch.yaml
index 3d14909d..11c9e77a 100644
--- a/.github/workflows/test_rocm_pytorch.yaml
+++ b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -1,4 +1,4 @@
-name: ROCm Pytorch Tests
+name: CLI ROCm Pytorch Tests
on:
workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
cancel-in-progress: true
jobs:
- build_image_and_run_rocm_pytorch_tests:
+ build_image_and_run_cli_rocm_pytorch_tests:
strategy:
fail-fast: false
matrix:
diff --git a/.github/workflows/test_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml
similarity index 93%
rename from .github/workflows/test_tensorrt_llm.yaml
rename to .github/workflows/test_cli_tensorrt_llm.yaml
index 06640699..0169fca5 100644
--- a/.github/workflows/test_tensorrt_llm.yaml
+++ b/.github/workflows/test_cli_tensorrt_llm.yaml
@@ -1,4 +1,4 @@
-name: TensorRT-LLM Tests
+name: CLI TensorRT-LLM Tests
on:
workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
cancel-in-progress: true
jobs:
- pull_image_and_run_tensorrt_llm_tests:
+ pull_image_and_run_cli_tensorrt_llm_tests:
runs-on: hf-dgx-01
steps:
- name: Checkout
diff --git a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
similarity index 86%
rename from .github/workflows/test_tensorrt_onnxruntime_inference.yaml
rename to .github/workflows/test_cli_tensorrt_onnxruntime.yaml
index 4d41313d..92f425e7 100644
--- a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml
+++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: TensorRT OnnxRuntime Inference Tests
+name: CLI TensorRT OnnxRuntime Tests
on:
workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
cancel-in-progress: true
jobs:
- build_image_and_run_tensorrt_onnxruntime_tests:
+ build_image_and_run_cli_tensorrt_onnxruntime_tests:
runs-on: hf-dgx-01
steps:
- name: Checkout
@@ -40,4 +40,4 @@ jobs:
--gpus '"device=0,1"'
--entrypoint /bin/bash
opt-bench-tensorrt:22.12
- -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime and inference' -x"
+ -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x"
diff --git a/Makefile b/Makefile
index c993cc7c..55e44e1e 100644
--- a/Makefile
+++ b/Makefile
@@ -12,8 +12,68 @@ style:
install:
pip install -e .
-install_cpu_dev:
- pip install -e .[quality,testing,openvino,onnxruntime,neural-compressor,diffusers,timm,peft]
+build_docker_cpu:
+ docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t opt-bench-cpu:latest .
-install_gpu_dev:
- pip install -e .[quality,testing,onnxruntime-gpu,deepspeed,diffusers,timm,peft]
+build_docker_cuda:
+ docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_CUDA=cu118 --build-arg CUDA_VERSION=11.8.0 -t opt-bench-cuda:11.8.0 .
+
+build_docker_rocm:
+ docker build -f docker/rocm.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_ROCM=rocm5.6 --build-arg ROCM_VERSION=5.6.1 -t opt-bench-rocm:5.6.1 .
+
+test_cli_cpu_neural_compressor:
+ docker run \
+ --rm \
+ --entrypoint /bin/bash \
+ --volume $(PWD):/workspace \
+ --workdir /workspace \
+ opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"
+
+test_cli_cpu_openvino:
+ docker run \
+ --rm \
+ --entrypoint /bin/bash \
+ --volume $(PWD):/workspace \
+ --workdir /workspace \
+ opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers] && pytest tests/ -k 'cli and cpu and openvino' -x"
+
+test_cli_cpu_onnxruntime:
+ docker run \
+ --rm \
+ --entrypoint /bin/bash \
+ --volume $(PWD):/workspace \
+ --workdir /workspace \
+ opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"
+
+test_cli_cpu_pytorch:
+ docker run \
+ --rm \
+ --entrypoint /bin/bash \
+ --volume $(PWD):/workspace \
+ --workdir /workspace \
+ opt-bench-cpu:latest -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x"
+
+test_api_cpu:
+ docker run \
+ --rm \
+ --entrypoint /bin/bash \
+ --volume $(PWD):/workspace \
+ --workdir /workspace \
+ opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cpu' -x"
+
+test_api_cuda:
+ docker run \
+ --rm \
+ --gpus '"device=0,1"' \
+ --entrypoint /bin/bash \
+ --volume $(PWD):/workspace \
+ --workdir /workspace \
+ opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cuda' -x"
+
+test_api_misc:
+ docker run \
+ --rm \
+ --entrypoint /bin/bash \
+ --volume $(PWD):/workspace \
+ --workdir /workspace \
+ opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
diff --git a/README.md b/README.md
index cc623d27..e338b888 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,13 @@
Optimum-Benchmark ποΈ
-Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with supported optimizations & quantization schemes, for [inference](https://github.com/huggingface/optimum#accelerated-inference) & [training](https://github.com/huggingface/optimum#accelerated-training), on multiple [backends & hardwares](https://github.com/huggingface/optimum-benchmark?tab=readme-ov-file#supported-backendsdevices).
+Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices-) utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with all their supported [optimizations & quantization schemes](#backend-features-), for [inference & training](#benchmark-features-%EF%B8%8F), in [distributed & non-distributed settings](#backend-features-).
## Motivation π€
-- Hardware vendors wanting to know how their hardware performs compared to others on the same models.
-- HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc.
+- HF hardware partners wanting to know how their hardware performs compared to another hardware on the same models.
+- HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc compared to another model.
- Experimenting with hardware & backend specific optimizations & quantization schemes that can be applied to models and improve their computational/memory/energy efficiency.
-- [...]
## Current status π
@@ -19,23 +18,20 @@ Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transform
[![CPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml)
[![CUDA](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml)
[![ROCM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml)
-[![MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml)
### CLI
+
[![CPU Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml)
[![CPU OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml)
[![CPU Intel Neural Compressor Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml)
[![CPU OpenVINO Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml)
-
[![CUDA Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml)
-[![CUDA OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml)
-[![CUDA Torch-ORT Training Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml)
-
-[![TensorRT OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml)
+[![CUDA OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml)
+[![CUDA Torch-ORT Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml)
+[![TensorRT OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml)
[![TensorRT-LLM Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml)
-
[![ROCm Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml)
-[![ROCm OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml)
+[![ROCm OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml)
## Quickstart π
@@ -44,7 +40,7 @@ Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transform
You can install `optimum-benchmark` using pip:
```bash
-python -m pip install git+https://github.com/huggingface/optimum-benchmark.git
+pip install optimum-benchmark
```
or by cloning the repository and installing it in editable mode:
@@ -66,33 +62,45 @@ Depending on the backends you want to use, you might need to install some extra
- Intel Neural Compressor: `pip install optimum-benchmark[neural-compressor]`
- Text Generation Inference: `pip install optimum-benchmark[text-generation-inference]`
-### Running benchmarks from python API π§ͺ
+### Running benchmarks from Python API π§ͺ
-You can run benchmarks from the python API:
+You can run benchmarks from the Python API, using the `launch` function from the `optimum_benchmark.experiment` module. Here's an example of how to run a benchmark using the `pytorch` backend, `process` launcher and `inference` benchmark.
```python
-import logging
-logging.basicConfig(level=logging.INFO)
-
+from optimum_benchmark.logging_utils import setup_logging
from optimum_benchmark.experiment import launch, ExperimentConfig
from optimum_benchmark.backends.pytorch.config import PyTorchConfig
from optimum_benchmark.launchers.process.config import ProcessConfig
from optimum_benchmark.benchmarks.inference.config import InferenceConfig
+
if __name__ == "__main__":
- backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cuda")
- launcher_config = ProcessConfig(device_isolation=True)
- benchmark_config = InferenceConfig(memory=True)
+ setup_logging(level="INFO")
+ benchmark_config = InferenceConfig(latency=False, memory=True, energy=True)
+ launcher_config = ProcessConfig()
+ backend_config = PyTorchConfig(
+ device="cuda",
+ no_weights=True,
+ device_ids="0,1",
+ device_map="auto",
+ model="IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm",
+ )
experiment_config = ExperimentConfig(
- experiment_name="api-launch-experiment",
+ experiment_name="python-api-launch-experiment",
benchmark=benchmark_config,
launcher=launcher_config,
backend=backend_config,
)
benchmark_report = launch(experiment_config)
- print("benchmark_report:", benchmark_report)
+ benchmark_report.log_all()
+ # or
+ print(benchmark_report.to_dict())
+ # or
+ benchmark_report.push_to_hub("IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm")
```
+Yep, it's that simple! Check the supported backends, launchers and benchmarks in the [features](#features-) section.
+
### Running benchmarks from CLI πββοΈ
You can run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension.
@@ -161,26 +169,26 @@ Other than the [examples](examples), you can also check [tests](tests/configs/).
Everything else is optional or inferred at runtime, but can be configured to your needs.
-### Backends & Devices π±
-
-- [x] Pytorch backend for CPU (`device=cpu`, `backend=pytorch`)
-- [x] Pytorch backend for CUDA (`device=cuda`, `backend=pytorch`)
-- [ ] Pytorch backend for Habana Gaudi Processor (`device=hpu`, `backend=pytorch`)
-- [x] OnnxRuntime backend for CPUExecutionProvider (`device=cpu`, `backend=onnxruntime`)
-- [x] OnnxRuntime backend for CUDAExecutionProvider (`device=cuda`, `backend=onnxruntime`)
-- [x] OnnxRuntime backend for ROCMExecutionProvider (`device=cuda`, `backend=onnxruntime`, `backend.provider=ROCMExecutionProvider`)
-- [x] OnnxRuntime backend for TensorrtExecutionProvider (`device=cuda`, `backend=onnxruntime`, `backend.provider=TensorrtExecutionProvider`)
-- [x] Intel Neural Compressor backend for CPU (`device=cpu`, `backend=neural-compressor`)
-- [x] TensorRT-LLM backend for CUDA (`device=cuda`, `backend=tensorrt-llm`)
-- [x] OpenVINO backend for CPU (`device=cpu`, `backend=openvino`)
-
-### Launcher features π
+### Launchers π
- [x] Process isolation between consecutive runs (`launcher=process`)
-- [x] Assert devices (NVIDIA & AMD GPUs) isolation (`launcher.device_isolation=true`)
-- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`, etc)
+- [x] Assert GPU devices (NVIDIA & AMD) isolation (`launcher.device_isolation=true`)
+- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`)
+
+### Backends & Devices π±
-### Benchmark features ποΈ
+- [x] Pytorch backend for CPU (`backend=pytorch`, `backend.device=cpu`)
+- [x] Pytorch backend for CUDA (`backend=pytorch`, `backend.device=cuda`)
+- [ ] Pytorch backend for Habana Gaudi Processor (`backend=pytorch`, `backend.device=habana`)
+- [x] OnnxRuntime backend for CPUExecutionProvider (`backend=onnxruntime`, `backend.device=cpu`)
+- [x] OnnxRuntime backend for CUDAExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`)
+- [x] OnnxRuntime backend for ROCMExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`, `backend.provider=ROCMExecutionProvider`)
+- [x] OnnxRuntime backend for TensorrtExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`, `backend.provider=TensorrtExecutionProvider`)
+- [x] Intel Neural Compressor backend for CPU (`backend=neural-compressor`, `backend.device=cpu`)
+- [x] TensorRT-LLM backend for CUDA (`backend=tensorrt-llm`, `backend.device=cuda`)
+- [x] OpenVINO backend for CPU (`backend=openvino`, `backend.device=cpu`)
+
+### Benchmarking ποΈ
- [x] Memory tracking (`benchmark.memory=true`)
- [x] Latency and throughput tracking of forward pass (default)
diff --git a/docker/cpu.dockerfile b/docker/cpu.dockerfile
new file mode 100644
index 00000000..371a89c8
--- /dev/null
+++ b/docker/cpu.dockerfile
@@ -0,0 +1,42 @@
+FROM ubuntu:latest
+
+
+# Ignore interactive questions during `docker build`
+ENV DEBIAN_FRONTEND noninteractive
+
+# Run as non-root user
+ARG USER_ID
+ARG GROUP_ID
+
+RUN addgroup --gid $GROUP_ID user
+RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
+
+# Install python
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ python3.10 \
+ python3.10-dev \
+ python3-pip \
+ git && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/* && \
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+
+# Add local bin to PATH
+ENV PATH="/home/user/.local/bin:${PATH}"
+
+# Add user to sudoers
+RUN adduser user sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >>/etc/sudoers
+
+# Change user
+USER user
+WORKDIR /home/user
+
+# Update pip
+RUN pip install --upgrade pip
+
+# Install PyTorch
+RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; \
+ then pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu ; \
+ else pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \
+ fi
diff --git a/examples/neural_compressor_ptq_bert.yaml b/examples/neural_compressor_ptq_bert.yaml
index 64691369..c8b0ee6e 100644
--- a/examples/neural_compressor_ptq_bert.yaml
+++ b/examples/neural_compressor_ptq_bert.yaml
@@ -7,25 +7,31 @@ defaults:
- override hydra/job_logging: colorlog # colorful logging
- override hydra/hydra_logging: colorlog # colorful logging
-experiment_name: openvino_static_quant_bert
+experiment_name: neural_compressor_ptq_bert
backend:
- model: bert-base-uncased
+ device: cpu
no_weights: true
+ model: bert-base-uncased
ptq_quantization: true
calibration: true
- device: cpu
benchmark:
input_shapes:
batch_size: 1
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
diff --git a/examples/onnxruntime_static_quant_vit.yaml b/examples/onnxruntime_static_quant_vit.yaml
index 0b06bc0e..d324415d 100644
--- a/examples/onnxruntime_static_quant_vit.yaml
+++ b/examples/onnxruntime_static_quant_vit.yaml
@@ -10,23 +10,28 @@ defaults:
experiment_name: onnxruntime_static_quant_vit
backend:
+ device: cpu
+ no_weights: true
model: google/vit-base-patch16-224
quantization: true
quantization_config:
is_static: true
per_channel: false
- device: cpu
calibration: true
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
- CUDA_VISIBLE_DEVICES: 0
- CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/openvino_diffusion.yaml b/examples/openvino_diffusion.yaml
index 3591ecd7..f9f62e64 100644
--- a/examples/openvino_diffusion.yaml
+++ b/examples/openvino_diffusion.yaml
@@ -10,22 +10,28 @@ defaults:
model: stabilityai/stable-diffusion-2-1
backend:
+ device: cpu
experiment_name: openvino_diffusion
- export: true
reshape: true
+ export: true
half: true
- device: cpu
benchmark:
input_shapes:
batch_size: 1
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
diff --git a/examples/openvino_static_quant_bert.yaml b/examples/openvino_static_quant_bert.yaml
index c349f3ea..83921f4c 100644
--- a/examples/openvino_static_quant_bert.yaml
+++ b/examples/openvino_static_quant_bert.yaml
@@ -10,24 +10,30 @@ defaults:
experiment_name: openvino_static_quant_bert
backend:
+ device: cpu
+ no_weights: true
model: bert-base-uncased
export: true
- no_weights: true
quantization: true
calibration: true
reshape: true
- device: cpu
benchmark:
input_shapes:
batch_size: 1
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml
index 71a087f0..5a36147c 100644
--- a/examples/pytorch_bert.yaml
+++ b/examples/pytorch_bert.yaml
@@ -10,17 +10,22 @@ defaults:
experiment_name: pytorch_bert
backend:
- model: bert-base-uncased
device: cpu
+ device_ids: 0
+ model: bert-base-uncased
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
- CUDA_VISIBLE_DEVICES: 0
- CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/pytorch_llama.yaml b/examples/pytorch_llama.yaml
index f6b29792..2c9e2845 100644
--- a/examples/pytorch_llama.yaml
+++ b/examples/pytorch_llama.yaml
@@ -10,8 +10,10 @@ defaults:
experiment_name: pytorch_llama
backend:
- model: TheBloke/Llama-2-70B-AWQ
device: cuda
+ device_ids: 0
+ no_weights: true
+ model: TheBloke/Llama-2-70B-AWQ
launcher:
device_isolation: true
@@ -22,14 +24,18 @@ benchmark:
sequence_length: 256
new_tokens: 1000
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
- CUDA_VISIBLE_DEVICES: 0
- CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/pytorch_timm.yaml b/examples/pytorch_timm.yaml
index 03125599..4b2c5295 100644
--- a/examples/pytorch_timm.yaml
+++ b/examples/pytorch_timm.yaml
@@ -10,8 +10,9 @@ defaults:
experiment_name: pytorch_timm
backend:
- model: timm/mobilenetv3_large_100.ra_in1k
device: cuda
+ device_ids: 0
+ model: timm/mobilenetv3_large_100.ra_in1k
launcher:
device_isolation: true
@@ -20,14 +21,18 @@ benchmark:
input_shapes:
batch_size: 1
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
- CUDA_VISIBLE_DEVICES: 0
- CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/tgi_llama.yaml b/examples/tgi_llama.yaml
index 9bf8b4d1..a23c5c55 100644
--- a/examples/tgi_llama.yaml
+++ b/examples/tgi_llama.yaml
@@ -10,10 +10,12 @@ defaults:
experiment_name: tgi_llama
backend:
+ device: cuda
+ device_ids: 0,1
+ device_map: true
model: TheBloke/Llama-2-7B-AWQ
quantization_scheme: awq
sharded: false
- device: cuda
benchmark:
input_shapes:
@@ -21,14 +23,18 @@ benchmark:
sequence_length: 256
new_tokens: 1000
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
- CUDA_VISIBLE_DEVICES: 0
- CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/trt_llama.yaml b/examples/trt_llama.yaml
index e3f8844d..702bb39e 100644
--- a/examples/trt_llama.yaml
+++ b/examples/trt_llama.yaml
@@ -10,8 +10,8 @@ defaults:
experiment_name: trt_llama
backend:
- model: NousResearch/Llama-2-7b-hf
device: cuda
+ model: NousResearch/Llama-2-7b-hf
benchmark:
input_shapes:
@@ -19,14 +19,18 @@ benchmark:
sequence_length: 64
new_tokens: 128
+# hydra/cli specific settings
hydra:
run:
+ # where to store run results
dir: runs/${experiment_name}
sweep:
+ # where to store sweep results
dir: sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
OVERRIDE_BENCHMARKS: 1
- CUDA_VISIBLE_DEVICES: 0
- CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/optimum_benchmark/aggregators/__init__.py b/optimum_benchmark/aggregators/__init__.py
deleted file mode 100644
index a3015d55..00000000
--- a/optimum_benchmark/aggregators/__init__.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from pathlib import Path
-from typing import Tuple, List, Dict
-
-import pandas as pd
-from rich.table import Table
-from omegaconf import OmegaConf
-import matplotlib.pyplot as plt
-from rich.console import Console
-from flatten_dict import flatten
-from rich.terminal_theme import MONOKAI
-
-
-def gather(root_folders: List[Path]) -> pd.DataFrame:
- configs_dfs = {}
- results_dfs = {}
-
- for root_folder in root_folders:
- if not root_folder.exists():
- raise ValueError(f"{root_folder} does not exist")
-
- for f in root_folder.glob("**/hydra_config.yaml"):
- parent_folder = f.parent.absolute().as_posix()
- configs_dfs[parent_folder] = pd.DataFrame.from_dict(
- flatten(OmegaConf.load(f), reducer="dot"), orient="index"
- ).T
-
- for f in root_folder.glob("**/*_results.csv"):
- parent_folder = f.parent.absolute().as_posix()
- results_dfs[parent_folder] = pd.read_csv(f)
-
- if (len(results_dfs) == 0) or (len(configs_dfs) == 0):
- raise ValueError(f"Results are missing in {root_folders}")
-
- # Merge inference and config dataframes
- full_dfs = {}
- for parent_folder in results_dfs:
- full_df = pd.concat(
- [configs_dfs[parent_folder], results_dfs[parent_folder]],
- axis=1,
- )
- full_df["parent_folder"] = parent_folder
- full_dfs[parent_folder] = full_df
-
- # Concatenate all dataframes
- full_report = pd.concat(full_dfs.values(), ignore_index=True, axis=0)
-
- return full_report
-
-
-def format_element(element):
- if isinstance(element, float):
- if element != element:
- formated_element = ""
- elif abs(element) >= 1:
- formated_element = f"{element:.2f}"
- elif abs(element) > 1e-6:
- formated_element = f"{element:.2e}"
- else:
- formated_element = f"{element}"
- elif element is None:
- formated_element = ""
- elif isinstance(element, bool):
- if element:
- formated_element = "[green]β[/green]"
- else:
- formated_element = "[red]β[/red]"
- else:
- formated_element = str(element)
-
- return formated_element
-
-
-def display(report: pd.DataFrame) -> Table:
- table = Table(show_header=True, show_lines=True)
-
- for column in report.columns:
- table.add_column(column, justify="right", header_style="bold")
-
- for _, row in report.iterrows():
- formated_row = []
- for element in row.values:
- formated_row.append(format_element(element))
- table.add_row(*formated_row)
-
- console = Console(record=True, theme=MONOKAI)
- console.print(table, justify="center")
-
- return console, table
-
-
-def rename(report: pd.DataFrame, rename_dict: Dict[str, str]):
- summarized_report = report[list(rename_dict.keys())].rename(columns=rename_dict)
-
- return summarized_report
-
-
-def plot(report: pd.DataFrame, x_axis: str, y_axis: str, groupby: str) -> Tuple[plt.Figure, plt.Axes]:
- fig, ax = plt.subplots()
-
- for group, sweep in report.groupby(groupby):
- sorted_sweep = sweep.sort_values(by=x_axis)
- ax.plot(sorted_sweep[x_axis], sorted_sweep[y_axis], label=group, marker="o")
-
- ax.set_xlabel(x_axis)
- ax.set_ylabel(y_axis)
- ax.set_title(f"{y_axis} per {x_axis}")
- ax.legend(fancybox=True, shadow=True)
-
- return fig, ax
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index 1c55a5ab..cf0f5087 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -1,46 +1,25 @@
import gc
-import os
import random
-import shutil
from abc import ABC
from logging import getLogger
-from typing import (
- Optional,
- ClassVar,
- Generic,
- Dict,
- Any,
-)
-
-import numpy as np
-from transformers.utils import ModelOutput
-from transformers import (
- GenerationConfig,
- PretrainedConfig,
- PreTrainedModel,
- TrainerState,
- AutoModel,
-)
+from collections import OrderedDict
+from typing import Optional, ClassVar, Generic, Dict, Any
from .config import BackendConfigT
from ..task_utils import get_automodel_class_for_task
-from .diffusers_utils import (
- extract_diffusers_shapes_from_config,
- get_diffusers_pretrained_config,
-)
+
+from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config
+from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config, get_timm_pre_processor
from .transformers_utils import (
extract_transformers_shapes_from_artifacts,
- get_transformers_pretrained_processor,
get_transformers_generation_config,
get_transformers_pretrained_config,
- get_transformers_cache_dir,
+ get_transformers_pre_processor,
PretrainedProcessor,
)
-from .timm_utils import (
- extract_timm_shapes_from_config,
- get_timm_pretrained_processor,
- get_timm_pretrained_config,
-)
+
+import numpy as np
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState
LOGGER = getLogger("backend")
@@ -48,43 +27,38 @@
class Backend(Generic[BackendConfigT], ABC):
NAME: ClassVar[str]
- config: BackendConfigT
- automodel_class: AutoModel
- pretrained_model: PreTrainedModel
+ model_type: str
model_shapes: Dict[str, int]
+ pretrained_model: PreTrainedModel
pretrained_config: Optional[PretrainedConfig]
- pretrained_processor: Optional[PretrainedProcessor]
- pretrained_generation_config: Optional[GenerationConfig]
+ generation_config: Optional[GenerationConfig]
+ pre_processor: Optional[PretrainedProcessor]
def __init__(self, config: BackendConfigT):
LOGGER.info(f"ΩAllocating {self.NAME} backend")
self.config = config
+ self.seed()
if self.config.library == "diffusers":
- self.pretrained_processor = None
- self.pretrained_generation_config = None
- self.pretrained_config = get_diffusers_pretrained_config(model=self.config.model, **self.config.hub_kwargs)
- self.model_shapes = extract_diffusers_shapes_from_config(model=self.config.model, **self.config.hub_kwargs)
+ self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.hub_kwargs)
+ self.model_shapes = extract_diffusers_shapes_from_config(self.config.model, **self.config.hub_kwargs)
self.model_type = self.config.task
+ self.generation_config = None
+ self.pre_processor = None
+
elif self.config.library == "timm":
- self.pretrained_processor = get_timm_pretrained_processor(self.config.model)
+ self.pre_processor = get_timm_pre_processor(self.config.model)
self.pretrained_config = get_timm_pretrained_config(self.config.model)
self.model_shapes = extract_timm_shapes_from_config(config=self.pretrained_config)
self.model_type = self.pretrained_config.architecture
- self.pretrained_generation_config = None
+ self.generation_config = None
+
else:
+ self.pre_processor = get_transformers_pre_processor(self.config.model, **self.config.hub_kwargs)
+ self.generation_config = get_transformers_generation_config(self.config.model, **self.config.hub_kwargs)
self.pretrained_config = get_transformers_pretrained_config(self.config.model, **self.config.hub_kwargs)
- self.pretrained_generation_config = get_transformers_generation_config(
- self.config.model, **self.config.hub_kwargs
- )
- self.pretrained_processor = get_transformers_pretrained_processor(
- self.config.model, **self.config.hub_kwargs
- )
- self.model_shapes = extract_transformers_shapes_from_artifacts(
- config=self.pretrained_config,
- processor=self.pretrained_processor,
- )
+ self.model_shapes = extract_transformers_shapes_from_artifacts(self.pretrained_config, self.pre_processor)
self.model_type = self.pretrained_config.model_type
self.automodel_class = get_automodel_class_for_task(
@@ -95,6 +69,7 @@ def __init__(self, config: BackendConfigT):
)
def seed(self) -> None:
+ LOGGER.info(f"\t+ Setting random seed to {self.config.seed}")
random.seed(self.config.seed)
np.random.seed(self.config.seed)
@@ -112,40 +87,35 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""
return inputs
- def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
"""
This method is used to perform the forward pass of the model.
"""
raise NotImplementedError("Backend must implement forward method")
- def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+ def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
"""
This method is used to perform the generation pass of the model.
"""
raise NotImplementedError("Backend must implement generate method")
+ def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ """
+ This method is used to call a whole pipeline.
+ """
+ raise NotImplementedError("Backend must implement call method")
+
def train(self, **kwargs) -> TrainerState:
"""
This method is used to train the model.
"""
raise NotImplementedError("Backend must implement train method")
- def delete_hf_model_cache(self) -> None:
- LOGGER.info("\t+ Deleting model cache")
- transformers_cache_path = get_transformers_cache_dir()
- model_cache_folder = f"models/{self.config.model}".replace("/", "--")
- model_cache_path = os.path.join(transformers_cache_path, model_cache_folder)
- shutil.rmtree(model_cache_path, ignore_errors=True)
-
def delete_pretrained_model(self) -> None:
- LOGGER.info("\t+ Deleting pretrained model")
- del self.pretrained_model
- gc.collect()
+ if hasattr(self, "pretrained_model"):
+ del self.pretrained_model
def clean(self) -> None:
LOGGER.info(f"Cleaning {self.NAME} backend")
-
- if hasattr(self, "pretrained_model"):
- self.delete_pretrained_model()
-
+ self.delete_pretrained_model()
gc.collect()
diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
index fff9bf80..a4919c15 100644
--- a/optimum_benchmark/backends/config.py
+++ b/optimum_benchmark/backends/config.py
@@ -4,13 +4,12 @@
from dataclasses import dataclass, field
from typing import Optional, TypeVar, Dict, Any
-from psutil import cpu_count
+from ..import_utils import is_psutil_available
+from ..env_utils import get_cuda_device_ids, is_nvidia_system, is_rocm_system
+from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path
-from ..env_utils import get_gpus, is_nvidia_system, is_rocm_system
-from ..task_utils import (
- infer_library_from_model_name_or_path,
- infer_task_from_model_name_or_path,
-)
+if is_psutil_available():
+ from psutil import cpu_count
LOGGER = getLogger("backend")
@@ -18,6 +17,7 @@
"revision": "main",
"force_download": False,
"local_files_only": False,
+ "trust_remote_code": False,
}
@@ -31,6 +31,10 @@ class BackendConfig(ABC):
model: Optional[str] = None
device: Optional[str] = None
+ # yes we use a string here instead of a list
+ # it's easier to pass in a yaml or from cli
+ # also it's consistent with CUDA_VISIBLE_DEVICES
+ device_ids: Optional[str] = None
task: Optional[str] = None
library: Optional[str] = None
@@ -48,41 +52,20 @@ def __post_init__(self):
self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu"
if ":" in self.device:
- raise ValueError(
- f"Device was specified as {self.device} with a target index."
- "We recommend using the main cuda device (e.g. `cuda`) and "
- "specifying the target index in `CUDA_VISIBLE_DEVICES`."
- )
+ # using device index
+ self.device = self.device.split(":")[0]
+ self.device_ids = self.device.split(":")[1]
+
+ if self.device == "cuda":
+ if self.device_ids is None:
+ self.device_ids = get_cuda_device_ids()
+
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+ os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids
+ # TODO: add rocm specific environment variables ?
if self.device not in ["cuda", "cpu", "mps", "xla"]:
- raise ValueError("`device` must be either `cuda`, `cpu`, `mps` or `xla`.")
-
- if self.device == "cuda" and len(get_gpus()) > 1:
- if os.environ.get("CUDA_VISIBLE_DEVICES", None) is None:
- LOGGER.warning(
- "Multiple GPUs detected but CUDA_VISIBLE_DEVICES is not set. "
- "This means that code might allocate resources from the wrong GPUs. "
- "For example, with `auto_device='auto'. `We recommend setting CUDA_VISIBLE_DEVICES "
- "to isolate the GPUs that will be used for this experiment. `CUDA_VISIBLE_DEVICES` will "
- "be set to `0` to ensure that only the first GPU is used. If you want to use multiple "
- "GPUs, please set `CUDA_VISIBLE_DEVICES` to the desired GPU indices."
- )
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-
- if os.environ.get("CUDA_DEVICE_ORDER", None) != "PCI_BUS_ID":
- LOGGER.warning(
- "Multiple GPUs detected but CUDA_DEVICE_ORDER is not set to `PCI_BUS_ID`. "
- "This means that code might allocate resources from the wrong GPUs even if "
- "`CUDA_VISIBLE_DEVICES` is set. For example pytorch uses the `FASTEST_FIRST` "
- "order by default, which is not guaranteed to be the same as nvidia-smi. `CUDA_DEVICE_ORDER` "
- "will be set to `PCI_BUS_ID` to ensure that the GPUs are allocated in the same order as nvidia-smi. "
- )
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
- elif self.device == "cuda" and len(get_gpus()) == 1:
- if os.environ.get("CUDA_VISIBLE_DEVICES", None) is None:
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+ raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}")
if self.task is None:
self.task = infer_task_from_model_name_or_path(self.model)
diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py
index 49c21906..705436d3 100644
--- a/optimum_benchmark/backends/diffusers_utils.py
+++ b/optimum_benchmark/backends/diffusers_utils.py
@@ -4,31 +4,27 @@
from ..import_utils import is_diffusers_available
-
if is_diffusers_available():
import diffusers
def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
- assert is_diffusers_available(), "Diffusers is not available"
return diffusers.DiffusionPipeline.load_config(model, **kwargs)
def extract_diffusers_shapes_from_config(model: str, **kwargs) -> Dict[str, int]:
- assert is_diffusers_available(), "Diffusers is not available"
+ config = diffusers.DiffusionPipeline.load_config(model, **kwargs)
shapes = {}
- pip_config = diffusers.DiffusionPipeline.load_config(model, **kwargs)
-
- if "vae" in pip_config:
- vae_import_path = pip_config["vae"]
+ if "vae" in config:
+ vae_import_path = config["vae"]
vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}")
vae_config = vae_class.load_config(model, subfolder="vae", **kwargs)
shapes["num_channels"] = vae_config["out_channels"]
shapes["height"] = vae_config["sample_size"]
shapes["width"] = vae_config["sample_size"]
- elif "vae_encoder" in pip_config:
- vae_import_path = pip_config["vae_encoder"]
+ elif "vae_encoder" in config:
+ vae_import_path = config["vae_encoder"]
vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}")
vae_config = vae_class.load_config(model, subfolder="vae", **kwargs)
shapes["num_channels"] = vae_config["out_channels"]
diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py
index 092affff..dd2a7a82 100644
--- a/optimum_benchmark/backends/neural_compressor/backend.py
+++ b/optimum_benchmark/backends/neural_compressor/backend.py
@@ -4,22 +4,19 @@
from logging import getLogger
from tempfile import TemporaryDirectory
+from ...generators.dataset_generator import DatasetGenerator
+from ..transformers_utils import randomize_weights
+from .utils import TASKS_TO_INCMODELS
+from .config import INCConfig
+from ..base import Backend
+
import torch
from hydra.utils import get_class
from transformers.utils import ModelOutput
from transformers.modeling_utils import no_init_weights
from transformers.utils.logging import set_verbosity_error
from optimum.intel.neural_compressor.quantization import INCQuantizer
-from neural_compressor.config import (
- PostTrainingQuantConfig,
- AccuracyCriterion,
- TuningCriterion,
-)
-
-from ...generators.dataset_generator import DatasetGenerator
-from .utils import TASKS_TO_INCMODELS
-from .config import INCConfig
-from ..base import Backend
+from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion, TuningCriterion
# disable transformers logging
set_verbosity_error()
@@ -34,9 +31,7 @@ def __init__(self, config: INCConfig):
super().__init__(config)
self.validate_task()
- self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task])
- LOGGER.info(f"Using INCModel class {self.incmodel_class.__name__}")
-
+ LOGGER.info("\t+ Creating backend temporary directory")
self.tmpdir = TemporaryDirectory()
if self.config.ptq_quantization:
@@ -52,57 +47,65 @@ def __init__(self, config: INCConfig):
else:
self.load_incmodel_from_pretrained()
- self.tmpdir.cleanup()
-
def validate_task(self) -> None:
if self.config.task not in TASKS_TO_INCMODELS:
raise NotImplementedError(f"INCBackend does not support task {self.config.task}")
+ self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task])
+ LOGGER.info(f"Using INCModel class {self.incmodel_class.__name__}")
+
def load_automodel_from_pretrained(self) -> None:
LOGGER.info("\t+ Loading AutoModel from pretrained")
self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs)
- def load_automodel_with_no_weights(self) -> None:
- no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+ def create_no_weights_model(self) -> None:
+ LOGGER.info("\t+ Creating no weights model state_dict")
+ state_dict = torch.nn.Linear(1, 1).state_dict()
- if not os.path.exists(no_weights_model):
- LOGGER.info("\t+ Creating no weights model directory")
- os.makedirs(no_weights_model)
+ LOGGER.info("\t+ Creating no weights model directory")
+ self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+ os.makedirs(self.no_weights_model, exist_ok=True)
- LOGGER.info("\t+ Saving pretrained config")
- self.pretrained_config.save_pretrained(save_directory=no_weights_model)
+ LOGGER.info("\t+ Saving no weights model pretrained config")
+ self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
- LOGGER.info("\t+ Creating no weights model")
- state_dict = torch.nn.Linear(1, 1).state_dict()
+ LOGGER.info("\t+ Saving no weights model state_dict")
+ torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin"))
- LOGGER.info("\t+ Saving no weights model")
- torch.save(state_dict, os.path.join(no_weights_model, "pytorch_model.bin"))
+ def load_automodel_with_no_weights(self) -> None:
+ self.create_no_weights_model()
- LOGGER.info("\t+ Loading no weights model")
with no_init_weights():
original_model = self.config.model
- self.config.model = no_weights_model
+ self.config.model = self.no_weights_model
+ LOGGER.info("\t+ Loading no weights model")
self.load_automodel_from_pretrained()
self.config.model = original_model
+ LOGGER.info("\t+ Randomizing model weights")
+ randomize_weights(self.pretrained_model)
+ LOGGER.info("\t+ Tying model weights")
+ self.pretrained_model.tie_weights()
+
def load_incmodel_from_pretrained(self) -> None:
LOGGER.info("\t+ Loading INCModel from pretrained")
self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs)
def load_incmodel_with_no_weights(self) -> None:
- no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
-
- LOGGER.info("\t+ Loading AutoModel with no weights")
- self.load_automodel_with_no_weights()
- self.delete_pretrained_model()
+ self.create_no_weights_model()
- LOGGER.info("\t+ Loading INCModel with no weights")
with no_init_weights():
original_model = self.config.model
- self.config.model = no_weights_model
+ self.config.model = self.no_weights_model
+ LOGGER.info("\t+ Loading no weights model")
self.load_incmodel_from_pretrained()
self.config.model = original_model
+ LOGGER.info("\t+ Randomizing model weights")
+ randomize_weights(self.pretrained_model.model)
+ LOGGER.info("\t+ Tying model weights")
+ self.pretrained_model.model.tie_weights()
+
def quantize_automodel(self) -> None:
LOGGER.info("\t+ Attempting to quantize model")
quantized_model_path = f"{self.tmpdir.name}/quantized"
@@ -134,7 +137,7 @@ def quantize_automodel(self) -> None:
task=self.config.task,
dataset_shapes=dataset_shapes,
model_shapes=self.model_shapes,
- ).generate()
+ )()
columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns))
calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
else:
@@ -169,6 +172,7 @@ def clean(self) -> None:
super().clean()
if hasattr(self, "tmpdir"):
+ LOGGER.info("\t+ Cleaning backend temporary directory")
self.tmpdir.cleanup()
gc.collect()
diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py
index 0801b000..07d5d860 100644
--- a/optimum_benchmark/backends/onnxruntime/backend.py
+++ b/optimum_benchmark/backends/onnxruntime/backend.py
@@ -1,16 +1,22 @@
import gc
import os
from logging import getLogger
+from collections import OrderedDict
from tempfile import TemporaryDirectory
from typing import Any, Callable, Dict, List
+from ..base import Backend
+from .config import ORTConfig
+from ...task_utils import TEXT_GENERATION_TASKS
+from ...generators.dataset_generator import DatasetGenerator
+from .utils import format_calibration_config, format_quantization_config, TASKS_TO_ORTMODELS, TASKS_TO_ORTSD
+
import torch
from datasets import Dataset
from hydra.utils import get_class
from onnxruntime import SessionOptions
from safetensors.torch import save_file
-from transformers.utils import ModelOutput
-from transformers import TrainerCallback, TrainerState
+from transformers import TrainerCallback
from transformers.modeling_utils import no_init_weights
from transformers.utils.logging import set_verbosity_error
from optimum.onnxruntime.configuration import (
@@ -24,19 +30,10 @@
from optimum.onnxruntime import (
ONNX_DECODER_WITH_PAST_NAME,
ONNX_DECODER_NAME,
+ ORTTrainingArguments,
ORTOptimizer,
ORTQuantizer,
-)
-
-from ...generators.dataset_generator import DatasetGenerator
-from ...task_utils import TEXT_GENERATION_TASKS
-from .config import ORTConfig
-from ..base import Backend
-from .utils import (
- format_calibration_config,
- format_quantization_config,
- TASKS_TO_ORTMODELS,
- TASKS_TO_ORTSD,
+ ORTTrainer,
)
# disable transformers logging
@@ -61,15 +58,19 @@ def __init__(self, config: ORTConfig) -> None:
else:
raise NotImplementedError(f"ORTBackend does not support task {self.config.task}")
- self.set_session_options()
+ LOGGER.info("\t+ Creating backend temporary directory")
self.tmpdir = TemporaryDirectory()
+ self.session_options = SessionOptions()
+ for key, value in self.config.session_options.items():
+ setattr(self.session_options, key, value)
+
if self.config.no_weights:
self.load_ortmodel_with_no_weights()
else:
self.load_ortmodel_from_pretrained()
- if self.is_deferred_trt_loading():
+ if self.is_trt_text_generation:
return
if self.is_optimized or self.is_quantized:
@@ -99,35 +100,30 @@ def validate_provider(self) -> None:
self.pretrained_model.providers[0] == self.config.provider
), f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}"
- def is_deferred_trt_loading(self) -> bool:
- return self.config.provider == "TensorrtExecutionProvider" and self.config.task in TEXT_GENERATION_TASKS
-
- def set_session_options(self) -> None:
- self.session_options = SessionOptions()
- for key, value in self.config.session_options.items():
- setattr(self.session_options, key, value)
-
- def load_ortmodel_with_no_weights(self) -> None:
+ def create_no_weights_model(self) -> None:
LOGGER.info("\t+ Creating no weights model directory")
- no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
- os.makedirs(no_weights_model, exist_ok=True)
+ self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+ os.makedirs(self.no_weights_model, exist_ok=True)
LOGGER.info("\t+ Saving pretrained config")
- self.pretrained_config.save_pretrained(save_directory=no_weights_model)
+ self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
- LOGGER.info("\t+ Creating no weights model weights")
+ LOGGER.info("\t+ Creating no weights model state dict")
state_dict = torch.nn.Linear(1, 1).state_dict()
- LOGGER.info("\t+ Saving no weights model weights")
+ LOGGER.info("\t+ Saving no weights model state dict")
save_file(
- filename=os.path.join(no_weights_model, "model.safetensors"),
+ filename=os.path.join(self.no_weights_model, "model.safetensors"),
metadata={"format": "pt"},
tensors=state_dict,
)
+ def load_ortmodel_with_no_weights(self) -> None:
+ self.create_no_weights_model()
+
with no_init_weights():
original_model = self.config.model
- self.config.model = no_weights_model
+ self.config.model = self.no_weights_model
LOGGER.info("\t+ Loading no weights model")
self.load_ortmodel_from_pretrained()
self.config.model = original_model
@@ -144,6 +140,10 @@ def load_ortmodel_from_pretrained(self) -> None:
**self.ortmodel_kwargs,
)
+ @property
+ def is_trt_text_generation(self) -> bool:
+ return self.config.provider == "TensorrtExecutionProvider" and self.config.task in TEXT_GENERATION_TASKS
+
@property
def is_optimized(self) -> bool:
return (self.config.auto_optimization is not None) or self.config.optimization
@@ -252,7 +252,7 @@ def quantize_onnx_files(self) -> None:
task=self.config.task,
dataset_shapes=dataset_shapes,
model_shapes=self.model_shapes,
- ).generate()
+ )()
columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.inputs_names))
calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
@@ -314,7 +314,7 @@ def quantize_onnx_files(self) -> None:
self.config.model = quantized_model_path
def prepare_for_inference(self, **kwargs) -> None:
- if self.is_deferred_trt_loading():
+ if self.is_trt_text_generation:
LOGGER.info("\t+ Creating dynamic shapes for Tensorrt engine. Engine creation might take a while.")
batch_size = kwargs["batch_size"]
max_new_tokens = kwargs["max_new_tokens"]
@@ -353,21 +353,22 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs
- def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
- return self.pretrained_model(**inputs, **kwargs)
+ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ return self.pretrained_model.forward(**inputs, **kwargs)
- def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+ def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
return self.pretrained_model.generate(**inputs, **kwargs)
+ def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ return self.pretrained_model(**inputs, **kwargs)
+
def train(
self,
training_dataset: Dataset,
training_arguments: Dict[str, Any],
training_callbacks: List[TrainerCallback],
training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]],
- ) -> TrainerState:
- from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
-
+ ) -> None:
LOGGER.info("\t+ Setting dataset format to `torch`")
training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys()))
LOGGER.info("\t+ Wrapping training arguments with optimum.onnxruntime.ORTTrainingArguments")
@@ -384,13 +385,11 @@ def train(
trainer.train()
LOGGER.info("\t+ Training finished successfully")
- return trainer.state
-
def clean(self) -> None:
super().clean()
if hasattr(self, "tmpdir"):
- LOGGER.info("\t+ Cleaning temporary directory")
+ LOGGER.info("\t+ Cleaning backend temporary directory")
self.tmpdir.cleanup()
gc.collect()
diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
index 0f9262cc..e0191b88 100644
--- a/optimum_benchmark/backends/onnxruntime/config.py
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -38,6 +38,7 @@ class ORTConfig(BackendConfig):
version: Optional[str] = onnxruntime_version()
_target_: str = "optimum_benchmark.backends.onnxruntime.backend.ORTBackend"
+ # load options
no_weights: bool = False
# export options
diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py
index 4140b973..73cbd63d 100644
--- a/optimum_benchmark/backends/openvino/backend.py
+++ b/optimum_benchmark/backends/openvino/backend.py
@@ -3,26 +3,25 @@
import inspect
from typing import Any, Dict
from logging import getLogger
+from collections import OrderedDict
from tempfile import TemporaryDirectory
+from ..base import Backend
+from .config import OVConfig
+from .utils import TASKS_TO_OVMODEL
+from ...task_utils import TEXT_GENERATION_TASKS
+from ..transformers_utils import randomize_weights
+from ...generators.dataset_generator import DatasetGenerator
+
import torch
from hydra.utils import get_class
from openvino.runtime import properties
from safetensors.torch import save_file
from optimum.intel.openvino import OVQuantizer
from transformers.modeling_utils import no_init_weights
-from transformers.utils import ModelOutput
from transformers.utils.logging import set_verbosity_error
from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict
-from ..base import Backend
-from .config import OVConfig
-from .utils import TASKS_TO_OVMODEL
-from ...task_utils import TEXT_GENERATION_TASKS
-from ..transformers_utils import randomize_weights
-from ...generators.dataset_generator import DatasetGenerator
-
-
# disable transformers logging
set_verbosity_error()
@@ -149,7 +148,11 @@ def quantize_automodel(self) -> None:
"sequence_length": 1,
**self.model_shapes,
}
- calibration_dataset = DatasetGenerator(task=self.config.task, dataset_shapes=dataset_shapes).generate()
+ calibration_dataset = DatasetGenerator(
+ task=self.config.task,
+ dataset_shapes=dataset_shapes,
+ model_shapes=self.model_shapes,
+ )()
columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._export_input_names))
calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
else:
@@ -196,12 +199,15 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs
- def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
- return self.pretrained_model(**inputs, **kwargs)
+ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ return self.pretrained_model.forward(**inputs, **kwargs)
- def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+ def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
return self.pretrained_model.generate(**inputs, **kwargs)
+ def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ return self.pretrained_model(**inputs, **kwargs)
+
def clean(self) -> None:
super().clean()
diff --git a/optimum_benchmark/backends/peft_utils.py b/optimum_benchmark/backends/peft_utils.py
index 695e602c..1a367120 100644
--- a/optimum_benchmark/backends/peft_utils.py
+++ b/optimum_benchmark/backends/peft_utils.py
@@ -13,7 +13,6 @@
PromptLearningConfig,
)
-
PEFT_TASKS_TYPES = [
"SEQ_CLS",
"SEQ_2_SEQ_LM",
diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index cadfe878..268f4306 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -1,27 +1,23 @@
import gc
import os
from logging import getLogger
+from collections import OrderedDict
from tempfile import TemporaryDirectory
from typing import Any, Callable, Dict, List
+from ..base import Backend
+from .config import PyTorchConfig
+from ..peft_utils import get_peft_config_class
+from ..transformers_utils import randomize_weights
+from ...import_utils import is_deepspeed_available, is_peft_available
+
import torch
from datasets import Dataset
from safetensors.torch import save_file
-from transformers.utils import ModelOutput
import datasets.utils.logging as datasets_logging
-from transformers import TrainerCallback, TrainerState
from transformers.modeling_utils import no_init_weights
import transformers.utils.logging as transformers_logging
-
-from ..base import Backend
-from .config import PyTorchConfig
-from ..peft_utils import get_peft_config_class
-from ..transformers_utils import TransformersDataParallel, randomize_weights
-from ...import_utils import (
- is_deepspeed_available,
- is_peft_available,
-)
-
+from transformers import TrainerCallback, TrainerState, Trainer, TrainingArguments
if is_peft_available():
from peft import get_peft_model
@@ -38,21 +34,13 @@
class PyTorchBackend(Backend[PyTorchConfig]):
- NAME: str = "pytorch"
+ NAME = "pytorch"
def __init__(self, config: PyTorchConfig):
super().__init__(config)
+ self.validate_library()
- if self.config.library == "timm":
- LOGGER.info("\t+ Using method timm.create_model")
- else:
- automodel = self.automodel_class.__name__
- if self.config.library == "diffusers":
- LOGGER.info(f"\t+ Using Pipeline class {automodel}")
- else:
- LOGGER.info(f"\t+ Using AutoModel class {automodel}")
-
- # Threading options
+ # Threads
if self.config.inter_op_num_threads is not None:
LOGGER.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))")
torch.set_num_threads(self.config.inter_op_num_threads)
@@ -60,18 +48,23 @@ def __init__(self, config: PyTorchConfig):
LOGGER.info(f"\t+ Setting pytorch intra_op_num_threads({self.config.intra_op_num_threads}))")
torch.set_num_interop_threads(self.config.intra_op_num_threads)
- # Dtypes options
- self.amp_dtype = getattr(torch, self.config.amp_dtype) if self.config.amp_dtype is not None else None
+ # Mixed precision
+ if self.config.amp_dtype:
+ LOGGER.info(f"\t+ Setting mixed precision dtype to {self.config.amp_dtype}")
+ self.amp_dtype = getattr(torch, self.config.amp_dtype)
+ else:
+ self.amp_dtype = None
+ # Quantization
if self.is_quantized:
LOGGER.info("\t+ Processing quantization config")
self.process_quantization_config()
else:
self.quantization_config = None
+ LOGGER.info("\t+ Creating backend temporary directory")
self.tmpdir = TemporaryDirectory()
- # Load model
if self.config.no_weights and self.config.library == "diffusers":
raise ValueError("Diffusion pipelines are not supported with no_weights=True")
elif self.config.no_weights:
@@ -81,8 +74,12 @@ def __init__(self, config: PyTorchConfig):
LOGGER.info("\t+ Loading model with pretrained weights")
self.load_model_from_pretrained()
+ if self.config.cache_implementation is not None:
+ LOGGER.info(f"\t+ Setting cache implementation to {self.config.cache_implementation}")
+ self.pretrained_model.generation_config.cache_implementation = self.config.cache_implementation
+
# Eval mode
- if self.config.eval_mode and not self.config.library == "diffusers":
+ if self.config.eval_mode and self.config.library != "diffusers":
LOGGER.info("\t+ Turning on model's eval mode")
self.pretrained_model.eval()
@@ -91,7 +88,7 @@ def __init__(self, config: PyTorchConfig):
LOGGER.info("\t+ Enabling BetterTransformer")
self.pretrained_model.to_bettertransformer()
- # Compile model
+ # Torch compile
if self.config.torch_compile:
if self.config.library == "diffusers":
LOGGER.info("\t+ Using torch.compile on unet forward pass")
@@ -115,18 +112,21 @@ def __init__(self, config: PyTorchConfig):
if self.config.deepspeed_inference:
LOGGER.info("\t+ Using DeepSpeed-Inference")
-
self.pretrained_model = init_inference(
self.pretrained_model,
config=self.config.deepspeed_inference_config,
dtype=getattr(self.pretrained_model, "dtype", None),
)
- if self.config.data_parallel:
- LOGGER.info("\t+ Using TransformersDataParallel")
- self.pretrained_model = TransformersDataParallel(self.pretrained_model)
-
- self.tmpdir.cleanup()
+ def validate_library(self) -> None:
+ if self.config.library == "timm":
+ LOGGER.info(f"\t+ Using Timm method {self.automodel_class.__name__}")
+ elif self.config.library == "diffusers":
+ LOGGER.info(f"\t+ Using Pipeline class {self.automodel_class.__name__}")
+ elif self.config.library == "transformers":
+ LOGGER.info(f"\t+ Using AutoModel class {self.automodel_class.__name__}")
+ else:
+ raise ValueError(f"Library {self.config.library} not supported")
def load_model_from_pretrained(self) -> None:
if self.config.library == "timm":
@@ -138,8 +138,8 @@ def load_model_from_pretrained(self) -> None:
self.pretrained_model = self.automodel_class.from_pretrained(
pretrained_model_name_or_path=self.config.model,
device_map=self.config.device_map,
- **self.automodel_kwargs,
**self.config.hub_kwargs,
+ **self.automodel_kwargs,
)
if self.config.device_map is None:
LOGGER.info(f"\t+ Moving pipeline to device: {self.config.device}")
@@ -148,7 +148,6 @@ def load_model_from_pretrained(self) -> None:
LOGGER.info("\t+ Loading BnB quantized model")
self.pretrained_model = self.automodel_class.from_pretrained(
pretrained_model_name_or_path=self.config.model,
- low_cpu_mem_usage=self.config.low_cpu_mem_usage,
device_map=self.config.device_map,
**self.config.hub_kwargs,
**self.automodel_kwargs,
@@ -158,10 +157,8 @@ def load_model_from_pretrained(self) -> None:
self.pretrained_model = self.automodel_class.from_pretrained(
pretrained_model_name_or_path=self.config.model,
# for gptq, we need to specify the device_map to either auto
- # or a cuda adevice to avoid any modules being assigned to cpu
+ # or a cuda adevice to avoid any modules being assigned to cpu Β―\_(γ)_/Β―
device_map=self.config.device_map or torch.device(self.config.device),
- # this avoids unnecessary memory usage when loading quantized models
- low_cpu_mem_usage=self.config.low_cpu_mem_usage,
**self.config.hub_kwargs,
**self.automodel_kwargs,
)
@@ -175,39 +172,39 @@ def load_model_from_pretrained(self) -> None:
)
else:
# this is the fastest way to load a model on a specific device
+ # but not compatible with all quantization methods (and pipelines)
LOGGER.info(f"\t+ Loading model directly on device: {self.config.device}")
with torch.device(self.config.device):
self.pretrained_model = self.automodel_class.from_pretrained(
pretrained_model_name_or_path=self.config.model,
- **self.automodel_kwargs,
**self.config.hub_kwargs,
+ **self.automodel_kwargs,
)
def create_no_weights_model(self) -> None:
- self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
-
- LOGGER.info("\t+ Creating no weights model directory")
- os.makedirs(self.no_weights_model, exist_ok=True)
-
- if self.is_quantized:
- # tricking from_pretrained to load the model as if it was quantized
- self.pretrained_config.quantization_config = self.quantization_config.to_dict()
-
- LOGGER.info("\t+ Saving pretrained config")
- self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-
LOGGER.info("\t+ Creating no weights model state_dict")
state_dict = torch.nn.Linear(1, 1).state_dict()
if self.is_exllamav2:
- # for exllamav2 we need to add g_idx to the state_dict
+ # for exllamav2 we need to add g_idx to the state_dict which
+ # requires some information about linear layers dimensions
with torch.device("meta"):
meta_model = self.automodel_class.from_config(self.pretrained_config)
-
for name, module in meta_model.named_modules():
if hasattr(module, "in_features"):
state_dict[name + ".g_idx"] = torch.ones((module.in_features,), dtype=torch.int32)
+ if self.is_quantized:
+ # tricking from_pretrained to load the model as if it was quantized
+ self.pretrained_config.quantization_config = self.quantization_config.to_dict()
+
+ LOGGER.info("\t+ Creating no weights model directory")
+ self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+ os.makedirs(self.no_weights_model, exist_ok=True)
+
+ LOGGER.info("\t+ Saving no weights model pretrained config")
+ self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+
LOGGER.info("\t+ Saving no weights model state_dict")
save_file(
filename=os.path.join(self.no_weights_model, "model.safetensors"),
@@ -292,10 +289,9 @@ def is_awq_quantized(self) -> bool:
@property
def is_exllamav2(self) -> bool:
return (
- self.is_quantized
- and self.is_gptq_quantized
- and "exllama_config" in self.config.quantization_config
- and self.config.quantization_config["exllama_config"]["version"] == 2
+ self.is_gptq_quantized
+ and "exllama_config" in self.quantization_config
+ and self.quantization_config["exllama_config"].get("version", None) == 2
)
@property
@@ -305,12 +301,14 @@ def automodel_kwargs(self) -> Dict[str, Any]:
if self.config.torch_dtype is not None:
kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype)
- if self.config.use_flash_attention_2:
- kwargs["use_flash_attention_2"] = True
+ if self.config.attn_implementation is not None:
+ kwargs["attn_implementation"] = self.config.attn_implementation
- if self.is_gptq_quantized or self.is_bnb_quantized:
- # awq quantization doesn't support overriding the quantization
- # config by passing quantization_config to from_pretrained
+ if self.config.low_cpu_mem_usage is not None:
+ kwargs["low_cpu_mem_usage"] = self.config.low_cpu_mem_usage
+
+ if self.is_quantized:
+ kwargs["_fast_init"] = False
kwargs["quantization_config"] = self.quantization_config
return kwargs
@@ -329,24 +327,19 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs
@torch.inference_mode()
- def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
- if self.config.library == "diffusers":
- return self.pretrained_model(**inputs, **kwargs)
-
- if self.config.amp_autocast:
- with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype):
- return self.pretrained_model(**inputs, **kwargs)
- else:
- return self.pretrained_model(**inputs, **kwargs)
+ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype, enabled=self.config.amp_autocast):
+ return self.pretrained_model.forward(**inputs, **kwargs)
@torch.inference_mode()
- def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
- if self.config.amp_autocast:
- with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype):
- return self.pretrained_model.generate(**inputs, **kwargs)
- else:
+ def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype, enabled=self.config.amp_autocast):
return self.pretrained_model.generate(**inputs, **kwargs)
+ @torch.inference_mode()
+ def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+ return self.pretrained_model(**inputs, **kwargs)
+
def train(
self,
training_dataset: Dataset,
@@ -354,16 +347,12 @@ def train(
training_callbacks: List[TrainerCallback],
training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]],
) -> TrainerState:
- from transformers import Trainer, TrainingArguments
-
- LOGGER.info("\t+ Setting dataset format to `torch`")
- training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys()))
LOGGER.info("\t+ Wrapping training arguments with transformers.TrainingArguments")
training_arguments = TrainingArguments(**training_arguments)
LOGGER.info("\t+ Wrapping model with transformers.Trainer")
trainer = Trainer(
- model=self.pretrained_model,
args=training_arguments,
+ model=self.pretrained_model,
callbacks=training_callbacks,
train_dataset=training_dataset,
data_collator=training_data_collator,
@@ -372,8 +361,6 @@ def train(
trainer.train()
LOGGER.info("\t+ Training finished successfully")
- return trainer.state
-
def seed(self):
super().seed()
torch.manual_seed(self.config.seed)
@@ -385,7 +372,7 @@ def clean(self) -> None:
super().clean()
if hasattr(self, "tmpdir"):
- LOGGER.info("\t+ Cleaning temporary directory")
+ LOGGER.info("\t+ Cleaning backend temporary directory")
self.tmpdir.cleanup()
gc.collect()
diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py
index 1cbb04ba..d8089f60 100644
--- a/optimum_benchmark/backends/pytorch/config.py
+++ b/optimum_benchmark/backends/pytorch/config.py
@@ -42,9 +42,10 @@ class PyTorchConfig(BackendConfig):
# optimization options
eval_mode: bool = True
- low_cpu_mem_usage: bool = False
to_bettertransformer: bool = False
- use_flash_attention_2: bool = False
+ low_cpu_mem_usage: Optional[bool] = None
+ attn_implementation: Optional[str] = None
+ cache_implementation: Optional[str] = None
# compilation options
torch_compile: bool = False
@@ -55,7 +56,6 @@ class PyTorchConfig(BackendConfig):
quantization_config: Dict[str, Any] = field(default_factory=dict)
# distributed inference options
- data_parallel: bool = False
deepspeed_inference: bool = False
deepspeed_inference_config: Dict[str, Any] = field(default_factory=dict)
diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
index 43a5fd75..7c86adeb 100644
--- a/optimum_benchmark/backends/tensorrt_llm/backend.py
+++ b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -1,13 +1,13 @@
from logging import getLogger
from typing import Any, Dict
-from hydra.utils import get_class
-from transformers.utils import ModelOutput
-
from ..base import Backend
from .config import TRTLLMConfig
from .utils import MODEL_TYPE_TO_TRTLLMMODEL
+from hydra.utils import get_class
+from transformers.utils import ModelOutput
+
LOGGER = getLogger("tensorrt-llm")
@@ -18,15 +18,15 @@ def __init__(self, config: TRTLLMConfig):
super().__init__(config)
self.validate_model_type()
- self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type])
- LOGGER.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}")
-
self.load_trtmodel_from_pretrained()
def validate_model_type(self) -> None:
if self.model_type not in MODEL_TYPE_TO_TRTLLMMODEL:
raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.model_type}")
+ self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type])
+ LOGGER.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}")
+
def load_trtmodel_from_pretrained(self) -> None:
self.pretrained_model = self.trtmodel_class.from_pretrained(
self.config.model,
diff --git a/optimum_benchmark/backends/text_generation_inference/backend.py b/optimum_benchmark/backends/text_generation_inference/backend.py
index fbd3d1de..538de53c 100644
--- a/optimum_benchmark/backends/text_generation_inference/backend.py
+++ b/optimum_benchmark/backends/text_generation_inference/backend.py
@@ -6,6 +6,11 @@
from tempfile import TemporaryDirectory
from concurrent.futures import ThreadPoolExecutor
+from ..base import Backend
+from .config import TGIConfig
+from ...task_utils import TEXT_GENERATION_TASKS
+from ..transformers_utils import randomize_weights
+
import torch
import docker
import docker.types
@@ -14,10 +19,6 @@
from huggingface_hub import InferenceClient, snapshot_download
from huggingface_hub.inference._text_generation import TextGenerationResponse
-from ..base import Backend
-from .config import TGIConfig
-from ..transformers_utils import randomize_weights
-
# bachend logger
LOGGER = getLogger("text-generation-inference")
@@ -29,8 +30,7 @@ def __init__(self, config: TGIConfig) -> None:
super().__init__(config)
self.validate_task()
- LOGGER.info(f"Using AutoModel class {self.automodel_class.__name__}")
-
+ LOGGER.info("\t+ Creating backend temporary directory")
self.tmp_dir = TemporaryDirectory()
if self.config.no_weights:
@@ -40,9 +40,11 @@ def __init__(self, config: TGIConfig) -> None:
self.load_model_from_pretrained()
def validate_task(self) -> None:
- if self.config.task not in ["text-generation", "text2text-generation"]:
+ if self.config.task not in TEXT_GENERATION_TASKS:
raise NotImplementedError(f"TGI does not support task {self.config.task}")
+ LOGGER.info(f"Using AutoModel class {self.automodel_class.__name__}")
+
def download_pretrained_model(self) -> None:
LOGGER.info("\t+ Downloading pretrained model")
snapshot_download(self.config.model, **self.config.hub_kwargs)
@@ -93,7 +95,7 @@ def create_no_weights_model(self) -> None:
self.pretrained_model = self.automodel_class.from_pretrained(
self.no_weights_model,
**self.config.hub_kwargs,
- device_map="auto",
+ device_map="auto", # for faster/safer loading
)
LOGGER.info("\t+ Randomizing weights")
diff --git a/optimum_benchmark/backends/text_generation_inference/config.py b/optimum_benchmark/backends/text_generation_inference/config.py
index edf37ba3..8b73617e 100644
--- a/optimum_benchmark/backends/text_generation_inference/config.py
+++ b/optimum_benchmark/backends/text_generation_inference/config.py
@@ -11,6 +11,9 @@ class TGIConfig(BackendConfig):
version: Optional[str] = "0.0.1"
_target_: str = "optimum_benchmark.backends.text_generation_inference.backend.TGIBackend"
+ # optimum benchmark specific
+ no_weights: bool = False
+
# docker options
image: str = "ghcr.io/huggingface/text-generation-inference:latest"
volume: str = f"{os.path.expanduser('~')}/.cache/huggingface/hub"
@@ -28,9 +31,6 @@ class TGIConfig(BackendConfig):
sharded: Optional[bool] = None # None, True, False
num_shard: Optional[int] = None # None, 1, 2, 4, 8, 16, 32, 64
- # optimum benchmark specific
- no_weights: bool = False # True, False
-
def __post_init__(self):
super().__post_init__()
diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py
index 3af970a3..9e2924b2 100644
--- a/optimum_benchmark/backends/timm_utils.py
+++ b/optimum_benchmark/backends/timm_utils.py
@@ -1,22 +1,18 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional
-from transformers import PretrainedConfig
+from ..import_utils import is_timm_available, is_transformers_available, is_torch_available
-from ..import_utils import is_timm_available
+if is_torch_available():
+ import torch
if is_timm_available():
import timm
-
-def get_timm_pretrained_processor(model: str) -> Any:
- try:
- pretrained_config = get_timm_pretrained_config(model)
- return timm.data.create_transform(**timm.data.resolve_data_config(pretrained_config))
- except Exception:
- return None
+if is_transformers_available():
+ from transformers import PretrainedConfig
-def get_timm_pretrained_config(model_name: str) -> PretrainedConfig:
+def get_timm_pretrained_config(model_name: str) -> "PretrainedConfig":
model_source, model_name = timm.models.parse_model_name(model_name)
if model_source == "hf-hub":
# For model names specified in the form `hf-hub:path/architecture_name@revision`,
@@ -27,13 +23,22 @@ def get_timm_pretrained_config(model_name: str) -> PretrainedConfig:
return timm.get_pretrained_cfg(model_name)
-def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
- shapes = {}
+def get_timm_pre_processor(model: str) -> Optional["torch.nn.Module"]:
+ try:
+ pretrained_config = get_timm_pretrained_config(model)
+ return timm.data.create_transform(**timm.data.resolve_data_config(pretrained_config))
+ except Exception:
+ return None
+
+
+def extract_timm_shapes_from_config(config: "PretrainedConfig") -> Dict[str, Any]:
artifacts_dict = {}
config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
artifacts_dict.update(config_dict)
+ shapes = {}
+
# image input
shapes["num_channels"] = artifacts_dict.get("num_channels", None)
if shapes["num_channels"] is None:
diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py
index aefce8ea..a7515d2f 100644
--- a/optimum_benchmark/backends/torch_ort/backend.py
+++ b/optimum_benchmark/backends/torch_ort/backend.py
@@ -4,6 +4,11 @@
from tempfile import TemporaryDirectory
from typing import Any, Callable, Dict, List
+from ..transformers_utils import randomize_weights
+from ..peft_utils import get_peft_config_class
+from .config import TorchORTConfig
+from ..base import Backend
+
import torch
from datasets import Dataset
from safetensors.torch import save_file
@@ -12,11 +17,6 @@
from transformers.utils.logging import set_verbosity_error
from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
-from ..transformers_utils import randomize_weights
-from ..peft_utils import get_peft_config_class
-from .config import TorchORTConfig
-from ..base import Backend
-
# disable transformers logging
set_verbosity_error()
@@ -28,9 +28,9 @@ class TorchORTBackend(Backend[TorchORTConfig]):
def __init__(self, config: TorchORTConfig):
super().__init__(config)
+ self.validate_library()
- LOGGER.info(f"Using AutoModel: {self.automodel_class.__name__}")
-
+ LOGGER.info("\t+ Creating backend temporary directory")
self.tmpdir = TemporaryDirectory()
if self.config.no_weights:
@@ -46,7 +46,11 @@ def __init__(self, config: TorchORTConfig):
peft_config = peft_config_class(**self.config.peft_config)
self.pretrained_model = get_peft_model(self.pretrained_model, peft_config=peft_config)
- self.tmpdir.cleanup()
+ def validate_library(self) -> None:
+ if self.config.library == "transformers":
+ LOGGER.info(f"Using AutoModel: {self.automodel_class.__name__}")
+ else:
+ raise NotImplementedError(f"TorchORTBackend does not support {self.config.library} library")
def create_no_weights_model(self) -> None:
LOGGER.info("\t+ Creating no weights model directory")
@@ -76,9 +80,9 @@ def load_automodel_with_no_weights(self) -> None:
self.load_automodel_from_pretrained()
self.config.model = original_model
- LOGGER.info("\t+ Randomizing weights")
+ LOGGER.info("\t+ Randomizing model weights")
randomize_weights(self.pretrained_model)
- LOGGER.info("\t+ Tying model weights after randomization")
+ LOGGER.info("\t+ Tying model weights")
self.pretrained_model.tie_weights()
def load_automodel_from_pretrained(self) -> None:
@@ -126,7 +130,7 @@ def clean(self) -> None:
super().clean()
if hasattr(self, "tmpdir"):
- LOGGER.info("\t+ Cleaning temporary directory")
+ LOGGER.info("\t+ Cleaning backend temporary directory")
self.tmpdir.cleanup()
gc.collect()
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 488adca5..1d7ad410 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -1,54 +1,49 @@
import os
-import threading
-from itertools import chain
-from typing import Any, Dict, List, Optional, Sequence, Union, cast
-
-import torch
-from torch.nn.modules import Module
-from torch.cuda.amp import autocast
-from torch._utils import ExceptionWrapper
-from torch.cuda._utils import _get_device_index
-from torch.nn.parallel.parallel_apply import get_a_var
-from transformers import (
- FeatureExtractionMixin,
- ImageProcessingMixin,
- PreTrainedTokenizer,
- GenerationConfig,
- PretrainedConfig,
- ProcessorMixin,
- AutoProcessor,
- AutoConfig,
-)
+from typing import Any, Dict, Optional, Union
+
+from ..import_utils import is_transformers_available, is_torch_available
+
+if is_torch_available():
+ import torch
+
+if is_transformers_available():
+ from transformers import (
+ FeatureExtractionMixin,
+ ImageProcessingMixin,
+ PreTrainedTokenizer,
+ GenerationConfig,
+ PretrainedConfig,
+ ProcessorMixin,
+ AutoProcessor,
+ AutoConfig,
+ )
-PretrainedProcessor = Union[
- FeatureExtractionMixin,
- ImageProcessingMixin,
- PreTrainedTokenizer,
- ProcessorMixin,
-]
+ PretrainedProcessor = Union[
+ FeatureExtractionMixin,
+ ImageProcessingMixin,
+ PreTrainedTokenizer,
+ ProcessorMixin,
+ ]
-def get_transformers_cache_dir():
+def get_transformers_cache_dir() -> str:
return os.path.expanduser("~/.cache/huggingface/hub")
-def get_transformers_generation_config(model: str, **kwargs: Dict[str, Any]):
- try:
- # sometimes contains information about the model's input shapes that are not available in the config
- return GenerationConfig.from_pretrained(model, **kwargs)
- except Exception:
- return None
+def get_transformers_pretrained_config(model: str, **kwargs) -> "PretrainedConfig":
+ # sometimes contains information about the model's input shapes that are not available in the config
+ return AutoConfig.from_pretrained(model, **kwargs)
-def get_transformers_pretrained_config(model: str, **kwargs: Dict[str, Any]):
+def get_transformers_generation_config(model: str, **kwargs) -> Optional["GenerationConfig"]:
try:
# sometimes contains information about the model's input shapes that are not available in the config
- return AutoConfig.from_pretrained(model, **kwargs)
- except ValueError:
+ return GenerationConfig.from_pretrained(model, **kwargs)
+ except Exception:
return None
-def get_transformers_pretrained_processor(model: str, **kwargs: Dict[str, Any]):
+def get_transformers_pre_processor(model: str, **kwargs) -> Optional["PretrainedProcessor"]:
try:
# sometimes contains information about the model's input shapes that are not available in the config
return AutoProcessor.from_pretrained(model, **kwargs)
@@ -57,9 +52,9 @@ def get_transformers_pretrained_processor(model: str, **kwargs: Dict[str, Any]):
def extract_transformers_shapes_from_artifacts(
- config: PretrainedConfig, processor: Optional[PretrainedProcessor] = None
+ config: "PretrainedConfig",
+ processor: Optional["PretrainedProcessor"] = None,
) -> Dict[str, Any]:
- shapes = {}
artifacts_dict = {}
config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
@@ -68,6 +63,10 @@ def extract_transformers_shapes_from_artifacts(
if processor is not None and hasattr(processor, "to_dict"):
processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
artifacts_dict.update(processor_dict)
+ elif processor is not None:
+ processor_dict = {k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int)}
+
+ shapes = {}
# text input
shapes["vocab_size"] = artifacts_dict.get("vocab_size", None)
@@ -126,142 +125,20 @@ def extract_transformers_shapes_from_artifacts(
return shapes
-def randomize_weights(model):
+def randomize_weights(model: "torch.nn.Module") -> None:
for param in model.parameters():
- if param.data.dtype in (torch.float32, torch.float16, torch.bfloat16):
- if torch.cuda.is_available() and param.device.type == "cpu":
+ if param.data.is_floating_point():
+ if torch.cuda.is_available() and param.device.type != "cuda":
param.data.cuda().normal_(mean=0.0, std=0.2).cpu()
- elif torch.backends.mps.is_available() and param.device.type == "cpu":
- param.data.mps_normal_(mean=0.0, std=0.2)
+ elif torch.backends.mps.is_available() and param.device.type != "mps":
+ param.data.to("mps").normal_(mean=0.0, std=0.2).cpu()
else:
param.data.normal_(mean=0.0, std=0.2)
- elif param.data.dtype in (torch.int8, torch.int16, torch.int32, torch.int64):
- if torch.cuda.is_available() and param.device.type == "cpu":
- param.data.cuda().randint_(low=-127, high=127).cpu()
- elif torch.backends.mps.is_available() and param.device.type == "cpu":
- param.data.mps_randint_(low=-127, high=127)
- else:
- param.data.randint_(low=-127, high=127)
-
-
-# adapted from torch to use generate instead of forward
-def parallel_generate_apply(
- modules: Sequence[Module],
- inputs: Sequence[Any],
- kwargs_tup: Optional[Sequence[Dict[str, Any]]] = None,
- devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None,
-) -> List[Any]:
- assert len(modules) == len(
- inputs
- ), f"The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}"
- if kwargs_tup is not None:
- assert len(modules) == len(kwargs_tup)
- else:
- kwargs_tup = (cast(Dict[str, Any], {}),) * len(modules)
- if devices is not None:
- assert len(modules) == len(devices)
- else:
- devices = [None] * len(modules)
- devices = [_get_device_index(x, True) for x in devices]
- streams = [torch.cuda.current_stream(x) for x in devices]
- lock = threading.Lock()
- results = {}
- grad_enabled, autocast_enabled = (
- torch.is_grad_enabled(),
- torch.is_autocast_enabled(),
- )
-
- def _worker(
- i: int,
- module: Module,
- input: Any,
- kwargs: Dict[str, Any],
- device: Optional[Union[int, torch.device]] = None,
- stream: Optional[torch.cuda.Stream] = None,
- ) -> None:
- torch.set_grad_enabled(grad_enabled)
- if device is None:
- t = get_a_var(input)
- if t is None:
- with lock:
- results[i] = ExceptionWrapper(
- where=f"in replica {i}, no device was provided and no tensor input was found; "
- "device cannot be resolved"
- )
- return
- device = t.get_device()
- if stream is None:
- stream = torch.cuda.current_stream(device)
- try:
- with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
- # this also avoids accidental slicing of `input` if it is a Tensor
- if not isinstance(input, (list, tuple)):
- input = (input,)
- output = module.generate(*input, **kwargs)
- with lock:
- results[i] = output
- except Exception:
- with lock:
- results[i] = ExceptionWrapper(where=f"in replica {i} on device {device}")
-
- if len(modules) > 1:
- threads = [
- threading.Thread(target=_worker, args=(i, module, input, kwargs, device, stream))
- for i, (module, input, kwargs, device, stream) in enumerate(
- zip(modules, inputs, kwargs_tup, devices, streams)
- )
- ]
-
- for thread in threads:
- thread.start()
- for thread in threads:
- thread.join()
- else:
- _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0])
-
- outputs = []
- for i in range(len(inputs)):
- output = results[i]
- if isinstance(output, ExceptionWrapper):
- output.reraise()
- outputs.append(output)
- return outputs
-
-# adapted from torch to support generate
-class TransformersDataParallel(torch.nn.DataParallel):
- def generate(self, *inputs: Any, **kwargs: Any) -> Any:
- with torch.autograd.profiler.record_function("DataParallel.generate"):
- if not self.device_ids:
- return self.module.generate(*inputs, **kwargs)
-
- for t in chain(self.module.parameters(), self.module.buffers()):
- if t.device != self.src_device_obj:
- raise RuntimeError(
- "module must have its parameters and buffers "
- f"on device {self.src_device_obj} (device_ids[0]) but found one of "
- f"them on device: {t.device}"
- )
-
- inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
- # for forward function without any inputs, empty list and dict will be created
- # so the module can be executed on one device which is the first one in device_ids
- if not inputs and not module_kwargs:
- inputs = ((),)
- module_kwargs = ({},)
-
- if len(self.device_ids) == 1:
- return self.module.generate(*inputs[0], **module_kwargs[0])
-
- replicas = self.replicate(self.module, self.device_ids[: len(inputs)])
- outputs = self.parallel_generate_apply(replicas, inputs, module_kwargs)
- return self.gather(outputs, self.output_device)
-
- def parallel_generate_apply(self, replicas: Sequence, inputs: Sequence, kwargs: Any) -> List[Any]:
- return parallel_generate_apply(replicas, inputs, kwargs, self.device_ids[: len(replicas)])
-
- def __getattr__(self, name: str) -> Any:
- try:
- return super().__getattr__(name)
- except AttributeError:
- return getattr(self.module, name)
+ elif param.data.dtype in (torch.int32, torch.int16, torch.int8):
+ if torch.cuda.is_available() and param.device.type != "cuda":
+ param.data.copy_(torch.randint(-127, 127, param.data.shape, device="cuda"))
+ elif torch.backends.mps.is_available() and param.device.type != "mps":
+ param.data.copy_(torch.randint(-127, 127, param.data.shape, device="mps"))
+ else:
+ param.data.copy_(torch.randint(-127, 127, param.data.shape))
diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py
index dbc68c3c..84495a1a 100644
--- a/optimum_benchmark/benchmarks/base.py
+++ b/optimum_benchmark/benchmarks/base.py
@@ -1,19 +1,17 @@
from abc import ABC
from logging import getLogger
-from typing import ClassVar, Generic, Dict, Any
+from typing import ClassVar, Generic
from ..backends.base import Backend
+from .report import BenchmarkReport
from .config import BenchmarkConfigT
-
LOGGER = getLogger("benchmark")
class Benchmark(Generic[BenchmarkConfigT], ABC):
NAME: ClassVar[str]
- config: BenchmarkConfigT
-
def __init__(self, config: BenchmarkConfigT) -> None:
LOGGER.info(f"Allocating {self.NAME} benchmark")
self.config = config
@@ -21,5 +19,5 @@ def __init__(self, config: BenchmarkConfigT) -> None:
def run(self, backend: Backend) -> None:
raise NotImplementedError("Benchmark must implement run method")
- def report(self) -> Dict[str, Any]:
- raise NotImplementedError("Benchmark must implement save method")
+ def get_report(self) -> BenchmarkReport:
+ raise NotImplementedError("Benchmark must implement report method")
diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py
index 23e479b4..9cc96ee1 100644
--- a/optimum_benchmark/benchmarks/inference/benchmark.py
+++ b/optimum_benchmark/benchmarks/inference/benchmark.py
@@ -1,24 +1,28 @@
-import os
-import statistics
from logging import getLogger
-from typing import List, Dict, Any
+from typing import List, Tuple, Dict
from ..base import Benchmark
from .config import InferenceConfig
-from ...backends.base import Backend
from ...trackers.energy import EnergyTracker
from ...trackers.memory import MemoryTracker
from ...trackers.latency import LatencyTracker
+from ...backends.base import Backend, BackendConfigT
from ...generators.input_generator import InputGenerator
-from ...task_utils import TEXT_GENERATION_TASKS, DIFFUSION_TASKS
+from ...import_utils import is_torch_distributed_available
+from ...task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS
+from .report import InferenceReport, TextGenerationReport, ImageDiffusionReport
+
+if is_torch_distributed_available():
+ import torch.distributed
LOGGER = getLogger("inference")
-DIFFUSION_KWARGS = {
+IMAGE_DIFFUSION_KWARGS = {
+ "num_inference_steps": 30,
"num_images_per_prompt": 1,
}
-GENERATE_KWARGS = {
+TEXT_GENERATION_KWARGS = {
"num_return_sequences": 1,
"max_new_tokens": 100,
"min_new_tokens": 100,
@@ -36,45 +40,13 @@ class InferenceBenchmark(Benchmark[InferenceConfig]):
def __init__(self, config: InferenceConfig) -> None:
super().__init__(config)
- self.forward_energy: float = 0
- self.forward_emissions: float = 0
- self.forward_max_memory_used: int = 0
- self.forward_max_memory_allocated: int = 0
- self.forward_max_memory_reserved: int = 0
- self.forward_latencies: List[float] = []
-
- self.generate_energy: float = 0
- self.generate_emissions: float = 0
- self.generate_max_memory_used: int = 0
- self.generate_max_memory_allocated: int = 0
- self.generate_max_memory_reserved: int = 0
- self.generate_latencies: List[float] = []
-
- def run(self, backend: Backend) -> None:
- self.can_diffuse = backend.config.task in DIFFUSION_TASKS
- self.can_generate = backend.config.task in TEXT_GENERATION_TASKS
-
- if self.can_diffuse:
- LOGGER.info("\t+ Updating forward kwargs with default values")
- self.config.forward_kwargs = {
- **DIFFUSION_KWARGS,
- **self.config.forward_kwargs,
- }
- if self.can_generate:
- LOGGER.info("\t+ Updating generate kwargs with default values")
- self.config.generate_kwargs = {
- **GENERATE_KWARGS,
- **self.config.generate_kwargs,
- }
-
- # compile with static shapes if needed
- LOGGER.info("\t+ Preparing backend for inference")
- backend.prepare_for_inference(
- **backend.model_shapes,
- **self.config.input_shapes,
- **self.config.forward_kwargs,
- **self.config.generate_kwargs,
- )
+ def run(self, backend: Backend[BackendConfigT]) -> None:
+ if is_torch_distributed_available() and torch.distributed.is_initialized():
+ if self.config.input_shapes["batch_size"] % torch.distributed.get_world_size() != 0:
+ raise ValueError(
+ "The batch size must be divisible by the number of processes in a distributed environment"
+ )
+ self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size()
LOGGER.info("\t+ Creating input generator")
self.input_generator = InputGenerator(
@@ -83,226 +55,223 @@ def run(self, backend: Backend) -> None:
input_shapes=self.config.input_shapes,
)
- # run memory tracking
- # we do this first to measure the memory on the first call to forward/generate
- if self.config.memory:
- self.run_forward_memory_tracking(backend)
- if self.can_generate:
- self.run_generate_memory_tracking(backend)
+ if backend.config.task in TEXT_GENERATION_TASKS:
+ LOGGER.info("\t+ Generating and preparing Text Generation input")
+ self.forward_inputs = self.input_generator(mode="forward")
+ self.generate_input = self.input_generator(mode="generate")
+ self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
+ self.generate_input = backend.prepare_inputs(self.generate_input)
+ LOGGER.info("\t+ Updating Text Generation kwargs with default values")
+ self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs}
+ LOGGER.info("\t+ Initializing Text Generation report")
+ self.report = TextGenerationReport(
+ batch_size=self.config.input_shapes["batch_size"],
+ sequence_length=self.config.input_shapes["sequence_length"],
+ num_new_tokens=self.config.generate_kwargs["max_new_tokens"],
+ num_return_sequences=self.config.generate_kwargs["num_return_sequences"],
+ )
+
+ elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+ LOGGER.info("\t+ Generating and preparing Image Diffusion input")
+ self.diffuse_input = self.input_generator(mode="call")
+ self.diffuse_input = backend.prepare_inputs(self.diffuse_input)
+ LOGGER.info("\t+ Updating Image Diffusion kwargs with default values")
+ self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs}
+ LOGGER.info("\t+ Initializing Image Diffusion report")
+ self.report = ImageDiffusionReport(
+ batch_size=self.config.input_shapes["batch_size"],
+ num_images_per_prompts=self.config.forward_kwargs["num_images_per_prompt"],
+ )
+
+ else:
+ LOGGER.info("\t+ Generating and preparing Inference input")
+ self.forward_inputs = self.input_generator(mode="forward")
+ self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
+ LOGGER.info("\t+ Initializing Inference report")
+ self.report = InferenceReport(
+ batch_size=self.config.input_shapes["batch_size"],
+ )
+
+ LOGGER.info("\t+ Preparing backend for Inference")
+ backend.prepare_for_inference(
+ **backend.model_shapes,
+ **self.config.input_shapes,
+ **self.config.forward_kwargs,
+ **self.config.generate_kwargs,
+ )
- # run lacency tracking
- self.run_forward_latency_tracking(backend)
- if self.can_generate:
- self.run_generate_latency_tracking(backend)
+ LOGGER.info("\t+ Warming up backend for Inference")
+ for _ in range(self.config.warmup_runs):
+ if backend.config.task in TEXT_GENERATION_TASKS:
+ generate_warmup_kwargs = {"max_new_tokens": 2, "min_new_tokens": 2}
+ _ = backend.generate(self.generate_input, generate_warmup_kwargs)
+ elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+ diffuse_warmup_kwargs = {"num_inference_steps": 2}
+ _ = backend.call(self.diffuse_input, diffuse_warmup_kwargs)
+ else:
+ _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+ if self.config.memory:
+ LOGGER.info("\t+ Creating inference memory tracker")
+ self.memory_tracker = MemoryTracker(
+ backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids
+ )
+ if backend.config.task in TEXT_GENERATION_TASKS:
+ forward_memories_dict, generate_memories_dict = self.run_text_generation_memory_tracking(backend)
+ self.report.populate_memory(forward_memories_dict, generate_memories_dict)
+ elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+ call_memories_dict = self.run_image_diffusion_memory_tracking(backend)
+ self.report.populate_memory(call_memories_dict)
+ else:
+ forward_memories_dict = self.run_inference_memory_tracking(backend)
+ self.report.populate_memory(forward_memories_dict)
+
+ self.report.log_memory()
+
+ if self.config.latency:
+ LOGGER.info("\t+ Creating inference latency tracker")
+ self.latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device)
+ if backend.config.task in TEXT_GENERATION_TASKS:
+ forward_latencies_dict, generate_latencies_dict = self.run_text_generation_latency_tracking(backend)
+ self.report.populate_latency(forward_latencies_dict, generate_latencies_dict)
+ elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+ call_latencies_dict = self.run_image_diffusion_latency_tracking(backend)
+ self.report.populate_latency(call_latencies_dict)
+ else:
+ forward_latencies_dict = self.run_latency_inference_tracking(backend)
+ self.report.populate_latency(forward_latencies_dict)
+
+ self.report.log_latency()
- # run energy tracking
if self.config.energy:
- self.run_forward_energy_tracking(backend)
- if self.can_generate:
- self.run_generate_energy_tracking(backend)
+ LOGGER.info("\t+ Creating inference energy tracker")
+ self.energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids)
+ if backend.config.task in TEXT_GENERATION_TASKS:
+ forward_energies_dict, generate_energies_dict = self.run_text_generation_energy_tracking(backend)
+ self.report.populate_energy(forward_energies_dict, generate_energies_dict)
+ elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+ call_energies_dict = self.run_image_diffusion_energy_tracking(backend)
+ self.report.populate_energy(call_energies_dict)
+ else:
+ forward_energies_dict = self.run_inference_energy_tracking(backend)
+ self.report.populate_energy(forward_energies_dict)
- def run_forward_latency_tracking(self, backend: "Backend") -> None:
- forward_input = self.input_generator.generate(mode="forward")
+ self.report.log_energy()
- LOGGER.info("\t+ Preparing input for the forward pass")
- forward_input = backend.prepare_inputs(forward_input)
+ ## Memory tracking
+ def run_text_generation_memory_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]:
+ LOGGER.info("\t+ Running memory tracking")
+ self.memory_tracker.reset()
+ with self.memory_tracker.track():
+ _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
- LOGGER.info("\t+ Warming up the forward pass")
- for _ in range(self.config.warmup_runs):
- _ = backend.forward(forward_input, self.config.forward_kwargs)
-
- LOGGER.info("\t+ Tracking forward pass latency and throughput")
- latency_tracker = LatencyTracker(device=backend.config.device, backend=backend.config.name)
- while sum(self.forward_latencies) < self.config.duration:
- with latency_tracker.track():
- _ = backend.forward(forward_input, self.config.forward_kwargs)
- self.forward_latencies = latency_tracker.get_latencies()
-
- LOGGER.debug(f"\t+ Forward pass latency: {self.forward_latency:.3g} (s)")
- LOGGER.debug(f"\t+ Forward pass throughput: {self.forward_throughput:.3g} (samples/s)")
-
- def run_forward_energy_tracking(self, backend: Backend) -> None:
- forward_input = self.input_generator.generate(mode="forward")
-
- LOGGER.info("\t+ Preparing input for the forward pass")
- forward_input = backend.prepare_inputs(forward_input)
-
- LOGGER.info("\t+ Tracking forward pass energy consumption")
- num_forward_passes = 0
- energy_tracker = EnergyTracker()
- with energy_tracker.track(interval=1, file_prefix="forward"):
- while energy_tracker.get_elapsed_time() < self.config.duration:
- _ = backend.forward(forward_input, self.config.forward_kwargs)
- num_forward_passes += 1
- num_forward_samples = num_forward_passes * self.config.input_shapes["batch_size"]
- self.forward_energy = energy_tracker.get_total_energy() / num_forward_samples
- self.forward_emissions = energy_tracker.get_total_emissions() / num_forward_samples
-
- LOGGER.debug(f"\t+ Forward pass energy consumption: {self.forward_energy:.3g} (kWh/sample)")
- LOGGER.debug(f"\t+ Forward pass carbon emissions: {self.forward_emissions:.3g} (kgCO2eq/sample)")
- LOGGER.debug(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/forward_codecarbon.csv")
-
- def run_forward_memory_tracking(self, backend: "Backend") -> None:
- forward_input = self.input_generator.generate(mode="forward")
-
- LOGGER.info("\t+ Preparing input for the forward pass")
- forward_input = backend.prepare_inputs(forward_input)
-
- LOGGER.info("\t+ Tracking forward pass peak memory")
- memory_tracker = MemoryTracker(device=backend.config.device, backend=backend.config.name)
- with memory_tracker.track():
- _ = backend.forward(forward_input, self.config.forward_kwargs)
- self.forward_max_memory_used = memory_tracker.get_max_memory_used()
- self.forward_max_memory_reserved = memory_tracker.get_max_memory_reserved()
- self.forward_max_memory_allocated = memory_tracker.get_max_memory_allocated()
-
- LOGGER.debug(f"\t+ Forward pass max memory used: {self.forward_max_memory_used:.3g} (MB)")
- LOGGER.debug(f"\t+ Forward pass max memory reserved: {self.forward_max_memory_reserved:.3g} (MB)")
- LOGGER.debug(f"\t+ Forward pass max memory allocated: {self.forward_max_memory_allocated:.3g} (MB)")
-
- def run_generate_latency_tracking(self, backend: "Backend") -> None:
- generate_input = self.input_generator.generate(mode="generate")
-
- LOGGER.info("\t+ Preparing input for the generation pass")
- generate_input = backend.prepare_inputs(generate_input)
-
- LOGGER.info("\t+ Warming up the generation pass")
- _ = backend.generate(generate_input, self.config.generate_kwargs)
-
- LOGGER.info("\t+ Tracking generation latency and throughput")
- latency_tracker = LatencyTracker(device=backend.config.device, backend=backend.config.name)
- while sum(self.generate_latencies) < self.config.duration:
- with latency_tracker.track():
- _ = backend.generate(generate_input, self.config.generate_kwargs)
- self.generate_latencies = latency_tracker.get_latencies()
-
- LOGGER.debug(f"\t+ Generation pass latency: {self.generate_latency:.3g} (s)")
- LOGGER.debug(f"\t+ Generation pass throughput: {self.generate_throughput:.3g} (tokens/s)")
-
- def run_generate_energy_tracking(self, backend: Backend) -> None:
- generate_input = self.input_generator.generate(mode="generate")
-
- LOGGER.info("\t+ Preparing input for the generation pass")
- generate_input = backend.prepare_inputs(generate_input)
-
- LOGGER.info("\t+ Tracking generation pass energy consumption")
- num_generate_passes = 0
- energy_tracker = EnergyTracker()
- with energy_tracker.track(interval=1, file_prefix="generate"):
- while energy_tracker.get_elapsed_time() < self.config.duration:
- _ = backend.generate(generate_input, self.config.generate_kwargs)
- num_generate_passes += 1
- num_generated_tokens = (
- num_generate_passes
- * self.config.generate_kwargs["min_new_tokens"]
- * self.config.generate_kwargs["num_return_sequences"]
- * self.config.input_shapes["batch_size"]
- )
- self.generate_energy = energy_tracker.get_total_energy() / num_generated_tokens
- self.generate_emissions = energy_tracker.get_total_emissions() / num_generated_tokens
-
- LOGGER.debug(f"\t+ Generation pass energy consumption: {self.generate_energy:.3g} (kWh/token)")
- LOGGER.debug(f"\t+ Generation pass carbon emissions: {self.generate_emissions:.3g} (kgCO2eq/token)")
- LOGGER.debug(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/generate_codecarbon.csv")
-
- def run_generate_memory_tracking(self, backend: "Backend") -> None:
- generate_input = self.input_generator.generate(mode="generate")
-
- LOGGER.info("\t+ Preparing input for the generation pass")
- generate_input = backend.prepare_inputs(generate_input)
-
- LOGGER.info("\t+ Tracking generation pass peak memory")
- memory_tracker = MemoryTracker(device=backend.config.device, backend=backend.config.name)
- with memory_tracker.track():
- _ = backend.generate(generate_input, self.config.generate_kwargs)
- self.generate_max_memory_used = memory_tracker.get_max_memory_used()
- self.generate_max_memory_reserved = memory_tracker.get_max_memory_reserved()
- self.generate_max_memory_allocated = memory_tracker.get_max_memory_allocated()
-
- LOGGER.debug(f"\t+ Generation pass max memory used: {self.generate_max_memory_used:.3g} (MB)")
- LOGGER.debug(f"\t+ Generation pass max memory reserved: {self.generate_max_memory_reserved:.3g} (MB)")
- LOGGER.debug(f"\t+ Generation pass max memory allocated: {self.generate_max_memory_allocated:.3g} (MB)")
-
- # Metrics
- ## Forward pass metrics
- @property
- def forward_latency(self) -> float:
- return statistics.mean(self.forward_latencies)
-
- @property
- def forward_throughput(self) -> float:
- return self.config.input_shapes["batch_size"] / self.forward_latency
-
- ## Generation pass metrics
- @property
- def generate_latency(self) -> float:
- return statistics.mean(self.generate_latencies)
-
- @property
- def generate_throughput(self) -> float:
- return (
- self.config.generate_kwargs["min_new_tokens"]
- * self.config.generate_kwargs["num_return_sequences"]
- * self.config.input_shapes["batch_size"]
- / self.generate_latency
- )
+ forward_memories_dict = self.memory_tracker.get_memories_dict()
- @property
- def decode_latency(self) -> float:
- return self.generate_latency - self.forward_latency
-
- @property
- def decode_throughput(self) -> float:
- return (
- (self.config.generate_kwargs["min_new_tokens"] - 1)
- * self.config.generate_kwargs["num_return_sequences"]
- * self.config.input_shapes["batch_size"]
- / self.decode_latency
- )
+ self.memory_tracker.reset()
+ with self.memory_tracker.track():
+ _ = backend.generate(self.generate_input, self.config.generate_kwargs)
- ## Diffusion pass metrics
- @property
- def diffusion_throughput(self) -> float:
- return (
- self.config.input_shapes["batch_size"]
- * self.config.forward_kwargs["num_images_per_prompt"]
- / self.forward_latency
- )
+ generate_memories_dict = self.memory_tracker.get_memories_dict()
- def report(self) -> Dict[str, Any]:
- report_dict = {}
+ return forward_memories_dict, generate_memories_dict
- report_dict["forward.latency(s)"] = self.forward_latency
- report_dict["forward.throughput(samples/s)"] = self.forward_throughput
+ def run_image_diffusion_memory_tracking(self, backend: Backend) -> Dict[str, float]:
+ LOGGER.info("\t+ Running memory tracking")
+ self.memory_tracker.reset()
+ with self.memory_tracker.track():
+ _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
- if self.can_diffuse:
- report_dict["diffusion.throughput(images/s)"] = self.diffusion_throughput
+ call_memories_dict = self.memory_tracker.get_memories_dict()
- if self.config.memory:
- report_dict["forward.peak_memory(MB)"] = self.forward_max_memory_used
- report_dict["forward.max_memory_used(MB)"] = self.forward_max_memory_used
- report_dict["forward.max_memory_allocated(MB)"] = self.forward_max_memory_allocated
- report_dict["forward.max_memory_reserved(MB)"] = self.forward_max_memory_reserved
+ return call_memories_dict
- if self.config.energy:
- report_dict["forward.energy_consumption(kWh/sample)"] = self.forward_energy
- report_dict["forward.carbon_emissions(kgCO2eq/sample)"] = self.forward_emissions
+ def run_inference_memory_tracking(self, backend: Backend) -> Dict[str, float]:
+ LOGGER.info("\t+ Running memory tracking")
+ self.memory_tracker.reset()
+ with self.memory_tracker.track():
+ _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+ forward_memories_dict = self.memory_tracker.get_memories_dict()
+
+ return forward_memories_dict
+
+ ## Latency tracking
+ def run_text_generation_latency_tracking(self, backend: Backend) -> Tuple[List[float], List[float]]:
+ LOGGER.info("\t+ Running latency tracking")
+ self.latency_tracker.reset()
+ while self.latency_tracker.get_total_latency() < self.config.duration:
+ with self.latency_tracker.track():
+ _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+ forward_latencies_list = self.latency_tracker.get_latencies_list()
+
+ self.latency_tracker.reset()
+ while self.latency_tracker.get_total_latency() < self.config.duration:
+ with self.latency_tracker.track():
+ _ = backend.generate(self.generate_input, self.config.generate_kwargs)
+
+ generate_latencies_list = self.latency_tracker.get_latencies_list()
+
+ return forward_latencies_list, generate_latencies_list
+
+ def run_image_diffusion_latency_tracking(self, backend: Backend) -> List[float]:
+ LOGGER.info("\t+ Running latency tracking")
+ self.latency_tracker.reset()
+ while self.latency_tracker.get_total_latency() < self.config.duration:
+ with self.latency_tracker.track():
+ _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
+
+ call_latencies_list = self.latency_tracker.get_latencies_list()
+
+ return call_latencies_list
+
+ def run_latency_inference_tracking(self, backend: Backend) -> List[float]:
+ LOGGER.info("\t+ Running latency tracking")
+ self.latency_tracker.reset()
+ while self.latency_tracker.get_total_latency() < self.config.duration:
+ with self.latency_tracker.track():
+ _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+ forward_latencies_list = self.latency_tracker.get_latencies_list()
+
+ return forward_latencies_list
+
+ ## Energy tracking
+ def run_text_generation_energy_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]:
+ LOGGER.info("\t+ Running energy tracking")
+ self.energy_tracker.reset()
+ with self.energy_tracker.track():
+ _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+ forward_energies_dict = self.energy_tracker.get_energies_dict()
+
+ self.energy_tracker.reset()
+ with self.energy_tracker.track():
+ _ = backend.generate(self.generate_input, self.config.generate_kwargs)
+
+ generate_energies_dict = self.energy_tracker.get_energies_dict()
+
+ return forward_energies_dict, generate_energies_dict
+
+ def run_image_diffusion_energy_tracking(self, backend: Backend) -> Dict[str, float]:
+ LOGGER.info("\t+ Running energy tracking")
+ self.energy_tracker.reset()
+ with self.energy_tracker.track():
+ _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
+
+ call_energies_dict = self.energy_tracker.get_energies_dict()
- if self.can_generate:
- report_dict["generate.latency(s)"] = self.generate_latency
- report_dict["generate.throughput(tokens/s)"] = self.generate_throughput
+ return call_energies_dict
- report_dict["decode.latency(s)"] = self.decode_latency
- report_dict["decode.throughput(tokens/s)"] = self.decode_throughput
+ def run_inference_energy_tracking(self, backend: Backend) -> Dict[str, float]:
+ LOGGER.info("\t+ Running energy tracking")
+ self.energy_tracker.reset()
+ with self.energy_tracker.track():
+ _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
- if self.config.memory:
- report_dict["generate.peak_memory(MB)"] = self.generate_max_memory_used
- report_dict["generate.max_memory_used(MB)"] = self.generate_max_memory_used
- report_dict["generate.max_memory_allocated(MB)"] = self.generate_max_memory_allocated
- report_dict["generate.max_memory_reserved(MB)"] = self.generate_max_memory_reserved
+ forward_energies_dict = self.energy_tracker.get_energies_dict()
- if self.config.energy:
- report_dict["generate.energy_consumption(kWh/token)"] = self.generate_energy
- report_dict["generate.carbon_emissions(kgCO2eq/token)"] = self.generate_emissions
+ return forward_energies_dict
- return report_dict
+ def get_report(self) -> InferenceReport:
+ return self.report
diff --git a/optimum_benchmark/benchmarks/inference/callback.py b/optimum_benchmark/benchmarks/inference/callback.py
new file mode 100644
index 00000000..4871691d
--- /dev/null
+++ b/optimum_benchmark/benchmarks/inference/callback.py
@@ -0,0 +1,25 @@
+import time
+
+from ...import_utils import is_torch_available
+
+from transformers import LogitsProcessor
+
+if is_torch_available():
+ import torch
+
+
+# TODO: uses this class for more fine-grained latency measurements in text generation
+class MeasurementProcessor(LogitsProcessor):
+ def __init__(self, device: str, backend: str):
+ self.device = device
+ self.backend = backend
+
+ self.latencies = []
+
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+ """
+ Callback to track the time it takes to generate one batch of tokens.
+ """
+ self.latencies.append(time.perf_counter_ns())
+
+ return scores
diff --git a/optimum_benchmark/benchmarks/inference/config.py b/optimum_benchmark/benchmarks/inference/config.py
index 1299ca85..d5c4a0bb 100644
--- a/optimum_benchmark/benchmarks/inference/config.py
+++ b/optimum_benchmark/benchmarks/inference/config.py
@@ -2,33 +2,15 @@
from typing import Any, Dict, Optional
from dataclasses import dataclass, field
-from ..config import BenchmarkConfig
from ...env_utils import is_rocm_system
+from ..config import BenchmarkConfig
LOGGER = getLogger("inference")
INPUT_SHAPES = {
- # used with all tasks
"batch_size": 2,
- # used with text input tasks
"sequence_length": 16,
- # used with multiple choice tasks where input
- # is of shape (batch_size, num_choices, sequence_length)
- "num_choices": 1,
- # used with audio input tasks
- "feature_size": 80,
- "nb_max_frames": 3000,
-}
-
-GENERATE_CONFIG = {
- "num_return_sequences": 1,
- "max_new_tokens": 100,
- "min_new_tokens": 100,
- "do_sample": False,
- "use_cache": True,
- "pad_token_id": 0,
- "temperature": 1.0,
- "num_beams": 1,
+ "num_choices": 2,
}
@@ -38,37 +20,73 @@ class InferenceConfig(BenchmarkConfig):
_target_: str = "optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark"
# benchmark options
- duration: int = 10
- warmup_runs: int = 10
+ duration: int = field(default=10, metadata={"help": "Minimum duration of the benchmark in seconds"})
+ warmup_runs: int = field(default=10, metadata={"help": "Number of warmup runs to perform before benchmarking"})
- # additional/optional metrics
- memory: bool = False
- energy: bool = False
+ # input/output shapes
+ input_shapes: Dict[str, Any] = field(
+ default_factory=dict,
+ metadata={"help": "Input shapes for the model. Missing keys will be filled with default values."},
+ )
+ new_tokens: Optional[int] = field(
+ default=None,
+ metadata={"help": "Deprecated. If set, `max_new_tokens` and `min_new_tokens` will be set to this value."},
+ )
- # input options
- input_shapes: Dict = field(default_factory=dict)
- # output options
- new_tokens: Optional[int] = None
+ # tracking options
+ energy: bool = field(default=False, metadata={"help": "Measure energy usage"})
+ memory: bool = field(default=False, metadata={"help": "Measure max memory usage"})
+ latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"})
- # forward options
- forward_kwargs: Dict[str, Any] = field(default_factory=dict)
- # generation options
- generate_kwargs: Dict[str, Any] = field(default_factory=dict)
+ # methods kwargs
+ forward_kwargs: Dict[str, Any] = field(
+ default_factory=dict,
+ metadata={"help": "Keyword arguments to pass to the forward method of the model."},
+ )
+ generate_kwargs: Dict[str, Any] = field(
+ default_factory=dict,
+ metadata={"help": "Keyword arguments to pass to the generate method of the model."},
+ )
+ call_kwargs: Dict[str, Any] = field(
+ default_factory=dict,
+ metadata={"help": "Keyword arguments to pass to the __call__ method of the pipeline."},
+ )
def __post_init__(self):
super().__post_init__()
self.input_shapes = {**INPUT_SHAPES, **self.input_shapes}
- self.generate_kwargs = {**GENERATE_CONFIG, **self.generate_kwargs}
-
- if self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]:
- raise ValueError("`max_new_tokens` and `min_new_tokens` must be equal for fixed length output.")
if self.new_tokens is not None:
+ LOGGER.warning(
+ "`new_tokens` is deprecated. Use `max_new_tokens` and `min_new_tokens` instead. "
+ "Setting `max_new_tokens` and `min_new_tokens` to `new_tokens`."
+ )
self.generate_kwargs["max_new_tokens"] = self.new_tokens
self.generate_kwargs["min_new_tokens"] = self.new_tokens
- else:
- self.new_tokens = self.generate_kwargs["min_new_tokens"]
+
+ if (
+ "max_new_tokens" in self.generate_kwargs
+ and "min_new_tokens" in self.generate_kwargs
+ and self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]
+ ):
+ raise ValueError(
+ "Setting `min_new_tokens` and `max_new_tokens` to different values results in non-deterministic behavior."
+ )
+
+ elif "max_new_tokens" in self.generate_kwargs and "min_new_tokens" not in self.generate_kwargs:
+ LOGGER.warning(
+ "Setting `max_new_tokens` without `min_new_tokens` results in non-deterministic behavior. "
+ "Setting `min_new_tokens` to `max_new_tokens`."
+ )
+ self.generate_kwargs["min_new_tokens"] = self.generate_kwargs["max_new_tokens"]
+
+ elif "min_new_tokens" in self.generate_kwargs and "max_new_tokens" not in self.generate_kwargs:
+ LOGGER.warning(
+ "Setting `min_new_tokens` without `max_new_tokens` results in non-deterministic behavior. "
+ "Setting `max_new_tokens` to `min_new_tokens`."
+ )
+ self.generate_kwargs["max_new_tokens"] = self.generate_kwargs["min_new_tokens"]
if self.energy and is_rocm_system():
raise ValueError("Energy measurement through codecarbon is not yet available on ROCm-powered devices.")
diff --git a/optimum_benchmark/benchmarks/inference/report.py b/optimum_benchmark/benchmarks/inference/report.py
new file mode 100644
index 00000000..9cd43cfc
--- /dev/null
+++ b/optimum_benchmark/benchmarks/inference/report.py
@@ -0,0 +1,353 @@
+from dataclasses import dataclass, field
+from statistics import mean, stdev
+from typing import Any, Dict, List
+from logging import getLogger
+
+from ..report import BenchmarkReport
+
+LOGGER = getLogger("report")
+
+
+@dataclass
+class InferenceReport(BenchmarkReport):
+ # Config
+ batch_size: int
+ # Metrics
+ forward: Dict[str, Any] = field(default_factory=dict)
+
+ # POPULATING
+ def populate_latency(self, forward_latencies_list: List[float]):
+ ## Latency
+ self.forward["latency"] = {
+ "list[s]": forward_latencies_list,
+ "mean(s)": compute_mean(forward_latencies_list),
+ "stdev(s)": compute_stdev(forward_latencies_list),
+ }
+ ## Throughput
+ forward_throughputs_list = [self.batch_size / latency for latency in forward_latencies_list]
+ self.forward["throughput"] = {
+ "list[samples/s]": forward_throughputs_list,
+ "mean(samples/s)": compute_mean(forward_throughputs_list),
+ "stdev(samples/s)": compute_stdev(forward_throughputs_list),
+ }
+
+ def populate_memory(self, forward_memories_dict: Dict[str, Any]):
+ self.forward["memory"] = forward_memories_dict
+
+ def populate_energy(self, forward_energies_dict: Dict[str, Any]):
+ self.forward["energy"] = forward_energies_dict
+
+ # LOGGING
+ def log_latency(self):
+ for key, value in self.forward["latency"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ forward.latency.{key}: {value:f} (s)")
+ for key, value in self.forward["throughput"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ forward.throughput.{key}: {value:f} (samples/s)")
+
+ def log_memory(self):
+ for key, value in self.forward["memory"].items():
+ LOGGER.info(f"\t+ forward.memory.{key}: {value:f} (MB)")
+
+ def log_energy(self):
+ for key, value in self.forward["energy"].items():
+ LOGGER.info(f"\t+ forward.energy.{key}: {value:f} (kWh)")
+
+ def log_all(self) -> None:
+ if "latency" in self.forward:
+ self.log_latency()
+ if "memory" in self.forward:
+ self.log_memory()
+ if "energy" in self.forward:
+ self.log_energy()
+
+ # add operator to aggregate multiple reports
+ def __add__(self, other: "InferenceReport") -> "InferenceReport":
+ agg_report = InferenceReport(batch_size=self.batch_size + other.batch_size)
+ if "latency" in self.forward and "latency" in other.forward:
+ agg_forward_latencies_list = [
+ (lat_1 + lat_2) / 2
+ for lat_1, lat_2 in zip(self.forward["latency"]["list[s]"], other.forward["latency"]["list[s]"])
+ ]
+ agg_report.populate_latency(agg_forward_latencies_list)
+
+ if "memory" in self.forward and "memory" in other.forward:
+ agg_forward_memories_dict = {}
+ for key in self.forward["memory"]:
+ if "vram" in key:
+ # our vram measures are not process-specific
+ agg_forward_memories_dict[key] = max(self.forward["memory"][key], other.forward["memory"][key])
+ else:
+ # ram and pytorch measures are process-specific
+ agg_forward_memories_dict[key] = self.forward["memory"][key] + other.forward["memory"][key]
+
+ agg_report.populate_memory(agg_forward_memories_dict)
+
+ if "energy" in self.forward and "energy" in other.forward:
+ agg_forward_energies_dict = {}
+ for key in self.forward["energy"]:
+ # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+ agg_forward_energies_dict[key] = self.forward["energy"][key] + other.forward["energy"][key]
+
+ agg_report.populate_energy(agg_forward_energies_dict)
+
+ return agg_report
+
+
+@dataclass
+class ImageDiffusionReport(BenchmarkReport):
+ # Config
+ batch_size: int
+ num_images_per_prompts: int
+ # Metrics
+ call: Dict[str, Any] = field(default_factory=dict)
+
+ # POPULATING
+ def populate_latency(self, call_latencies_list: List[float]):
+ ## Latency
+ self.call["latency"] = {
+ "list[s]": call_latencies_list,
+ "mean(s)": compute_mean(call_latencies_list),
+ "stdev(s)": compute_stdev(call_latencies_list),
+ }
+ ## Throughput
+ call_throughputs_list = [
+ self.batch_size * self.num_images_per_prompts / latency for latency in call_latencies_list
+ ]
+ self.call["throughput"] = {
+ "list[images/s]": call_throughputs_list,
+ "mean[images/s]": compute_mean(call_throughputs_list),
+ "stdev[images/s]": compute_stdev(call_throughputs_list),
+ }
+
+ def populate_memory(self, call_memories_dict: Dict[str, Any]):
+ self.call["memory"] = call_memories_dict
+
+ def populate_energy(self, call_energies_dict: Dict[str, Any]):
+ self.call["energy"] = call_energies_dict
+
+ # LOGGING
+ def log_latency(self):
+ for key, value in self.call["latency"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ call.latency.{key}: {value:f} (s)")
+ for key, value in self.call["throughput"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ call.throughput.{key}: {value:f} (images/s)")
+
+ def log_memory(self):
+ for key, value in self.call["memory"].items():
+ LOGGER.info(f"\t+ call.memory.{key}: {value:f} (MB)")
+
+ def log_energy(self):
+ for key, value in self.call["energy"].items():
+ LOGGER.info(f"\t+ call.energy.{key}: {value:f} (kWh)")
+
+ def log_all(self) -> None:
+ if "latency" in self.call:
+ self.log_latency()
+ if "memory" in self.call:
+ self.log_memory()
+ if "energy" in self.call:
+ self.log_energy()
+
+ # add operator to aggregate multiple reports
+ def __add__(self, other: "ImageDiffusionReport") -> "ImageDiffusionReport":
+ assert self.num_images_per_prompts == other.num_images_per_prompts, "num_images_per_prompts must be the same"
+
+ agg_report = ImageDiffusionReport(
+ batch_size=self.batch_size + other.batch_size,
+ num_images_per_prompts=self.num_images_per_prompts,
+ )
+ if "latency" in self.call and "latency" in other.call:
+ agg_call_latencies_list = [
+ (lat_1 + lat_2) / 2
+ for lat_1, lat_2 in zip(self.call["latency"]["list[s]"], other.call["latency"]["list[s]"])
+ ]
+ agg_report.populate_latency(agg_call_latencies_list)
+
+ if "memory" in self.call and "memory" in other.call:
+ agg_call_memories_dict = {}
+ for key in self.call["memory"]:
+ if "vram" in key:
+ # our vram measures are not process-specific
+ agg_call_memories_dict[key] = max(self.call["memory"][key], other.call["memory"][key])
+ else:
+ # ram and pytorch measures are process-specific
+ agg_call_memories_dict[key] = self.call["memory"][key] + other.call["memory"][key]
+
+ agg_report.populate_memory(agg_call_memories_dict)
+
+ if "energy" in self.call and "energy" in other.call:
+ agg_call_energies_dict = {}
+ for key in self.call["energy"]:
+ # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+ agg_call_energies_dict[key] = self.call["energy"][key] + other.call["energy"][key]
+
+ agg_report.populate_energy(agg_call_energies_dict)
+
+ return agg_report
+
+
+@dataclass
+class TextGenerationReport(BenchmarkReport):
+ # Config
+ batch_size: int
+ sequence_length: int
+ num_new_tokens: int
+ num_return_sequences: int
+ # Prefill Metrics
+ prefill: Dict[str, Any] = field(default_factory=dict)
+ # Decode Metrics
+ decode: Dict[str, Any] = field(default_factory=dict)
+
+ def populate_latency(self, forward_latencies_list: List[float], generate_latencies_list: List[float]):
+ ## Latency
+ self.prefill["latency"] = {
+ "list[s]": forward_latencies_list,
+ "mean(s)": compute_mean(forward_latencies_list),
+ "stdev(s)": compute_stdev(forward_latencies_list),
+ }
+ ## Throughput
+ prefill_throughputs_list = [
+ self.batch_size * self.sequence_length / latency for latency in forward_latencies_list
+ ]
+ self.prefill["throughput"] = {
+ "list[tokens/s]": prefill_throughputs_list,
+ "mean[tokens/s]": compute_mean(prefill_throughputs_list),
+ "stdev[tokens/s]": compute_stdev(prefill_throughputs_list),
+ }
+ ## Latency
+ decode_latencies_list = [
+ generate_latency - self.prefill["latency"]["mean(s)"] for generate_latency in generate_latencies_list
+ ]
+ self.decode["latency"] = {
+ "list[s]": decode_latencies_list,
+ "mean(s)": compute_mean(decode_latencies_list),
+ "stdev(s)": compute_stdev(decode_latencies_list),
+ }
+ ## Throughput
+ decode_throughputs_list = [
+ self.batch_size * self.num_new_tokens * self.num_return_sequences / latency
+ for latency in decode_latencies_list
+ ]
+ self.decode["throughput"] = {
+ "list[tokens/s]": decode_throughputs_list,
+ "mean[tokens/s]": compute_mean(decode_throughputs_list),
+ "stdev[tokens/s]": compute_stdev(decode_throughputs_list),
+ }
+
+ def populate_memory(self, forward_memories_dict: Dict[str, Any], generate_memories_dict: Dict[str, Any]):
+ self.prefill["memory"] = forward_memories_dict
+ self.decode["memory"] = generate_memories_dict
+
+ def populate_energy(self, forward_energies_dict: Dict[str, Any], generate_energies_dict: Dict[str, Any]):
+ self.prefill["energy"] = forward_energies_dict
+ self.decode["energy"] = generate_energies_dict
+
+ # LOGGING
+ def log_latency(self):
+ for key, value in self.prefill["latency"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ prefill.latency.{key}: {value:f} (s)")
+ for key, value in self.prefill["throughput"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ prefill.throughput.{key}: {value:f} (tokens/s)")
+ for key, value in self.decode["latency"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ decode.latency.{key}: {value:f} (s)")
+ for key, value in self.decode["throughput"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ decode.throughput.{key}: {value:f} (tokens/s)")
+
+ def log_memory(self):
+ for key, value in self.prefill["memory"].items():
+ LOGGER.info(f"\t+ prefill.memory.{key}: {value:f} (MB)")
+ for key, value in self.decode["memory"].items():
+ LOGGER.info(f"\t+ decode.memory.{key}: {value:f} (MB)")
+
+ def log_energy(self):
+ for key, value in self.prefill["energy"].items():
+ LOGGER.info(f"\t+ prefill.energy.{key}: {value:f} (kWh)")
+ for key, value in self.decode["energy"].items():
+ LOGGER.info(f"\t+ decode.energy.{key}: {value:f} (kWh)")
+
+ def log_all(self) -> None:
+ if "latency" in self.prefill:
+ self.log_latency()
+ if "memory" in self.prefill:
+ self.log_memory()
+ if "energy" in self.prefill:
+ self.log_energy()
+
+ # add operator to aggregate multiple reports
+ def __add__(self, other: "TextGenerationReport") -> "TextGenerationReport":
+ agg_report = TextGenerationReport(
+ batch_size=self.batch_size + other.batch_size,
+ sequence_length=self.sequence_length,
+ num_new_tokens=self.num_new_tokens,
+ num_return_sequences=self.num_return_sequences,
+ )
+ if "latency" in self.prefill and "latency" in other.prefill:
+ agg_forward_latencies_list = [
+ (lat_1 + lat_2) / 2
+ for lat_1, lat_2 in zip(self.prefill["latency"]["list[s]"], other.prefill["latency"]["list[s]"])
+ ]
+ agg_generate_latencies_list = [
+ (lat_1 + lat_2) / 2
+ for lat_1, lat_2 in zip(self.decode["latency"]["list[s]"], other.decode["latency"]["list[s]"])
+ ]
+ agg_report.populate_latency(agg_forward_latencies_list, agg_generate_latencies_list)
+
+ if "memory" in self.prefill and "memory" in other.prefill:
+ agg_forward_memories_dict = {}
+ for key in self.prefill["memory"]:
+ if "vram" in key:
+ # our vram measures are not process-specific
+ agg_forward_memories_dict[key] = max(self.prefill["memory"][key], other.prefill["memory"][key])
+ else:
+ # ram and pytorch measures are process-specific
+ agg_forward_memories_dict[key] = self.prefill["memory"][key] + other.prefill["memory"][key]
+
+ agg_generate_memories_dict = {}
+ for key in self.decode["memory"]:
+ if "vram" in key:
+ # our vram measures are not process-specific
+ agg_generate_memories_dict[key] = max(self.decode["memory"][key], other.decode["memory"][key])
+ else:
+ # ram and pytorch measures are process-specific
+ agg_generate_memories_dict[key] = self.decode["memory"][key] + other.decode["memory"][key]
+
+ agg_report.populate_memory(agg_forward_memories_dict, agg_generate_memories_dict)
+
+ if "energy" in self.prefill and "energy" in other.prefill:
+ agg_forward_energies_dict = {}
+ for key in self.prefill["energy"]:
+ # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+ agg_forward_energies_dict[key] = self.prefill["energy"][key] + other.prefill["energy"][key]
+
+ agg_generate_energies_dict = {}
+ for key in self.decode["energy"]:
+ # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+ agg_generate_energies_dict[key] = self.decode["energy"][key] + other.decode["energy"][key]
+
+ agg_report.populate_energy(agg_forward_energies_dict, agg_generate_energies_dict)
+
+ return agg_report
+
+
+def compute_mean(values: List[float]) -> float:
+ return mean(values) if len(values) > 0 else 0.0
+
+
+def compute_stdev(values: List[float]) -> float:
+ return stdev(values) if len(values) > 1 else 0.0
diff --git a/optimum_benchmark/benchmarks/report.py b/optimum_benchmark/benchmarks/report.py
new file mode 100644
index 00000000..69491d65
--- /dev/null
+++ b/optimum_benchmark/benchmarks/report.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass, asdict
+from typing import Union, Optional
+from json import dump
+import os
+
+from transformers.configuration_utils import PushToHubMixin
+from flatten_dict import flatten
+import pandas as pd
+
+
+@dataclass
+class BenchmarkReport(PushToHubMixin):
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ config_file_name: Optional[Union[str, os.PathLike]] = None,
+ push_to_hub: bool = False,
+ **kwargs,
+ ):
+ use_auth_token = kwargs.pop("use_auth_token", None)
+
+ if use_auth_token is not None:
+ kwargs["token"] = use_auth_token
+
+ config_file_name = config_file_name if config_file_name is not None else "benchmark_report.json"
+
+ if os.path.isfile(save_directory):
+ raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+ os.makedirs(save_directory, exist_ok=True)
+
+ if push_to_hub:
+ commit_message = kwargs.pop("commit_message", None)
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+ repo_id = self._create_repo(repo_id, **kwargs)
+ files_timestamps = self._get_files_timestamps(save_directory)
+
+ output_config_file = os.path.join(save_directory, config_file_name)
+ self.to_json(output_config_file)
+
+ if push_to_hub:
+ self._upload_modified_files(
+ save_directory,
+ repo_id,
+ files_timestamps,
+ commit_message=commit_message,
+ token=kwargs.get("token"),
+ )
+
+ def to_dict(self) -> dict:
+ return asdict(self)
+
+ def to_flat_dict(self) -> dict:
+ report_dict = self.to_dict()
+ return flatten(report_dict, reducer="dot")
+
+ def to_json(self, path: str, flat: bool = False) -> None:
+ if flat:
+ with open(path, "w") as f:
+ dump(self.to_flat_dict(), f, indent=4)
+ else:
+ with open(path, "w") as f:
+ dump(self.to_dict(), f, indent=4)
+
+ def to_dataframe(self) -> pd.DataFrame:
+ flat_report_dict = self.to_flat_dict()
+ return pd.DataFrame(flat_report_dict, index=[0])
+
+ def to_csv(self, path: str) -> None:
+ self.to_dataframe().to_csv(path, index=False)
+
+ def log_all(self) -> None:
+ raise NotImplementedError("`log_all` method must be implemented in the child class")
diff --git a/optimum_benchmark/benchmarks/training/benchmark.py b/optimum_benchmark/benchmarks/training/benchmark.py
index e5eaa65f..90c231d0 100644
--- a/optimum_benchmark/benchmarks/training/benchmark.py
+++ b/optimum_benchmark/benchmarks/training/benchmark.py
@@ -1,19 +1,16 @@
-import time
-from typing import Any, Dict
from logging import getLogger
+from contextlib import ExitStack
-from transformers import (
- default_data_collator,
- TrainingArguments,
- TrainerCallback,
- TrainerControl,
- TrainerState,
-)
-
-from ...generators.dataset_generator import DatasetGenerator
-from ...backends.base import Backend
-from .config import TrainingConfig
from ..base import Benchmark
+from .config import TrainingConfig
+from .report import TrainingReport
+from ...trackers.memory import MemoryTracker
+from ...trackers.energy import EnergyTracker
+from .callback import LatencyTrainerCallback
+from ...backends.base import Backend, BackendConfigT
+from ...generators.dataset_generator import DatasetGenerator
+
+from transformers import default_data_collator
LOGGER = getLogger("training")
@@ -24,9 +21,7 @@ class TrainingBenchmark(Benchmark[TrainingConfig]):
def __init__(self, config: TrainingConfig) -> None:
super().__init__(config)
- def run(self, backend: Backend) -> None:
- LOGGER.info("Running training benchmark")
-
+ def run(self, backend: Backend[BackendConfigT]) -> None:
LOGGER.info("\t+ Creating dataset generator")
dataset_generator = DatasetGenerator(
task=backend.config.task,
@@ -35,105 +30,57 @@ def run(self, backend: Backend) -> None:
)
LOGGER.info("\t+ Generating training dataset")
- training_dataset = dataset_generator.generate()
-
- LOGGER.info("\t+ Creating training callbacks")
- training_callbacks = [MeasurementCallback(warmup_steps=self.config.warmup_steps)]
-
- self.trainer_state = backend.train(
- training_dataset=training_dataset,
- training_callbacks=training_callbacks,
- training_data_collator=default_data_collator,
- training_arguments=self.config.training_arguments,
+ training_dataset = dataset_generator()
+
+ LOGGER.info("\t+ Initializing training report")
+ self.report = TrainingReport(
+ max_steps=self.config.max_steps,
+ warmup_steps=self.config.warmup_steps,
+ per_process_batch_size=self.config.training_arguments["per_device_train_batch_size"],
+ gradient_accumulation_steps=self.config.training_arguments["gradient_accumulation_steps"],
)
- LOGGER.debug(f"Training runtime: {self.trainer_state.training_runtime:.3g} (s)")
- LOGGER.debug(f"Training throughput: {self.trainer_state.training_throughput:.3g} (samples/s)")
-
- return self.report()
-
- def report(self) -> Dict[str, Any]:
- return {
- # warmup metrics
- "warmup.runtime(s)": self.trainer_state.warmup_runtime,
- "warmup.throughput(samples/s)": self.trainer_state.warmup_throughput,
- # training metrics
- "training.runtime(s)": self.trainer_state.training_runtime,
- "training.throughput(samples/s)": self.trainer_state.training_throughput,
- # overall metrics
- "overall.runtime(s)": self.trainer_state.overall_runtime,
- "overall.throughput(samples/s)": (self.trainer_state.overall_throughput),
- }
-
-
-class MeasurementCallback(TrainerCallback):
- def __init__(self, warmup_steps: int):
- self.warmup_steps = warmup_steps
-
- def on_train_begin(
- self,
- args: TrainingArguments,
- state: TrainerState,
- control: TrainerControl,
- **kwargs,
- ):
- state.warmup_start = time.perf_counter_ns() * 1e-9
- state.overall_start = time.perf_counter_ns() * 1e-9
-
- def on_step_begin(
- self,
- args: TrainingArguments,
- state: TrainerState,
- control: TrainerControl,
- **kwargs,
- ):
- if state.global_step == self.warmup_steps:
- state.warmup_end = time.perf_counter_ns() * 1e-9
- state.training_start = time.perf_counter_ns() * 1e-9
-
- def on_train_end(
- self,
- args: TrainingArguments,
- state: TrainerState,
- control: TrainerControl,
- **kwargs,
- ):
- state.training_end = time.perf_counter_ns() * 1e-9
- state.overall_end = time.perf_counter_ns() * 1e-9
-
- state.total_training_batch_size = args.train_batch_size * args.gradient_accumulation_steps
-
- # warmup metrics
- state.warmup_runtime = state.warmup_end - state.warmup_start
- state.num_warmup_samples = self.warmup_steps * state.total_training_batch_size
- state.warmup_throughput = state.num_warmup_samples / state.warmup_runtime
- state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime
-
- # training metrics
- state.training_runtime = state.training_end - state.training_start
- state.num_training_steps = state.max_steps - self.warmup_steps
- state.num_training_samples = state.num_training_steps * state.total_training_batch_size
- state.training_throughput = state.num_training_samples / state.training_runtime
- state.training_steps_per_second = state.num_training_steps / state.training_runtime
-
- # overall training metrics
- state.overall_runtime = state.training_end - state.warmup_start
- state.num_overall_samples = state.num_warmup_samples + state.num_training_samples
- state.overall_throughput = state.num_overall_samples / state.overall_runtime
- state.overall_steps_per_second = state.num_overall_samples / state.overall_runtime
-
-
-# def get_data_collator(task: str):
-# if task == "object-detection":
-# return object_detection_data_collator
-# else:
-# return default_data_collator
-
-
-# def object_detection_data_collator(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
-# pixel_values = torch.stack([example["pixel_values"] for example in batch])
-# labels = [example["labels"] for example in batch]
-# return {
-# "pixel_values": pixel_values,
-# "labels": labels,
-# }
+ training_callbackes = []
+ if self.config.latency:
+ LOGGER.info("\t+ Adding latency measuring callback")
+ latency_callback = LatencyTrainerCallback(device=backend.config.device, backend=backend.config.name)
+ training_callbackes.append(latency_callback)
+
+ training_trackers = []
+ if self.config.memory:
+ LOGGER.info("\t+ Adding memory tracking context manager")
+ memory_tracker = MemoryTracker(
+ device=backend.config.device, backend=backend.config.name, device_ids=backend.config.device_ids
+ )
+ training_trackers.append(memory_tracker.track())
+
+ if self.config.energy:
+ LOGGER.info("\t+ Adding energy tracking context manager")
+ energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids)
+ training_trackers.append(energy_tracker.track())
+
+ with ExitStack() as stack:
+ for tracker in training_trackers:
+ stack.enter_context(tracker)
+
+ backend.train(
+ training_dataset=training_dataset,
+ training_callbacks=training_callbackes,
+ training_data_collator=default_data_collator,
+ training_arguments=self.config.training_arguments,
+ )
+
+ if self.config.latency:
+ self.report.populate_latency(overall_latencies_list=latency_callback.get_latencies_list())
+ self.report.log_latency()
+
+ if self.config.memory:
+ self.report.populate_memory(overall_memories_dict=memory_tracker.get_memories_dict())
+ self.report.log_memory()
+
+ if self.config.energy:
+ self.report.populate_energy(overall_energies_dict=energy_tracker.get_energies_dict())
+ self.report.log_energy()
+
+ def get_report(self) -> TrainingReport:
+ return self.report
diff --git a/optimum_benchmark/benchmarks/training/callback.py b/optimum_benchmark/benchmarks/training/callback.py
new file mode 100644
index 00000000..88026d79
--- /dev/null
+++ b/optimum_benchmark/benchmarks/training/callback.py
@@ -0,0 +1,43 @@
+import time
+from typing import List
+
+import torch
+from transformers import TrainerCallback
+
+
+class LatencyTrainerCallback(TrainerCallback):
+ def __init__(self, device: str, backend: str) -> None:
+ self.device = device
+ self.backend = backend
+ self.all_latencies_list = []
+
+ def on_step_begin(self, *args, **kwargs):
+ # one record per step
+ if self.device == "cuda" and self.backend == "pytorch":
+ self.all_latencies_list.append(torch.cuda.Event(enable_timing=True))
+ self.all_latencies_list[-1].record()
+ else:
+ self.all_latencies_list.append(time.perf_counter_ns())
+
+ def on_train_end(self, *args, **kwargs):
+ # one last record to measure the time of the last step
+ if self.device == "cuda" and self.backend == "pytorch":
+ self.all_latencies_list.append(torch.cuda.Event(enable_timing=True))
+ self.all_latencies_list[-1].record()
+ else:
+ self.all_latencies_list.append(time.perf_counter_ns())
+
+ def get_latencies_list(self) -> List[float]:
+ if self.device == "cuda" and self.backend == "pytorch":
+ torch.cuda.synchronize() # synchronize the device to make sure all events have been recorded
+ latencies_list = [
+ self.all_latencies_list[i - 1].elapsed_time(self.all_latencies_list[i]) * 1e-3
+ for i in range(1, len(self.all_latencies_list))
+ ]
+ else:
+ latencies_list = [
+ (self.all_latencies_list[i] - self.all_latencies_list[i - 1]) * 1e-9
+ for i in range(1, len(self.all_latencies_list))
+ ]
+
+ return latencies_list
diff --git a/optimum_benchmark/benchmarks/training/config.py b/optimum_benchmark/benchmarks/training/config.py
index 3a872684..e5d19581 100644
--- a/optimum_benchmark/benchmarks/training/config.py
+++ b/optimum_benchmark/benchmarks/training/config.py
@@ -8,6 +8,7 @@
TRAINING_ARGUMENT = {
"per_device_train_batch_size": 2,
+ "gradient_accumulation_steps": 1,
"output_dir": "./trainer_output",
"do_train": True,
"use_cpu": False,
@@ -25,16 +26,9 @@
}
DATASET_SHAPES = {
- # used with all tasks
"dataset_size": 500,
- # used with text input tasks
"sequence_length": 16,
- # used with multiple choice tasks where input
- # is of shape (batch_size, num_choices, sequence_length)
"num_choices": 1,
- # used with audio input tasks
- "feature_size": 80,
- "nb_max_frames": 3000,
}
@@ -49,10 +43,14 @@ class TrainingConfig(BenchmarkConfig):
# dataset options
dataset_shapes: Dict[str, Any] = field(default_factory=dict)
-
# training options
training_arguments: Dict[str, Any] = field(default_factory=dict)
+ # tracking options
+ latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"})
+ memory: bool = field(default=False, metadata={"help": "Measure max memory usage"})
+ energy: bool = field(default=False, metadata={"help": "Measure energy usage"})
+
def __post_init__(self):
super().__post_init__()
diff --git a/optimum_benchmark/benchmarks/training/report.py b/optimum_benchmark/benchmarks/training/report.py
new file mode 100644
index 00000000..9eeba211
--- /dev/null
+++ b/optimum_benchmark/benchmarks/training/report.py
@@ -0,0 +1,169 @@
+from dataclasses import dataclass, field
+from statistics import mean, stdev
+from typing import Any, Dict, List
+from logging import getLogger
+
+from ..report import BenchmarkReport
+
+LOGGER = getLogger("report")
+
+
+@dataclass
+class TrainingReport(BenchmarkReport):
+ max_steps: int
+ warmup_steps: int
+ per_process_batch_size: int
+ gradient_accumulation_steps: int
+
+ overall: Dict[str, Any] = field(default_factory=dict)
+ training: Dict[str, Any] = field(default_factory=dict)
+ warmup: Dict[str, Any] = field(default_factory=dict)
+
+ world_size: int = 1
+
+ # POPULATING
+ def populate_latency(self, overall_latencies_list: List[float]) -> None:
+ assert (
+ len(overall_latencies_list) == self.max_steps
+ ), f"Expected {self.max_steps} latencies, but got {len(overall_latencies_list)} latencies"
+ # Overall
+ ## Latency
+ self.overall["latency"] = {
+ "list[s/step]": overall_latencies_list,
+ "mean(s/step)": compute_mean(overall_latencies_list),
+ "stdev(s/step)": compute_stdev(overall_latencies_list),
+ }
+ ## Throughput
+ overall_throughputs_list = [
+ self.world_size * self.per_process_batch_size * self.gradient_accumulation_steps / latency
+ for latency in overall_latencies_list
+ ]
+ self.overall["throughput"] = {
+ "list[samples/s]": overall_throughputs_list,
+ "mean(samples/s)": compute_mean(overall_throughputs_list),
+ "stdev(samples/s)": compute_stdev(overall_throughputs_list),
+ }
+ # Training
+ ## Latency
+ training_latencies_list = overall_latencies_list[self.warmup_steps :]
+ self.training["latency"] = {
+ "list[s/step]": training_latencies_list,
+ "mean(s/step)": compute_mean(training_latencies_list),
+ "stdev(s/step)": compute_stdev(training_latencies_list),
+ }
+ ## Throughput
+ training_throughputs_list = overall_throughputs_list[self.warmup_steps :]
+ self.training["throughput"] = {
+ "list[samples/s]": training_throughputs_list,
+ "mean(samples/s)": compute_mean(training_throughputs_list),
+ "stdev(samples/s)": compute_stdev(training_throughputs_list),
+ }
+ # Warmup
+ ## Latency
+ warmup_latencies_list = overall_latencies_list[: self.warmup_steps]
+ self.warmup["latency"] = {
+ "list[s/step]": warmup_latencies_list,
+ "mean(s/step)": compute_mean(warmup_latencies_list),
+ "stdev(s/step)": compute_stdev(warmup_latencies_list),
+ }
+ ## Throughput
+ warmup_throughputs_list = overall_throughputs_list[: self.warmup_steps]
+ self.warmup["throughput"] = {
+ "list[samples/s]": warmup_throughputs_list,
+ "mean(samples/s)": compute_mean(warmup_throughputs_list),
+ "stdev(samples/s)": compute_stdev(warmup_throughputs_list),
+ }
+
+ def populate_memory(self, overall_memories_dict: Dict[str, float]) -> None:
+ self.warmup["memory"] = overall_memories_dict
+ self.overall["memory"] = overall_memories_dict
+ self.training["memory"] = overall_memories_dict
+
+ def populate_energy(self, overall_energies_dict: Dict[str, float]) -> None:
+ self.overall["energy"] = overall_energies_dict
+ # can't get training only or warmup only energies
+ # self.warmup["energy"] = overall_energies_dict
+ # self.training["energy"] = overall_energies_dict
+ # TODO: use a callback for energy instead of a tracker
+
+ # LOGGING
+ def log_latency(self):
+ for key, value in self.training["latency"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ training.latency.{key}: {value:f} (s)")
+ for key, value in self.training["throughput"].items():
+ if "list" in key:
+ continue
+ LOGGER.info(f"\t+ training.throughput.{key}: {value:f} (samples/s)")
+
+ def log_memory(self):
+ for key, value in self.training["memory"].items():
+ LOGGER.info(f"\t+ training.memory.{key}: {value:f} (MB)")
+
+ def log_energy(self):
+ for key, value in self.overall["energy"].items():
+ LOGGER.info(f"\t+ overall.energy.{key}: {value:f} (kWh)")
+
+ def log_all(self):
+ if "latency" in self.training:
+ self.log_latency()
+ if "memory" in self.training:
+ self.log_memory()
+ if "energy" in self.training:
+ self.log_energy()
+
+ # LOGIC
+ def __add__(self, other: "TrainingReport") -> "TrainingReport":
+ assert self.max_steps == other.max_steps, "Both reports must have the same max_steps"
+ assert self.warmup_steps == other.warmup_steps, "Both reports must have the same warmup_steps"
+ assert (
+ self.gradient_accumulation_steps == other.gradient_accumulation_steps
+ ), "Both reports must have the same gradient_accumulation_steps"
+
+ agg_report = TrainingReport(
+ max_steps=self.max_steps,
+ warmup_steps=self.warmup_steps,
+ world_size=self.world_size + other.world_size,
+ per_process_batch_size=self.per_process_batch_size,
+ gradient_accumulation_steps=self.gradient_accumulation_steps,
+ )
+
+ if "latency" in self.overall:
+ agg_overall_latencies_list = [
+ max(lat_1, lat_2)
+ for lat_1, lat_2 in zip(
+ self.overall["latency"]["list[s/step]"], other.overall["latency"]["list[s/step]"]
+ )
+ ]
+ agg_report.populate_latency(agg_overall_latencies_list)
+
+ if "memory" in self.overall:
+ agg_overall_memories_dict = {}
+ for key in self.overall["memory"]:
+ if "vram" in key:
+ # our vram measures are not process-specific
+ agg_overall_memories_dict[key] = max(self.overall["memory"][key], other.overall["memory"][key])
+ else:
+ # ram and pytorch measures are process-specific (can be accumulated)
+ agg_overall_memories_dict[key] = self.overall["memory"][key] + other.overall["memory"][key]
+
+ agg_report.populate_memory(agg_overall_memories_dict)
+
+ if "energy" in self.overall:
+ agg_overall_energies_dict = {}
+ for key in self.overall["energy"]:
+ # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+ agg_overall_energies_dict[key] = self.overall["energy"][key] + other.overall["energy"][key]
+
+ agg_report.populate_energy(agg_overall_energies_dict)
+
+ return agg_report
+
+
+def compute_mean(values: List[float]) -> float:
+ return mean(values) if len(values) > 0 else 0.0
+
+
+def compute_stdev(values: List[float]) -> float:
+ return stdev(values) if len(values) > 1 else 0.0
diff --git a/optimum_benchmark/benchmarks/utils.py b/optimum_benchmark/benchmarks/utils.py
index 052276c4..8b137891 100644
--- a/optimum_benchmark/benchmarks/utils.py
+++ b/optimum_benchmark/benchmarks/utils.py
@@ -1,55 +1 @@
-from typing import List, Dict, Any
-
-# TODO: use some kind of logic to handle this instead of this function
-def consolidate_reports(reports: List[Dict[str, Any]]) -> Dict[str, Any]:
- report = {}
-
- ## Training
-
- if "warmup.runtime(s)" in reports[0]:
- report["warmup.runtime(s)"] = reports[0]["warmup.runtime(s)"]
- report["warmup.throughput(samples/s)"] = sum(r["warmup.throughput(samples/s)"] for r in reports)
-
- if "training.runtime(s)" in reports[0]:
- report["training.runtime(s)"] = reports[0]["training.runtime(s)"]
- report["training.throughput(samples/s)"] = sum(r["training.throughput(samples/s)"] for r in reports)
-
- if "overall.runtime(s)" in reports[0]:
- report["overall.runtime(s)"] = reports[0]["overall.runtime(s)"]
- report["overall.throughput(samples/s)"] = sum(r["overall.throughput(samples/s)"] for r in reports)
-
- ## Inference
-
- if "forward.latency(s)" in reports[0]:
- report["forward.latency(s)"] = reports[0]["forward.latency(s)"]
- report["forward.throughput(samples/s)"] = sum(r["forward.throughput(samples/s)"] for r in reports)
-
- if "diffusion.throughput(images/s)" in reports[0]:
- report["diffusion.throughput(images/s)"] = sum(r["diffusion.throughput(images/s)"] for r in reports)
-
- if "forward.peak_memory(MB)" in reports[0]:
- report["forward.max_memory_used(MB)"] = reports[0]["forward.max_memory_used(MB)"]
- report["forward.max_memory_allocated(MB)"] = sum(r["forward.max_memory_allocated(MB)"] for r in reports)
- report["forward.max_memory_reserved(MB)"] = sum(r["forward.max_memory_reserved(MB)"] for r in reports)
-
- if "forward.energy_consumption(kWh/sample)" in reports[0]:
- report["forward.energy_consumption(kWh/sample)"] = reports[0]["forward.energy_consumption(kWh/sample)"]
- report["forward.carbon_emissions(kgCO2eq/sample)"] = reports[0]["forward.carbon_emissions(kgCO2eq/sample)"]
-
- if "generate.latency(s)" in reports[0]:
- report["generate.latency(s)"] = reports[0]["generate.latency(s)"]
- report["generate.throughput(tokens/s)"] = sum(r["generate.throughput(tokens/s)"] for r in reports)
- report["decode.latency(s)"] = reports[0]["decode.latency(s)"]
- report["decode.throughput(tokens/s)"] = sum(r["decode.throughput(tokens/s)"] for r in reports)
-
- if "generate.peak_memory(MB)" in reports[0]:
- report["generate.max_memory_used(MB)"] = reports[0]["generate.max_memory_used(MB)"]
- report["generate.max_memory_allocated(MB)"] = sum(r["generate.max_memory_allocated(MB)"] for r in reports)
- report["generate.max_memory_reserved(MB)"] = sum(r["generate.max_memory_reserved(MB)"] for r in reports)
-
- if "generate.energy_consumption(kWh/token)" in reports[0]:
- report["generate.energy_consumption(kWh/token)"] = reports[0]["generate.energy_consumption(kWh/token)"]
- report["generate.carbon_emissions(kgCO2eq/token)"] = reports[0]["generate.carbon_emissions(kgCO2eq/token)"]
-
- return report
diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py
index cf36855d..4961c189 100644
--- a/optimum_benchmark/cli.py
+++ b/optimum_benchmark/cli.py
@@ -1,6 +1,5 @@
import os
import glob
-import json
from logging import getLogger
import hydra
@@ -19,6 +18,7 @@
from .backends.neural_compressor.config import INCConfig
from .backends.text_generation_inference.config import TGIConfig
+from .benchmarks.report import BenchmarkReport
from .experiment import launch, ExperimentConfig
from .benchmarks.training.config import TrainingConfig
from .benchmarks.inference.config import InferenceConfig
@@ -49,6 +49,8 @@
# optimum-benchmark
@hydra.main(version_base=None)
def benchmark_cli(experiment_config: DictConfig) -> None:
+ os.environ["BENCHMARK_CLI"] = "1"
+
if glob.glob("*.csv") and os.environ.get("OVERRIDE_BENCHMARKS", "0") != "1":
LOGGER.warning(
"Skipping benchmark because results already exist. "
@@ -74,10 +76,6 @@ def benchmark_cli(experiment_config: DictConfig) -> None:
experiment_config: ExperimentConfig = OmegaConf.to_object(experiment_config)
OmegaConf.save(experiment_config, "experiment_config.yaml", resolve=True)
- benchmark_report = launch(experiment_config=experiment_config)
-
- LOGGER.info("Benchmark Report:")
- for metric, value in benchmark_report.items():
- LOGGER.info(f"\t+ {metric}: {value:.3f}")
+ benchmark_report: BenchmarkReport = launch(experiment_config=experiment_config)
- json.dump(benchmark_report, open("benchmark_report.json", "w"), indent=4)
+ benchmark_report.to_json("benchmark_report.json")
diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py
index 5a714dd9..ed4b710b 100644
--- a/optimum_benchmark/env_utils.py
+++ b/optimum_benchmark/env_utils.py
@@ -1,13 +1,14 @@
+import os
import re
import platform
import subprocess
import importlib.util
-from typing import Optional
-
-import psutil
+from typing import Optional, List
from .import_utils import is_py3nvml_available, is_pyrsmi_available
+import psutil
+
def is_nvidia_system():
try:
@@ -91,20 +92,84 @@ def get_gpus():
return gpus
-def get_git_revision_hash(package_name: str, path: Optional[str] = None) -> Optional[str]:
+def get_gpu_vram_mb() -> List[int]:
+ if is_nvidia_system():
+ if not is_py3nvml_available():
+ raise ValueError(
+ "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. "
+ "Please install it through `pip install py3nvml`."
+ )
+ import py3nvml.py3nvml as nvml
+
+ nvml.nvmlInit()
+ device_count = nvml.nvmlDeviceGetCount()
+ vrams = [nvml.nvmlDeviceGetMemoryInfo(nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)]
+ nvml.nvmlShutdown()
+ elif is_rocm_system():
+ if not is_pyrsmi_available():
+ raise ValueError(
+ "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. "
+ "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi."
+ )
+
+ from pyrsmi import rocml
+
+ rocml.smi_initialize()
+ device_count = rocml.smi_get_device_count()
+ vrams = [rocml.smi_get_device_memory_total(index) for index in range(device_count)]
+ rocml.smi_shutdown()
+ else:
+ vrams = []
+
+ return sum(vrams)
+
+
+def get_cuda_device_ids() -> str:
+ if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
+ device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
+ else:
+ if is_nvidia_system():
+ if not is_py3nvml_available():
+ raise ValueError(
+ "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. "
+ "Please install it through `pip install py3nvml`."
+ )
+ import py3nvml.py3nvml as nvml
+
+ nvml.nvmlInit()
+ device_ids = list(range(nvml.nvmlDeviceGetCount()))
+ nvml.nvmlShutdown()
+ elif is_rocm_system():
+ if not is_pyrsmi_available():
+ raise ValueError(
+ "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. "
+ "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi."
+ )
+
+ from pyrsmi import rocml
+
+ rocml.smi_initialize()
+ device_ids = list(range(rocml.smi_get_device_count()))
+ rocml.smi_shutdown()
+ else:
+ raise ValueError("No NVIDIA or ROCm GPUs found.")
+
+ return ",".join(str(i) for i in device_ids)
+
+
+def get_git_revision_hash(package_name: str) -> Optional[str]:
"""
Returns the git commit SHA of a package installed from a git repository.
"""
- if path is None:
- try:
- path = importlib.util.find_spec(package_name).origin
- except Exception:
- return None
+ try:
+ path = importlib.util.find_spec(package_name).origin
+ except Exception:
+ return None
try:
git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip()
except Exception:
- git_hash = None
+ return None
return git_hash
diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py
index 1396c131..c9b6d733 100644
--- a/optimum_benchmark/experiment.py
+++ b/optimum_benchmark/experiment.py
@@ -1,11 +1,13 @@
import os
import platform
from logging import getLogger
+from tempfile import TemporaryDirectory
from dataclasses import dataclass, field
from typing import Any, Dict, Type, Optional, TYPE_CHECKING
from hydra.utils import get_class
+from .benchmarks.report import BenchmarkReport
from .benchmarks.config import BenchmarkConfig
from .launchers.config import LauncherConfig
from .backends.config import BackendConfig
@@ -19,6 +21,9 @@
)
from .env_utils import (
get_git_revision_hash,
+ is_nvidia_system,
+ is_rocm_system,
+ get_gpu_vram_mb,
get_cpu_ram_mb,
get_gpus,
get_cpu,
@@ -57,29 +62,39 @@ class ExperimentConfig:
environment: Dict = field(
default_factory=lambda: {
"cpu": get_cpu(),
- "gpus": get_gpus(),
"cpu_count": os.cpu_count(),
- "system": platform.system(),
"cpu_ram_mb": get_cpu_ram_mb(),
+ "system": platform.system(),
"python_version": platform.python_version(),
# libraries
"transformers_version": transformers_version(),
- "transformers_commit": get_git_revision_hash("transformers", os.environ.get("TRANSFORMERS_PATH", None)),
+ "transformers_commit": get_git_revision_hash("transformers"),
"accelerate_version": accelerate_version(),
- "accelerate_commit": get_git_revision_hash("accelerate", os.environ.get("ACCELERATE_PATH", None)),
- "optimum_version": optimum_version(),
- "optimum_commit": get_git_revision_hash("optimum", os.environ.get("OPTIMUM_PATH", None)),
+ "accelerate_commit": get_git_revision_hash("accelerate"),
"diffusers_version": diffusers_version(),
- "diffusers_commit": get_git_revision_hash("diffusers", os.environ.get("DIFFUSERS_PATH", None)),
+ "diffusers_commit": get_git_revision_hash("diffusers"),
+ "optimum_version": optimum_version(),
+ "optimum_commit": get_git_revision_hash("optimum"),
"timm_version": timm_version(),
- "timm_commit": get_git_revision_hash("timm", os.environ.get("TIMM_PATH", None)),
+ "timm_commit": get_git_revision_hash("timm"),
"peft_version": peft_version(),
- "peft_commit": get_git_revision_hash("peft", os.environ.get("PEFT_PATH", None)),
+ "peft_commit": get_git_revision_hash("peft"),
}
)
+ def __post_init__(self):
+ # adding GPU information to the environment
+ if is_nvidia_system() or is_rocm_system():
+ available_gpus = get_gpus()
+ if len(available_gpus) > 0:
+ self.environment["gpu"] = available_gpus[0]
+ self.environment["gpu_count"] = len(available_gpus)
+ self.environment["gpu_vram_mb"] = get_gpu_vram_mb()
+ else:
+ LOGGER.warning("Detected NVIDIA or ROCm system, but no GPUs found.")
-def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dict[str, Any]:
+
+def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> BenchmarkReport:
try:
# Allocate requested backend
backend_factory: Type[Backend] = get_class(backend_config._target_)
@@ -107,7 +122,7 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dic
raise e
try:
- report = benchmark.report()
+ report = benchmark.get_report()
except Exception as e:
LOGGER.error("Error during report generation: %s", e)
raise e
@@ -115,7 +130,13 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dic
return report
-def launch(experiment_config: ExperimentConfig) -> Dict[str, Any]:
+def launch(experiment_config: ExperimentConfig) -> BenchmarkReport:
+ if os.environ.get("BENCHMARK_CLI", "0") == "0":
+ LOGGER.info("Launching experiment in a temporary directory.")
+ tmep_dir = TemporaryDirectory()
+ original_dir = os.getcwd()
+ os.chdir(tmep_dir.name)
+
launcher_config: LauncherConfig = experiment_config.launcher
try:
@@ -135,4 +156,8 @@ def launch(experiment_config: ExperimentConfig) -> Dict[str, Any]:
LOGGER.error(f"Error during experiment launching: {e}")
raise e
+ if os.environ.get("BENCHMARK_CLI", "0") == "0":
+ os.chdir(original_dir)
+ tmep_dir.cleanup()
+
return output
diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py
index f0ba921f..4bb9f188 100644
--- a/optimum_benchmark/generators/dataset_generator.py
+++ b/optimum_benchmark/generators/dataset_generator.py
@@ -15,7 +15,7 @@ def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict
dataset_shapes["batch_size"] = dataset_shapes["dataset_size"]
if task in TASKS_TO_GENERATORS:
- LOGGER.info(f"Using {task} task generator")
+ LOGGER.info(f"\t+ Using {task} task generator")
shapes = {**dataset_shapes, **model_shapes}
self.task_generator = TASKS_TO_GENERATORS[task](shapes=shapes, with_labels=True)
else:
@@ -26,7 +26,8 @@ def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict
"please submit a PR or a feature request to optimum-benchmark. \n"
)
- def generate(self) -> Dataset:
- task_dataset = self.task_generator.generate()
+ def __call__(self) -> Dataset:
+ task_dataset = self.task_generator()
task_dataset = Dataset.from_dict(task_dataset)
+ task_dataset.set_format(type="torch", columns=list(task_dataset.features.keys()))
return task_dataset
diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py
index bc14d6c8..13f1d9aa 100644
--- a/optimum_benchmark/generators/input_generator.py
+++ b/optimum_benchmark/generators/input_generator.py
@@ -22,8 +22,8 @@ def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[s
"please submit a PR or a feature request to optimum-benchmark. "
)
- def generate(self, mode: str) -> Dict[str, Any]:
- task_input = self.task_generator.generate()
+ def __call__(self, mode: str) -> Dict[str, Any]:
+ task_input = self.task_generator()
if mode == "generate":
if "pixel_values" in task_input:
@@ -46,5 +46,9 @@ def generate(self, mode: str) -> Dict[str, Any]:
task_input = {
"inputs": task_input["input_ids"],
}
+ elif mode == "call":
+ task_input = {
+ "prompt": task_input["prompt"],
+ }
return task_input
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index 9aee6ee9..1f3e9b23 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -34,8 +34,8 @@ def generate_random_strings(shape: Tuple[int]):
for _ in range(shape[0])
]
- def generate(self):
- raise NotImplementedError("Generator must implement generate method")
+ def __call__(self):
+ raise NotImplementedError("Generator must implement __call__ method")
class TextGenerator(TaskGenerator):
@@ -131,7 +131,7 @@ def labels(self):
shape=(self.shapes["batch_size"],),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_ids"] = self.input_ids()
@@ -160,7 +160,7 @@ def labels(self):
),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_ids"] = self.input_ids()
@@ -179,7 +179,7 @@ def generate(self):
class TextGenerationGenerator(TextGenerator):
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_ids"] = self.input_ids()
dummy["attention_mask"] = self.attention_mask()
@@ -211,7 +211,7 @@ def end_positions(self):
shape=(self.shapes["batch_size"],),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_ids"] = self.input_ids()
@@ -226,7 +226,7 @@ def generate(self):
class MaskedLanguageModelingGenerator(TextGenerator):
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_ids"] = self.input_ids()
@@ -252,7 +252,7 @@ def labels(self):
shape=(self.shapes["batch_size"],),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_ids"] = (
@@ -288,7 +288,7 @@ def labels(self):
shape=(self.shapes["batch_size"],),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["pixel_values"] = self.pixel_values()
@@ -316,7 +316,7 @@ def labels(self):
for _ in range(self.shapes["batch_size"])
]
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["pixel_values"] = self.pixel_values()
@@ -338,7 +338,7 @@ def labels(self):
),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["pixel_values"] = self.pixel_values()
@@ -356,7 +356,7 @@ def labels(self):
shape=(self.shapes["batch_size"],),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_values"] = self.input_values()
@@ -377,7 +377,7 @@ def labels(self):
),
)
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["input_values"] = self.input_values()
@@ -391,7 +391,7 @@ class PromptGenerator(TaskGenerator):
def prompt(self):
return self.generate_random_strings(shape=(self.shapes["batch_size"], 10))
- def generate(self):
+ def __call__(self):
dummy = {}
dummy["prompt"] = self.prompt()
@@ -399,7 +399,7 @@ def generate(self):
class FeatureExtractionGenerator(TextGenerator, ImageGenerator):
- def generate(self):
+ def __call__(self):
dummy = {}
if self.shapes["num_channels"] is not None and self.shapes["height"] is not None:
diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
index 1c4cc7e8..f19fbda3 100644
--- a/optimum_benchmark/import_utils.py
+++ b/optimum_benchmark/import_utils.py
@@ -1,6 +1,7 @@
import importlib.metadata
import importlib.util
+
_transformers_available = importlib.util.find_spec("transformers") is not None
_accelerate_available = importlib.util.find_spec("accelerate") is not None
_diffusers_available = importlib.util.find_spec("diffusers") is not None
@@ -19,23 +20,31 @@
_amdsmi_available = importlib.util.find_spec("amdsmi") is not None
_tensorflow_available = importlib.util.find_spec("tensorflow") is not None
_timm_available = importlib.util.find_spec("timm") is not None
-_is_diffusers_available = importlib.util.find_spec("diffusers") is not None
-_is_accelerate_available = importlib.util.find_spec("accelerate") is not None
-_is_torch_ort_available = importlib.util.find_spec("torch_ort") is not None
-_is_deepspeed_available = importlib.util.find_spec("deepspeed") is not None
-_is_tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None
+_diffusers_available = importlib.util.find_spec("diffusers") is not None
+_torch_ort_available = importlib.util.find_spec("torch_ort") is not None
+_deepspeed_available = importlib.util.find_spec("deepspeed") is not None
+_tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None
+_psutil_available = importlib.util.find_spec("psutil") is not None
+
+
+def is_psutil_available():
+ return _psutil_available
+
+
+def is_transformers_available():
+ return _transformers_available
def is_tensorrt_llm_available():
- return _is_tensorrt_llm_available
+ return _tensorrt_llm_available
def is_deepspeed_available():
- return _is_deepspeed_available
+ return _deepspeed_available
def is_torch_ort_available():
- return _is_torch_ort_available
+ return _torch_ort_available
def is_accelerate_available():
@@ -43,7 +52,7 @@ def is_accelerate_available():
def is_diffusers_available():
- return _is_diffusers_available
+ return _diffusers_available
def is_timm_available():
@@ -118,7 +127,7 @@ def onnxruntime_version():
try:
return "ort-training:" + importlib.metadata.version("onnxruntime-training")
except importlib.metadata.PackageNotFoundError:
- return "ort:unknown"
+ return None
def openvino_version():
@@ -152,7 +161,7 @@ def diffusers_version():
def torch_ort_version():
- if _is_torch_ort_available:
+ if _torch_ort_available:
return importlib.metadata.version("torch_ort")
@@ -167,5 +176,5 @@ def peft_version():
def tesnorrt_llm_version():
- if _is_tensorrt_llm_available:
+ if _tensorrt_llm_available:
return importlib.metadata.version("tensorrt_llm")
diff --git a/optimum_benchmark/launchers/isolation_utils.py b/optimum_benchmark/launchers/isolation_utils.py
index f48fc919..52006bcc 100644
--- a/optimum_benchmark/launchers/isolation_utils.py
+++ b/optimum_benchmark/launchers/isolation_utils.py
@@ -6,15 +6,15 @@
from multiprocessing import Process
from contextlib import contextmanager
-import psutil
-
from ..logging_utils import setup_logging
from ..env_utils import is_nvidia_system, is_rocm_system
-from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version
+from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version, is_psutil_available
+if is_psutil_available():
+ import psutil
if is_py3nvml_available():
- import py3nvml.py3nvml as nvml # type: ignore
+ import py3nvml.py3nvml as nvml
if is_amdsmi_available():
import amdsmi # type: ignore
@@ -172,7 +172,7 @@ def assert_system_devices_isolation(benchmark_pid: int) -> None:
@contextmanager
-def device_isolation(benchmark_pid: int, enabled: bool) -> None:
+def device_isolation(benchmark_pid: int, enabled: bool):
if not enabled:
yield
return
diff --git a/optimum_benchmark/launchers/torchrun/config.py b/optimum_benchmark/launchers/torchrun/config.py
index 77b6b4ef..2d87ff03 100644
--- a/optimum_benchmark/launchers/torchrun/config.py
+++ b/optimum_benchmark/launchers/torchrun/config.py
@@ -1,4 +1,3 @@
-import os
import uuid
from logging import getLogger
from typing import Any, Dict, Optional
@@ -20,7 +19,7 @@ class TorchrunConfig(LauncherConfig):
# Maximum amount of nodes that the user function will be launched on.
max_nodes: int = 1
# On each node the elastic agent will launch this amount of workers that will execute user defined function.
- nproc_per_node: Optional[int] = None
+ nproc_per_node: int = 2
# User defined role of the worker (defaults to "trainer").
role: str = "benchmark_worker"
# The interval in seconds that is used by the elastic_agent as a period of monitoring workers.
@@ -61,26 +60,3 @@ def __post_init__(self) -> None:
if self.min_nodes != 1:
LOGGER.info("For multi-node benchmarks, run the benchmark on each node separately.")
LOGGER.info(f"Waiting for the other nodes to be avaialable at {self.rdzv_endpoint}...")
-
- if self.nproc_per_node is None:
- if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
- LOGGER.warning(
- "`nproc_per_node` is not set but `CUDA_VISIBLE_DEVICES` is set. "
- "Setting `nproc_per_node` to the number of visible devices."
- )
- self.nproc_per_node = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
- else:
- LOGGER.warning(
- "`nproc_per_node` is not set and `CUDA_VISIBLE_DEVICES` is not set. "
- "Setting `nproc_per_node` and `CUDA_VISIBLE_DEVICES` to 1."
- )
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
- self.nproc_per_node = 1
- else:
- if len(os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")) != self.nproc_per_node:
- LOGGER.warning(
- f"`nproc_per_node` is set to {self.nproc_per_node} but `CUDA_VISIBLE_DEVICES` "
- f"is set to {os.environ.get('CUDA_VISIBLE_DEVICES', '')}. "
- "Setting `CUDA_VISIBLE_DEVICES` to match `nproc_per_node`."
- )
- os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in range(self.nproc_per_node)])
diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py
index b50f9f55..f327e85c 100644
--- a/optimum_benchmark/launchers/torchrun/launcher.py
+++ b/optimum_benchmark/launchers/torchrun/launcher.py
@@ -4,17 +4,19 @@
from multiprocessing import Queue
from typing import Callable, Dict, Any
-import torch.distributed
-from torch.distributed import FileStore
-from torch.distributed.elastic.multiprocessing import Std
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.launcher.api import LaunchConfig, launch_agent
-
from ..base import Launcher
from .config import TorchrunConfig
from ...logging_utils import setup_logging
from ..isolation_utils import device_isolation
-from ...benchmarks.utils import consolidate_reports
+from ...benchmarks.report import BenchmarkReport
+from ...import_utils import is_torch_distributed_available
+
+if is_torch_distributed_available():
+ import torch.distributed
+ from torch.distributed import FileStore
+ from torch.distributed.elastic.multiprocessing import Std
+ from torch.distributed.elastic.multiprocessing.errors import record
+ from torch.distributed.launcher.api import LaunchConfig, launch_agent
LOGGER = getLogger("torchrun")
@@ -49,8 +51,8 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
local_addr=self.config.local_addr,
log_dir=self.config.log_dir,
)
- current_log_level = getLogger().getEffectiveLevel()
queue = Queue()
+ current_log_level = getLogger().getEffectiveLevel()
with device_isolation(enabled=self.config.device_isolation, benchmark_pid=os.getpid()):
LOGGER.info(f"\t+ Launching torchrun agent with {self.config.nproc_per_node} workers processes")
@@ -61,10 +63,16 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
)
outputs = []
+
while not queue.empty():
outputs.append(queue.get())
- report = consolidate_reports(outputs)
+ if len(outputs) == 1:
+ report: BenchmarkReport = outputs[0]
+ else:
+ LOGGER.info(f"\t+ Merging benchmark reports from {len(outputs)} workers")
+ report: BenchmarkReport = sum(outputs[1:], outputs[0])
+ report.log_all()
return report
@@ -85,12 +93,12 @@ def entrypoint(fn, q, log_level, *args):
torch.cuda.set_device(rank)
if rank == 0:
- setup_logging(log_level)
+ setup_logging(level=log_level, prefix="RANK-0")
else:
- setup_logging("ERROR")
+ setup_logging(level="ERROR")
# TODO: use a tcp store instead
- store = FileStore("torchrun_filestore")
+ store = FileStore("torchrun.filestore")
store.set(f"rank_{rank}", str(os.getpid()))
output = fn(*args)
diff --git a/optimum_benchmark/logging_utils.py b/optimum_benchmark/logging_utils.py
index 398c7bf4..72f76889 100644
--- a/optimum_benchmark/logging_utils.py
+++ b/optimum_benchmark/logging_utils.py
@@ -1,11 +1,13 @@
import os
import logging
import logging.config
+from logging import Logger
+from typing import Optional
from subprocess import Popen, PIPE, STDOUT
from omegaconf import OmegaConf
-JOB_LOGGING = {
+API_JOB_LOGGING = {
"version": 1,
"formatters": {
"simple": {"format": "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"},
@@ -27,32 +29,34 @@
"stream": "ext://sys.stdout",
"class": "logging.StreamHandler",
},
- "file": {
- "filename": "api.log",
- "formatter": "simple",
- "class": "logging.FileHandler",
- },
},
- "root": {"level": "INFO", "handlers": ["console", "file"]},
+ "root": {"level": "INFO", "handlers": ["console"]},
"disable_existing_loggers": False,
}
-def setup_logging(level: str = "INFO"):
- if os.path.exists(".hydra/hydra.yaml"):
+def setup_logging(level: str = "INFO", prefix: Optional[str] = None):
+ if os.environ.get("BENCHMARK_CLI", "0") == "1":
hydra_config = OmegaConf.load(".hydra/hydra.yaml")
job_logging = OmegaConf.to_container(
hydra_config.hydra.job_logging,
resolve=True,
)
else:
- job_logging = JOB_LOGGING.copy()
+ job_logging = API_JOB_LOGGING.copy()
job_logging["root"]["level"] = level
+
+ if prefix is not None:
+ job_logging["formatters"]["simple"]["format"] = f"[{prefix}]" + job_logging["formatters"]["simple"]["format"]
+ job_logging["formatters"]["colorlog"]["format"] = (
+ f"[{prefix}]" + job_logging["formatters"]["colorlog"]["format"]
+ )
+
logging.config.dictConfig(job_logging)
-def run_process_and_log_stream_output(logger, args):
+def run_subprocess_and_log_stream_output(logger: Logger, args):
popen = Popen(args, stdout=PIPE, stderr=STDOUT)
for line in iter(popen.stdout.readline, b""):
if line is not None:
diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
index b3038812..e35baae3 100644
--- a/optimum_benchmark/task_utils.py
+++ b/optimum_benchmark/task_utils.py
@@ -114,7 +114,7 @@
),
}
-DIFFUSION_TASKS = [
+IMAGE_DIFFUSION_TASKS = [
"stable-diffusion",
"stable-diffusion-xl",
]
diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
index 815abaa9..7d3bb7ad 100644
--- a/optimum_benchmark/trackers/energy.py
+++ b/optimum_benchmark/trackers/energy.py
@@ -1,53 +1,44 @@
import os
from logging import getLogger
-from typing import List, Optional
from contextlib import contextmanager
+from typing import Optional, Dict
-from ..env_utils import is_nvidia_system, is_rocm_system
-from ..import_utils import (
- is_py3nvml_available,
- is_pyrsmi_available,
- is_codecarbon_available,
-)
+from ..env_utils import get_cuda_device_ids
+from ..import_utils import is_codecarbon_available
if is_codecarbon_available():
from codecarbon import EmissionsTracker, OfflineEmissionsTracker
-if is_nvidia_system():
- if is_py3nvml_available():
- import py3nvml.py3nvml as nvml
- else:
- raise ValueError(
- "The library py3nvml is required to run energy benchmark on NVIDIA GPUs, but is not installed. "
- "Please install it through `pip install py3nvml`."
- )
-
-if is_rocm_system():
- if is_pyrsmi_available():
- # TODO: use amdsmi instead of pyrsmi
- from pyrsmi import rocml
- else:
- raise ValueError(
- "The library pyrsmi is required to run energy benchmark on ROCm-powered GPUs, but is not installed. "
- "Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git."
- )
-
LOGGER = getLogger("energy")
class EnergyTracker:
- def __init__(self, device_ids: Optional[List[int]] = None):
- self.device_ids = device_ids
+ def __init__(self, device: str, device_ids: Optional[str] = None):
+ self.device = device
+ self.cpu_energy: float = 0
+ self.gpu_energy: float = 0
+ self.ram_energy: float = 0
self.total_energy: float = 0
- self.total_emissions: float = 0
- if self.device_ids is None:
- self.device_ids = infer_cuda_device_ids()
+ if self.device == "cuda":
+ if device_ids is None:
+ LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.")
+ self.device_ids = list(map(int, get_cuda_device_ids().split(",")))
+ else:
+ self.device_ids = list(map(int, device_ids.split(",")))
+ else:
+ self.device_ids = []
+
+ def reset(self):
+ self.cpu_energy = 0
+ self.gpu_energy = 0
+ self.ram_energy = 0
+ self.total_energy = 0
@contextmanager
- def track(self, interval=1, file_prefix=""):
+ def track(self, interval=1, file_prefix="method"):
if not is_codecarbon_available():
raise ValueError(
"The library codecarbon is required to run energy benchmark, but is not installed. "
@@ -55,6 +46,7 @@ def track(self, interval=1, file_prefix=""):
)
try:
+ # TODO: use pynvml and amdsmi directly to get the GPU power consumption
self.emission_tracker = EmissionsTracker(
log_level="error", # "info" for more verbosity
tracking_mode="process", # "machine" for machine-level tracking
@@ -63,11 +55,11 @@ def track(self, interval=1, file_prefix=""):
output_file=f"{file_prefix}_codecarbon.csv",
)
except Exception as e:
- LOGGER.warning(f"Failed to initialize Online Emissions Tracker: {e}")
- LOGGER.warning("Falling back to Offline Emissions Tracker")
+ LOGGER.warning("\t+ Failed to initialize Online Emissions Tracker:, %s", e)
+ LOGGER.warning("\t+ Falling back to Offline Emissions Tracker")
if os.environ.get("COUNTRY_ISO_CODE", None) is None:
LOGGER.warning(
- "Offline Emissions Tracker requires COUNTRY_ISO_CODE to be set. "
+ "\t+ Offline Emissions Tracker requires COUNTRY_ISO_CODE to be set. "
"We will set it to FRA but the carbon footprint will be inaccurate."
)
@@ -83,32 +75,19 @@ def track(self, interval=1, file_prefix=""):
self.emission_tracker.start()
yield
self.emission_tracker.stop()
- self.total_energy = self.emission_tracker._total_energy.kWh
- self.total_emissions = self.emission_tracker.final_emissions
-
- def get_total_energy(self) -> float:
- return self.total_energy
- def get_total_emissions(self) -> float:
- return self.total_emissions
+ self.cpu_energy = self.emission_tracker._total_cpu_energy.kWh
+ self.gpu_energy = self.emission_tracker._total_gpu_energy.kWh
+ self.ram_energy = self.emission_tracker._total_ram_energy.kWh
+ self.total_energy = self.emission_tracker._total_energy.kWh
def get_elapsed_time(self) -> float:
return self.emission_tracker._last_measured_time - self.emission_tracker._start_time
-
-def infer_cuda_device_ids() -> List[int]:
- if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
- cuda_device_ids = list(map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
- else:
- if is_nvidia_system() and is_py3nvml_available():
- nvml.nvmlInit()
- cuda_device_ids = list(range(nvml.nvmlDeviceGetCount()))
- nvml.nvmlShutdown()
- elif is_rocm_system() and is_pyrsmi_available():
- rocml.smi_initialize()
- cuda_device_ids = list(range(rocml.smi_get_device_count()))
- rocml.smi_shutdown()
- else:
- raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA energy tracking.")
-
- return cuda_device_ids
+ def get_energies_dict(self) -> Dict[str, float]:
+ return {
+ "cpu_energy(kHh)": self.cpu_energy,
+ "gpu_energy(kHh)": self.gpu_energy,
+ "ram_energy(kHh)": self.ram_energy,
+ "total(kHh)": self.total_energy,
+ }
diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 2823919c..369c2b70 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -1,11 +1,12 @@
-import time
from contextlib import contextmanager
from logging import getLogger
from typing import List
+import time
-import torch
+from ..import_utils import is_torch_distributed_available, is_torch_available
-from ..import_utils import is_torch_distributed_available
+if is_torch_available():
+ import torch
if is_torch_distributed_available():
import torch.distributed
@@ -20,61 +21,71 @@ def __init__(self, device: str, backend: str):
self.latencies: List[float] = []
+ # this is not in track, because this tracker is used repeatedly
if is_torch_distributed_available() and torch.distributed.is_initialized():
- LOGGER.info("Tracking Pytorch Distributed latency")
+ LOGGER.info("\t+ Tracking Pytorch Distributed latency")
elif self.device == "cuda" and self.backend == "pytorch":
- LOGGER.info("Tracking Pytorch CUDA latency")
+ LOGGER.info("\t+ Tracking Pytorch CUDA latency")
else:
- LOGGER.info("Tracking CPU latency")
+ LOGGER.info("\t+ Tracking CPU latency")
+
+ def reset(self):
+ self.latencies = []
@contextmanager
def track(self):
if is_torch_distributed_available() and torch.distributed.is_initialized():
- yield from self._pytorch_distributed_tracker()
+ yield from self._pytorch_distributed_latency()
elif self.backend == "pytorch" and self.device == "cuda":
- yield from self._pytorch_cuda_tracker()
+ yield from self._pytorch_cuda_latency()
else:
- yield from self._cpu_tracker()
+ yield from self._cpu_latency()
- def _pytorch_distributed_tracker(self):
+ def _pytorch_distributed_latency(self):
torch.distributed.barrier() # synchronize before workload
start = time.perf_counter_ns()
yield
torch.distributed.barrier() # synchronize after workload
end = time.perf_counter_ns()
- latency_ns = end - start
- latency = latency_ns / 1e9
+ latency = (end - start) / 1e9
self.latencies.append(latency)
- LOGGER.debug(f"Tracked Pytorch Distributed latency: {latency:.2e}s")
+ LOGGER.debug(f"\t+ Tracked Pytorch distributed latency: {latency:.2e}s")
- def _pytorch_cuda_tracker(self):
+ def _pytorch_cuda_latency(self):
+ # Note: torch.cuda.Event is not used here,
+ # there's actually no specific need to use cuda events if you're synchronizing
+ # it's rather a feature that can be used to measure kernel latency without synchronizing,
+ # allowing us to measure the time it takes to perform an operation without necessarily stalling the GPU.
+ # An interesting use case is with cuda graphs where synchronization makes us shoot the optimization in the foot.
+ # details: https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/
torch.cuda.synchronize() # synchronize before workload
start = time.perf_counter_ns()
yield
torch.cuda.synchronize() # synchronize after workload
end = time.perf_counter_ns()
- latency_ns = end - start
- latency = latency_ns / 1e9
+ latency = (end - start) / 1e9
self.latencies.append(latency)
- LOGGER.debug(f"Tracked Pytorch CUDA latency: {latency:.2e}s")
+ LOGGER.debug(f"\t+ Tracked Pytorch CUDA latency: {latency:.2e}s")
- def _cpu_tracker(self):
+ def _cpu_latency(self):
start = time.perf_counter_ns()
yield
end = time.perf_counter_ns()
- latency_ns = end - start
- latency = latency_ns / 1e9
+ latency = (end - start) / 1e9
self.latencies.append(latency)
- LOGGER.debug(f"Tracked CPU latency: {latency:.2e}s")
+ LOGGER.debug(f"\t+ Tracked CPU latency: {latency:.2e}s")
- def get_latencies(self):
- return self.latencies
+ def get_total_count(self):
+ return len(self.latencies)
def get_total_latency(self):
return sum(self.latencies)
+
+ def get_latencies_list(self) -> List[float]:
+ return self.latencies
diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
index 06b0683a..816f1d5a 100644
--- a/optimum_benchmark/trackers/memory.py
+++ b/optimum_benchmark/trackers/memory.py
@@ -1,18 +1,12 @@
import os
from logging import getLogger
-from typing import List, Optional
from contextlib import contextmanager
+from typing import List, Optional, Dict
from multiprocessing import Pipe, Process
from multiprocessing.connection import Connection
-import psutil
-import torch
-
-from ..env_utils import bytes_to_mega_bytes, is_nvidia_system, is_rocm_system
-from ..import_utils import (
- is_py3nvml_available,
- is_pyrsmi_available,
-)
+from ..env_utils import bytes_to_mega_bytes, get_cuda_device_ids, is_nvidia_system, is_rocm_system
+from ..import_utils import is_py3nvml_available, is_pyrsmi_available, is_torch_available
if is_nvidia_system():
if is_py3nvml_available():
@@ -25,33 +19,65 @@
if is_rocm_system():
if is_pyrsmi_available():
- # TODO: use amdsmi instead of pyrsmi
from pyrsmi import rocml
else:
raise ValueError(
- "The library pyrsmi is required to run memory benchmark on ROCm-powered GPUs, but is not installed. "
+ "The library pyrsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
"Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git."
)
+if is_torch_available():
+ import torch
+
+import psutil
+
LOGGER = getLogger("memory")
class MemoryTracker:
- def __init__(self, device: str, backend: str, device_ids: Optional[List[int]] = None):
+ """
+ Memory tracker to measure max memory usage of CPU or GPU devices.
+
+ Args:
+ device (str): Device to track memory usage. Can be either "cuda" or any other device.
+ backend (str): Backend to track memory usage. Can be either "pytorch" or any other backend.
+ device_ids (List[int], optional): List of device IDs to track memory usage. Defaults to None.
+ """
+
+ def __init__(self, device: str, backend: str, device_ids: Optional[str] = None):
self.device = device
self.backend = backend
- self.device_ids = device_ids
- self.max_memory_used: int = 0
- self.max_memory_reserved: int = 0
- self.max_memory_allocated: int = 0
+ self.max_memory_used = 0
+ self.max_memory_reserved = 0
+ self.max_memory_allocated = 0
if self.device == "cuda":
- if self.device_ids is None:
- self.device_ids = infer_cuda_device_ids()
+ if device_ids is None:
+ LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.")
+ self.device_ids = list(map(int, get_cuda_device_ids().split(",")))
+ else:
+ self.device_ids = list(map(int, device_ids.split(",")))
+
+ LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices: {self.device_ids}")
+
+ if self.backend == "pytorch":
+ self.pytorch_device_ids = list(range(torch.cuda.device_count()))
+ LOGGER.info(f"\t+ Tracking Pytorch memory of Pytorch CUDA devices: {self.pytorch_device_ids}")
+
+ if len(self.device_ids) != len(self.pytorch_device_ids):
+ raise ValueError(
+ "The number of CUDA devices and Pytorch CUDA devices must be the same. "
+ f"Got {len(self.device_ids)} and {len(self.pytorch_device_ids)} respectively."
+ )
+ else:
+ LOGGER.info("\t+ Tracking RAM memory")
- LOGGER.info(f"Tracking CUDA devices: {self.device_ids}")
+ def reset(self):
+ self.max_memory_used = 0
+ self.max_memory_reserved = 0
+ self.max_memory_allocated = 0
@contextmanager
def track(self):
@@ -62,109 +88,122 @@ def track(self):
else:
yield from self._cpu_memory()
- def get_max_memory_used(self):
- return bytes_to_mega_bytes(self.max_memory_used)
-
- def get_max_memory_reserved(self):
- return bytes_to_mega_bytes(self.max_memory_reserved)
-
- def get_max_memory_allocated(self):
- return bytes_to_mega_bytes(self.max_memory_allocated)
-
def _cuda_pytorch_memory(self):
torch.cuda.empty_cache()
-
- for device_index in range(torch.cuda.device_count()):
+ for pytorch_device_index in self.pytorch_device_ids:
try:
- torch.cuda.reset_peak_memory_stats(device=device_index)
+ torch.cuda.reset_peak_memory_stats(device=pytorch_device_index)
except Exception as e:
- LOGGER.warning(f"Could not reset peak memory stats for device {device_index}: {e}")
+ LOGGER.warning(f"\t+ Could not reset max memory stats for device {pytorch_device_index}: {e}")
yield from self._cuda_memory()
- for device_index in range(torch.cuda.device_count()):
- self.max_memory_allocated += torch.cuda.max_memory_allocated(device=device_index)
- self.max_memory_reserved += torch.cuda.max_memory_reserved(device=device_index)
+ for pytorch_device_index in self.pytorch_device_ids:
+ self.max_memory_reserved += torch.cuda.max_memory_reserved(device=pytorch_device_index)
+ self.max_memory_allocated += torch.cuda.max_memory_allocated(device=pytorch_device_index)
- LOGGER.debug(f"Pytorch max memory allocated: {self.get_max_memory_allocated()} MB")
- LOGGER.debug(f"Pytorch max memory reserved: {self.get_max_memory_reserved()} MB")
+ LOGGER.debug(f"\t+ Pytorch max memory reserved: {self.get_max_memory_reserved_mb()} MB")
+ LOGGER.debug(f"\t+ Pytorch max memory allocated: {self.get_max_memory_allocated_mb()} MB")
- def _cuda_memory(self):
- if is_nvidia_system() and is_py3nvml_available():
- handles = []
- nvml.nvmlInit()
- for device_index in self.device_ids:
- handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
- handles.append(handle)
-
- yield
-
- for handle in handles:
- meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
- self.max_memory_used += meminfo.used
- nvml.nvmlShutdown()
- LOGGER.debug(f"PyNVML max memory used: {self.get_max_memory_used()} MB")
-
- elif is_rocm_system() and is_pyrsmi_available():
- rocml.smi_initialize()
+ def _cuda_memory(self, interval: float = 0.001):
+ child_connection, parent_connection = Pipe()
+ memory_process = Process(
+ target=monitor_gpu_max_vram_memory,
+ args=(self.device_ids, child_connection, interval),
+ daemon=True,
+ )
+ memory_process.start()
+ parent_connection.recv() # wait for memory process to be ready
- yield
+ yield
- for device_index in self.device_ids:
- meminfo_used = rocml.smi_get_device_memory_used(device_index)
- self.max_memory_used += meminfo_used
- rocml.smi_shutdown()
- LOGGER.debug(f"PyRSMI max memory used: {self.get_max_memory_used()} MB")
- else:
- raise ValueError("Only NVIDIA and AMD RoCm GPUs are supported for CUDA memory tracking.")
+ parent_connection.send(True)
+ self.max_memory_used = parent_connection.recv()
+ LOGGER.debug(f"\t+ Max memory (VRAM) used: {self.get_max_memory_used_mb()} MB")
- def _cpu_memory(self, interval: float = 0.0001):
+ def _cpu_memory(self, interval: float = 0.001):
child_connection, parent_connection = Pipe()
- # instantiate process
memory_process = Process(
- target=monitor_process_peak_memory,
+ target=monitor_cpu_max_ram_memory,
args=(os.getpid(), child_connection, interval),
daemon=True,
)
memory_process.start()
- parent_connection.recv()
+ parent_connection.recv() # wait for memory process to be ready
yield
- parent_connection.send(0)
+ parent_connection.send(True)
self.max_memory_used = parent_connection.recv()
- LOGGER.debug(f"Peak memory usage: {self.get_max_memory_used()} MB")
+ LOGGER.debug(f"\t+ Max memory (RAM) used: {self.get_max_memory_used_mb()} MB")
+ def get_max_memory_used_mb(self) -> int:
+ return bytes_to_mega_bytes(self.max_memory_used)
+
+ def get_max_memory_allocated_mb(self) -> int:
+ return bytes_to_mega_bytes(self.max_memory_allocated)
-def monitor_process_peak_memory(process_id: int, connection: Connection, interval: float):
+ def get_max_memory_reserved_mb(self) -> int:
+ return bytes_to_mega_bytes(self.max_memory_reserved)
+
+ def get_memories_dict(self) -> Dict[str, int]:
+ if self.device == "cuda" and self.backend == "pytorch":
+ return {
+ "max_vram_used(MB)": self.get_max_memory_used_mb(),
+ "max_memory_reserved(MB)": self.get_max_memory_reserved_mb(),
+ "max_memory_allocated(MB)": self.get_max_memory_allocated_mb(),
+ }
+ elif self.device == "cuda":
+ return {"max_vram_used(MB)": self.get_max_memory_used_mb()}
+ else:
+ return {"max_ram_used(MB)": self.get_max_memory_used_mb()}
+
+
+def monitor_cpu_max_ram_memory(process_id: int, connection: Connection, interval: float):
process = psutil.Process(process_id)
- peak_memory_usage = 0
+ max_memory_usage = 0
connection.send(0)
stop = False
while not stop:
meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
current_memory_usage = getattr(process, meminfo_attr)()[0]
- peak_memory_usage = max(peak_memory_usage, current_memory_usage)
+ max_memory_usage = max(max_memory_usage, current_memory_usage)
stop = connection.poll(interval)
- connection.send(peak_memory_usage)
+ connection.send(max_memory_usage)
connection.close()
-def infer_cuda_device_ids() -> List[int]:
- if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
- cuda_device_ids = list(map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
+def monitor_gpu_max_vram_memory(device_ids: List[int], connection: Connection, interval: float):
+ if is_nvidia_system() and is_py3nvml_available():
+ nvml.nvmlInit()
+ handles = [nvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]
+ max_memory_usage = 0
+ connection.send(0)
+ stop = False
+
+ while not stop:
+ current_memory_usage = sum(nvml.nvmlDeviceGetMemoryInfo(handle).used for handle in handles)
+ max_memory_usage = max(max_memory_usage, current_memory_usage)
+ stop = connection.poll(interval)
+
+ connection.send(max_memory_usage)
+ nvml.nvmlShutdown()
+ connection.close()
+ elif is_rocm_system() and is_pyrsmi_available():
+ rocml.smi_initialize()
+ max_memory_usage = 0
+ connection.send(0)
+ stop = False
+
+ while not stop:
+ current_memory_usage = sum(rocml.smi_get_device_memory_used(device_id) for device_id in device_ids)
+ max_memory_usage = max(max_memory_usage, current_memory_usage)
+ stop = connection.poll(interval)
+
+ connection.send(max_memory_usage)
+ rocml.smi_shutdown()
+ connection.close()
else:
- if is_nvidia_system() and is_py3nvml_available():
- nvml.nvmlInit()
- cuda_device_ids = list(range(nvml.nvmlDeviceGetCount()))
- nvml.nvmlShutdown()
- elif is_rocm_system() and is_pyrsmi_available():
- rocml.smi_initialize()
- cuda_device_ids = list(range(rocml.smi_get_device_count()))
- rocml.smi_shutdown()
- else:
- raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.")
-
- return cuda_device_ids
+ raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.")
diff --git a/setup.py b/setup.py
index 7b618ed4..40504fd3 100644
--- a/setup.py
+++ b/setup.py
@@ -62,6 +62,7 @@
# docker-based backends
"text-generation-inference": ["docker"],
# specific settings
+ "codecarbon": ["codecarbon"],
"deepspeed": ["deepspeed"],
"diffusers": ["diffusers"],
"timm": ["timm"],
diff --git a/tests/configs/_base_.yaml b/tests/configs/_base_.yaml
index ff50aa22..d983b841 100644
--- a/tests/configs/_base_.yaml
+++ b/tests/configs/_base_.yaml
@@ -2,24 +2,27 @@ defaults:
- launcher: process # isolated process launcher
- experiment # inheriting experiment schema
- _self_ # for hydra 1.1 compatibility
- # - override hydra/hydra_logging: colorlog # colorful logging
- # - override hydra/job_logging: colorlog # colorful logging
+ - override hydra/hydra_logging: colorlog # colorful logging
+ - override hydra/job_logging: colorlog # colorful logging
- override hydra/launcher: joblib # for parallelization
experiment_name: ${device}_${benchmark.name}_${backend.name}_${task}
+# hydra/cli specific settings
hydra:
run:
- dir: tests/experiments/${experiment_name}
+ # where to store run results
+ dir: tests/runs/${experiment_name}
sweep:
- dir: tests/experiments/${experiment_name}
+ # where to store sweep results
+ dir: tests/sweeps/${experiment_name}
job:
+ # change working directory to the run directory
chdir: true
env_set:
- OVERRIDE_BENCHMARKS: 1 # to not skip if results already exist
- CUDA_VISIBLE_DEVICES: 0 # by default we only use one GPU
- CUDA_DEVICE_ORDER: PCI_BUS_ID # laking we use the right GPU
-
+ # set environment variable OVERRIDE_BENCHMARKS to 1
+ # to not skip benchmarks that have been run before
+ OVERRIDE_BENCHMARKS: 1
# we are using joblib launcher to parallelize testing since
# we're having ccorrect benchmarks is not important while testing
diff --git a/tests/configs/_ddp_.yaml b/tests/configs/_ddp_.yaml
index a5a946fc..aab449e4 100644
--- a/tests/configs/_ddp_.yaml
+++ b/tests/configs/_ddp_.yaml
@@ -4,7 +4,5 @@ defaults:
launcher:
nproc_per_node: 2
-hydra:
- job:
- env_set:
- CUDA_VISIBLE_DEVICES: 0,1
+backend:
+ device_ids: 0,1
diff --git a/tests/configs/_dp_.yaml b/tests/configs/_dp_.yaml
index 4d6528f6..b7578bdf 100644
--- a/tests/configs/_dp_.yaml
+++ b/tests/configs/_dp_.yaml
@@ -1,4 +1,2 @@
-hydra:
- job:
- env_set:
- CUDA_VISIBLE_DEVICES: 0,1
+backend:
+ device_ids: 0,1
diff --git a/tests/configs/_ds_tp_.yaml b/tests/configs/_ds_tp_.yaml
index 76608e2e..6c154e4f 100644
--- a/tests/configs/_ds_tp_.yaml
+++ b/tests/configs/_ds_tp_.yaml
@@ -5,12 +5,8 @@ launcher:
nproc_per_node: 2
backend:
+ device_ids: 0,1
deepspeed_inference: true
deepspeed_inference_config:
tensor_parallel:
tp_size: 2
-
-hydra:
- job:
- env_set:
- CUDA_VISIBLE_DEVICES: 0,1
diff --git a/tests/configs/_lm_naive_mp_.yaml b/tests/configs/_lm_naive_mp_.yaml
index 20aef92a..2ac16fb8 100644
--- a/tests/configs/_lm_naive_mp_.yaml
+++ b/tests/configs/_lm_naive_mp_.yaml
@@ -1,10 +1,6 @@
backend:
- model: gpt2
+ device_ids: 0,1
+ device_map: auto
task: text-generation
library: transformers
- device_map: auto
-
-hydra:
- job:
- env_set:
- CUDA_VISIBLE_DEVICES: 0,1
+ model: gpt2
diff --git a/tests/test_api.py b/tests/test_api.py
index f388e629..0bf6ced9 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -6,7 +6,6 @@
from optimum_benchmark.trackers.memory import MemoryTracker
from optimum_benchmark.trackers.latency import LatencyTracker
-from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS
from optimum_benchmark.experiment import ExperimentConfig, launch
from optimum_benchmark.launchers.inline.config import InlineConfig
from optimum_benchmark.backends.pytorch.config import PyTorchConfig
@@ -18,14 +17,13 @@
from optimum_benchmark.benchmarks.training.config import TrainingConfig
from optimum_benchmark.benchmarks.inference.config import InferenceConfig
from optimum_benchmark.generators.dataset_generator import DatasetGenerator
+from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS
+from optimum_benchmark.backends.timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config
from optimum_benchmark.backends.transformers_utils import (
extract_transformers_shapes_from_artifacts,
get_transformers_pretrained_config,
)
-from optimum_benchmark.backends.timm_utils import (
- extract_timm_shapes_from_config,
- get_timm_pretrained_config,
-)
+
LOGGER = getLogger("test-api")
@@ -45,8 +43,15 @@
("transformers", "image-classification", "google/vit-base-patch16-224"),
("transformers", "semantic-segmentation", "google/vit-base-patch16-224"),
]
-BENCHMARK_CONFIGS = [InferenceConfig(memory=True), TrainingConfig()]
-LAUNCHER_CONFIGS = [InlineConfig(), ProcessConfig(), TorchrunConfig(nproc_per_node=2)]
+BENCHMARK_CONFIGS = [
+ InferenceConfig(latency=True, memory=True),
+ TrainingConfig(latency=True, memory=True),
+]
+LAUNCHER_CONFIGS = [
+ TorchrunConfig(nproc_per_node=2, device_isolation=False),
+ ProcessConfig(device_isolation=False),
+ InlineConfig(device_isolation=False),
+]
@pytest.mark.parametrize("device,backend", DEVICES_BACKENDS)
@@ -58,11 +63,11 @@ def test_api_latency_tracker(device, backend):
with tracker.track():
time.sleep(1)
- measured_latencies = tracker.get_latencies()
+ latencies_list = tracker.get_latencies_list()
- assert len(measured_latencies) == 2
- assert measured_latencies[0] > expected_latency * 0.9
- assert measured_latencies[0] < expected_latency * 1.1
+ assert len(latencies_list) == 2
+ assert latencies_list[0] > expected_latency * 0.9
+ assert latencies_list[0] < expected_latency * 1.1
@pytest.mark.parametrize("device,backend", DEVICES_BACKENDS)
@@ -74,18 +79,18 @@ def test_api_memory_tracker(device, backend):
# the process consumes memory that we can't control
if backend == "pytorch":
- initial_process_memory = tracker.get_max_memory_allocated()
+ initial_process_memory = tracker.get_max_memory_allocated_mb()
else:
- initial_process_memory = tracker.get_max_memory_used()
+ initial_process_memory = tracker.get_max_memory_used_mb()
with tracker.track():
array = torch.ones((10000, 10000), dtype=torch.float64, device=device)
expected_memory = array.nbytes / 1e6 # around 800 MB
if backend == "pytorch":
- final_process_memory = tracker.get_max_memory_allocated()
+ final_process_memory = tracker.get_max_memory_allocated_mb()
else:
- final_process_memory = tracker.get_max_memory_used()
+ final_process_memory = tracker.get_max_memory_used_mb()
measured_memory = final_process_memory - initial_process_memory
@@ -96,11 +101,11 @@ def test_api_memory_tracker(device, backend):
@pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS)
def test_api_input_generator(library, task, model):
if library == "transformers":
- model_config = get_transformers_pretrained_config(model=model)
- model_shapes = extract_transformers_shapes_from_artifacts(config=model_config)
+ model_config = get_transformers_pretrained_config(model)
+ model_shapes = extract_transformers_shapes_from_artifacts(model_config)
elif library == "timm":
model_config = get_timm_pretrained_config(model)
- model_shapes = extract_timm_shapes_from_config(config=model_config)
+ model_shapes = extract_timm_shapes_from_config(model_config)
else:
raise ValueError(f"Unknown library {library}")
@@ -110,9 +115,13 @@ def test_api_input_generator(library, task, model):
model_shapes=model_shapes,
)
- _ = generator.generate(mode="forward")
if task in TEXT_GENERATION_TASKS:
- _ = generator.generate(mode="generate")
+ _ = generator(mode="forward")
+ _ = generator(mode="generate")
+ elif task in IMAGE_DIFFUSION_TASKS:
+ _ = generator(mode="call")
+ else:
+ _ = generator(mode="forward")
@pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS)
@@ -132,28 +141,15 @@ def test_api_dataset_generator(library, task, model):
model_shapes=model_shapes,
)
- _ = generator.generate()
-
-
-@pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS)
-def test_api_launchers(launcher_config):
- backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cpu")
- benchmark_config = InferenceConfig(memory=True)
- experiment_config = ExperimentConfig(
- experiment_name="api-launch-experiment",
- benchmark=benchmark_config,
- launcher=launcher_config,
- backend=backend_config,
- )
- _ = launch(experiment_config)
+ _ = generator()
@pytest.mark.parametrize("benchmark_config", BENCHMARK_CONFIGS)
-def test_api_benchmarks(benchmark_config):
- backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cpu")
- launcher_config = ProcessConfig()
+@pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS)
+def test_api_launch_cpu(benchmark_config, launcher_config):
+ backend_config = PyTorchConfig(model="bert-base-uncased", no_weights=True, device="cpu")
experiment_config = ExperimentConfig(
- experiment_name="api-benchmark-experiment",
+ experiment_name="",
benchmark=benchmark_config,
launcher=launcher_config,
backend=backend_config,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b48283e1..afae3609 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -3,7 +3,7 @@
import pytest
-from optimum_benchmark.logging_utils import run_process_and_log_stream_output
+from optimum_benchmark.logging_utils import run_subprocess_and_log_stream_output
LOGGER = getLogger("test-cli")
@@ -26,7 +26,7 @@ def test_cli_configs(config_name):
"--multirun",
]
- popen = run_process_and_log_stream_output(LOGGER, args)
+ popen = run_subprocess_and_log_stream_output(LOGGER, args)
assert popen.returncode == 0, f"Failed to run {config_name}"
@@ -42,7 +42,7 @@ def test_cli_exit_code():
"backend.model=bert-base-uncased",
]
- popen_0 = run_process_and_log_stream_output(LOGGER, args_0)
+ popen_0 = run_subprocess_and_log_stream_output(LOGGER, args_0)
assert popen_0.returncode == 0
args_1 = [
@@ -56,5 +56,5 @@ def test_cli_exit_code():
"backend.model=bert-base-uncased",
]
- popen_1 = run_process_and_log_stream_output(LOGGER, args_1)
+ popen_1 = run_subprocess_and_log_stream_output(LOGGER, args_1)
assert popen_1.returncode == 1