diff --git a/.github/workflows/check_quality.yaml b/.github/workflows/check_quality.yaml index da468da3..36b99f99 100644 --- a/.github/workflows/check_quality.yaml +++ b/.github/workflows/check_quality.yaml @@ -18,10 +18,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install quality requirements run: | diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml index 25ba8d1a..752afab7 100644 --- a/.github/workflows/test_api_cpu.yaml +++ b/.github/workflows/test_api_cpu.yaml @@ -18,10 +18,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install dependencies run: | diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml index abc7aed4..df72ffb2 100644 --- a/.github/workflows/test_api_misc.yaml +++ b/.github/workflows/test_api_misc.yaml @@ -18,10 +18,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install requirements run: | diff --git a/.github/workflows/test_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml similarity index 82% rename from .github/workflows/test_cpu_neural_compressor.yaml rename to .github/workflows/test_cli_cpu_neural_compressor.yaml index 7e3488d4..9150a90f 100644 --- a/.github/workflows/test_cpu_neural_compressor.yaml +++ b/.github/workflows/test_cli_cpu_neural_compressor.yaml @@ -1,4 +1,4 @@ -name: CPU Intel Neural Compressor Tests +name: CLI CPU Intel Neural Compressor Tests on: workflow_dispatch: @@ -12,16 +12,16 @@ concurrency: cancel-in-progress: true jobs: - run_cpu_neural_compressor_tests: + run_cli_cpu_neural_compressor_tests: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install Intel Neural Compressor CPU requirements run: | diff --git a/.github/workflows/test_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml similarity index 82% rename from .github/workflows/test_cpu_onnxruntime.yaml rename to .github/workflows/test_cli_cpu_onnxruntime.yaml index 2770b23f..e7caf218 100644 --- a/.github/workflows/test_cpu_onnxruntime.yaml +++ b/.github/workflows/test_cli_cpu_onnxruntime.yaml @@ -1,4 +1,4 @@ -name: CPU OnnxRuntime Tests +name: CLI CPU OnnxRuntime Tests on: workflow_dispatch: @@ -12,16 +12,16 @@ concurrency: cancel-in-progress: true jobs: - run_cpu_onnxruntime_tests: + run_cli_cpu_onnxruntime_tests: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install requirements run: | diff --git a/.github/workflows/test_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml similarity index 83% rename from .github/workflows/test_cpu_openvino.yaml rename to .github/workflows/test_cli_cpu_openvino.yaml index d2d93cce..00b40aef 100644 --- a/.github/workflows/test_cpu_openvino.yaml +++ b/.github/workflows/test_cli_cpu_openvino.yaml @@ -1,4 +1,4 @@ -name: CPU OpenVINO Tests +name: CLI CPU OpenVINO Tests on: workflow_dispatch: @@ -12,16 +12,16 @@ concurrency: cancel-in-progress: true jobs: - run_cpu_openvino_tests: + run_cli_cpu_openvino_tests: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install requirements run: | diff --git a/.github/workflows/test_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml similarity index 83% rename from .github/workflows/test_cpu_pytorch.yaml rename to .github/workflows/test_cli_cpu_pytorch.yaml index 1c6809cc..3df5368b 100644 --- a/.github/workflows/test_cpu_pytorch.yaml +++ b/.github/workflows/test_cli_cpu_pytorch.yaml @@ -1,4 +1,4 @@ -name: CPU Pytorch tests +name: CLI CPU Pytorch tests on: workflow_dispatch: @@ -12,16 +12,16 @@ concurrency: cancel-in-progress: true jobs: - run_cpu_pytorch_tests: + run_cli_cpu_pytorch_tests: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install requirements run: | diff --git a/.github/workflows/test_cuda_onnxruntime_inference.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml similarity index 86% rename from .github/workflows/test_cuda_onnxruntime_inference.yaml rename to .github/workflows/test_cli_cuda_onnxruntime.yaml index bbb81b36..0b03608e 100644 --- a/.github/workflows/test_cuda_onnxruntime_inference.yaml +++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml @@ -1,4 +1,4 @@ -name: CUDA OnnxRuntime Inference Tests +name: CLI CUDA OnnxRuntime Tests on: workflow_dispatch: @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - build_image_and_run_cuda_onnxruntime_inference_tests: + build_image_and_run_cli_cuda_onnxruntime_tests: runs-on: hf-dgx-01 steps: - name: Checkout @@ -40,4 +40,4 @@ jobs: --workdir /workspace/optimum-benchmark --gpus '"device=0,1"' opt-bench-cuda:11.8.0 - -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime and inference' -x" + -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x" diff --git a/.github/workflows/test_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml similarity index 95% rename from .github/workflows/test_cuda_pytorch.yaml rename to .github/workflows/test_cli_cuda_pytorch.yaml index 49e77f8a..1b3fd99f 100644 --- a/.github/workflows/test_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -1,4 +1,4 @@ -name: CUDA Pytorch Tests +name: CLI CUDA Pytorch Tests on: workflow_dispatch: @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - build_image_and_run_cuda_pytorch_tests: + build_image_and_run_cli_cuda_pytorch_tests: strategy: fail-fast: false matrix: diff --git a/.github/workflows/test_cuda_torch_ort_training.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml similarity index 91% rename from .github/workflows/test_cuda_torch_ort_training.yaml rename to .github/workflows/test_cli_cuda_torch_ort.yaml index 20f87e67..71bfd33e 100644 --- a/.github/workflows/test_cuda_torch_ort_training.yaml +++ b/.github/workflows/test_cli_cuda_torch_ort.yaml @@ -1,4 +1,4 @@ -name: CUDA Torch-ORT Training Tests +name: CLI CUDA Torch-ORT Tests on: workflow_dispatch: @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - build_image_and_run_cuda_torch_ort_training_tests: + build_image_and_run_cli_cuda_torch_ort_tests: runs-on: hf-dgx-01 steps: - name: Checkout @@ -40,4 +40,4 @@ jobs: --workdir /workspace/optimum-benchmark --gpus '"device=0,1"' opt-bench-cuda:11.8.0 - -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort and training' -x" + -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x" diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml index c448a213..5b55c0a7 100644 --- a/.github/workflows/test_cli_misc.yaml +++ b/.github/workflows/test_cli_misc.yaml @@ -18,10 +18,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: '3.10' - name: Install requirements run: | diff --git a/.github/workflows/test_rocm_onnxruntime_inference.yaml b/.github/workflows/test_cli_rocm_onnxruntime.yaml similarity index 90% rename from .github/workflows/test_rocm_onnxruntime_inference.yaml rename to .github/workflows/test_cli_rocm_onnxruntime.yaml index 5a8cc0a3..fcd0f53d 100644 --- a/.github/workflows/test_rocm_onnxruntime_inference.yaml +++ b/.github/workflows/test_cli_rocm_onnxruntime.yaml @@ -1,4 +1,4 @@ -name: ROCm OnnxRuntime Inference Tests +name: CLI ROCm OnnxRuntime Tests on: workflow_dispatch: @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - build_image_and_run_rocm_onnxruntime_inference_tests: + build_image_and_run_cli_rocm_onnxruntime_tests: runs-on: hf-amd-mi210-dev steps: - name: Checkout @@ -51,4 +51,4 @@ jobs: --device /dev/dri/renderD129 --entrypoint /bin/bash opt-bench-rocm-ort:5.7 - -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime and inference' -x" + -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x" diff --git a/.github/workflows/test_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml similarity index 95% rename from .github/workflows/test_rocm_pytorch.yaml rename to .github/workflows/test_cli_rocm_pytorch.yaml index 3d14909d..11c9e77a 100644 --- a/.github/workflows/test_rocm_pytorch.yaml +++ b/.github/workflows/test_cli_rocm_pytorch.yaml @@ -1,4 +1,4 @@ -name: ROCm Pytorch Tests +name: CLI ROCm Pytorch Tests on: workflow_dispatch: @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - build_image_and_run_rocm_pytorch_tests: + build_image_and_run_cli_rocm_pytorch_tests: strategy: fail-fast: false matrix: diff --git a/.github/workflows/test_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml similarity index 93% rename from .github/workflows/test_tensorrt_llm.yaml rename to .github/workflows/test_cli_tensorrt_llm.yaml index 06640699..0169fca5 100644 --- a/.github/workflows/test_tensorrt_llm.yaml +++ b/.github/workflows/test_cli_tensorrt_llm.yaml @@ -1,4 +1,4 @@ -name: TensorRT-LLM Tests +name: CLI TensorRT-LLM Tests on: workflow_dispatch: @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - pull_image_and_run_tensorrt_llm_tests: + pull_image_and_run_cli_tensorrt_llm_tests: runs-on: hf-dgx-01 steps: - name: Checkout diff --git a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml similarity index 86% rename from .github/workflows/test_tensorrt_onnxruntime_inference.yaml rename to .github/workflows/test_cli_tensorrt_onnxruntime.yaml index 4d41313d..92f425e7 100644 --- a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml +++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml @@ -1,4 +1,4 @@ -name: TensorRT OnnxRuntime Inference Tests +name: CLI TensorRT OnnxRuntime Tests on: workflow_dispatch: @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - build_image_and_run_tensorrt_onnxruntime_tests: + build_image_and_run_cli_tensorrt_onnxruntime_tests: runs-on: hf-dgx-01 steps: - name: Checkout @@ -40,4 +40,4 @@ jobs: --gpus '"device=0,1"' --entrypoint /bin/bash opt-bench-tensorrt:22.12 - -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime and inference' -x" + -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x" diff --git a/Makefile b/Makefile index c993cc7c..55e44e1e 100644 --- a/Makefile +++ b/Makefile @@ -12,8 +12,68 @@ style: install: pip install -e . -install_cpu_dev: - pip install -e .[quality,testing,openvino,onnxruntime,neural-compressor,diffusers,timm,peft] +build_docker_cpu: + docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t opt-bench-cpu:latest . -install_gpu_dev: - pip install -e .[quality,testing,onnxruntime-gpu,deepspeed,diffusers,timm,peft] +build_docker_cuda: + docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_CUDA=cu118 --build-arg CUDA_VERSION=11.8.0 -t opt-bench-cuda:11.8.0 . + +build_docker_rocm: + docker build -f docker/rocm.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_ROCM=rocm5.6 --build-arg ROCM_VERSION=5.6.1 -t opt-bench-rocm:5.6.1 . + +test_cli_cpu_neural_compressor: + docker run \ + --rm \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor] && pytest tests/ -k 'cli and cpu and neural_compressor' -x" + +test_cli_cpu_openvino: + docker run \ + --rm \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers] && pytest tests/ -k 'cli and cpu and openvino' -x" + +test_cli_cpu_onnxruntime: + docker run \ + --rm \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x" + +test_cli_cpu_pytorch: + docker run \ + --rm \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cpu:latest -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x" + +test_api_cpu: + docker run \ + --rm \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cpu' -x" + +test_api_cuda: + docker run \ + --rm \ + --gpus '"device=0,1"' \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cuda' -x" + +test_api_misc: + docker run \ + --rm \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x" diff --git a/README.md b/README.md index cc623d27..e338b888 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,13 @@

Optimum-Benchmark πŸ‹οΈ

-Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with supported optimizations & quantization schemes, for [inference](https://github.com/huggingface/optimum#accelerated-inference) & [training](https://github.com/huggingface/optimum#accelerated-training), on multiple [backends & hardwares](https://github.com/huggingface/optimum-benchmark?tab=readme-ov-file#supported-backendsdevices). +Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices-) utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with all their supported [optimizations & quantization schemes](#backend-features-), for [inference & training](#benchmark-features-%EF%B8%8F), in [distributed & non-distributed settings](#backend-features-). ## Motivation πŸ€” -- Hardware vendors wanting to know how their hardware performs compared to others on the same models. -- HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc. +- HF hardware partners wanting to know how their hardware performs compared to another hardware on the same models. +- HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc compared to another model. - Experimenting with hardware & backend specific optimizations & quantization schemes that can be applied to models and improve their computational/memory/energy efficiency. -- [...] ## Current status πŸ“ˆ @@ -19,23 +18,20 @@ Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transform [![CPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml) [![CUDA](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml) [![ROCM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml) -[![MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml) ### CLI + [![CPU Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml) [![CPU OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml) [![CPU Intel Neural Compressor Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml) [![CPU OpenVINO Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml) - [![CUDA Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml) -[![CUDA OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml) -[![CUDA Torch-ORT Training Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml) - -[![TensorRT OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml) +[![CUDA OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml) +[![CUDA Torch-ORT Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml) +[![TensorRT OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml) [![TensorRT-LLM Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml) - [![ROCm Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml) -[![ROCm OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml) +[![ROCm OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml) ## Quickstart πŸš€ @@ -44,7 +40,7 @@ Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transform You can install `optimum-benchmark` using pip: ```bash -python -m pip install git+https://github.com/huggingface/optimum-benchmark.git +pip install optimum-benchmark ``` or by cloning the repository and installing it in editable mode: @@ -66,33 +62,45 @@ Depending on the backends you want to use, you might need to install some extra - Intel Neural Compressor: `pip install optimum-benchmark[neural-compressor]` - Text Generation Inference: `pip install optimum-benchmark[text-generation-inference]` -### Running benchmarks from python API πŸ§ͺ +### Running benchmarks from Python API πŸ§ͺ -You can run benchmarks from the python API: +You can run benchmarks from the Python API, using the `launch` function from the `optimum_benchmark.experiment` module. Here's an example of how to run a benchmark using the `pytorch` backend, `process` launcher and `inference` benchmark. ```python -import logging -logging.basicConfig(level=logging.INFO) - +from optimum_benchmark.logging_utils import setup_logging from optimum_benchmark.experiment import launch, ExperimentConfig from optimum_benchmark.backends.pytorch.config import PyTorchConfig from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.benchmarks.inference.config import InferenceConfig + if __name__ == "__main__": - backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cuda") - launcher_config = ProcessConfig(device_isolation=True) - benchmark_config = InferenceConfig(memory=True) + setup_logging(level="INFO") + benchmark_config = InferenceConfig(latency=False, memory=True, energy=True) + launcher_config = ProcessConfig() + backend_config = PyTorchConfig( + device="cuda", + no_weights=True, + device_ids="0,1", + device_map="auto", + model="IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm", + ) experiment_config = ExperimentConfig( - experiment_name="api-launch-experiment", + experiment_name="python-api-launch-experiment", benchmark=benchmark_config, launcher=launcher_config, backend=backend_config, ) benchmark_report = launch(experiment_config) - print("benchmark_report:", benchmark_report) + benchmark_report.log_all() + # or + print(benchmark_report.to_dict()) + # or + benchmark_report.push_to_hub("IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm") ``` +Yep, it's that simple! Check the supported backends, launchers and benchmarks in the [features](#features-) section. + ### Running benchmarks from CLI πŸƒβ€β™‚οΈ You can run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension. @@ -161,26 +169,26 @@ Other than the [examples](examples), you can also check [tests](tests/configs/). Everything else is optional or inferred at runtime, but can be configured to your needs. -### Backends & Devices πŸ“± - -- [x] Pytorch backend for CPU (`device=cpu`, `backend=pytorch`) -- [x] Pytorch backend for CUDA (`device=cuda`, `backend=pytorch`) -- [ ] Pytorch backend for Habana Gaudi Processor (`device=hpu`, `backend=pytorch`) -- [x] OnnxRuntime backend for CPUExecutionProvider (`device=cpu`, `backend=onnxruntime`) -- [x] OnnxRuntime backend for CUDAExecutionProvider (`device=cuda`, `backend=onnxruntime`) -- [x] OnnxRuntime backend for ROCMExecutionProvider (`device=cuda`, `backend=onnxruntime`, `backend.provider=ROCMExecutionProvider`) -- [x] OnnxRuntime backend for TensorrtExecutionProvider (`device=cuda`, `backend=onnxruntime`, `backend.provider=TensorrtExecutionProvider`) -- [x] Intel Neural Compressor backend for CPU (`device=cpu`, `backend=neural-compressor`) -- [x] TensorRT-LLM backend for CUDA (`device=cuda`, `backend=tensorrt-llm`) -- [x] OpenVINO backend for CPU (`device=cpu`, `backend=openvino`) - -### Launcher features πŸš€ +### Launchers πŸš€ - [x] Process isolation between consecutive runs (`launcher=process`) -- [x] Assert devices (NVIDIA & AMD GPUs) isolation (`launcher.device_isolation=true`) -- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`, etc) +- [x] Assert GPU devices (NVIDIA & AMD) isolation (`launcher.device_isolation=true`) +- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`) + +### Backends & Devices πŸ“± -### Benchmark features πŸ‹οΈ +- [x] Pytorch backend for CPU (`backend=pytorch`, `backend.device=cpu`) +- [x] Pytorch backend for CUDA (`backend=pytorch`, `backend.device=cuda`) +- [ ] Pytorch backend for Habana Gaudi Processor (`backend=pytorch`, `backend.device=habana`) +- [x] OnnxRuntime backend for CPUExecutionProvider (`backend=onnxruntime`, `backend.device=cpu`) +- [x] OnnxRuntime backend for CUDAExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`) +- [x] OnnxRuntime backend for ROCMExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`, `backend.provider=ROCMExecutionProvider`) +- [x] OnnxRuntime backend for TensorrtExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`, `backend.provider=TensorrtExecutionProvider`) +- [x] Intel Neural Compressor backend for CPU (`backend=neural-compressor`, `backend.device=cpu`) +- [x] TensorRT-LLM backend for CUDA (`backend=tensorrt-llm`, `backend.device=cuda`) +- [x] OpenVINO backend for CPU (`backend=openvino`, `backend.device=cpu`) + +### Benchmarking πŸ‹οΈ - [x] Memory tracking (`benchmark.memory=true`) - [x] Latency and throughput tracking of forward pass (default) diff --git a/docker/cpu.dockerfile b/docker/cpu.dockerfile new file mode 100644 index 00000000..371a89c8 --- /dev/null +++ b/docker/cpu.dockerfile @@ -0,0 +1,42 @@ +FROM ubuntu:latest + + +# Ignore interactive questions during `docker build` +ENV DEBIAN_FRONTEND noninteractive + +# Run as non-root user +ARG USER_ID +ARG GROUP_ID + +RUN addgroup --gid $GROUP_ID user +RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user + +# Install python +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.10 \ + python3.10-dev \ + python3-pip \ + git && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 + +# Add local bin to PATH +ENV PATH="/home/user/.local/bin:${PATH}" + +# Add user to sudoers +RUN adduser user sudo +RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >>/etc/sudoers + +# Change user +USER user +WORKDIR /home/user + +# Update pip +RUN pip install --upgrade pip + +# Install PyTorch +RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; \ + then pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu ; \ + else pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \ + fi diff --git a/examples/neural_compressor_ptq_bert.yaml b/examples/neural_compressor_ptq_bert.yaml index 64691369..c8b0ee6e 100644 --- a/examples/neural_compressor_ptq_bert.yaml +++ b/examples/neural_compressor_ptq_bert.yaml @@ -7,25 +7,31 @@ defaults: - override hydra/job_logging: colorlog # colorful logging - override hydra/hydra_logging: colorlog # colorful logging -experiment_name: openvino_static_quant_bert +experiment_name: neural_compressor_ptq_bert backend: - model: bert-base-uncased + device: cpu no_weights: true + model: bert-base-uncased ptq_quantization: true calibration: true - device: cpu benchmark: input_shapes: batch_size: 1 +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 diff --git a/examples/onnxruntime_static_quant_vit.yaml b/examples/onnxruntime_static_quant_vit.yaml index 0b06bc0e..d324415d 100644 --- a/examples/onnxruntime_static_quant_vit.yaml +++ b/examples/onnxruntime_static_quant_vit.yaml @@ -10,23 +10,28 @@ defaults: experiment_name: onnxruntime_static_quant_vit backend: + device: cpu + no_weights: true model: google/vit-base-patch16-224 quantization: true quantization_config: is_static: true per_channel: false - device: cpu calibration: true +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 - CUDA_VISIBLE_DEVICES: 0 - CUDA_DEVICE_ORDER: PCI_BUS_ID diff --git a/examples/openvino_diffusion.yaml b/examples/openvino_diffusion.yaml index 3591ecd7..f9f62e64 100644 --- a/examples/openvino_diffusion.yaml +++ b/examples/openvino_diffusion.yaml @@ -10,22 +10,28 @@ defaults: model: stabilityai/stable-diffusion-2-1 backend: + device: cpu experiment_name: openvino_diffusion - export: true reshape: true + export: true half: true - device: cpu benchmark: input_shapes: batch_size: 1 +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 diff --git a/examples/openvino_static_quant_bert.yaml b/examples/openvino_static_quant_bert.yaml index c349f3ea..83921f4c 100644 --- a/examples/openvino_static_quant_bert.yaml +++ b/examples/openvino_static_quant_bert.yaml @@ -10,24 +10,30 @@ defaults: experiment_name: openvino_static_quant_bert backend: + device: cpu + no_weights: true model: bert-base-uncased export: true - no_weights: true quantization: true calibration: true reshape: true - device: cpu benchmark: input_shapes: batch_size: 1 +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml index 71a087f0..5a36147c 100644 --- a/examples/pytorch_bert.yaml +++ b/examples/pytorch_bert.yaml @@ -10,17 +10,22 @@ defaults: experiment_name: pytorch_bert backend: - model: bert-base-uncased device: cpu + device_ids: 0 + model: bert-base-uncased +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 - CUDA_VISIBLE_DEVICES: 0 - CUDA_DEVICE_ORDER: PCI_BUS_ID diff --git a/examples/pytorch_llama.yaml b/examples/pytorch_llama.yaml index f6b29792..2c9e2845 100644 --- a/examples/pytorch_llama.yaml +++ b/examples/pytorch_llama.yaml @@ -10,8 +10,10 @@ defaults: experiment_name: pytorch_llama backend: - model: TheBloke/Llama-2-70B-AWQ device: cuda + device_ids: 0 + no_weights: true + model: TheBloke/Llama-2-70B-AWQ launcher: device_isolation: true @@ -22,14 +24,18 @@ benchmark: sequence_length: 256 new_tokens: 1000 +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 - CUDA_VISIBLE_DEVICES: 0 - CUDA_DEVICE_ORDER: PCI_BUS_ID diff --git a/examples/pytorch_timm.yaml b/examples/pytorch_timm.yaml index 03125599..4b2c5295 100644 --- a/examples/pytorch_timm.yaml +++ b/examples/pytorch_timm.yaml @@ -10,8 +10,9 @@ defaults: experiment_name: pytorch_timm backend: - model: timm/mobilenetv3_large_100.ra_in1k device: cuda + device_ids: 0 + model: timm/mobilenetv3_large_100.ra_in1k launcher: device_isolation: true @@ -20,14 +21,18 @@ benchmark: input_shapes: batch_size: 1 +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 - CUDA_VISIBLE_DEVICES: 0 - CUDA_DEVICE_ORDER: PCI_BUS_ID diff --git a/examples/tgi_llama.yaml b/examples/tgi_llama.yaml index 9bf8b4d1..a23c5c55 100644 --- a/examples/tgi_llama.yaml +++ b/examples/tgi_llama.yaml @@ -10,10 +10,12 @@ defaults: experiment_name: tgi_llama backend: + device: cuda + device_ids: 0,1 + device_map: true model: TheBloke/Llama-2-7B-AWQ quantization_scheme: awq sharded: false - device: cuda benchmark: input_shapes: @@ -21,14 +23,18 @@ benchmark: sequence_length: 256 new_tokens: 1000 +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 - CUDA_VISIBLE_DEVICES: 0 - CUDA_DEVICE_ORDER: PCI_BUS_ID diff --git a/examples/trt_llama.yaml b/examples/trt_llama.yaml index e3f8844d..702bb39e 100644 --- a/examples/trt_llama.yaml +++ b/examples/trt_llama.yaml @@ -10,8 +10,8 @@ defaults: experiment_name: trt_llama backend: - model: NousResearch/Llama-2-7b-hf device: cuda + model: NousResearch/Llama-2-7b-hf benchmark: input_shapes: @@ -19,14 +19,18 @@ benchmark: sequence_length: 64 new_tokens: 128 +# hydra/cli specific settings hydra: run: + # where to store run results dir: runs/${experiment_name} sweep: + # where to store sweep results dir: sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before OVERRIDE_BENCHMARKS: 1 - CUDA_VISIBLE_DEVICES: 0 - CUDA_DEVICE_ORDER: PCI_BUS_ID diff --git a/optimum_benchmark/aggregators/__init__.py b/optimum_benchmark/aggregators/__init__.py deleted file mode 100644 index a3015d55..00000000 --- a/optimum_benchmark/aggregators/__init__.py +++ /dev/null @@ -1,109 +0,0 @@ -from pathlib import Path -from typing import Tuple, List, Dict - -import pandas as pd -from rich.table import Table -from omegaconf import OmegaConf -import matplotlib.pyplot as plt -from rich.console import Console -from flatten_dict import flatten -from rich.terminal_theme import MONOKAI - - -def gather(root_folders: List[Path]) -> pd.DataFrame: - configs_dfs = {} - results_dfs = {} - - for root_folder in root_folders: - if not root_folder.exists(): - raise ValueError(f"{root_folder} does not exist") - - for f in root_folder.glob("**/hydra_config.yaml"): - parent_folder = f.parent.absolute().as_posix() - configs_dfs[parent_folder] = pd.DataFrame.from_dict( - flatten(OmegaConf.load(f), reducer="dot"), orient="index" - ).T - - for f in root_folder.glob("**/*_results.csv"): - parent_folder = f.parent.absolute().as_posix() - results_dfs[parent_folder] = pd.read_csv(f) - - if (len(results_dfs) == 0) or (len(configs_dfs) == 0): - raise ValueError(f"Results are missing in {root_folders}") - - # Merge inference and config dataframes - full_dfs = {} - for parent_folder in results_dfs: - full_df = pd.concat( - [configs_dfs[parent_folder], results_dfs[parent_folder]], - axis=1, - ) - full_df["parent_folder"] = parent_folder - full_dfs[parent_folder] = full_df - - # Concatenate all dataframes - full_report = pd.concat(full_dfs.values(), ignore_index=True, axis=0) - - return full_report - - -def format_element(element): - if isinstance(element, float): - if element != element: - formated_element = "" - elif abs(element) >= 1: - formated_element = f"{element:.2f}" - elif abs(element) > 1e-6: - formated_element = f"{element:.2e}" - else: - formated_element = f"{element}" - elif element is None: - formated_element = "" - elif isinstance(element, bool): - if element: - formated_element = "[green]βœ”[/green]" - else: - formated_element = "[red]✘[/red]" - else: - formated_element = str(element) - - return formated_element - - -def display(report: pd.DataFrame) -> Table: - table = Table(show_header=True, show_lines=True) - - for column in report.columns: - table.add_column(column, justify="right", header_style="bold") - - for _, row in report.iterrows(): - formated_row = [] - for element in row.values: - formated_row.append(format_element(element)) - table.add_row(*formated_row) - - console = Console(record=True, theme=MONOKAI) - console.print(table, justify="center") - - return console, table - - -def rename(report: pd.DataFrame, rename_dict: Dict[str, str]): - summarized_report = report[list(rename_dict.keys())].rename(columns=rename_dict) - - return summarized_report - - -def plot(report: pd.DataFrame, x_axis: str, y_axis: str, groupby: str) -> Tuple[plt.Figure, plt.Axes]: - fig, ax = plt.subplots() - - for group, sweep in report.groupby(groupby): - sorted_sweep = sweep.sort_values(by=x_axis) - ax.plot(sorted_sweep[x_axis], sorted_sweep[y_axis], label=group, marker="o") - - ax.set_xlabel(x_axis) - ax.set_ylabel(y_axis) - ax.set_title(f"{y_axis} per {x_axis}") - ax.legend(fancybox=True, shadow=True) - - return fig, ax diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 1c55a5ab..cf0f5087 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -1,46 +1,25 @@ import gc -import os import random -import shutil from abc import ABC from logging import getLogger -from typing import ( - Optional, - ClassVar, - Generic, - Dict, - Any, -) - -import numpy as np -from transformers.utils import ModelOutput -from transformers import ( - GenerationConfig, - PretrainedConfig, - PreTrainedModel, - TrainerState, - AutoModel, -) +from collections import OrderedDict +from typing import Optional, ClassVar, Generic, Dict, Any from .config import BackendConfigT from ..task_utils import get_automodel_class_for_task -from .diffusers_utils import ( - extract_diffusers_shapes_from_config, - get_diffusers_pretrained_config, -) + +from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config +from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config, get_timm_pre_processor from .transformers_utils import ( extract_transformers_shapes_from_artifacts, - get_transformers_pretrained_processor, get_transformers_generation_config, get_transformers_pretrained_config, - get_transformers_cache_dir, + get_transformers_pre_processor, PretrainedProcessor, ) -from .timm_utils import ( - extract_timm_shapes_from_config, - get_timm_pretrained_processor, - get_timm_pretrained_config, -) + +import numpy as np +from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState LOGGER = getLogger("backend") @@ -48,43 +27,38 @@ class Backend(Generic[BackendConfigT], ABC): NAME: ClassVar[str] - config: BackendConfigT - automodel_class: AutoModel - pretrained_model: PreTrainedModel + model_type: str model_shapes: Dict[str, int] + pretrained_model: PreTrainedModel pretrained_config: Optional[PretrainedConfig] - pretrained_processor: Optional[PretrainedProcessor] - pretrained_generation_config: Optional[GenerationConfig] + generation_config: Optional[GenerationConfig] + pre_processor: Optional[PretrainedProcessor] def __init__(self, config: BackendConfigT): LOGGER.info(f"َAllocating {self.NAME} backend") self.config = config + self.seed() if self.config.library == "diffusers": - self.pretrained_processor = None - self.pretrained_generation_config = None - self.pretrained_config = get_diffusers_pretrained_config(model=self.config.model, **self.config.hub_kwargs) - self.model_shapes = extract_diffusers_shapes_from_config(model=self.config.model, **self.config.hub_kwargs) + self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.hub_kwargs) + self.model_shapes = extract_diffusers_shapes_from_config(self.config.model, **self.config.hub_kwargs) self.model_type = self.config.task + self.generation_config = None + self.pre_processor = None + elif self.config.library == "timm": - self.pretrained_processor = get_timm_pretrained_processor(self.config.model) + self.pre_processor = get_timm_pre_processor(self.config.model) self.pretrained_config = get_timm_pretrained_config(self.config.model) self.model_shapes = extract_timm_shapes_from_config(config=self.pretrained_config) self.model_type = self.pretrained_config.architecture - self.pretrained_generation_config = None + self.generation_config = None + else: + self.pre_processor = get_transformers_pre_processor(self.config.model, **self.config.hub_kwargs) + self.generation_config = get_transformers_generation_config(self.config.model, **self.config.hub_kwargs) self.pretrained_config = get_transformers_pretrained_config(self.config.model, **self.config.hub_kwargs) - self.pretrained_generation_config = get_transformers_generation_config( - self.config.model, **self.config.hub_kwargs - ) - self.pretrained_processor = get_transformers_pretrained_processor( - self.config.model, **self.config.hub_kwargs - ) - self.model_shapes = extract_transformers_shapes_from_artifacts( - config=self.pretrained_config, - processor=self.pretrained_processor, - ) + self.model_shapes = extract_transformers_shapes_from_artifacts(self.pretrained_config, self.pre_processor) self.model_type = self.pretrained_config.model_type self.automodel_class = get_automodel_class_for_task( @@ -95,6 +69,7 @@ def __init__(self, config: BackendConfigT): ) def seed(self) -> None: + LOGGER.info(f"\t+ Setting random seed to {self.config.seed}") random.seed(self.config.seed) np.random.seed(self.config.seed) @@ -112,40 +87,35 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """ return inputs - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: + def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: """ This method is used to perform the forward pass of the model. """ raise NotImplementedError("Backend must implement forward method") - def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: + def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: """ This method is used to perform the generation pass of the model. """ raise NotImplementedError("Backend must implement generate method") + def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + """ + This method is used to call a whole pipeline. + """ + raise NotImplementedError("Backend must implement call method") + def train(self, **kwargs) -> TrainerState: """ This method is used to train the model. """ raise NotImplementedError("Backend must implement train method") - def delete_hf_model_cache(self) -> None: - LOGGER.info("\t+ Deleting model cache") - transformers_cache_path = get_transformers_cache_dir() - model_cache_folder = f"models/{self.config.model}".replace("/", "--") - model_cache_path = os.path.join(transformers_cache_path, model_cache_folder) - shutil.rmtree(model_cache_path, ignore_errors=True) - def delete_pretrained_model(self) -> None: - LOGGER.info("\t+ Deleting pretrained model") - del self.pretrained_model - gc.collect() + if hasattr(self, "pretrained_model"): + del self.pretrained_model def clean(self) -> None: LOGGER.info(f"Cleaning {self.NAME} backend") - - if hasattr(self, "pretrained_model"): - self.delete_pretrained_model() - + self.delete_pretrained_model() gc.collect() diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index fff9bf80..a4919c15 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -4,13 +4,12 @@ from dataclasses import dataclass, field from typing import Optional, TypeVar, Dict, Any -from psutil import cpu_count +from ..import_utils import is_psutil_available +from ..env_utils import get_cuda_device_ids, is_nvidia_system, is_rocm_system +from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path -from ..env_utils import get_gpus, is_nvidia_system, is_rocm_system -from ..task_utils import ( - infer_library_from_model_name_or_path, - infer_task_from_model_name_or_path, -) +if is_psutil_available(): + from psutil import cpu_count LOGGER = getLogger("backend") @@ -18,6 +17,7 @@ "revision": "main", "force_download": False, "local_files_only": False, + "trust_remote_code": False, } @@ -31,6 +31,10 @@ class BackendConfig(ABC): model: Optional[str] = None device: Optional[str] = None + # yes we use a string here instead of a list + # it's easier to pass in a yaml or from cli + # also it's consistent with CUDA_VISIBLE_DEVICES + device_ids: Optional[str] = None task: Optional[str] = None library: Optional[str] = None @@ -48,41 +52,20 @@ def __post_init__(self): self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu" if ":" in self.device: - raise ValueError( - f"Device was specified as {self.device} with a target index." - "We recommend using the main cuda device (e.g. `cuda`) and " - "specifying the target index in `CUDA_VISIBLE_DEVICES`." - ) + # using device index + self.device = self.device.split(":")[0] + self.device_ids = self.device.split(":")[1] + + if self.device == "cuda": + if self.device_ids is None: + self.device_ids = get_cuda_device_ids() + + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids + # TODO: add rocm specific environment variables ? if self.device not in ["cuda", "cpu", "mps", "xla"]: - raise ValueError("`device` must be either `cuda`, `cpu`, `mps` or `xla`.") - - if self.device == "cuda" and len(get_gpus()) > 1: - if os.environ.get("CUDA_VISIBLE_DEVICES", None) is None: - LOGGER.warning( - "Multiple GPUs detected but CUDA_VISIBLE_DEVICES is not set. " - "This means that code might allocate resources from the wrong GPUs. " - "For example, with `auto_device='auto'. `We recommend setting CUDA_VISIBLE_DEVICES " - "to isolate the GPUs that will be used for this experiment. `CUDA_VISIBLE_DEVICES` will " - "be set to `0` to ensure that only the first GPU is used. If you want to use multiple " - "GPUs, please set `CUDA_VISIBLE_DEVICES` to the desired GPU indices." - ) - os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - if os.environ.get("CUDA_DEVICE_ORDER", None) != "PCI_BUS_ID": - LOGGER.warning( - "Multiple GPUs detected but CUDA_DEVICE_ORDER is not set to `PCI_BUS_ID`. " - "This means that code might allocate resources from the wrong GPUs even if " - "`CUDA_VISIBLE_DEVICES` is set. For example pytorch uses the `FASTEST_FIRST` " - "order by default, which is not guaranteed to be the same as nvidia-smi. `CUDA_DEVICE_ORDER` " - "will be set to `PCI_BUS_ID` to ensure that the GPUs are allocated in the same order as nvidia-smi. " - ) - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - - elif self.device == "cuda" and len(get_gpus()) == 1: - if os.environ.get("CUDA_VISIBLE_DEVICES", None) is None: - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = "0" + raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}") if self.task is None: self.task = infer_task_from_model_name_or_path(self.model) diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py index 49c21906..705436d3 100644 --- a/optimum_benchmark/backends/diffusers_utils.py +++ b/optimum_benchmark/backends/diffusers_utils.py @@ -4,31 +4,27 @@ from ..import_utils import is_diffusers_available - if is_diffusers_available(): import diffusers def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]: - assert is_diffusers_available(), "Diffusers is not available" return diffusers.DiffusionPipeline.load_config(model, **kwargs) def extract_diffusers_shapes_from_config(model: str, **kwargs) -> Dict[str, int]: - assert is_diffusers_available(), "Diffusers is not available" + config = diffusers.DiffusionPipeline.load_config(model, **kwargs) shapes = {} - pip_config = diffusers.DiffusionPipeline.load_config(model, **kwargs) - - if "vae" in pip_config: - vae_import_path = pip_config["vae"] + if "vae" in config: + vae_import_path = config["vae"] vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}") vae_config = vae_class.load_config(model, subfolder="vae", **kwargs) shapes["num_channels"] = vae_config["out_channels"] shapes["height"] = vae_config["sample_size"] shapes["width"] = vae_config["sample_size"] - elif "vae_encoder" in pip_config: - vae_import_path = pip_config["vae_encoder"] + elif "vae_encoder" in config: + vae_import_path = config["vae_encoder"] vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}") vae_config = vae_class.load_config(model, subfolder="vae", **kwargs) shapes["num_channels"] = vae_config["out_channels"] diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py index 092affff..dd2a7a82 100644 --- a/optimum_benchmark/backends/neural_compressor/backend.py +++ b/optimum_benchmark/backends/neural_compressor/backend.py @@ -4,22 +4,19 @@ from logging import getLogger from tempfile import TemporaryDirectory +from ...generators.dataset_generator import DatasetGenerator +from ..transformers_utils import randomize_weights +from .utils import TASKS_TO_INCMODELS +from .config import INCConfig +from ..base import Backend + import torch from hydra.utils import get_class from transformers.utils import ModelOutput from transformers.modeling_utils import no_init_weights from transformers.utils.logging import set_verbosity_error from optimum.intel.neural_compressor.quantization import INCQuantizer -from neural_compressor.config import ( - PostTrainingQuantConfig, - AccuracyCriterion, - TuningCriterion, -) - -from ...generators.dataset_generator import DatasetGenerator -from .utils import TASKS_TO_INCMODELS -from .config import INCConfig -from ..base import Backend +from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion, TuningCriterion # disable transformers logging set_verbosity_error() @@ -34,9 +31,7 @@ def __init__(self, config: INCConfig): super().__init__(config) self.validate_task() - self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task]) - LOGGER.info(f"Using INCModel class {self.incmodel_class.__name__}") - + LOGGER.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.ptq_quantization: @@ -52,57 +47,65 @@ def __init__(self, config: INCConfig): else: self.load_incmodel_from_pretrained() - self.tmpdir.cleanup() - def validate_task(self) -> None: if self.config.task not in TASKS_TO_INCMODELS: raise NotImplementedError(f"INCBackend does not support task {self.config.task}") + self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task]) + LOGGER.info(f"Using INCModel class {self.incmodel_class.__name__}") + def load_automodel_from_pretrained(self) -> None: LOGGER.info("\t+ Loading AutoModel from pretrained") self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs) - def load_automodel_with_no_weights(self) -> None: - no_weights_model = os.path.join(self.tmpdir.name, "no_weights") + def create_no_weights_model(self) -> None: + LOGGER.info("\t+ Creating no weights model state_dict") + state_dict = torch.nn.Linear(1, 1).state_dict() - if not os.path.exists(no_weights_model): - LOGGER.info("\t+ Creating no weights model directory") - os.makedirs(no_weights_model) + LOGGER.info("\t+ Creating no weights model directory") + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") + os.makedirs(self.no_weights_model, exist_ok=True) - LOGGER.info("\t+ Saving pretrained config") - self.pretrained_config.save_pretrained(save_directory=no_weights_model) + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - LOGGER.info("\t+ Creating no weights model") - state_dict = torch.nn.Linear(1, 1).state_dict() + LOGGER.info("\t+ Saving no weights model state_dict") + torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin")) - LOGGER.info("\t+ Saving no weights model") - torch.save(state_dict, os.path.join(no_weights_model, "pytorch_model.bin")) + def load_automodel_with_no_weights(self) -> None: + self.create_no_weights_model() - LOGGER.info("\t+ Loading no weights model") with no_init_weights(): original_model = self.config.model - self.config.model = no_weights_model + self.config.model = self.no_weights_model + LOGGER.info("\t+ Loading no weights model") self.load_automodel_from_pretrained() self.config.model = original_model + LOGGER.info("\t+ Randomizing model weights") + randomize_weights(self.pretrained_model) + LOGGER.info("\t+ Tying model weights") + self.pretrained_model.tie_weights() + def load_incmodel_from_pretrained(self) -> None: LOGGER.info("\t+ Loading INCModel from pretrained") self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs) def load_incmodel_with_no_weights(self) -> None: - no_weights_model = os.path.join(self.tmpdir.name, "no_weights") - - LOGGER.info("\t+ Loading AutoModel with no weights") - self.load_automodel_with_no_weights() - self.delete_pretrained_model() + self.create_no_weights_model() - LOGGER.info("\t+ Loading INCModel with no weights") with no_init_weights(): original_model = self.config.model - self.config.model = no_weights_model + self.config.model = self.no_weights_model + LOGGER.info("\t+ Loading no weights model") self.load_incmodel_from_pretrained() self.config.model = original_model + LOGGER.info("\t+ Randomizing model weights") + randomize_weights(self.pretrained_model.model) + LOGGER.info("\t+ Tying model weights") + self.pretrained_model.model.tie_weights() + def quantize_automodel(self) -> None: LOGGER.info("\t+ Attempting to quantize model") quantized_model_path = f"{self.tmpdir.name}/quantized" @@ -134,7 +137,7 @@ def quantize_automodel(self) -> None: task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes, - ).generate() + )() columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns)) calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) else: @@ -169,6 +172,7 @@ def clean(self) -> None: super().clean() if hasattr(self, "tmpdir"): + LOGGER.info("\t+ Cleaning backend temporary directory") self.tmpdir.cleanup() gc.collect() diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py index 0801b000..07d5d860 100644 --- a/optimum_benchmark/backends/onnxruntime/backend.py +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -1,16 +1,22 @@ import gc import os from logging import getLogger +from collections import OrderedDict from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List +from ..base import Backend +from .config import ORTConfig +from ...task_utils import TEXT_GENERATION_TASKS +from ...generators.dataset_generator import DatasetGenerator +from .utils import format_calibration_config, format_quantization_config, TASKS_TO_ORTMODELS, TASKS_TO_ORTSD + import torch from datasets import Dataset from hydra.utils import get_class from onnxruntime import SessionOptions from safetensors.torch import save_file -from transformers.utils import ModelOutput -from transformers import TrainerCallback, TrainerState +from transformers import TrainerCallback from transformers.modeling_utils import no_init_weights from transformers.utils.logging import set_verbosity_error from optimum.onnxruntime.configuration import ( @@ -24,19 +30,10 @@ from optimum.onnxruntime import ( ONNX_DECODER_WITH_PAST_NAME, ONNX_DECODER_NAME, + ORTTrainingArguments, ORTOptimizer, ORTQuantizer, -) - -from ...generators.dataset_generator import DatasetGenerator -from ...task_utils import TEXT_GENERATION_TASKS -from .config import ORTConfig -from ..base import Backend -from .utils import ( - format_calibration_config, - format_quantization_config, - TASKS_TO_ORTMODELS, - TASKS_TO_ORTSD, + ORTTrainer, ) # disable transformers logging @@ -61,15 +58,19 @@ def __init__(self, config: ORTConfig) -> None: else: raise NotImplementedError(f"ORTBackend does not support task {self.config.task}") - self.set_session_options() + LOGGER.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() + self.session_options = SessionOptions() + for key, value in self.config.session_options.items(): + setattr(self.session_options, key, value) + if self.config.no_weights: self.load_ortmodel_with_no_weights() else: self.load_ortmodel_from_pretrained() - if self.is_deferred_trt_loading(): + if self.is_trt_text_generation: return if self.is_optimized or self.is_quantized: @@ -99,35 +100,30 @@ def validate_provider(self) -> None: self.pretrained_model.providers[0] == self.config.provider ), f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}" - def is_deferred_trt_loading(self) -> bool: - return self.config.provider == "TensorrtExecutionProvider" and self.config.task in TEXT_GENERATION_TASKS - - def set_session_options(self) -> None: - self.session_options = SessionOptions() - for key, value in self.config.session_options.items(): - setattr(self.session_options, key, value) - - def load_ortmodel_with_no_weights(self) -> None: + def create_no_weights_model(self) -> None: LOGGER.info("\t+ Creating no weights model directory") - no_weights_model = os.path.join(self.tmpdir.name, "no_weights") - os.makedirs(no_weights_model, exist_ok=True) + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") + os.makedirs(self.no_weights_model, exist_ok=True) LOGGER.info("\t+ Saving pretrained config") - self.pretrained_config.save_pretrained(save_directory=no_weights_model) + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - LOGGER.info("\t+ Creating no weights model weights") + LOGGER.info("\t+ Creating no weights model state dict") state_dict = torch.nn.Linear(1, 1).state_dict() - LOGGER.info("\t+ Saving no weights model weights") + LOGGER.info("\t+ Saving no weights model state dict") save_file( - filename=os.path.join(no_weights_model, "model.safetensors"), + filename=os.path.join(self.no_weights_model, "model.safetensors"), metadata={"format": "pt"}, tensors=state_dict, ) + def load_ortmodel_with_no_weights(self) -> None: + self.create_no_weights_model() + with no_init_weights(): original_model = self.config.model - self.config.model = no_weights_model + self.config.model = self.no_weights_model LOGGER.info("\t+ Loading no weights model") self.load_ortmodel_from_pretrained() self.config.model = original_model @@ -144,6 +140,10 @@ def load_ortmodel_from_pretrained(self) -> None: **self.ortmodel_kwargs, ) + @property + def is_trt_text_generation(self) -> bool: + return self.config.provider == "TensorrtExecutionProvider" and self.config.task in TEXT_GENERATION_TASKS + @property def is_optimized(self) -> bool: return (self.config.auto_optimization is not None) or self.config.optimization @@ -252,7 +252,7 @@ def quantize_onnx_files(self) -> None: task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes, - ).generate() + )() columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.inputs_names)) calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) @@ -314,7 +314,7 @@ def quantize_onnx_files(self) -> None: self.config.model = quantized_model_path def prepare_for_inference(self, **kwargs) -> None: - if self.is_deferred_trt_loading(): + if self.is_trt_text_generation: LOGGER.info("\t+ Creating dynamic shapes for Tensorrt engine. Engine creation might take a while.") batch_size = kwargs["batch_size"] max_new_tokens = kwargs["max_new_tokens"] @@ -353,21 +353,22 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: - return self.pretrained_model(**inputs, **kwargs) + def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + return self.pretrained_model.forward(**inputs, **kwargs) - def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: + def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: return self.pretrained_model.generate(**inputs, **kwargs) + def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + return self.pretrained_model(**inputs, **kwargs) + def train( self, training_dataset: Dataset, training_arguments: Dict[str, Any], training_callbacks: List[TrainerCallback], training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]], - ) -> TrainerState: - from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments - + ) -> None: LOGGER.info("\t+ Setting dataset format to `torch`") training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys())) LOGGER.info("\t+ Wrapping training arguments with optimum.onnxruntime.ORTTrainingArguments") @@ -384,13 +385,11 @@ def train( trainer.train() LOGGER.info("\t+ Training finished successfully") - return trainer.state - def clean(self) -> None: super().clean() if hasattr(self, "tmpdir"): - LOGGER.info("\t+ Cleaning temporary directory") + LOGGER.info("\t+ Cleaning backend temporary directory") self.tmpdir.cleanup() gc.collect() diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py index 0f9262cc..e0191b88 100644 --- a/optimum_benchmark/backends/onnxruntime/config.py +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -38,6 +38,7 @@ class ORTConfig(BackendConfig): version: Optional[str] = onnxruntime_version() _target_: str = "optimum_benchmark.backends.onnxruntime.backend.ORTBackend" + # load options no_weights: bool = False # export options diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py index 4140b973..73cbd63d 100644 --- a/optimum_benchmark/backends/openvino/backend.py +++ b/optimum_benchmark/backends/openvino/backend.py @@ -3,26 +3,25 @@ import inspect from typing import Any, Dict from logging import getLogger +from collections import OrderedDict from tempfile import TemporaryDirectory +from ..base import Backend +from .config import OVConfig +from .utils import TASKS_TO_OVMODEL +from ...task_utils import TEXT_GENERATION_TASKS +from ..transformers_utils import randomize_weights +from ...generators.dataset_generator import DatasetGenerator + import torch from hydra.utils import get_class from openvino.runtime import properties from safetensors.torch import save_file from optimum.intel.openvino import OVQuantizer from transformers.modeling_utils import no_init_weights -from transformers.utils import ModelOutput from transformers.utils.logging import set_verbosity_error from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict -from ..base import Backend -from .config import OVConfig -from .utils import TASKS_TO_OVMODEL -from ...task_utils import TEXT_GENERATION_TASKS -from ..transformers_utils import randomize_weights -from ...generators.dataset_generator import DatasetGenerator - - # disable transformers logging set_verbosity_error() @@ -149,7 +148,11 @@ def quantize_automodel(self) -> None: "sequence_length": 1, **self.model_shapes, } - calibration_dataset = DatasetGenerator(task=self.config.task, dataset_shapes=dataset_shapes).generate() + calibration_dataset = DatasetGenerator( + task=self.config.task, + dataset_shapes=dataset_shapes, + model_shapes=self.model_shapes, + )() columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._export_input_names)) calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) else: @@ -196,12 +199,15 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: - return self.pretrained_model(**inputs, **kwargs) + def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + return self.pretrained_model.forward(**inputs, **kwargs) - def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: + def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: return self.pretrained_model.generate(**inputs, **kwargs) + def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + return self.pretrained_model(**inputs, **kwargs) + def clean(self) -> None: super().clean() diff --git a/optimum_benchmark/backends/peft_utils.py b/optimum_benchmark/backends/peft_utils.py index 695e602c..1a367120 100644 --- a/optimum_benchmark/backends/peft_utils.py +++ b/optimum_benchmark/backends/peft_utils.py @@ -13,7 +13,6 @@ PromptLearningConfig, ) - PEFT_TASKS_TYPES = [ "SEQ_CLS", "SEQ_2_SEQ_LM", diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py index cadfe878..268f4306 100644 --- a/optimum_benchmark/backends/pytorch/backend.py +++ b/optimum_benchmark/backends/pytorch/backend.py @@ -1,27 +1,23 @@ import gc import os from logging import getLogger +from collections import OrderedDict from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List +from ..base import Backend +from .config import PyTorchConfig +from ..peft_utils import get_peft_config_class +from ..transformers_utils import randomize_weights +from ...import_utils import is_deepspeed_available, is_peft_available + import torch from datasets import Dataset from safetensors.torch import save_file -from transformers.utils import ModelOutput import datasets.utils.logging as datasets_logging -from transformers import TrainerCallback, TrainerState from transformers.modeling_utils import no_init_weights import transformers.utils.logging as transformers_logging - -from ..base import Backend -from .config import PyTorchConfig -from ..peft_utils import get_peft_config_class -from ..transformers_utils import TransformersDataParallel, randomize_weights -from ...import_utils import ( - is_deepspeed_available, - is_peft_available, -) - +from transformers import TrainerCallback, TrainerState, Trainer, TrainingArguments if is_peft_available(): from peft import get_peft_model @@ -38,21 +34,13 @@ class PyTorchBackend(Backend[PyTorchConfig]): - NAME: str = "pytorch" + NAME = "pytorch" def __init__(self, config: PyTorchConfig): super().__init__(config) + self.validate_library() - if self.config.library == "timm": - LOGGER.info("\t+ Using method timm.create_model") - else: - automodel = self.automodel_class.__name__ - if self.config.library == "diffusers": - LOGGER.info(f"\t+ Using Pipeline class {automodel}") - else: - LOGGER.info(f"\t+ Using AutoModel class {automodel}") - - # Threading options + # Threads if self.config.inter_op_num_threads is not None: LOGGER.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))") torch.set_num_threads(self.config.inter_op_num_threads) @@ -60,18 +48,23 @@ def __init__(self, config: PyTorchConfig): LOGGER.info(f"\t+ Setting pytorch intra_op_num_threads({self.config.intra_op_num_threads}))") torch.set_num_interop_threads(self.config.intra_op_num_threads) - # Dtypes options - self.amp_dtype = getattr(torch, self.config.amp_dtype) if self.config.amp_dtype is not None else None + # Mixed precision + if self.config.amp_dtype: + LOGGER.info(f"\t+ Setting mixed precision dtype to {self.config.amp_dtype}") + self.amp_dtype = getattr(torch, self.config.amp_dtype) + else: + self.amp_dtype = None + # Quantization if self.is_quantized: LOGGER.info("\t+ Processing quantization config") self.process_quantization_config() else: self.quantization_config = None + LOGGER.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() - # Load model if self.config.no_weights and self.config.library == "diffusers": raise ValueError("Diffusion pipelines are not supported with no_weights=True") elif self.config.no_weights: @@ -81,8 +74,12 @@ def __init__(self, config: PyTorchConfig): LOGGER.info("\t+ Loading model with pretrained weights") self.load_model_from_pretrained() + if self.config.cache_implementation is not None: + LOGGER.info(f"\t+ Setting cache implementation to {self.config.cache_implementation}") + self.pretrained_model.generation_config.cache_implementation = self.config.cache_implementation + # Eval mode - if self.config.eval_mode and not self.config.library == "diffusers": + if self.config.eval_mode and self.config.library != "diffusers": LOGGER.info("\t+ Turning on model's eval mode") self.pretrained_model.eval() @@ -91,7 +88,7 @@ def __init__(self, config: PyTorchConfig): LOGGER.info("\t+ Enabling BetterTransformer") self.pretrained_model.to_bettertransformer() - # Compile model + # Torch compile if self.config.torch_compile: if self.config.library == "diffusers": LOGGER.info("\t+ Using torch.compile on unet forward pass") @@ -115,18 +112,21 @@ def __init__(self, config: PyTorchConfig): if self.config.deepspeed_inference: LOGGER.info("\t+ Using DeepSpeed-Inference") - self.pretrained_model = init_inference( self.pretrained_model, config=self.config.deepspeed_inference_config, dtype=getattr(self.pretrained_model, "dtype", None), ) - if self.config.data_parallel: - LOGGER.info("\t+ Using TransformersDataParallel") - self.pretrained_model = TransformersDataParallel(self.pretrained_model) - - self.tmpdir.cleanup() + def validate_library(self) -> None: + if self.config.library == "timm": + LOGGER.info(f"\t+ Using Timm method {self.automodel_class.__name__}") + elif self.config.library == "diffusers": + LOGGER.info(f"\t+ Using Pipeline class {self.automodel_class.__name__}") + elif self.config.library == "transformers": + LOGGER.info(f"\t+ Using AutoModel class {self.automodel_class.__name__}") + else: + raise ValueError(f"Library {self.config.library} not supported") def load_model_from_pretrained(self) -> None: if self.config.library == "timm": @@ -138,8 +138,8 @@ def load_model_from_pretrained(self) -> None: self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, device_map=self.config.device_map, - **self.automodel_kwargs, **self.config.hub_kwargs, + **self.automodel_kwargs, ) if self.config.device_map is None: LOGGER.info(f"\t+ Moving pipeline to device: {self.config.device}") @@ -148,7 +148,6 @@ def load_model_from_pretrained(self) -> None: LOGGER.info("\t+ Loading BnB quantized model") self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, - low_cpu_mem_usage=self.config.low_cpu_mem_usage, device_map=self.config.device_map, **self.config.hub_kwargs, **self.automodel_kwargs, @@ -158,10 +157,8 @@ def load_model_from_pretrained(self) -> None: self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, # for gptq, we need to specify the device_map to either auto - # or a cuda adevice to avoid any modules being assigned to cpu + # or a cuda adevice to avoid any modules being assigned to cpu Β―\_(ツ)_/Β― device_map=self.config.device_map or torch.device(self.config.device), - # this avoids unnecessary memory usage when loading quantized models - low_cpu_mem_usage=self.config.low_cpu_mem_usage, **self.config.hub_kwargs, **self.automodel_kwargs, ) @@ -175,39 +172,39 @@ def load_model_from_pretrained(self) -> None: ) else: # this is the fastest way to load a model on a specific device + # but not compatible with all quantization methods (and pipelines) LOGGER.info(f"\t+ Loading model directly on device: {self.config.device}") with torch.device(self.config.device): self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, - **self.automodel_kwargs, **self.config.hub_kwargs, + **self.automodel_kwargs, ) def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") - - LOGGER.info("\t+ Creating no weights model directory") - os.makedirs(self.no_weights_model, exist_ok=True) - - if self.is_quantized: - # tricking from_pretrained to load the model as if it was quantized - self.pretrained_config.quantization_config = self.quantization_config.to_dict() - - LOGGER.info("\t+ Saving pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - LOGGER.info("\t+ Creating no weights model state_dict") state_dict = torch.nn.Linear(1, 1).state_dict() if self.is_exllamav2: - # for exllamav2 we need to add g_idx to the state_dict + # for exllamav2 we need to add g_idx to the state_dict which + # requires some information about linear layers dimensions with torch.device("meta"): meta_model = self.automodel_class.from_config(self.pretrained_config) - for name, module in meta_model.named_modules(): if hasattr(module, "in_features"): state_dict[name + ".g_idx"] = torch.ones((module.in_features,), dtype=torch.int32) + if self.is_quantized: + # tricking from_pretrained to load the model as if it was quantized + self.pretrained_config.quantization_config = self.quantization_config.to_dict() + + LOGGER.info("\t+ Creating no weights model directory") + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") + os.makedirs(self.no_weights_model, exist_ok=True) + + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + LOGGER.info("\t+ Saving no weights model state_dict") save_file( filename=os.path.join(self.no_weights_model, "model.safetensors"), @@ -292,10 +289,9 @@ def is_awq_quantized(self) -> bool: @property def is_exllamav2(self) -> bool: return ( - self.is_quantized - and self.is_gptq_quantized - and "exllama_config" in self.config.quantization_config - and self.config.quantization_config["exllama_config"]["version"] == 2 + self.is_gptq_quantized + and "exllama_config" in self.quantization_config + and self.quantization_config["exllama_config"].get("version", None) == 2 ) @property @@ -305,12 +301,14 @@ def automodel_kwargs(self) -> Dict[str, Any]: if self.config.torch_dtype is not None: kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype) - if self.config.use_flash_attention_2: - kwargs["use_flash_attention_2"] = True + if self.config.attn_implementation is not None: + kwargs["attn_implementation"] = self.config.attn_implementation - if self.is_gptq_quantized or self.is_bnb_quantized: - # awq quantization doesn't support overriding the quantization - # config by passing quantization_config to from_pretrained + if self.config.low_cpu_mem_usage is not None: + kwargs["low_cpu_mem_usage"] = self.config.low_cpu_mem_usage + + if self.is_quantized: + kwargs["_fast_init"] = False kwargs["quantization_config"] = self.quantization_config return kwargs @@ -329,24 +327,19 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs @torch.inference_mode() - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: - if self.config.library == "diffusers": - return self.pretrained_model(**inputs, **kwargs) - - if self.config.amp_autocast: - with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype): - return self.pretrained_model(**inputs, **kwargs) - else: - return self.pretrained_model(**inputs, **kwargs) + def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype, enabled=self.config.amp_autocast): + return self.pretrained_model.forward(**inputs, **kwargs) @torch.inference_mode() - def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: - if self.config.amp_autocast: - with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype): - return self.pretrained_model.generate(**inputs, **kwargs) - else: + def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype, enabled=self.config.amp_autocast): return self.pretrained_model.generate(**inputs, **kwargs) + @torch.inference_mode() + def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + return self.pretrained_model(**inputs, **kwargs) + def train( self, training_dataset: Dataset, @@ -354,16 +347,12 @@ def train( training_callbacks: List[TrainerCallback], training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]], ) -> TrainerState: - from transformers import Trainer, TrainingArguments - - LOGGER.info("\t+ Setting dataset format to `torch`") - training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys())) LOGGER.info("\t+ Wrapping training arguments with transformers.TrainingArguments") training_arguments = TrainingArguments(**training_arguments) LOGGER.info("\t+ Wrapping model with transformers.Trainer") trainer = Trainer( - model=self.pretrained_model, args=training_arguments, + model=self.pretrained_model, callbacks=training_callbacks, train_dataset=training_dataset, data_collator=training_data_collator, @@ -372,8 +361,6 @@ def train( trainer.train() LOGGER.info("\t+ Training finished successfully") - return trainer.state - def seed(self): super().seed() torch.manual_seed(self.config.seed) @@ -385,7 +372,7 @@ def clean(self) -> None: super().clean() if hasattr(self, "tmpdir"): - LOGGER.info("\t+ Cleaning temporary directory") + LOGGER.info("\t+ Cleaning backend temporary directory") self.tmpdir.cleanup() gc.collect() diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py index 1cbb04ba..d8089f60 100644 --- a/optimum_benchmark/backends/pytorch/config.py +++ b/optimum_benchmark/backends/pytorch/config.py @@ -42,9 +42,10 @@ class PyTorchConfig(BackendConfig): # optimization options eval_mode: bool = True - low_cpu_mem_usage: bool = False to_bettertransformer: bool = False - use_flash_attention_2: bool = False + low_cpu_mem_usage: Optional[bool] = None + attn_implementation: Optional[str] = None + cache_implementation: Optional[str] = None # compilation options torch_compile: bool = False @@ -55,7 +56,6 @@ class PyTorchConfig(BackendConfig): quantization_config: Dict[str, Any] = field(default_factory=dict) # distributed inference options - data_parallel: bool = False deepspeed_inference: bool = False deepspeed_inference_config: Dict[str, Any] = field(default_factory=dict) diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py index 43a5fd75..7c86adeb 100644 --- a/optimum_benchmark/backends/tensorrt_llm/backend.py +++ b/optimum_benchmark/backends/tensorrt_llm/backend.py @@ -1,13 +1,13 @@ from logging import getLogger from typing import Any, Dict -from hydra.utils import get_class -from transformers.utils import ModelOutput - from ..base import Backend from .config import TRTLLMConfig from .utils import MODEL_TYPE_TO_TRTLLMMODEL +from hydra.utils import get_class +from transformers.utils import ModelOutput + LOGGER = getLogger("tensorrt-llm") @@ -18,15 +18,15 @@ def __init__(self, config: TRTLLMConfig): super().__init__(config) self.validate_model_type() - self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type]) - LOGGER.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}") - self.load_trtmodel_from_pretrained() def validate_model_type(self) -> None: if self.model_type not in MODEL_TYPE_TO_TRTLLMMODEL: raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.model_type}") + self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type]) + LOGGER.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}") + def load_trtmodel_from_pretrained(self) -> None: self.pretrained_model = self.trtmodel_class.from_pretrained( self.config.model, diff --git a/optimum_benchmark/backends/text_generation_inference/backend.py b/optimum_benchmark/backends/text_generation_inference/backend.py index fbd3d1de..538de53c 100644 --- a/optimum_benchmark/backends/text_generation_inference/backend.py +++ b/optimum_benchmark/backends/text_generation_inference/backend.py @@ -6,6 +6,11 @@ from tempfile import TemporaryDirectory from concurrent.futures import ThreadPoolExecutor +from ..base import Backend +from .config import TGIConfig +from ...task_utils import TEXT_GENERATION_TASKS +from ..transformers_utils import randomize_weights + import torch import docker import docker.types @@ -14,10 +19,6 @@ from huggingface_hub import InferenceClient, snapshot_download from huggingface_hub.inference._text_generation import TextGenerationResponse -from ..base import Backend -from .config import TGIConfig -from ..transformers_utils import randomize_weights - # bachend logger LOGGER = getLogger("text-generation-inference") @@ -29,8 +30,7 @@ def __init__(self, config: TGIConfig) -> None: super().__init__(config) self.validate_task() - LOGGER.info(f"Using AutoModel class {self.automodel_class.__name__}") - + LOGGER.info("\t+ Creating backend temporary directory") self.tmp_dir = TemporaryDirectory() if self.config.no_weights: @@ -40,9 +40,11 @@ def __init__(self, config: TGIConfig) -> None: self.load_model_from_pretrained() def validate_task(self) -> None: - if self.config.task not in ["text-generation", "text2text-generation"]: + if self.config.task not in TEXT_GENERATION_TASKS: raise NotImplementedError(f"TGI does not support task {self.config.task}") + LOGGER.info(f"Using AutoModel class {self.automodel_class.__name__}") + def download_pretrained_model(self) -> None: LOGGER.info("\t+ Downloading pretrained model") snapshot_download(self.config.model, **self.config.hub_kwargs) @@ -93,7 +95,7 @@ def create_no_weights_model(self) -> None: self.pretrained_model = self.automodel_class.from_pretrained( self.no_weights_model, **self.config.hub_kwargs, - device_map="auto", + device_map="auto", # for faster/safer loading ) LOGGER.info("\t+ Randomizing weights") diff --git a/optimum_benchmark/backends/text_generation_inference/config.py b/optimum_benchmark/backends/text_generation_inference/config.py index edf37ba3..8b73617e 100644 --- a/optimum_benchmark/backends/text_generation_inference/config.py +++ b/optimum_benchmark/backends/text_generation_inference/config.py @@ -11,6 +11,9 @@ class TGIConfig(BackendConfig): version: Optional[str] = "0.0.1" _target_: str = "optimum_benchmark.backends.text_generation_inference.backend.TGIBackend" + # optimum benchmark specific + no_weights: bool = False + # docker options image: str = "ghcr.io/huggingface/text-generation-inference:latest" volume: str = f"{os.path.expanduser('~')}/.cache/huggingface/hub" @@ -28,9 +31,6 @@ class TGIConfig(BackendConfig): sharded: Optional[bool] = None # None, True, False num_shard: Optional[int] = None # None, 1, 2, 4, 8, 16, 32, 64 - # optimum benchmark specific - no_weights: bool = False # True, False - def __post_init__(self): super().__post_init__() diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py index 3af970a3..9e2924b2 100644 --- a/optimum_benchmark/backends/timm_utils.py +++ b/optimum_benchmark/backends/timm_utils.py @@ -1,22 +1,18 @@ -from typing import Any, Dict +from typing import Any, Dict, Optional -from transformers import PretrainedConfig +from ..import_utils import is_timm_available, is_transformers_available, is_torch_available -from ..import_utils import is_timm_available +if is_torch_available(): + import torch if is_timm_available(): import timm - -def get_timm_pretrained_processor(model: str) -> Any: - try: - pretrained_config = get_timm_pretrained_config(model) - return timm.data.create_transform(**timm.data.resolve_data_config(pretrained_config)) - except Exception: - return None +if is_transformers_available(): + from transformers import PretrainedConfig -def get_timm_pretrained_config(model_name: str) -> PretrainedConfig: +def get_timm_pretrained_config(model_name: str) -> "PretrainedConfig": model_source, model_name = timm.models.parse_model_name(model_name) if model_source == "hf-hub": # For model names specified in the form `hf-hub:path/architecture_name@revision`, @@ -27,13 +23,22 @@ def get_timm_pretrained_config(model_name: str) -> PretrainedConfig: return timm.get_pretrained_cfg(model_name) -def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]: - shapes = {} +def get_timm_pre_processor(model: str) -> Optional["torch.nn.Module"]: + try: + pretrained_config = get_timm_pretrained_config(model) + return timm.data.create_transform(**timm.data.resolve_data_config(pretrained_config)) + except Exception: + return None + + +def extract_timm_shapes_from_config(config: "PretrainedConfig") -> Dict[str, Any]: artifacts_dict = {} config_dict = {k: v for k, v in config.to_dict().items() if v is not None} artifacts_dict.update(config_dict) + shapes = {} + # image input shapes["num_channels"] = artifacts_dict.get("num_channels", None) if shapes["num_channels"] is None: diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py index aefce8ea..a7515d2f 100644 --- a/optimum_benchmark/backends/torch_ort/backend.py +++ b/optimum_benchmark/backends/torch_ort/backend.py @@ -4,6 +4,11 @@ from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List +from ..transformers_utils import randomize_weights +from ..peft_utils import get_peft_config_class +from .config import TorchORTConfig +from ..base import Backend + import torch from datasets import Dataset from safetensors.torch import save_file @@ -12,11 +17,6 @@ from transformers.utils.logging import set_verbosity_error from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments -from ..transformers_utils import randomize_weights -from ..peft_utils import get_peft_config_class -from .config import TorchORTConfig -from ..base import Backend - # disable transformers logging set_verbosity_error() @@ -28,9 +28,9 @@ class TorchORTBackend(Backend[TorchORTConfig]): def __init__(self, config: TorchORTConfig): super().__init__(config) + self.validate_library() - LOGGER.info(f"Using AutoModel: {self.automodel_class.__name__}") - + LOGGER.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.no_weights: @@ -46,7 +46,11 @@ def __init__(self, config: TorchORTConfig): peft_config = peft_config_class(**self.config.peft_config) self.pretrained_model = get_peft_model(self.pretrained_model, peft_config=peft_config) - self.tmpdir.cleanup() + def validate_library(self) -> None: + if self.config.library == "transformers": + LOGGER.info(f"Using AutoModel: {self.automodel_class.__name__}") + else: + raise NotImplementedError(f"TorchORTBackend does not support {self.config.library} library") def create_no_weights_model(self) -> None: LOGGER.info("\t+ Creating no weights model directory") @@ -76,9 +80,9 @@ def load_automodel_with_no_weights(self) -> None: self.load_automodel_from_pretrained() self.config.model = original_model - LOGGER.info("\t+ Randomizing weights") + LOGGER.info("\t+ Randomizing model weights") randomize_weights(self.pretrained_model) - LOGGER.info("\t+ Tying model weights after randomization") + LOGGER.info("\t+ Tying model weights") self.pretrained_model.tie_weights() def load_automodel_from_pretrained(self) -> None: @@ -126,7 +130,7 @@ def clean(self) -> None: super().clean() if hasattr(self, "tmpdir"): - LOGGER.info("\t+ Cleaning temporary directory") + LOGGER.info("\t+ Cleaning backend temporary directory") self.tmpdir.cleanup() gc.collect() diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 488adca5..1d7ad410 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -1,54 +1,49 @@ import os -import threading -from itertools import chain -from typing import Any, Dict, List, Optional, Sequence, Union, cast - -import torch -from torch.nn.modules import Module -from torch.cuda.amp import autocast -from torch._utils import ExceptionWrapper -from torch.cuda._utils import _get_device_index -from torch.nn.parallel.parallel_apply import get_a_var -from transformers import ( - FeatureExtractionMixin, - ImageProcessingMixin, - PreTrainedTokenizer, - GenerationConfig, - PretrainedConfig, - ProcessorMixin, - AutoProcessor, - AutoConfig, -) +from typing import Any, Dict, Optional, Union + +from ..import_utils import is_transformers_available, is_torch_available + +if is_torch_available(): + import torch + +if is_transformers_available(): + from transformers import ( + FeatureExtractionMixin, + ImageProcessingMixin, + PreTrainedTokenizer, + GenerationConfig, + PretrainedConfig, + ProcessorMixin, + AutoProcessor, + AutoConfig, + ) -PretrainedProcessor = Union[ - FeatureExtractionMixin, - ImageProcessingMixin, - PreTrainedTokenizer, - ProcessorMixin, -] + PretrainedProcessor = Union[ + FeatureExtractionMixin, + ImageProcessingMixin, + PreTrainedTokenizer, + ProcessorMixin, + ] -def get_transformers_cache_dir(): +def get_transformers_cache_dir() -> str: return os.path.expanduser("~/.cache/huggingface/hub") -def get_transformers_generation_config(model: str, **kwargs: Dict[str, Any]): - try: - # sometimes contains information about the model's input shapes that are not available in the config - return GenerationConfig.from_pretrained(model, **kwargs) - except Exception: - return None +def get_transformers_pretrained_config(model: str, **kwargs) -> "PretrainedConfig": + # sometimes contains information about the model's input shapes that are not available in the config + return AutoConfig.from_pretrained(model, **kwargs) -def get_transformers_pretrained_config(model: str, **kwargs: Dict[str, Any]): +def get_transformers_generation_config(model: str, **kwargs) -> Optional["GenerationConfig"]: try: # sometimes contains information about the model's input shapes that are not available in the config - return AutoConfig.from_pretrained(model, **kwargs) - except ValueError: + return GenerationConfig.from_pretrained(model, **kwargs) + except Exception: return None -def get_transformers_pretrained_processor(model: str, **kwargs: Dict[str, Any]): +def get_transformers_pre_processor(model: str, **kwargs) -> Optional["PretrainedProcessor"]: try: # sometimes contains information about the model's input shapes that are not available in the config return AutoProcessor.from_pretrained(model, **kwargs) @@ -57,9 +52,9 @@ def get_transformers_pretrained_processor(model: str, **kwargs: Dict[str, Any]): def extract_transformers_shapes_from_artifacts( - config: PretrainedConfig, processor: Optional[PretrainedProcessor] = None + config: "PretrainedConfig", + processor: Optional["PretrainedProcessor"] = None, ) -> Dict[str, Any]: - shapes = {} artifacts_dict = {} config_dict = {k: v for k, v in config.to_dict().items() if v is not None} @@ -68,6 +63,10 @@ def extract_transformers_shapes_from_artifacts( if processor is not None and hasattr(processor, "to_dict"): processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None} artifacts_dict.update(processor_dict) + elif processor is not None: + processor_dict = {k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int)} + + shapes = {} # text input shapes["vocab_size"] = artifacts_dict.get("vocab_size", None) @@ -126,142 +125,20 @@ def extract_transformers_shapes_from_artifacts( return shapes -def randomize_weights(model): +def randomize_weights(model: "torch.nn.Module") -> None: for param in model.parameters(): - if param.data.dtype in (torch.float32, torch.float16, torch.bfloat16): - if torch.cuda.is_available() and param.device.type == "cpu": + if param.data.is_floating_point(): + if torch.cuda.is_available() and param.device.type != "cuda": param.data.cuda().normal_(mean=0.0, std=0.2).cpu() - elif torch.backends.mps.is_available() and param.device.type == "cpu": - param.data.mps_normal_(mean=0.0, std=0.2) + elif torch.backends.mps.is_available() and param.device.type != "mps": + param.data.to("mps").normal_(mean=0.0, std=0.2).cpu() else: param.data.normal_(mean=0.0, std=0.2) - elif param.data.dtype in (torch.int8, torch.int16, torch.int32, torch.int64): - if torch.cuda.is_available() and param.device.type == "cpu": - param.data.cuda().randint_(low=-127, high=127).cpu() - elif torch.backends.mps.is_available() and param.device.type == "cpu": - param.data.mps_randint_(low=-127, high=127) - else: - param.data.randint_(low=-127, high=127) - - -# adapted from torch to use generate instead of forward -def parallel_generate_apply( - modules: Sequence[Module], - inputs: Sequence[Any], - kwargs_tup: Optional[Sequence[Dict[str, Any]]] = None, - devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None, -) -> List[Any]: - assert len(modules) == len( - inputs - ), f"The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}" - if kwargs_tup is not None: - assert len(modules) == len(kwargs_tup) - else: - kwargs_tup = (cast(Dict[str, Any], {}),) * len(modules) - if devices is not None: - assert len(modules) == len(devices) - else: - devices = [None] * len(modules) - devices = [_get_device_index(x, True) for x in devices] - streams = [torch.cuda.current_stream(x) for x in devices] - lock = threading.Lock() - results = {} - grad_enabled, autocast_enabled = ( - torch.is_grad_enabled(), - torch.is_autocast_enabled(), - ) - - def _worker( - i: int, - module: Module, - input: Any, - kwargs: Dict[str, Any], - device: Optional[Union[int, torch.device]] = None, - stream: Optional[torch.cuda.Stream] = None, - ) -> None: - torch.set_grad_enabled(grad_enabled) - if device is None: - t = get_a_var(input) - if t is None: - with lock: - results[i] = ExceptionWrapper( - where=f"in replica {i}, no device was provided and no tensor input was found; " - "device cannot be resolved" - ) - return - device = t.get_device() - if stream is None: - stream = torch.cuda.current_stream(device) - try: - with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled): - # this also avoids accidental slicing of `input` if it is a Tensor - if not isinstance(input, (list, tuple)): - input = (input,) - output = module.generate(*input, **kwargs) - with lock: - results[i] = output - except Exception: - with lock: - results[i] = ExceptionWrapper(where=f"in replica {i} on device {device}") - - if len(modules) > 1: - threads = [ - threading.Thread(target=_worker, args=(i, module, input, kwargs, device, stream)) - for i, (module, input, kwargs, device, stream) in enumerate( - zip(modules, inputs, kwargs_tup, devices, streams) - ) - ] - - for thread in threads: - thread.start() - for thread in threads: - thread.join() - else: - _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0]) - - outputs = [] - for i in range(len(inputs)): - output = results[i] - if isinstance(output, ExceptionWrapper): - output.reraise() - outputs.append(output) - return outputs - -# adapted from torch to support generate -class TransformersDataParallel(torch.nn.DataParallel): - def generate(self, *inputs: Any, **kwargs: Any) -> Any: - with torch.autograd.profiler.record_function("DataParallel.generate"): - if not self.device_ids: - return self.module.generate(*inputs, **kwargs) - - for t in chain(self.module.parameters(), self.module.buffers()): - if t.device != self.src_device_obj: - raise RuntimeError( - "module must have its parameters and buffers " - f"on device {self.src_device_obj} (device_ids[0]) but found one of " - f"them on device: {t.device}" - ) - - inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids) - # for forward function without any inputs, empty list and dict will be created - # so the module can be executed on one device which is the first one in device_ids - if not inputs and not module_kwargs: - inputs = ((),) - module_kwargs = ({},) - - if len(self.device_ids) == 1: - return self.module.generate(*inputs[0], **module_kwargs[0]) - - replicas = self.replicate(self.module, self.device_ids[: len(inputs)]) - outputs = self.parallel_generate_apply(replicas, inputs, module_kwargs) - return self.gather(outputs, self.output_device) - - def parallel_generate_apply(self, replicas: Sequence, inputs: Sequence, kwargs: Any) -> List[Any]: - return parallel_generate_apply(replicas, inputs, kwargs, self.device_ids[: len(replicas)]) - - def __getattr__(self, name: str) -> Any: - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.module, name) + elif param.data.dtype in (torch.int32, torch.int16, torch.int8): + if torch.cuda.is_available() and param.device.type != "cuda": + param.data.copy_(torch.randint(-127, 127, param.data.shape, device="cuda")) + elif torch.backends.mps.is_available() and param.device.type != "mps": + param.data.copy_(torch.randint(-127, 127, param.data.shape, device="mps")) + else: + param.data.copy_(torch.randint(-127, 127, param.data.shape)) diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py index dbc68c3c..84495a1a 100644 --- a/optimum_benchmark/benchmarks/base.py +++ b/optimum_benchmark/benchmarks/base.py @@ -1,19 +1,17 @@ from abc import ABC from logging import getLogger -from typing import ClassVar, Generic, Dict, Any +from typing import ClassVar, Generic from ..backends.base import Backend +from .report import BenchmarkReport from .config import BenchmarkConfigT - LOGGER = getLogger("benchmark") class Benchmark(Generic[BenchmarkConfigT], ABC): NAME: ClassVar[str] - config: BenchmarkConfigT - def __init__(self, config: BenchmarkConfigT) -> None: LOGGER.info(f"Allocating {self.NAME} benchmark") self.config = config @@ -21,5 +19,5 @@ def __init__(self, config: BenchmarkConfigT) -> None: def run(self, backend: Backend) -> None: raise NotImplementedError("Benchmark must implement run method") - def report(self) -> Dict[str, Any]: - raise NotImplementedError("Benchmark must implement save method") + def get_report(self) -> BenchmarkReport: + raise NotImplementedError("Benchmark must implement report method") diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py index 23e479b4..9cc96ee1 100644 --- a/optimum_benchmark/benchmarks/inference/benchmark.py +++ b/optimum_benchmark/benchmarks/inference/benchmark.py @@ -1,24 +1,28 @@ -import os -import statistics from logging import getLogger -from typing import List, Dict, Any +from typing import List, Tuple, Dict from ..base import Benchmark from .config import InferenceConfig -from ...backends.base import Backend from ...trackers.energy import EnergyTracker from ...trackers.memory import MemoryTracker from ...trackers.latency import LatencyTracker +from ...backends.base import Backend, BackendConfigT from ...generators.input_generator import InputGenerator -from ...task_utils import TEXT_GENERATION_TASKS, DIFFUSION_TASKS +from ...import_utils import is_torch_distributed_available +from ...task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS +from .report import InferenceReport, TextGenerationReport, ImageDiffusionReport + +if is_torch_distributed_available(): + import torch.distributed LOGGER = getLogger("inference") -DIFFUSION_KWARGS = { +IMAGE_DIFFUSION_KWARGS = { + "num_inference_steps": 30, "num_images_per_prompt": 1, } -GENERATE_KWARGS = { +TEXT_GENERATION_KWARGS = { "num_return_sequences": 1, "max_new_tokens": 100, "min_new_tokens": 100, @@ -36,45 +40,13 @@ class InferenceBenchmark(Benchmark[InferenceConfig]): def __init__(self, config: InferenceConfig) -> None: super().__init__(config) - self.forward_energy: float = 0 - self.forward_emissions: float = 0 - self.forward_max_memory_used: int = 0 - self.forward_max_memory_allocated: int = 0 - self.forward_max_memory_reserved: int = 0 - self.forward_latencies: List[float] = [] - - self.generate_energy: float = 0 - self.generate_emissions: float = 0 - self.generate_max_memory_used: int = 0 - self.generate_max_memory_allocated: int = 0 - self.generate_max_memory_reserved: int = 0 - self.generate_latencies: List[float] = [] - - def run(self, backend: Backend) -> None: - self.can_diffuse = backend.config.task in DIFFUSION_TASKS - self.can_generate = backend.config.task in TEXT_GENERATION_TASKS - - if self.can_diffuse: - LOGGER.info("\t+ Updating forward kwargs with default values") - self.config.forward_kwargs = { - **DIFFUSION_KWARGS, - **self.config.forward_kwargs, - } - if self.can_generate: - LOGGER.info("\t+ Updating generate kwargs with default values") - self.config.generate_kwargs = { - **GENERATE_KWARGS, - **self.config.generate_kwargs, - } - - # compile with static shapes if needed - LOGGER.info("\t+ Preparing backend for inference") - backend.prepare_for_inference( - **backend.model_shapes, - **self.config.input_shapes, - **self.config.forward_kwargs, - **self.config.generate_kwargs, - ) + def run(self, backend: Backend[BackendConfigT]) -> None: + if is_torch_distributed_available() and torch.distributed.is_initialized(): + if self.config.input_shapes["batch_size"] % torch.distributed.get_world_size() != 0: + raise ValueError( + "The batch size must be divisible by the number of processes in a distributed environment" + ) + self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size() LOGGER.info("\t+ Creating input generator") self.input_generator = InputGenerator( @@ -83,226 +55,223 @@ def run(self, backend: Backend) -> None: input_shapes=self.config.input_shapes, ) - # run memory tracking - # we do this first to measure the memory on the first call to forward/generate - if self.config.memory: - self.run_forward_memory_tracking(backend) - if self.can_generate: - self.run_generate_memory_tracking(backend) + if backend.config.task in TEXT_GENERATION_TASKS: + LOGGER.info("\t+ Generating and preparing Text Generation input") + self.forward_inputs = self.input_generator(mode="forward") + self.generate_input = self.input_generator(mode="generate") + self.forward_inputs = backend.prepare_inputs(self.forward_inputs) + self.generate_input = backend.prepare_inputs(self.generate_input) + LOGGER.info("\t+ Updating Text Generation kwargs with default values") + self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs} + LOGGER.info("\t+ Initializing Text Generation report") + self.report = TextGenerationReport( + batch_size=self.config.input_shapes["batch_size"], + sequence_length=self.config.input_shapes["sequence_length"], + num_new_tokens=self.config.generate_kwargs["max_new_tokens"], + num_return_sequences=self.config.generate_kwargs["num_return_sequences"], + ) + + elif backend.config.task in IMAGE_DIFFUSION_TASKS: + LOGGER.info("\t+ Generating and preparing Image Diffusion input") + self.diffuse_input = self.input_generator(mode="call") + self.diffuse_input = backend.prepare_inputs(self.diffuse_input) + LOGGER.info("\t+ Updating Image Diffusion kwargs with default values") + self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs} + LOGGER.info("\t+ Initializing Image Diffusion report") + self.report = ImageDiffusionReport( + batch_size=self.config.input_shapes["batch_size"], + num_images_per_prompts=self.config.forward_kwargs["num_images_per_prompt"], + ) + + else: + LOGGER.info("\t+ Generating and preparing Inference input") + self.forward_inputs = self.input_generator(mode="forward") + self.forward_inputs = backend.prepare_inputs(self.forward_inputs) + LOGGER.info("\t+ Initializing Inference report") + self.report = InferenceReport( + batch_size=self.config.input_shapes["batch_size"], + ) + + LOGGER.info("\t+ Preparing backend for Inference") + backend.prepare_for_inference( + **backend.model_shapes, + **self.config.input_shapes, + **self.config.forward_kwargs, + **self.config.generate_kwargs, + ) - # run lacency tracking - self.run_forward_latency_tracking(backend) - if self.can_generate: - self.run_generate_latency_tracking(backend) + LOGGER.info("\t+ Warming up backend for Inference") + for _ in range(self.config.warmup_runs): + if backend.config.task in TEXT_GENERATION_TASKS: + generate_warmup_kwargs = {"max_new_tokens": 2, "min_new_tokens": 2} + _ = backend.generate(self.generate_input, generate_warmup_kwargs) + elif backend.config.task in IMAGE_DIFFUSION_TASKS: + diffuse_warmup_kwargs = {"num_inference_steps": 2} + _ = backend.call(self.diffuse_input, diffuse_warmup_kwargs) + else: + _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) + + if self.config.memory: + LOGGER.info("\t+ Creating inference memory tracker") + self.memory_tracker = MemoryTracker( + backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids + ) + if backend.config.task in TEXT_GENERATION_TASKS: + forward_memories_dict, generate_memories_dict = self.run_text_generation_memory_tracking(backend) + self.report.populate_memory(forward_memories_dict, generate_memories_dict) + elif backend.config.task in IMAGE_DIFFUSION_TASKS: + call_memories_dict = self.run_image_diffusion_memory_tracking(backend) + self.report.populate_memory(call_memories_dict) + else: + forward_memories_dict = self.run_inference_memory_tracking(backend) + self.report.populate_memory(forward_memories_dict) + + self.report.log_memory() + + if self.config.latency: + LOGGER.info("\t+ Creating inference latency tracker") + self.latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device) + if backend.config.task in TEXT_GENERATION_TASKS: + forward_latencies_dict, generate_latencies_dict = self.run_text_generation_latency_tracking(backend) + self.report.populate_latency(forward_latencies_dict, generate_latencies_dict) + elif backend.config.task in IMAGE_DIFFUSION_TASKS: + call_latencies_dict = self.run_image_diffusion_latency_tracking(backend) + self.report.populate_latency(call_latencies_dict) + else: + forward_latencies_dict = self.run_latency_inference_tracking(backend) + self.report.populate_latency(forward_latencies_dict) + + self.report.log_latency() - # run energy tracking if self.config.energy: - self.run_forward_energy_tracking(backend) - if self.can_generate: - self.run_generate_energy_tracking(backend) + LOGGER.info("\t+ Creating inference energy tracker") + self.energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids) + if backend.config.task in TEXT_GENERATION_TASKS: + forward_energies_dict, generate_energies_dict = self.run_text_generation_energy_tracking(backend) + self.report.populate_energy(forward_energies_dict, generate_energies_dict) + elif backend.config.task in IMAGE_DIFFUSION_TASKS: + call_energies_dict = self.run_image_diffusion_energy_tracking(backend) + self.report.populate_energy(call_energies_dict) + else: + forward_energies_dict = self.run_inference_energy_tracking(backend) + self.report.populate_energy(forward_energies_dict) - def run_forward_latency_tracking(self, backend: "Backend") -> None: - forward_input = self.input_generator.generate(mode="forward") + self.report.log_energy() - LOGGER.info("\t+ Preparing input for the forward pass") - forward_input = backend.prepare_inputs(forward_input) + ## Memory tracking + def run_text_generation_memory_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]: + LOGGER.info("\t+ Running memory tracking") + self.memory_tracker.reset() + with self.memory_tracker.track(): + _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - LOGGER.info("\t+ Warming up the forward pass") - for _ in range(self.config.warmup_runs): - _ = backend.forward(forward_input, self.config.forward_kwargs) - - LOGGER.info("\t+ Tracking forward pass latency and throughput") - latency_tracker = LatencyTracker(device=backend.config.device, backend=backend.config.name) - while sum(self.forward_latencies) < self.config.duration: - with latency_tracker.track(): - _ = backend.forward(forward_input, self.config.forward_kwargs) - self.forward_latencies = latency_tracker.get_latencies() - - LOGGER.debug(f"\t+ Forward pass latency: {self.forward_latency:.3g} (s)") - LOGGER.debug(f"\t+ Forward pass throughput: {self.forward_throughput:.3g} (samples/s)") - - def run_forward_energy_tracking(self, backend: Backend) -> None: - forward_input = self.input_generator.generate(mode="forward") - - LOGGER.info("\t+ Preparing input for the forward pass") - forward_input = backend.prepare_inputs(forward_input) - - LOGGER.info("\t+ Tracking forward pass energy consumption") - num_forward_passes = 0 - energy_tracker = EnergyTracker() - with energy_tracker.track(interval=1, file_prefix="forward"): - while energy_tracker.get_elapsed_time() < self.config.duration: - _ = backend.forward(forward_input, self.config.forward_kwargs) - num_forward_passes += 1 - num_forward_samples = num_forward_passes * self.config.input_shapes["batch_size"] - self.forward_energy = energy_tracker.get_total_energy() / num_forward_samples - self.forward_emissions = energy_tracker.get_total_emissions() / num_forward_samples - - LOGGER.debug(f"\t+ Forward pass energy consumption: {self.forward_energy:.3g} (kWh/sample)") - LOGGER.debug(f"\t+ Forward pass carbon emissions: {self.forward_emissions:.3g} (kgCO2eq/sample)") - LOGGER.debug(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/forward_codecarbon.csv") - - def run_forward_memory_tracking(self, backend: "Backend") -> None: - forward_input = self.input_generator.generate(mode="forward") - - LOGGER.info("\t+ Preparing input for the forward pass") - forward_input = backend.prepare_inputs(forward_input) - - LOGGER.info("\t+ Tracking forward pass peak memory") - memory_tracker = MemoryTracker(device=backend.config.device, backend=backend.config.name) - with memory_tracker.track(): - _ = backend.forward(forward_input, self.config.forward_kwargs) - self.forward_max_memory_used = memory_tracker.get_max_memory_used() - self.forward_max_memory_reserved = memory_tracker.get_max_memory_reserved() - self.forward_max_memory_allocated = memory_tracker.get_max_memory_allocated() - - LOGGER.debug(f"\t+ Forward pass max memory used: {self.forward_max_memory_used:.3g} (MB)") - LOGGER.debug(f"\t+ Forward pass max memory reserved: {self.forward_max_memory_reserved:.3g} (MB)") - LOGGER.debug(f"\t+ Forward pass max memory allocated: {self.forward_max_memory_allocated:.3g} (MB)") - - def run_generate_latency_tracking(self, backend: "Backend") -> None: - generate_input = self.input_generator.generate(mode="generate") - - LOGGER.info("\t+ Preparing input for the generation pass") - generate_input = backend.prepare_inputs(generate_input) - - LOGGER.info("\t+ Warming up the generation pass") - _ = backend.generate(generate_input, self.config.generate_kwargs) - - LOGGER.info("\t+ Tracking generation latency and throughput") - latency_tracker = LatencyTracker(device=backend.config.device, backend=backend.config.name) - while sum(self.generate_latencies) < self.config.duration: - with latency_tracker.track(): - _ = backend.generate(generate_input, self.config.generate_kwargs) - self.generate_latencies = latency_tracker.get_latencies() - - LOGGER.debug(f"\t+ Generation pass latency: {self.generate_latency:.3g} (s)") - LOGGER.debug(f"\t+ Generation pass throughput: {self.generate_throughput:.3g} (tokens/s)") - - def run_generate_energy_tracking(self, backend: Backend) -> None: - generate_input = self.input_generator.generate(mode="generate") - - LOGGER.info("\t+ Preparing input for the generation pass") - generate_input = backend.prepare_inputs(generate_input) - - LOGGER.info("\t+ Tracking generation pass energy consumption") - num_generate_passes = 0 - energy_tracker = EnergyTracker() - with energy_tracker.track(interval=1, file_prefix="generate"): - while energy_tracker.get_elapsed_time() < self.config.duration: - _ = backend.generate(generate_input, self.config.generate_kwargs) - num_generate_passes += 1 - num_generated_tokens = ( - num_generate_passes - * self.config.generate_kwargs["min_new_tokens"] - * self.config.generate_kwargs["num_return_sequences"] - * self.config.input_shapes["batch_size"] - ) - self.generate_energy = energy_tracker.get_total_energy() / num_generated_tokens - self.generate_emissions = energy_tracker.get_total_emissions() / num_generated_tokens - - LOGGER.debug(f"\t+ Generation pass energy consumption: {self.generate_energy:.3g} (kWh/token)") - LOGGER.debug(f"\t+ Generation pass carbon emissions: {self.generate_emissions:.3g} (kgCO2eq/token)") - LOGGER.debug(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/generate_codecarbon.csv") - - def run_generate_memory_tracking(self, backend: "Backend") -> None: - generate_input = self.input_generator.generate(mode="generate") - - LOGGER.info("\t+ Preparing input for the generation pass") - generate_input = backend.prepare_inputs(generate_input) - - LOGGER.info("\t+ Tracking generation pass peak memory") - memory_tracker = MemoryTracker(device=backend.config.device, backend=backend.config.name) - with memory_tracker.track(): - _ = backend.generate(generate_input, self.config.generate_kwargs) - self.generate_max_memory_used = memory_tracker.get_max_memory_used() - self.generate_max_memory_reserved = memory_tracker.get_max_memory_reserved() - self.generate_max_memory_allocated = memory_tracker.get_max_memory_allocated() - - LOGGER.debug(f"\t+ Generation pass max memory used: {self.generate_max_memory_used:.3g} (MB)") - LOGGER.debug(f"\t+ Generation pass max memory reserved: {self.generate_max_memory_reserved:.3g} (MB)") - LOGGER.debug(f"\t+ Generation pass max memory allocated: {self.generate_max_memory_allocated:.3g} (MB)") - - # Metrics - ## Forward pass metrics - @property - def forward_latency(self) -> float: - return statistics.mean(self.forward_latencies) - - @property - def forward_throughput(self) -> float: - return self.config.input_shapes["batch_size"] / self.forward_latency - - ## Generation pass metrics - @property - def generate_latency(self) -> float: - return statistics.mean(self.generate_latencies) - - @property - def generate_throughput(self) -> float: - return ( - self.config.generate_kwargs["min_new_tokens"] - * self.config.generate_kwargs["num_return_sequences"] - * self.config.input_shapes["batch_size"] - / self.generate_latency - ) + forward_memories_dict = self.memory_tracker.get_memories_dict() - @property - def decode_latency(self) -> float: - return self.generate_latency - self.forward_latency - - @property - def decode_throughput(self) -> float: - return ( - (self.config.generate_kwargs["min_new_tokens"] - 1) - * self.config.generate_kwargs["num_return_sequences"] - * self.config.input_shapes["batch_size"] - / self.decode_latency - ) + self.memory_tracker.reset() + with self.memory_tracker.track(): + _ = backend.generate(self.generate_input, self.config.generate_kwargs) - ## Diffusion pass metrics - @property - def diffusion_throughput(self) -> float: - return ( - self.config.input_shapes["batch_size"] - * self.config.forward_kwargs["num_images_per_prompt"] - / self.forward_latency - ) + generate_memories_dict = self.memory_tracker.get_memories_dict() - def report(self) -> Dict[str, Any]: - report_dict = {} + return forward_memories_dict, generate_memories_dict - report_dict["forward.latency(s)"] = self.forward_latency - report_dict["forward.throughput(samples/s)"] = self.forward_throughput + def run_image_diffusion_memory_tracking(self, backend: Backend) -> Dict[str, float]: + LOGGER.info("\t+ Running memory tracking") + self.memory_tracker.reset() + with self.memory_tracker.track(): + _ = backend.call(self.diffuse_input, self.config.forward_kwargs) - if self.can_diffuse: - report_dict["diffusion.throughput(images/s)"] = self.diffusion_throughput + call_memories_dict = self.memory_tracker.get_memories_dict() - if self.config.memory: - report_dict["forward.peak_memory(MB)"] = self.forward_max_memory_used - report_dict["forward.max_memory_used(MB)"] = self.forward_max_memory_used - report_dict["forward.max_memory_allocated(MB)"] = self.forward_max_memory_allocated - report_dict["forward.max_memory_reserved(MB)"] = self.forward_max_memory_reserved + return call_memories_dict - if self.config.energy: - report_dict["forward.energy_consumption(kWh/sample)"] = self.forward_energy - report_dict["forward.carbon_emissions(kgCO2eq/sample)"] = self.forward_emissions + def run_inference_memory_tracking(self, backend: Backend) -> Dict[str, float]: + LOGGER.info("\t+ Running memory tracking") + self.memory_tracker.reset() + with self.memory_tracker.track(): + _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) + + forward_memories_dict = self.memory_tracker.get_memories_dict() + + return forward_memories_dict + + ## Latency tracking + def run_text_generation_latency_tracking(self, backend: Backend) -> Tuple[List[float], List[float]]: + LOGGER.info("\t+ Running latency tracking") + self.latency_tracker.reset() + while self.latency_tracker.get_total_latency() < self.config.duration: + with self.latency_tracker.track(): + _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) + + forward_latencies_list = self.latency_tracker.get_latencies_list() + + self.latency_tracker.reset() + while self.latency_tracker.get_total_latency() < self.config.duration: + with self.latency_tracker.track(): + _ = backend.generate(self.generate_input, self.config.generate_kwargs) + + generate_latencies_list = self.latency_tracker.get_latencies_list() + + return forward_latencies_list, generate_latencies_list + + def run_image_diffusion_latency_tracking(self, backend: Backend) -> List[float]: + LOGGER.info("\t+ Running latency tracking") + self.latency_tracker.reset() + while self.latency_tracker.get_total_latency() < self.config.duration: + with self.latency_tracker.track(): + _ = backend.call(self.diffuse_input, self.config.forward_kwargs) + + call_latencies_list = self.latency_tracker.get_latencies_list() + + return call_latencies_list + + def run_latency_inference_tracking(self, backend: Backend) -> List[float]: + LOGGER.info("\t+ Running latency tracking") + self.latency_tracker.reset() + while self.latency_tracker.get_total_latency() < self.config.duration: + with self.latency_tracker.track(): + _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) + + forward_latencies_list = self.latency_tracker.get_latencies_list() + + return forward_latencies_list + + ## Energy tracking + def run_text_generation_energy_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]: + LOGGER.info("\t+ Running energy tracking") + self.energy_tracker.reset() + with self.energy_tracker.track(): + _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) + + forward_energies_dict = self.energy_tracker.get_energies_dict() + + self.energy_tracker.reset() + with self.energy_tracker.track(): + _ = backend.generate(self.generate_input, self.config.generate_kwargs) + + generate_energies_dict = self.energy_tracker.get_energies_dict() + + return forward_energies_dict, generate_energies_dict + + def run_image_diffusion_energy_tracking(self, backend: Backend) -> Dict[str, float]: + LOGGER.info("\t+ Running energy tracking") + self.energy_tracker.reset() + with self.energy_tracker.track(): + _ = backend.call(self.diffuse_input, self.config.forward_kwargs) + + call_energies_dict = self.energy_tracker.get_energies_dict() - if self.can_generate: - report_dict["generate.latency(s)"] = self.generate_latency - report_dict["generate.throughput(tokens/s)"] = self.generate_throughput + return call_energies_dict - report_dict["decode.latency(s)"] = self.decode_latency - report_dict["decode.throughput(tokens/s)"] = self.decode_throughput + def run_inference_energy_tracking(self, backend: Backend) -> Dict[str, float]: + LOGGER.info("\t+ Running energy tracking") + self.energy_tracker.reset() + with self.energy_tracker.track(): + _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - if self.config.memory: - report_dict["generate.peak_memory(MB)"] = self.generate_max_memory_used - report_dict["generate.max_memory_used(MB)"] = self.generate_max_memory_used - report_dict["generate.max_memory_allocated(MB)"] = self.generate_max_memory_allocated - report_dict["generate.max_memory_reserved(MB)"] = self.generate_max_memory_reserved + forward_energies_dict = self.energy_tracker.get_energies_dict() - if self.config.energy: - report_dict["generate.energy_consumption(kWh/token)"] = self.generate_energy - report_dict["generate.carbon_emissions(kgCO2eq/token)"] = self.generate_emissions + return forward_energies_dict - return report_dict + def get_report(self) -> InferenceReport: + return self.report diff --git a/optimum_benchmark/benchmarks/inference/callback.py b/optimum_benchmark/benchmarks/inference/callback.py new file mode 100644 index 00000000..4871691d --- /dev/null +++ b/optimum_benchmark/benchmarks/inference/callback.py @@ -0,0 +1,25 @@ +import time + +from ...import_utils import is_torch_available + +from transformers import LogitsProcessor + +if is_torch_available(): + import torch + + +# TODO: uses this class for more fine-grained latency measurements in text generation +class MeasurementProcessor(LogitsProcessor): + def __init__(self, device: str, backend: str): + self.device = device + self.backend = backend + + self.latencies = [] + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): + """ + Callback to track the time it takes to generate one batch of tokens. + """ + self.latencies.append(time.perf_counter_ns()) + + return scores diff --git a/optimum_benchmark/benchmarks/inference/config.py b/optimum_benchmark/benchmarks/inference/config.py index 1299ca85..d5c4a0bb 100644 --- a/optimum_benchmark/benchmarks/inference/config.py +++ b/optimum_benchmark/benchmarks/inference/config.py @@ -2,33 +2,15 @@ from typing import Any, Dict, Optional from dataclasses import dataclass, field -from ..config import BenchmarkConfig from ...env_utils import is_rocm_system +from ..config import BenchmarkConfig LOGGER = getLogger("inference") INPUT_SHAPES = { - # used with all tasks "batch_size": 2, - # used with text input tasks "sequence_length": 16, - # used with multiple choice tasks where input - # is of shape (batch_size, num_choices, sequence_length) - "num_choices": 1, - # used with audio input tasks - "feature_size": 80, - "nb_max_frames": 3000, -} - -GENERATE_CONFIG = { - "num_return_sequences": 1, - "max_new_tokens": 100, - "min_new_tokens": 100, - "do_sample": False, - "use_cache": True, - "pad_token_id": 0, - "temperature": 1.0, - "num_beams": 1, + "num_choices": 2, } @@ -38,37 +20,73 @@ class InferenceConfig(BenchmarkConfig): _target_: str = "optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark" # benchmark options - duration: int = 10 - warmup_runs: int = 10 + duration: int = field(default=10, metadata={"help": "Minimum duration of the benchmark in seconds"}) + warmup_runs: int = field(default=10, metadata={"help": "Number of warmup runs to perform before benchmarking"}) - # additional/optional metrics - memory: bool = False - energy: bool = False + # input/output shapes + input_shapes: Dict[str, Any] = field( + default_factory=dict, + metadata={"help": "Input shapes for the model. Missing keys will be filled with default values."}, + ) + new_tokens: Optional[int] = field( + default=None, + metadata={"help": "Deprecated. If set, `max_new_tokens` and `min_new_tokens` will be set to this value."}, + ) - # input options - input_shapes: Dict = field(default_factory=dict) - # output options - new_tokens: Optional[int] = None + # tracking options + energy: bool = field(default=False, metadata={"help": "Measure energy usage"}) + memory: bool = field(default=False, metadata={"help": "Measure max memory usage"}) + latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"}) - # forward options - forward_kwargs: Dict[str, Any] = field(default_factory=dict) - # generation options - generate_kwargs: Dict[str, Any] = field(default_factory=dict) + # methods kwargs + forward_kwargs: Dict[str, Any] = field( + default_factory=dict, + metadata={"help": "Keyword arguments to pass to the forward method of the model."}, + ) + generate_kwargs: Dict[str, Any] = field( + default_factory=dict, + metadata={"help": "Keyword arguments to pass to the generate method of the model."}, + ) + call_kwargs: Dict[str, Any] = field( + default_factory=dict, + metadata={"help": "Keyword arguments to pass to the __call__ method of the pipeline."}, + ) def __post_init__(self): super().__post_init__() self.input_shapes = {**INPUT_SHAPES, **self.input_shapes} - self.generate_kwargs = {**GENERATE_CONFIG, **self.generate_kwargs} - - if self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]: - raise ValueError("`max_new_tokens` and `min_new_tokens` must be equal for fixed length output.") if self.new_tokens is not None: + LOGGER.warning( + "`new_tokens` is deprecated. Use `max_new_tokens` and `min_new_tokens` instead. " + "Setting `max_new_tokens` and `min_new_tokens` to `new_tokens`." + ) self.generate_kwargs["max_new_tokens"] = self.new_tokens self.generate_kwargs["min_new_tokens"] = self.new_tokens - else: - self.new_tokens = self.generate_kwargs["min_new_tokens"] + + if ( + "max_new_tokens" in self.generate_kwargs + and "min_new_tokens" in self.generate_kwargs + and self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"] + ): + raise ValueError( + "Setting `min_new_tokens` and `max_new_tokens` to different values results in non-deterministic behavior." + ) + + elif "max_new_tokens" in self.generate_kwargs and "min_new_tokens" not in self.generate_kwargs: + LOGGER.warning( + "Setting `max_new_tokens` without `min_new_tokens` results in non-deterministic behavior. " + "Setting `min_new_tokens` to `max_new_tokens`." + ) + self.generate_kwargs["min_new_tokens"] = self.generate_kwargs["max_new_tokens"] + + elif "min_new_tokens" in self.generate_kwargs and "max_new_tokens" not in self.generate_kwargs: + LOGGER.warning( + "Setting `min_new_tokens` without `max_new_tokens` results in non-deterministic behavior. " + "Setting `max_new_tokens` to `min_new_tokens`." + ) + self.generate_kwargs["max_new_tokens"] = self.generate_kwargs["min_new_tokens"] if self.energy and is_rocm_system(): raise ValueError("Energy measurement through codecarbon is not yet available on ROCm-powered devices.") diff --git a/optimum_benchmark/benchmarks/inference/report.py b/optimum_benchmark/benchmarks/inference/report.py new file mode 100644 index 00000000..9cd43cfc --- /dev/null +++ b/optimum_benchmark/benchmarks/inference/report.py @@ -0,0 +1,353 @@ +from dataclasses import dataclass, field +from statistics import mean, stdev +from typing import Any, Dict, List +from logging import getLogger + +from ..report import BenchmarkReport + +LOGGER = getLogger("report") + + +@dataclass +class InferenceReport(BenchmarkReport): + # Config + batch_size: int + # Metrics + forward: Dict[str, Any] = field(default_factory=dict) + + # POPULATING + def populate_latency(self, forward_latencies_list: List[float]): + ## Latency + self.forward["latency"] = { + "list[s]": forward_latencies_list, + "mean(s)": compute_mean(forward_latencies_list), + "stdev(s)": compute_stdev(forward_latencies_list), + } + ## Throughput + forward_throughputs_list = [self.batch_size / latency for latency in forward_latencies_list] + self.forward["throughput"] = { + "list[samples/s]": forward_throughputs_list, + "mean(samples/s)": compute_mean(forward_throughputs_list), + "stdev(samples/s)": compute_stdev(forward_throughputs_list), + } + + def populate_memory(self, forward_memories_dict: Dict[str, Any]): + self.forward["memory"] = forward_memories_dict + + def populate_energy(self, forward_energies_dict: Dict[str, Any]): + self.forward["energy"] = forward_energies_dict + + # LOGGING + def log_latency(self): + for key, value in self.forward["latency"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ forward.latency.{key}: {value:f} (s)") + for key, value in self.forward["throughput"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ forward.throughput.{key}: {value:f} (samples/s)") + + def log_memory(self): + for key, value in self.forward["memory"].items(): + LOGGER.info(f"\t+ forward.memory.{key}: {value:f} (MB)") + + def log_energy(self): + for key, value in self.forward["energy"].items(): + LOGGER.info(f"\t+ forward.energy.{key}: {value:f} (kWh)") + + def log_all(self) -> None: + if "latency" in self.forward: + self.log_latency() + if "memory" in self.forward: + self.log_memory() + if "energy" in self.forward: + self.log_energy() + + # add operator to aggregate multiple reports + def __add__(self, other: "InferenceReport") -> "InferenceReport": + agg_report = InferenceReport(batch_size=self.batch_size + other.batch_size) + if "latency" in self.forward and "latency" in other.forward: + agg_forward_latencies_list = [ + (lat_1 + lat_2) / 2 + for lat_1, lat_2 in zip(self.forward["latency"]["list[s]"], other.forward["latency"]["list[s]"]) + ] + agg_report.populate_latency(agg_forward_latencies_list) + + if "memory" in self.forward and "memory" in other.forward: + agg_forward_memories_dict = {} + for key in self.forward["memory"]: + if "vram" in key: + # our vram measures are not process-specific + agg_forward_memories_dict[key] = max(self.forward["memory"][key], other.forward["memory"][key]) + else: + # ram and pytorch measures are process-specific + agg_forward_memories_dict[key] = self.forward["memory"][key] + other.forward["memory"][key] + + agg_report.populate_memory(agg_forward_memories_dict) + + if "energy" in self.forward and "energy" in other.forward: + agg_forward_energies_dict = {} + for key in self.forward["energy"]: + # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) + agg_forward_energies_dict[key] = self.forward["energy"][key] + other.forward["energy"][key] + + agg_report.populate_energy(agg_forward_energies_dict) + + return agg_report + + +@dataclass +class ImageDiffusionReport(BenchmarkReport): + # Config + batch_size: int + num_images_per_prompts: int + # Metrics + call: Dict[str, Any] = field(default_factory=dict) + + # POPULATING + def populate_latency(self, call_latencies_list: List[float]): + ## Latency + self.call["latency"] = { + "list[s]": call_latencies_list, + "mean(s)": compute_mean(call_latencies_list), + "stdev(s)": compute_stdev(call_latencies_list), + } + ## Throughput + call_throughputs_list = [ + self.batch_size * self.num_images_per_prompts / latency for latency in call_latencies_list + ] + self.call["throughput"] = { + "list[images/s]": call_throughputs_list, + "mean[images/s]": compute_mean(call_throughputs_list), + "stdev[images/s]": compute_stdev(call_throughputs_list), + } + + def populate_memory(self, call_memories_dict: Dict[str, Any]): + self.call["memory"] = call_memories_dict + + def populate_energy(self, call_energies_dict: Dict[str, Any]): + self.call["energy"] = call_energies_dict + + # LOGGING + def log_latency(self): + for key, value in self.call["latency"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ call.latency.{key}: {value:f} (s)") + for key, value in self.call["throughput"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ call.throughput.{key}: {value:f} (images/s)") + + def log_memory(self): + for key, value in self.call["memory"].items(): + LOGGER.info(f"\t+ call.memory.{key}: {value:f} (MB)") + + def log_energy(self): + for key, value in self.call["energy"].items(): + LOGGER.info(f"\t+ call.energy.{key}: {value:f} (kWh)") + + def log_all(self) -> None: + if "latency" in self.call: + self.log_latency() + if "memory" in self.call: + self.log_memory() + if "energy" in self.call: + self.log_energy() + + # add operator to aggregate multiple reports + def __add__(self, other: "ImageDiffusionReport") -> "ImageDiffusionReport": + assert self.num_images_per_prompts == other.num_images_per_prompts, "num_images_per_prompts must be the same" + + agg_report = ImageDiffusionReport( + batch_size=self.batch_size + other.batch_size, + num_images_per_prompts=self.num_images_per_prompts, + ) + if "latency" in self.call and "latency" in other.call: + agg_call_latencies_list = [ + (lat_1 + lat_2) / 2 + for lat_1, lat_2 in zip(self.call["latency"]["list[s]"], other.call["latency"]["list[s]"]) + ] + agg_report.populate_latency(agg_call_latencies_list) + + if "memory" in self.call and "memory" in other.call: + agg_call_memories_dict = {} + for key in self.call["memory"]: + if "vram" in key: + # our vram measures are not process-specific + agg_call_memories_dict[key] = max(self.call["memory"][key], other.call["memory"][key]) + else: + # ram and pytorch measures are process-specific + agg_call_memories_dict[key] = self.call["memory"][key] + other.call["memory"][key] + + agg_report.populate_memory(agg_call_memories_dict) + + if "energy" in self.call and "energy" in other.call: + agg_call_energies_dict = {} + for key in self.call["energy"]: + # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) + agg_call_energies_dict[key] = self.call["energy"][key] + other.call["energy"][key] + + agg_report.populate_energy(agg_call_energies_dict) + + return agg_report + + +@dataclass +class TextGenerationReport(BenchmarkReport): + # Config + batch_size: int + sequence_length: int + num_new_tokens: int + num_return_sequences: int + # Prefill Metrics + prefill: Dict[str, Any] = field(default_factory=dict) + # Decode Metrics + decode: Dict[str, Any] = field(default_factory=dict) + + def populate_latency(self, forward_latencies_list: List[float], generate_latencies_list: List[float]): + ## Latency + self.prefill["latency"] = { + "list[s]": forward_latencies_list, + "mean(s)": compute_mean(forward_latencies_list), + "stdev(s)": compute_stdev(forward_latencies_list), + } + ## Throughput + prefill_throughputs_list = [ + self.batch_size * self.sequence_length / latency for latency in forward_latencies_list + ] + self.prefill["throughput"] = { + "list[tokens/s]": prefill_throughputs_list, + "mean[tokens/s]": compute_mean(prefill_throughputs_list), + "stdev[tokens/s]": compute_stdev(prefill_throughputs_list), + } + ## Latency + decode_latencies_list = [ + generate_latency - self.prefill["latency"]["mean(s)"] for generate_latency in generate_latencies_list + ] + self.decode["latency"] = { + "list[s]": decode_latencies_list, + "mean(s)": compute_mean(decode_latencies_list), + "stdev(s)": compute_stdev(decode_latencies_list), + } + ## Throughput + decode_throughputs_list = [ + self.batch_size * self.num_new_tokens * self.num_return_sequences / latency + for latency in decode_latencies_list + ] + self.decode["throughput"] = { + "list[tokens/s]": decode_throughputs_list, + "mean[tokens/s]": compute_mean(decode_throughputs_list), + "stdev[tokens/s]": compute_stdev(decode_throughputs_list), + } + + def populate_memory(self, forward_memories_dict: Dict[str, Any], generate_memories_dict: Dict[str, Any]): + self.prefill["memory"] = forward_memories_dict + self.decode["memory"] = generate_memories_dict + + def populate_energy(self, forward_energies_dict: Dict[str, Any], generate_energies_dict: Dict[str, Any]): + self.prefill["energy"] = forward_energies_dict + self.decode["energy"] = generate_energies_dict + + # LOGGING + def log_latency(self): + for key, value in self.prefill["latency"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ prefill.latency.{key}: {value:f} (s)") + for key, value in self.prefill["throughput"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ prefill.throughput.{key}: {value:f} (tokens/s)") + for key, value in self.decode["latency"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ decode.latency.{key}: {value:f} (s)") + for key, value in self.decode["throughput"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ decode.throughput.{key}: {value:f} (tokens/s)") + + def log_memory(self): + for key, value in self.prefill["memory"].items(): + LOGGER.info(f"\t+ prefill.memory.{key}: {value:f} (MB)") + for key, value in self.decode["memory"].items(): + LOGGER.info(f"\t+ decode.memory.{key}: {value:f} (MB)") + + def log_energy(self): + for key, value in self.prefill["energy"].items(): + LOGGER.info(f"\t+ prefill.energy.{key}: {value:f} (kWh)") + for key, value in self.decode["energy"].items(): + LOGGER.info(f"\t+ decode.energy.{key}: {value:f} (kWh)") + + def log_all(self) -> None: + if "latency" in self.prefill: + self.log_latency() + if "memory" in self.prefill: + self.log_memory() + if "energy" in self.prefill: + self.log_energy() + + # add operator to aggregate multiple reports + def __add__(self, other: "TextGenerationReport") -> "TextGenerationReport": + agg_report = TextGenerationReport( + batch_size=self.batch_size + other.batch_size, + sequence_length=self.sequence_length, + num_new_tokens=self.num_new_tokens, + num_return_sequences=self.num_return_sequences, + ) + if "latency" in self.prefill and "latency" in other.prefill: + agg_forward_latencies_list = [ + (lat_1 + lat_2) / 2 + for lat_1, lat_2 in zip(self.prefill["latency"]["list[s]"], other.prefill["latency"]["list[s]"]) + ] + agg_generate_latencies_list = [ + (lat_1 + lat_2) / 2 + for lat_1, lat_2 in zip(self.decode["latency"]["list[s]"], other.decode["latency"]["list[s]"]) + ] + agg_report.populate_latency(agg_forward_latencies_list, agg_generate_latencies_list) + + if "memory" in self.prefill and "memory" in other.prefill: + agg_forward_memories_dict = {} + for key in self.prefill["memory"]: + if "vram" in key: + # our vram measures are not process-specific + agg_forward_memories_dict[key] = max(self.prefill["memory"][key], other.prefill["memory"][key]) + else: + # ram and pytorch measures are process-specific + agg_forward_memories_dict[key] = self.prefill["memory"][key] + other.prefill["memory"][key] + + agg_generate_memories_dict = {} + for key in self.decode["memory"]: + if "vram" in key: + # our vram measures are not process-specific + agg_generate_memories_dict[key] = max(self.decode["memory"][key], other.decode["memory"][key]) + else: + # ram and pytorch measures are process-specific + agg_generate_memories_dict[key] = self.decode["memory"][key] + other.decode["memory"][key] + + agg_report.populate_memory(agg_forward_memories_dict, agg_generate_memories_dict) + + if "energy" in self.prefill and "energy" in other.prefill: + agg_forward_energies_dict = {} + for key in self.prefill["energy"]: + # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) + agg_forward_energies_dict[key] = self.prefill["energy"][key] + other.prefill["energy"][key] + + agg_generate_energies_dict = {} + for key in self.decode["energy"]: + # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) + agg_generate_energies_dict[key] = self.decode["energy"][key] + other.decode["energy"][key] + + agg_report.populate_energy(agg_forward_energies_dict, agg_generate_energies_dict) + + return agg_report + + +def compute_mean(values: List[float]) -> float: + return mean(values) if len(values) > 0 else 0.0 + + +def compute_stdev(values: List[float]) -> float: + return stdev(values) if len(values) > 1 else 0.0 diff --git a/optimum_benchmark/benchmarks/report.py b/optimum_benchmark/benchmarks/report.py new file mode 100644 index 00000000..69491d65 --- /dev/null +++ b/optimum_benchmark/benchmarks/report.py @@ -0,0 +1,73 @@ +from dataclasses import dataclass, asdict +from typing import Union, Optional +from json import dump +import os + +from transformers.configuration_utils import PushToHubMixin +from flatten_dict import flatten +import pandas as pd + + +@dataclass +class BenchmarkReport(PushToHubMixin): + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + config_file_name: Optional[Union[str, os.PathLike]] = None, + push_to_hub: bool = False, + **kwargs, + ): + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + kwargs["token"] = use_auth_token + + config_file_name = config_file_name if config_file_name is not None else "benchmark_report.json" + + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + + output_config_file = os.path.join(save_directory, config_file_name) + self.to_json(output_config_file) + + if push_to_hub: + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=kwargs.get("token"), + ) + + def to_dict(self) -> dict: + return asdict(self) + + def to_flat_dict(self) -> dict: + report_dict = self.to_dict() + return flatten(report_dict, reducer="dot") + + def to_json(self, path: str, flat: bool = False) -> None: + if flat: + with open(path, "w") as f: + dump(self.to_flat_dict(), f, indent=4) + else: + with open(path, "w") as f: + dump(self.to_dict(), f, indent=4) + + def to_dataframe(self) -> pd.DataFrame: + flat_report_dict = self.to_flat_dict() + return pd.DataFrame(flat_report_dict, index=[0]) + + def to_csv(self, path: str) -> None: + self.to_dataframe().to_csv(path, index=False) + + def log_all(self) -> None: + raise NotImplementedError("`log_all` method must be implemented in the child class") diff --git a/optimum_benchmark/benchmarks/training/benchmark.py b/optimum_benchmark/benchmarks/training/benchmark.py index e5eaa65f..90c231d0 100644 --- a/optimum_benchmark/benchmarks/training/benchmark.py +++ b/optimum_benchmark/benchmarks/training/benchmark.py @@ -1,19 +1,16 @@ -import time -from typing import Any, Dict from logging import getLogger +from contextlib import ExitStack -from transformers import ( - default_data_collator, - TrainingArguments, - TrainerCallback, - TrainerControl, - TrainerState, -) - -from ...generators.dataset_generator import DatasetGenerator -from ...backends.base import Backend -from .config import TrainingConfig from ..base import Benchmark +from .config import TrainingConfig +from .report import TrainingReport +from ...trackers.memory import MemoryTracker +from ...trackers.energy import EnergyTracker +from .callback import LatencyTrainerCallback +from ...backends.base import Backend, BackendConfigT +from ...generators.dataset_generator import DatasetGenerator + +from transformers import default_data_collator LOGGER = getLogger("training") @@ -24,9 +21,7 @@ class TrainingBenchmark(Benchmark[TrainingConfig]): def __init__(self, config: TrainingConfig) -> None: super().__init__(config) - def run(self, backend: Backend) -> None: - LOGGER.info("Running training benchmark") - + def run(self, backend: Backend[BackendConfigT]) -> None: LOGGER.info("\t+ Creating dataset generator") dataset_generator = DatasetGenerator( task=backend.config.task, @@ -35,105 +30,57 @@ def run(self, backend: Backend) -> None: ) LOGGER.info("\t+ Generating training dataset") - training_dataset = dataset_generator.generate() - - LOGGER.info("\t+ Creating training callbacks") - training_callbacks = [MeasurementCallback(warmup_steps=self.config.warmup_steps)] - - self.trainer_state = backend.train( - training_dataset=training_dataset, - training_callbacks=training_callbacks, - training_data_collator=default_data_collator, - training_arguments=self.config.training_arguments, + training_dataset = dataset_generator() + + LOGGER.info("\t+ Initializing training report") + self.report = TrainingReport( + max_steps=self.config.max_steps, + warmup_steps=self.config.warmup_steps, + per_process_batch_size=self.config.training_arguments["per_device_train_batch_size"], + gradient_accumulation_steps=self.config.training_arguments["gradient_accumulation_steps"], ) - LOGGER.debug(f"Training runtime: {self.trainer_state.training_runtime:.3g} (s)") - LOGGER.debug(f"Training throughput: {self.trainer_state.training_throughput:.3g} (samples/s)") - - return self.report() - - def report(self) -> Dict[str, Any]: - return { - # warmup metrics - "warmup.runtime(s)": self.trainer_state.warmup_runtime, - "warmup.throughput(samples/s)": self.trainer_state.warmup_throughput, - # training metrics - "training.runtime(s)": self.trainer_state.training_runtime, - "training.throughput(samples/s)": self.trainer_state.training_throughput, - # overall metrics - "overall.runtime(s)": self.trainer_state.overall_runtime, - "overall.throughput(samples/s)": (self.trainer_state.overall_throughput), - } - - -class MeasurementCallback(TrainerCallback): - def __init__(self, warmup_steps: int): - self.warmup_steps = warmup_steps - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - state.warmup_start = time.perf_counter_ns() * 1e-9 - state.overall_start = time.perf_counter_ns() * 1e-9 - - def on_step_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if state.global_step == self.warmup_steps: - state.warmup_end = time.perf_counter_ns() * 1e-9 - state.training_start = time.perf_counter_ns() * 1e-9 - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - state.training_end = time.perf_counter_ns() * 1e-9 - state.overall_end = time.perf_counter_ns() * 1e-9 - - state.total_training_batch_size = args.train_batch_size * args.gradient_accumulation_steps - - # warmup metrics - state.warmup_runtime = state.warmup_end - state.warmup_start - state.num_warmup_samples = self.warmup_steps * state.total_training_batch_size - state.warmup_throughput = state.num_warmup_samples / state.warmup_runtime - state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime - - # training metrics - state.training_runtime = state.training_end - state.training_start - state.num_training_steps = state.max_steps - self.warmup_steps - state.num_training_samples = state.num_training_steps * state.total_training_batch_size - state.training_throughput = state.num_training_samples / state.training_runtime - state.training_steps_per_second = state.num_training_steps / state.training_runtime - - # overall training metrics - state.overall_runtime = state.training_end - state.warmup_start - state.num_overall_samples = state.num_warmup_samples + state.num_training_samples - state.overall_throughput = state.num_overall_samples / state.overall_runtime - state.overall_steps_per_second = state.num_overall_samples / state.overall_runtime - - -# def get_data_collator(task: str): -# if task == "object-detection": -# return object_detection_data_collator -# else: -# return default_data_collator - - -# def object_detection_data_collator(batch: List[Dict[str, Any]]) -> Dict[str, Any]: -# pixel_values = torch.stack([example["pixel_values"] for example in batch]) -# labels = [example["labels"] for example in batch] -# return { -# "pixel_values": pixel_values, -# "labels": labels, -# } + training_callbackes = [] + if self.config.latency: + LOGGER.info("\t+ Adding latency measuring callback") + latency_callback = LatencyTrainerCallback(device=backend.config.device, backend=backend.config.name) + training_callbackes.append(latency_callback) + + training_trackers = [] + if self.config.memory: + LOGGER.info("\t+ Adding memory tracking context manager") + memory_tracker = MemoryTracker( + device=backend.config.device, backend=backend.config.name, device_ids=backend.config.device_ids + ) + training_trackers.append(memory_tracker.track()) + + if self.config.energy: + LOGGER.info("\t+ Adding energy tracking context manager") + energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids) + training_trackers.append(energy_tracker.track()) + + with ExitStack() as stack: + for tracker in training_trackers: + stack.enter_context(tracker) + + backend.train( + training_dataset=training_dataset, + training_callbacks=training_callbackes, + training_data_collator=default_data_collator, + training_arguments=self.config.training_arguments, + ) + + if self.config.latency: + self.report.populate_latency(overall_latencies_list=latency_callback.get_latencies_list()) + self.report.log_latency() + + if self.config.memory: + self.report.populate_memory(overall_memories_dict=memory_tracker.get_memories_dict()) + self.report.log_memory() + + if self.config.energy: + self.report.populate_energy(overall_energies_dict=energy_tracker.get_energies_dict()) + self.report.log_energy() + + def get_report(self) -> TrainingReport: + return self.report diff --git a/optimum_benchmark/benchmarks/training/callback.py b/optimum_benchmark/benchmarks/training/callback.py new file mode 100644 index 00000000..88026d79 --- /dev/null +++ b/optimum_benchmark/benchmarks/training/callback.py @@ -0,0 +1,43 @@ +import time +from typing import List + +import torch +from transformers import TrainerCallback + + +class LatencyTrainerCallback(TrainerCallback): + def __init__(self, device: str, backend: str) -> None: + self.device = device + self.backend = backend + self.all_latencies_list = [] + + def on_step_begin(self, *args, **kwargs): + # one record per step + if self.device == "cuda" and self.backend == "pytorch": + self.all_latencies_list.append(torch.cuda.Event(enable_timing=True)) + self.all_latencies_list[-1].record() + else: + self.all_latencies_list.append(time.perf_counter_ns()) + + def on_train_end(self, *args, **kwargs): + # one last record to measure the time of the last step + if self.device == "cuda" and self.backend == "pytorch": + self.all_latencies_list.append(torch.cuda.Event(enable_timing=True)) + self.all_latencies_list[-1].record() + else: + self.all_latencies_list.append(time.perf_counter_ns()) + + def get_latencies_list(self) -> List[float]: + if self.device == "cuda" and self.backend == "pytorch": + torch.cuda.synchronize() # synchronize the device to make sure all events have been recorded + latencies_list = [ + self.all_latencies_list[i - 1].elapsed_time(self.all_latencies_list[i]) * 1e-3 + for i in range(1, len(self.all_latencies_list)) + ] + else: + latencies_list = [ + (self.all_latencies_list[i] - self.all_latencies_list[i - 1]) * 1e-9 + for i in range(1, len(self.all_latencies_list)) + ] + + return latencies_list diff --git a/optimum_benchmark/benchmarks/training/config.py b/optimum_benchmark/benchmarks/training/config.py index 3a872684..e5d19581 100644 --- a/optimum_benchmark/benchmarks/training/config.py +++ b/optimum_benchmark/benchmarks/training/config.py @@ -8,6 +8,7 @@ TRAINING_ARGUMENT = { "per_device_train_batch_size": 2, + "gradient_accumulation_steps": 1, "output_dir": "./trainer_output", "do_train": True, "use_cpu": False, @@ -25,16 +26,9 @@ } DATASET_SHAPES = { - # used with all tasks "dataset_size": 500, - # used with text input tasks "sequence_length": 16, - # used with multiple choice tasks where input - # is of shape (batch_size, num_choices, sequence_length) "num_choices": 1, - # used with audio input tasks - "feature_size": 80, - "nb_max_frames": 3000, } @@ -49,10 +43,14 @@ class TrainingConfig(BenchmarkConfig): # dataset options dataset_shapes: Dict[str, Any] = field(default_factory=dict) - # training options training_arguments: Dict[str, Any] = field(default_factory=dict) + # tracking options + latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"}) + memory: bool = field(default=False, metadata={"help": "Measure max memory usage"}) + energy: bool = field(default=False, metadata={"help": "Measure energy usage"}) + def __post_init__(self): super().__post_init__() diff --git a/optimum_benchmark/benchmarks/training/report.py b/optimum_benchmark/benchmarks/training/report.py new file mode 100644 index 00000000..9eeba211 --- /dev/null +++ b/optimum_benchmark/benchmarks/training/report.py @@ -0,0 +1,169 @@ +from dataclasses import dataclass, field +from statistics import mean, stdev +from typing import Any, Dict, List +from logging import getLogger + +from ..report import BenchmarkReport + +LOGGER = getLogger("report") + + +@dataclass +class TrainingReport(BenchmarkReport): + max_steps: int + warmup_steps: int + per_process_batch_size: int + gradient_accumulation_steps: int + + overall: Dict[str, Any] = field(default_factory=dict) + training: Dict[str, Any] = field(default_factory=dict) + warmup: Dict[str, Any] = field(default_factory=dict) + + world_size: int = 1 + + # POPULATING + def populate_latency(self, overall_latencies_list: List[float]) -> None: + assert ( + len(overall_latencies_list) == self.max_steps + ), f"Expected {self.max_steps} latencies, but got {len(overall_latencies_list)} latencies" + # Overall + ## Latency + self.overall["latency"] = { + "list[s/step]": overall_latencies_list, + "mean(s/step)": compute_mean(overall_latencies_list), + "stdev(s/step)": compute_stdev(overall_latencies_list), + } + ## Throughput + overall_throughputs_list = [ + self.world_size * self.per_process_batch_size * self.gradient_accumulation_steps / latency + for latency in overall_latencies_list + ] + self.overall["throughput"] = { + "list[samples/s]": overall_throughputs_list, + "mean(samples/s)": compute_mean(overall_throughputs_list), + "stdev(samples/s)": compute_stdev(overall_throughputs_list), + } + # Training + ## Latency + training_latencies_list = overall_latencies_list[self.warmup_steps :] + self.training["latency"] = { + "list[s/step]": training_latencies_list, + "mean(s/step)": compute_mean(training_latencies_list), + "stdev(s/step)": compute_stdev(training_latencies_list), + } + ## Throughput + training_throughputs_list = overall_throughputs_list[self.warmup_steps :] + self.training["throughput"] = { + "list[samples/s]": training_throughputs_list, + "mean(samples/s)": compute_mean(training_throughputs_list), + "stdev(samples/s)": compute_stdev(training_throughputs_list), + } + # Warmup + ## Latency + warmup_latencies_list = overall_latencies_list[: self.warmup_steps] + self.warmup["latency"] = { + "list[s/step]": warmup_latencies_list, + "mean(s/step)": compute_mean(warmup_latencies_list), + "stdev(s/step)": compute_stdev(warmup_latencies_list), + } + ## Throughput + warmup_throughputs_list = overall_throughputs_list[: self.warmup_steps] + self.warmup["throughput"] = { + "list[samples/s]": warmup_throughputs_list, + "mean(samples/s)": compute_mean(warmup_throughputs_list), + "stdev(samples/s)": compute_stdev(warmup_throughputs_list), + } + + def populate_memory(self, overall_memories_dict: Dict[str, float]) -> None: + self.warmup["memory"] = overall_memories_dict + self.overall["memory"] = overall_memories_dict + self.training["memory"] = overall_memories_dict + + def populate_energy(self, overall_energies_dict: Dict[str, float]) -> None: + self.overall["energy"] = overall_energies_dict + # can't get training only or warmup only energies + # self.warmup["energy"] = overall_energies_dict + # self.training["energy"] = overall_energies_dict + # TODO: use a callback for energy instead of a tracker + + # LOGGING + def log_latency(self): + for key, value in self.training["latency"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ training.latency.{key}: {value:f} (s)") + for key, value in self.training["throughput"].items(): + if "list" in key: + continue + LOGGER.info(f"\t+ training.throughput.{key}: {value:f} (samples/s)") + + def log_memory(self): + for key, value in self.training["memory"].items(): + LOGGER.info(f"\t+ training.memory.{key}: {value:f} (MB)") + + def log_energy(self): + for key, value in self.overall["energy"].items(): + LOGGER.info(f"\t+ overall.energy.{key}: {value:f} (kWh)") + + def log_all(self): + if "latency" in self.training: + self.log_latency() + if "memory" in self.training: + self.log_memory() + if "energy" in self.training: + self.log_energy() + + # LOGIC + def __add__(self, other: "TrainingReport") -> "TrainingReport": + assert self.max_steps == other.max_steps, "Both reports must have the same max_steps" + assert self.warmup_steps == other.warmup_steps, "Both reports must have the same warmup_steps" + assert ( + self.gradient_accumulation_steps == other.gradient_accumulation_steps + ), "Both reports must have the same gradient_accumulation_steps" + + agg_report = TrainingReport( + max_steps=self.max_steps, + warmup_steps=self.warmup_steps, + world_size=self.world_size + other.world_size, + per_process_batch_size=self.per_process_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + ) + + if "latency" in self.overall: + agg_overall_latencies_list = [ + max(lat_1, lat_2) + for lat_1, lat_2 in zip( + self.overall["latency"]["list[s/step]"], other.overall["latency"]["list[s/step]"] + ) + ] + agg_report.populate_latency(agg_overall_latencies_list) + + if "memory" in self.overall: + agg_overall_memories_dict = {} + for key in self.overall["memory"]: + if "vram" in key: + # our vram measures are not process-specific + agg_overall_memories_dict[key] = max(self.overall["memory"][key], other.overall["memory"][key]) + else: + # ram and pytorch measures are process-specific (can be accumulated) + agg_overall_memories_dict[key] = self.overall["memory"][key] + other.overall["memory"][key] + + agg_report.populate_memory(agg_overall_memories_dict) + + if "energy" in self.overall: + agg_overall_energies_dict = {} + for key in self.overall["energy"]: + # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) + agg_overall_energies_dict[key] = self.overall["energy"][key] + other.overall["energy"][key] + + agg_report.populate_energy(agg_overall_energies_dict) + + return agg_report + + +def compute_mean(values: List[float]) -> float: + return mean(values) if len(values) > 0 else 0.0 + + +def compute_stdev(values: List[float]) -> float: + return stdev(values) if len(values) > 1 else 0.0 diff --git a/optimum_benchmark/benchmarks/utils.py b/optimum_benchmark/benchmarks/utils.py index 052276c4..8b137891 100644 --- a/optimum_benchmark/benchmarks/utils.py +++ b/optimum_benchmark/benchmarks/utils.py @@ -1,55 +1 @@ -from typing import List, Dict, Any - -# TODO: use some kind of logic to handle this instead of this function -def consolidate_reports(reports: List[Dict[str, Any]]) -> Dict[str, Any]: - report = {} - - ## Training - - if "warmup.runtime(s)" in reports[0]: - report["warmup.runtime(s)"] = reports[0]["warmup.runtime(s)"] - report["warmup.throughput(samples/s)"] = sum(r["warmup.throughput(samples/s)"] for r in reports) - - if "training.runtime(s)" in reports[0]: - report["training.runtime(s)"] = reports[0]["training.runtime(s)"] - report["training.throughput(samples/s)"] = sum(r["training.throughput(samples/s)"] for r in reports) - - if "overall.runtime(s)" in reports[0]: - report["overall.runtime(s)"] = reports[0]["overall.runtime(s)"] - report["overall.throughput(samples/s)"] = sum(r["overall.throughput(samples/s)"] for r in reports) - - ## Inference - - if "forward.latency(s)" in reports[0]: - report["forward.latency(s)"] = reports[0]["forward.latency(s)"] - report["forward.throughput(samples/s)"] = sum(r["forward.throughput(samples/s)"] for r in reports) - - if "diffusion.throughput(images/s)" in reports[0]: - report["diffusion.throughput(images/s)"] = sum(r["diffusion.throughput(images/s)"] for r in reports) - - if "forward.peak_memory(MB)" in reports[0]: - report["forward.max_memory_used(MB)"] = reports[0]["forward.max_memory_used(MB)"] - report["forward.max_memory_allocated(MB)"] = sum(r["forward.max_memory_allocated(MB)"] for r in reports) - report["forward.max_memory_reserved(MB)"] = sum(r["forward.max_memory_reserved(MB)"] for r in reports) - - if "forward.energy_consumption(kWh/sample)" in reports[0]: - report["forward.energy_consumption(kWh/sample)"] = reports[0]["forward.energy_consumption(kWh/sample)"] - report["forward.carbon_emissions(kgCO2eq/sample)"] = reports[0]["forward.carbon_emissions(kgCO2eq/sample)"] - - if "generate.latency(s)" in reports[0]: - report["generate.latency(s)"] = reports[0]["generate.latency(s)"] - report["generate.throughput(tokens/s)"] = sum(r["generate.throughput(tokens/s)"] for r in reports) - report["decode.latency(s)"] = reports[0]["decode.latency(s)"] - report["decode.throughput(tokens/s)"] = sum(r["decode.throughput(tokens/s)"] for r in reports) - - if "generate.peak_memory(MB)" in reports[0]: - report["generate.max_memory_used(MB)"] = reports[0]["generate.max_memory_used(MB)"] - report["generate.max_memory_allocated(MB)"] = sum(r["generate.max_memory_allocated(MB)"] for r in reports) - report["generate.max_memory_reserved(MB)"] = sum(r["generate.max_memory_reserved(MB)"] for r in reports) - - if "generate.energy_consumption(kWh/token)" in reports[0]: - report["generate.energy_consumption(kWh/token)"] = reports[0]["generate.energy_consumption(kWh/token)"] - report["generate.carbon_emissions(kgCO2eq/token)"] = reports[0]["generate.carbon_emissions(kgCO2eq/token)"] - - return report diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py index cf36855d..4961c189 100644 --- a/optimum_benchmark/cli.py +++ b/optimum_benchmark/cli.py @@ -1,6 +1,5 @@ import os import glob -import json from logging import getLogger import hydra @@ -19,6 +18,7 @@ from .backends.neural_compressor.config import INCConfig from .backends.text_generation_inference.config import TGIConfig +from .benchmarks.report import BenchmarkReport from .experiment import launch, ExperimentConfig from .benchmarks.training.config import TrainingConfig from .benchmarks.inference.config import InferenceConfig @@ -49,6 +49,8 @@ # optimum-benchmark @hydra.main(version_base=None) def benchmark_cli(experiment_config: DictConfig) -> None: + os.environ["BENCHMARK_CLI"] = "1" + if glob.glob("*.csv") and os.environ.get("OVERRIDE_BENCHMARKS", "0") != "1": LOGGER.warning( "Skipping benchmark because results already exist. " @@ -74,10 +76,6 @@ def benchmark_cli(experiment_config: DictConfig) -> None: experiment_config: ExperimentConfig = OmegaConf.to_object(experiment_config) OmegaConf.save(experiment_config, "experiment_config.yaml", resolve=True) - benchmark_report = launch(experiment_config=experiment_config) - - LOGGER.info("Benchmark Report:") - for metric, value in benchmark_report.items(): - LOGGER.info(f"\t+ {metric}: {value:.3f}") + benchmark_report: BenchmarkReport = launch(experiment_config=experiment_config) - json.dump(benchmark_report, open("benchmark_report.json", "w"), indent=4) + benchmark_report.to_json("benchmark_report.json") diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py index 5a714dd9..ed4b710b 100644 --- a/optimum_benchmark/env_utils.py +++ b/optimum_benchmark/env_utils.py @@ -1,13 +1,14 @@ +import os import re import platform import subprocess import importlib.util -from typing import Optional - -import psutil +from typing import Optional, List from .import_utils import is_py3nvml_available, is_pyrsmi_available +import psutil + def is_nvidia_system(): try: @@ -91,20 +92,84 @@ def get_gpus(): return gpus -def get_git_revision_hash(package_name: str, path: Optional[str] = None) -> Optional[str]: +def get_gpu_vram_mb() -> List[int]: + if is_nvidia_system(): + if not is_py3nvml_available(): + raise ValueError( + "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. " + "Please install it through `pip install py3nvml`." + ) + import py3nvml.py3nvml as nvml + + nvml.nvmlInit() + device_count = nvml.nvmlDeviceGetCount() + vrams = [nvml.nvmlDeviceGetMemoryInfo(nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)] + nvml.nvmlShutdown() + elif is_rocm_system(): + if not is_pyrsmi_available(): + raise ValueError( + "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. " + "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi." + ) + + from pyrsmi import rocml + + rocml.smi_initialize() + device_count = rocml.smi_get_device_count() + vrams = [rocml.smi_get_device_memory_total(index) for index in range(device_count)] + rocml.smi_shutdown() + else: + vrams = [] + + return sum(vrams) + + +def get_cuda_device_ids() -> str: + if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: + device_ids = os.environ["CUDA_VISIBLE_DEVICES"] + else: + if is_nvidia_system(): + if not is_py3nvml_available(): + raise ValueError( + "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. " + "Please install it through `pip install py3nvml`." + ) + import py3nvml.py3nvml as nvml + + nvml.nvmlInit() + device_ids = list(range(nvml.nvmlDeviceGetCount())) + nvml.nvmlShutdown() + elif is_rocm_system(): + if not is_pyrsmi_available(): + raise ValueError( + "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. " + "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi." + ) + + from pyrsmi import rocml + + rocml.smi_initialize() + device_ids = list(range(rocml.smi_get_device_count())) + rocml.smi_shutdown() + else: + raise ValueError("No NVIDIA or ROCm GPUs found.") + + return ",".join(str(i) for i in device_ids) + + +def get_git_revision_hash(package_name: str) -> Optional[str]: """ Returns the git commit SHA of a package installed from a git repository. """ - if path is None: - try: - path = importlib.util.find_spec(package_name).origin - except Exception: - return None + try: + path = importlib.util.find_spec(package_name).origin + except Exception: + return None try: git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip() except Exception: - git_hash = None + return None return git_hash diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py index 1396c131..c9b6d733 100644 --- a/optimum_benchmark/experiment.py +++ b/optimum_benchmark/experiment.py @@ -1,11 +1,13 @@ import os import platform from logging import getLogger +from tempfile import TemporaryDirectory from dataclasses import dataclass, field from typing import Any, Dict, Type, Optional, TYPE_CHECKING from hydra.utils import get_class +from .benchmarks.report import BenchmarkReport from .benchmarks.config import BenchmarkConfig from .launchers.config import LauncherConfig from .backends.config import BackendConfig @@ -19,6 +21,9 @@ ) from .env_utils import ( get_git_revision_hash, + is_nvidia_system, + is_rocm_system, + get_gpu_vram_mb, get_cpu_ram_mb, get_gpus, get_cpu, @@ -57,29 +62,39 @@ class ExperimentConfig: environment: Dict = field( default_factory=lambda: { "cpu": get_cpu(), - "gpus": get_gpus(), "cpu_count": os.cpu_count(), - "system": platform.system(), "cpu_ram_mb": get_cpu_ram_mb(), + "system": platform.system(), "python_version": platform.python_version(), # libraries "transformers_version": transformers_version(), - "transformers_commit": get_git_revision_hash("transformers", os.environ.get("TRANSFORMERS_PATH", None)), + "transformers_commit": get_git_revision_hash("transformers"), "accelerate_version": accelerate_version(), - "accelerate_commit": get_git_revision_hash("accelerate", os.environ.get("ACCELERATE_PATH", None)), - "optimum_version": optimum_version(), - "optimum_commit": get_git_revision_hash("optimum", os.environ.get("OPTIMUM_PATH", None)), + "accelerate_commit": get_git_revision_hash("accelerate"), "diffusers_version": diffusers_version(), - "diffusers_commit": get_git_revision_hash("diffusers", os.environ.get("DIFFUSERS_PATH", None)), + "diffusers_commit": get_git_revision_hash("diffusers"), + "optimum_version": optimum_version(), + "optimum_commit": get_git_revision_hash("optimum"), "timm_version": timm_version(), - "timm_commit": get_git_revision_hash("timm", os.environ.get("TIMM_PATH", None)), + "timm_commit": get_git_revision_hash("timm"), "peft_version": peft_version(), - "peft_commit": get_git_revision_hash("peft", os.environ.get("PEFT_PATH", None)), + "peft_commit": get_git_revision_hash("peft"), } ) + def __post_init__(self): + # adding GPU information to the environment + if is_nvidia_system() or is_rocm_system(): + available_gpus = get_gpus() + if len(available_gpus) > 0: + self.environment["gpu"] = available_gpus[0] + self.environment["gpu_count"] = len(available_gpus) + self.environment["gpu_vram_mb"] = get_gpu_vram_mb() + else: + LOGGER.warning("Detected NVIDIA or ROCm system, but no GPUs found.") -def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dict[str, Any]: + +def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> BenchmarkReport: try: # Allocate requested backend backend_factory: Type[Backend] = get_class(backend_config._target_) @@ -107,7 +122,7 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dic raise e try: - report = benchmark.report() + report = benchmark.get_report() except Exception as e: LOGGER.error("Error during report generation: %s", e) raise e @@ -115,7 +130,13 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dic return report -def launch(experiment_config: ExperimentConfig) -> Dict[str, Any]: +def launch(experiment_config: ExperimentConfig) -> BenchmarkReport: + if os.environ.get("BENCHMARK_CLI", "0") == "0": + LOGGER.info("Launching experiment in a temporary directory.") + tmep_dir = TemporaryDirectory() + original_dir = os.getcwd() + os.chdir(tmep_dir.name) + launcher_config: LauncherConfig = experiment_config.launcher try: @@ -135,4 +156,8 @@ def launch(experiment_config: ExperimentConfig) -> Dict[str, Any]: LOGGER.error(f"Error during experiment launching: {e}") raise e + if os.environ.get("BENCHMARK_CLI", "0") == "0": + os.chdir(original_dir) + tmep_dir.cleanup() + return output diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py index f0ba921f..4bb9f188 100644 --- a/optimum_benchmark/generators/dataset_generator.py +++ b/optimum_benchmark/generators/dataset_generator.py @@ -15,7 +15,7 @@ def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict dataset_shapes["batch_size"] = dataset_shapes["dataset_size"] if task in TASKS_TO_GENERATORS: - LOGGER.info(f"Using {task} task generator") + LOGGER.info(f"\t+ Using {task} task generator") shapes = {**dataset_shapes, **model_shapes} self.task_generator = TASKS_TO_GENERATORS[task](shapes=shapes, with_labels=True) else: @@ -26,7 +26,8 @@ def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict "please submit a PR or a feature request to optimum-benchmark. \n" ) - def generate(self) -> Dataset: - task_dataset = self.task_generator.generate() + def __call__(self) -> Dataset: + task_dataset = self.task_generator() task_dataset = Dataset.from_dict(task_dataset) + task_dataset.set_format(type="torch", columns=list(task_dataset.features.keys())) return task_dataset diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py index bc14d6c8..13f1d9aa 100644 --- a/optimum_benchmark/generators/input_generator.py +++ b/optimum_benchmark/generators/input_generator.py @@ -22,8 +22,8 @@ def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[s "please submit a PR or a feature request to optimum-benchmark. " ) - def generate(self, mode: str) -> Dict[str, Any]: - task_input = self.task_generator.generate() + def __call__(self, mode: str) -> Dict[str, Any]: + task_input = self.task_generator() if mode == "generate": if "pixel_values" in task_input: @@ -46,5 +46,9 @@ def generate(self, mode: str) -> Dict[str, Any]: task_input = { "inputs": task_input["input_ids"], } + elif mode == "call": + task_input = { + "prompt": task_input["prompt"], + } return task_input diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index 9aee6ee9..1f3e9b23 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -34,8 +34,8 @@ def generate_random_strings(shape: Tuple[int]): for _ in range(shape[0]) ] - def generate(self): - raise NotImplementedError("Generator must implement generate method") + def __call__(self): + raise NotImplementedError("Generator must implement __call__ method") class TextGenerator(TaskGenerator): @@ -131,7 +131,7 @@ def labels(self): shape=(self.shapes["batch_size"],), ) - def generate(self): + def __call__(self): dummy = {} dummy["input_ids"] = self.input_ids() @@ -160,7 +160,7 @@ def labels(self): ), ) - def generate(self): + def __call__(self): dummy = {} dummy["input_ids"] = self.input_ids() @@ -179,7 +179,7 @@ def generate(self): class TextGenerationGenerator(TextGenerator): - def generate(self): + def __call__(self): dummy = {} dummy["input_ids"] = self.input_ids() dummy["attention_mask"] = self.attention_mask() @@ -211,7 +211,7 @@ def end_positions(self): shape=(self.shapes["batch_size"],), ) - def generate(self): + def __call__(self): dummy = {} dummy["input_ids"] = self.input_ids() @@ -226,7 +226,7 @@ def generate(self): class MaskedLanguageModelingGenerator(TextGenerator): - def generate(self): + def __call__(self): dummy = {} dummy["input_ids"] = self.input_ids() @@ -252,7 +252,7 @@ def labels(self): shape=(self.shapes["batch_size"],), ) - def generate(self): + def __call__(self): dummy = {} dummy["input_ids"] = ( @@ -288,7 +288,7 @@ def labels(self): shape=(self.shapes["batch_size"],), ) - def generate(self): + def __call__(self): dummy = {} dummy["pixel_values"] = self.pixel_values() @@ -316,7 +316,7 @@ def labels(self): for _ in range(self.shapes["batch_size"]) ] - def generate(self): + def __call__(self): dummy = {} dummy["pixel_values"] = self.pixel_values() @@ -338,7 +338,7 @@ def labels(self): ), ) - def generate(self): + def __call__(self): dummy = {} dummy["pixel_values"] = self.pixel_values() @@ -356,7 +356,7 @@ def labels(self): shape=(self.shapes["batch_size"],), ) - def generate(self): + def __call__(self): dummy = {} dummy["input_values"] = self.input_values() @@ -377,7 +377,7 @@ def labels(self): ), ) - def generate(self): + def __call__(self): dummy = {} dummy["input_values"] = self.input_values() @@ -391,7 +391,7 @@ class PromptGenerator(TaskGenerator): def prompt(self): return self.generate_random_strings(shape=(self.shapes["batch_size"], 10)) - def generate(self): + def __call__(self): dummy = {} dummy["prompt"] = self.prompt() @@ -399,7 +399,7 @@ def generate(self): class FeatureExtractionGenerator(TextGenerator, ImageGenerator): - def generate(self): + def __call__(self): dummy = {} if self.shapes["num_channels"] is not None and self.shapes["height"] is not None: diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index 1c4cc7e8..f19fbda3 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -1,6 +1,7 @@ import importlib.metadata import importlib.util + _transformers_available = importlib.util.find_spec("transformers") is not None _accelerate_available = importlib.util.find_spec("accelerate") is not None _diffusers_available = importlib.util.find_spec("diffusers") is not None @@ -19,23 +20,31 @@ _amdsmi_available = importlib.util.find_spec("amdsmi") is not None _tensorflow_available = importlib.util.find_spec("tensorflow") is not None _timm_available = importlib.util.find_spec("timm") is not None -_is_diffusers_available = importlib.util.find_spec("diffusers") is not None -_is_accelerate_available = importlib.util.find_spec("accelerate") is not None -_is_torch_ort_available = importlib.util.find_spec("torch_ort") is not None -_is_deepspeed_available = importlib.util.find_spec("deepspeed") is not None -_is_tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None +_diffusers_available = importlib.util.find_spec("diffusers") is not None +_torch_ort_available = importlib.util.find_spec("torch_ort") is not None +_deepspeed_available = importlib.util.find_spec("deepspeed") is not None +_tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None +_psutil_available = importlib.util.find_spec("psutil") is not None + + +def is_psutil_available(): + return _psutil_available + + +def is_transformers_available(): + return _transformers_available def is_tensorrt_llm_available(): - return _is_tensorrt_llm_available + return _tensorrt_llm_available def is_deepspeed_available(): - return _is_deepspeed_available + return _deepspeed_available def is_torch_ort_available(): - return _is_torch_ort_available + return _torch_ort_available def is_accelerate_available(): @@ -43,7 +52,7 @@ def is_accelerate_available(): def is_diffusers_available(): - return _is_diffusers_available + return _diffusers_available def is_timm_available(): @@ -118,7 +127,7 @@ def onnxruntime_version(): try: return "ort-training:" + importlib.metadata.version("onnxruntime-training") except importlib.metadata.PackageNotFoundError: - return "ort:unknown" + return None def openvino_version(): @@ -152,7 +161,7 @@ def diffusers_version(): def torch_ort_version(): - if _is_torch_ort_available: + if _torch_ort_available: return importlib.metadata.version("torch_ort") @@ -167,5 +176,5 @@ def peft_version(): def tesnorrt_llm_version(): - if _is_tensorrt_llm_available: + if _tensorrt_llm_available: return importlib.metadata.version("tensorrt_llm") diff --git a/optimum_benchmark/launchers/isolation_utils.py b/optimum_benchmark/launchers/isolation_utils.py index f48fc919..52006bcc 100644 --- a/optimum_benchmark/launchers/isolation_utils.py +++ b/optimum_benchmark/launchers/isolation_utils.py @@ -6,15 +6,15 @@ from multiprocessing import Process from contextlib import contextmanager -import psutil - from ..logging_utils import setup_logging from ..env_utils import is_nvidia_system, is_rocm_system -from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version +from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version, is_psutil_available +if is_psutil_available(): + import psutil if is_py3nvml_available(): - import py3nvml.py3nvml as nvml # type: ignore + import py3nvml.py3nvml as nvml if is_amdsmi_available(): import amdsmi # type: ignore @@ -172,7 +172,7 @@ def assert_system_devices_isolation(benchmark_pid: int) -> None: @contextmanager -def device_isolation(benchmark_pid: int, enabled: bool) -> None: +def device_isolation(benchmark_pid: int, enabled: bool): if not enabled: yield return diff --git a/optimum_benchmark/launchers/torchrun/config.py b/optimum_benchmark/launchers/torchrun/config.py index 77b6b4ef..2d87ff03 100644 --- a/optimum_benchmark/launchers/torchrun/config.py +++ b/optimum_benchmark/launchers/torchrun/config.py @@ -1,4 +1,3 @@ -import os import uuid from logging import getLogger from typing import Any, Dict, Optional @@ -20,7 +19,7 @@ class TorchrunConfig(LauncherConfig): # Maximum amount of nodes that the user function will be launched on. max_nodes: int = 1 # On each node the elastic agent will launch this amount of workers that will execute user defined function. - nproc_per_node: Optional[int] = None + nproc_per_node: int = 2 # User defined role of the worker (defaults to "trainer"). role: str = "benchmark_worker" # The interval in seconds that is used by the elastic_agent as a period of monitoring workers. @@ -61,26 +60,3 @@ def __post_init__(self) -> None: if self.min_nodes != 1: LOGGER.info("For multi-node benchmarks, run the benchmark on each node separately.") LOGGER.info(f"Waiting for the other nodes to be avaialable at {self.rdzv_endpoint}...") - - if self.nproc_per_node is None: - if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: - LOGGER.warning( - "`nproc_per_node` is not set but `CUDA_VISIBLE_DEVICES` is set. " - "Setting `nproc_per_node` to the number of visible devices." - ) - self.nproc_per_node = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) - else: - LOGGER.warning( - "`nproc_per_node` is not set and `CUDA_VISIBLE_DEVICES` is not set. " - "Setting `nproc_per_node` and `CUDA_VISIBLE_DEVICES` to 1." - ) - os.environ["CUDA_VISIBLE_DEVICES"] = "0" - self.nproc_per_node = 1 - else: - if len(os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")) != self.nproc_per_node: - LOGGER.warning( - f"`nproc_per_node` is set to {self.nproc_per_node} but `CUDA_VISIBLE_DEVICES` " - f"is set to {os.environ.get('CUDA_VISIBLE_DEVICES', '')}. " - "Setting `CUDA_VISIBLE_DEVICES` to match `nproc_per_node`." - ) - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in range(self.nproc_per_node)]) diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py index b50f9f55..f327e85c 100644 --- a/optimum_benchmark/launchers/torchrun/launcher.py +++ b/optimum_benchmark/launchers/torchrun/launcher.py @@ -4,17 +4,19 @@ from multiprocessing import Queue from typing import Callable, Dict, Any -import torch.distributed -from torch.distributed import FileStore -from torch.distributed.elastic.multiprocessing import Std -from torch.distributed.elastic.multiprocessing.errors import record -from torch.distributed.launcher.api import LaunchConfig, launch_agent - from ..base import Launcher from .config import TorchrunConfig from ...logging_utils import setup_logging from ..isolation_utils import device_isolation -from ...benchmarks.utils import consolidate_reports +from ...benchmarks.report import BenchmarkReport +from ...import_utils import is_torch_distributed_available + +if is_torch_distributed_available(): + import torch.distributed + from torch.distributed import FileStore + from torch.distributed.elastic.multiprocessing import Std + from torch.distributed.elastic.multiprocessing.errors import record + from torch.distributed.launcher.api import LaunchConfig, launch_agent LOGGER = getLogger("torchrun") @@ -49,8 +51,8 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]: local_addr=self.config.local_addr, log_dir=self.config.log_dir, ) - current_log_level = getLogger().getEffectiveLevel() queue = Queue() + current_log_level = getLogger().getEffectiveLevel() with device_isolation(enabled=self.config.device_isolation, benchmark_pid=os.getpid()): LOGGER.info(f"\t+ Launching torchrun agent with {self.config.nproc_per_node} workers processes") @@ -61,10 +63,16 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]: ) outputs = [] + while not queue.empty(): outputs.append(queue.get()) - report = consolidate_reports(outputs) + if len(outputs) == 1: + report: BenchmarkReport = outputs[0] + else: + LOGGER.info(f"\t+ Merging benchmark reports from {len(outputs)} workers") + report: BenchmarkReport = sum(outputs[1:], outputs[0]) + report.log_all() return report @@ -85,12 +93,12 @@ def entrypoint(fn, q, log_level, *args): torch.cuda.set_device(rank) if rank == 0: - setup_logging(log_level) + setup_logging(level=log_level, prefix="RANK-0") else: - setup_logging("ERROR") + setup_logging(level="ERROR") # TODO: use a tcp store instead - store = FileStore("torchrun_filestore") + store = FileStore("torchrun.filestore") store.set(f"rank_{rank}", str(os.getpid())) output = fn(*args) diff --git a/optimum_benchmark/logging_utils.py b/optimum_benchmark/logging_utils.py index 398c7bf4..72f76889 100644 --- a/optimum_benchmark/logging_utils.py +++ b/optimum_benchmark/logging_utils.py @@ -1,11 +1,13 @@ import os import logging import logging.config +from logging import Logger +from typing import Optional from subprocess import Popen, PIPE, STDOUT from omegaconf import OmegaConf -JOB_LOGGING = { +API_JOB_LOGGING = { "version": 1, "formatters": { "simple": {"format": "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"}, @@ -27,32 +29,34 @@ "stream": "ext://sys.stdout", "class": "logging.StreamHandler", }, - "file": { - "filename": "api.log", - "formatter": "simple", - "class": "logging.FileHandler", - }, }, - "root": {"level": "INFO", "handlers": ["console", "file"]}, + "root": {"level": "INFO", "handlers": ["console"]}, "disable_existing_loggers": False, } -def setup_logging(level: str = "INFO"): - if os.path.exists(".hydra/hydra.yaml"): +def setup_logging(level: str = "INFO", prefix: Optional[str] = None): + if os.environ.get("BENCHMARK_CLI", "0") == "1": hydra_config = OmegaConf.load(".hydra/hydra.yaml") job_logging = OmegaConf.to_container( hydra_config.hydra.job_logging, resolve=True, ) else: - job_logging = JOB_LOGGING.copy() + job_logging = API_JOB_LOGGING.copy() job_logging["root"]["level"] = level + + if prefix is not None: + job_logging["formatters"]["simple"]["format"] = f"[{prefix}]" + job_logging["formatters"]["simple"]["format"] + job_logging["formatters"]["colorlog"]["format"] = ( + f"[{prefix}]" + job_logging["formatters"]["colorlog"]["format"] + ) + logging.config.dictConfig(job_logging) -def run_process_and_log_stream_output(logger, args): +def run_subprocess_and_log_stream_output(logger: Logger, args): popen = Popen(args, stdout=PIPE, stderr=STDOUT) for line in iter(popen.stdout.readline, b""): if line is not None: diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index b3038812..e35baae3 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -114,7 +114,7 @@ ), } -DIFFUSION_TASKS = [ +IMAGE_DIFFUSION_TASKS = [ "stable-diffusion", "stable-diffusion-xl", ] diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py index 815abaa9..7d3bb7ad 100644 --- a/optimum_benchmark/trackers/energy.py +++ b/optimum_benchmark/trackers/energy.py @@ -1,53 +1,44 @@ import os from logging import getLogger -from typing import List, Optional from contextlib import contextmanager +from typing import Optional, Dict -from ..env_utils import is_nvidia_system, is_rocm_system -from ..import_utils import ( - is_py3nvml_available, - is_pyrsmi_available, - is_codecarbon_available, -) +from ..env_utils import get_cuda_device_ids +from ..import_utils import is_codecarbon_available if is_codecarbon_available(): from codecarbon import EmissionsTracker, OfflineEmissionsTracker -if is_nvidia_system(): - if is_py3nvml_available(): - import py3nvml.py3nvml as nvml - else: - raise ValueError( - "The library py3nvml is required to run energy benchmark on NVIDIA GPUs, but is not installed. " - "Please install it through `pip install py3nvml`." - ) - -if is_rocm_system(): - if is_pyrsmi_available(): - # TODO: use amdsmi instead of pyrsmi - from pyrsmi import rocml - else: - raise ValueError( - "The library pyrsmi is required to run energy benchmark on ROCm-powered GPUs, but is not installed. " - "Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git." - ) - LOGGER = getLogger("energy") class EnergyTracker: - def __init__(self, device_ids: Optional[List[int]] = None): - self.device_ids = device_ids + def __init__(self, device: str, device_ids: Optional[str] = None): + self.device = device + self.cpu_energy: float = 0 + self.gpu_energy: float = 0 + self.ram_energy: float = 0 self.total_energy: float = 0 - self.total_emissions: float = 0 - if self.device_ids is None: - self.device_ids = infer_cuda_device_ids() + if self.device == "cuda": + if device_ids is None: + LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.") + self.device_ids = list(map(int, get_cuda_device_ids().split(","))) + else: + self.device_ids = list(map(int, device_ids.split(","))) + else: + self.device_ids = [] + + def reset(self): + self.cpu_energy = 0 + self.gpu_energy = 0 + self.ram_energy = 0 + self.total_energy = 0 @contextmanager - def track(self, interval=1, file_prefix=""): + def track(self, interval=1, file_prefix="method"): if not is_codecarbon_available(): raise ValueError( "The library codecarbon is required to run energy benchmark, but is not installed. " @@ -55,6 +46,7 @@ def track(self, interval=1, file_prefix=""): ) try: + # TODO: use pynvml and amdsmi directly to get the GPU power consumption self.emission_tracker = EmissionsTracker( log_level="error", # "info" for more verbosity tracking_mode="process", # "machine" for machine-level tracking @@ -63,11 +55,11 @@ def track(self, interval=1, file_prefix=""): output_file=f"{file_prefix}_codecarbon.csv", ) except Exception as e: - LOGGER.warning(f"Failed to initialize Online Emissions Tracker: {e}") - LOGGER.warning("Falling back to Offline Emissions Tracker") + LOGGER.warning("\t+ Failed to initialize Online Emissions Tracker:, %s", e) + LOGGER.warning("\t+ Falling back to Offline Emissions Tracker") if os.environ.get("COUNTRY_ISO_CODE", None) is None: LOGGER.warning( - "Offline Emissions Tracker requires COUNTRY_ISO_CODE to be set. " + "\t+ Offline Emissions Tracker requires COUNTRY_ISO_CODE to be set. " "We will set it to FRA but the carbon footprint will be inaccurate." ) @@ -83,32 +75,19 @@ def track(self, interval=1, file_prefix=""): self.emission_tracker.start() yield self.emission_tracker.stop() - self.total_energy = self.emission_tracker._total_energy.kWh - self.total_emissions = self.emission_tracker.final_emissions - - def get_total_energy(self) -> float: - return self.total_energy - def get_total_emissions(self) -> float: - return self.total_emissions + self.cpu_energy = self.emission_tracker._total_cpu_energy.kWh + self.gpu_energy = self.emission_tracker._total_gpu_energy.kWh + self.ram_energy = self.emission_tracker._total_ram_energy.kWh + self.total_energy = self.emission_tracker._total_energy.kWh def get_elapsed_time(self) -> float: return self.emission_tracker._last_measured_time - self.emission_tracker._start_time - -def infer_cuda_device_ids() -> List[int]: - if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: - cuda_device_ids = list(map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(","))) - else: - if is_nvidia_system() and is_py3nvml_available(): - nvml.nvmlInit() - cuda_device_ids = list(range(nvml.nvmlDeviceGetCount())) - nvml.nvmlShutdown() - elif is_rocm_system() and is_pyrsmi_available(): - rocml.smi_initialize() - cuda_device_ids = list(range(rocml.smi_get_device_count())) - rocml.smi_shutdown() - else: - raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA energy tracking.") - - return cuda_device_ids + def get_energies_dict(self) -> Dict[str, float]: + return { + "cpu_energy(kHh)": self.cpu_energy, + "gpu_energy(kHh)": self.gpu_energy, + "ram_energy(kHh)": self.ram_energy, + "total(kHh)": self.total_energy, + } diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 2823919c..369c2b70 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -1,11 +1,12 @@ -import time from contextlib import contextmanager from logging import getLogger from typing import List +import time -import torch +from ..import_utils import is_torch_distributed_available, is_torch_available -from ..import_utils import is_torch_distributed_available +if is_torch_available(): + import torch if is_torch_distributed_available(): import torch.distributed @@ -20,61 +21,71 @@ def __init__(self, device: str, backend: str): self.latencies: List[float] = [] + # this is not in track, because this tracker is used repeatedly if is_torch_distributed_available() and torch.distributed.is_initialized(): - LOGGER.info("Tracking Pytorch Distributed latency") + LOGGER.info("\t+ Tracking Pytorch Distributed latency") elif self.device == "cuda" and self.backend == "pytorch": - LOGGER.info("Tracking Pytorch CUDA latency") + LOGGER.info("\t+ Tracking Pytorch CUDA latency") else: - LOGGER.info("Tracking CPU latency") + LOGGER.info("\t+ Tracking CPU latency") + + def reset(self): + self.latencies = [] @contextmanager def track(self): if is_torch_distributed_available() and torch.distributed.is_initialized(): - yield from self._pytorch_distributed_tracker() + yield from self._pytorch_distributed_latency() elif self.backend == "pytorch" and self.device == "cuda": - yield from self._pytorch_cuda_tracker() + yield from self._pytorch_cuda_latency() else: - yield from self._cpu_tracker() + yield from self._cpu_latency() - def _pytorch_distributed_tracker(self): + def _pytorch_distributed_latency(self): torch.distributed.barrier() # synchronize before workload start = time.perf_counter_ns() yield torch.distributed.barrier() # synchronize after workload end = time.perf_counter_ns() - latency_ns = end - start - latency = latency_ns / 1e9 + latency = (end - start) / 1e9 self.latencies.append(latency) - LOGGER.debug(f"Tracked Pytorch Distributed latency: {latency:.2e}s") + LOGGER.debug(f"\t+ Tracked Pytorch distributed latency: {latency:.2e}s") - def _pytorch_cuda_tracker(self): + def _pytorch_cuda_latency(self): + # Note: torch.cuda.Event is not used here, + # there's actually no specific need to use cuda events if you're synchronizing + # it's rather a feature that can be used to measure kernel latency without synchronizing, + # allowing us to measure the time it takes to perform an operation without necessarily stalling the GPU. + # An interesting use case is with cuda graphs where synchronization makes us shoot the optimization in the foot. + # details: https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/ torch.cuda.synchronize() # synchronize before workload start = time.perf_counter_ns() yield torch.cuda.synchronize() # synchronize after workload end = time.perf_counter_ns() - latency_ns = end - start - latency = latency_ns / 1e9 + latency = (end - start) / 1e9 self.latencies.append(latency) - LOGGER.debug(f"Tracked Pytorch CUDA latency: {latency:.2e}s") + LOGGER.debug(f"\t+ Tracked Pytorch CUDA latency: {latency:.2e}s") - def _cpu_tracker(self): + def _cpu_latency(self): start = time.perf_counter_ns() yield end = time.perf_counter_ns() - latency_ns = end - start - latency = latency_ns / 1e9 + latency = (end - start) / 1e9 self.latencies.append(latency) - LOGGER.debug(f"Tracked CPU latency: {latency:.2e}s") + LOGGER.debug(f"\t+ Tracked CPU latency: {latency:.2e}s") - def get_latencies(self): - return self.latencies + def get_total_count(self): + return len(self.latencies) def get_total_latency(self): return sum(self.latencies) + + def get_latencies_list(self) -> List[float]: + return self.latencies diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py index 06b0683a..816f1d5a 100644 --- a/optimum_benchmark/trackers/memory.py +++ b/optimum_benchmark/trackers/memory.py @@ -1,18 +1,12 @@ import os from logging import getLogger -from typing import List, Optional from contextlib import contextmanager +from typing import List, Optional, Dict from multiprocessing import Pipe, Process from multiprocessing.connection import Connection -import psutil -import torch - -from ..env_utils import bytes_to_mega_bytes, is_nvidia_system, is_rocm_system -from ..import_utils import ( - is_py3nvml_available, - is_pyrsmi_available, -) +from ..env_utils import bytes_to_mega_bytes, get_cuda_device_ids, is_nvidia_system, is_rocm_system +from ..import_utils import is_py3nvml_available, is_pyrsmi_available, is_torch_available if is_nvidia_system(): if is_py3nvml_available(): @@ -25,33 +19,65 @@ if is_rocm_system(): if is_pyrsmi_available(): - # TODO: use amdsmi instead of pyrsmi from pyrsmi import rocml else: raise ValueError( - "The library pyrsmi is required to run memory benchmark on ROCm-powered GPUs, but is not installed. " + "The library pyrsmi is required to run memory benchmark on AMD GPUs, but is not installed. " "Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git." ) +if is_torch_available(): + import torch + +import psutil + LOGGER = getLogger("memory") class MemoryTracker: - def __init__(self, device: str, backend: str, device_ids: Optional[List[int]] = None): + """ + Memory tracker to measure max memory usage of CPU or GPU devices. + + Args: + device (str): Device to track memory usage. Can be either "cuda" or any other device. + backend (str): Backend to track memory usage. Can be either "pytorch" or any other backend. + device_ids (List[int], optional): List of device IDs to track memory usage. Defaults to None. + """ + + def __init__(self, device: str, backend: str, device_ids: Optional[str] = None): self.device = device self.backend = backend - self.device_ids = device_ids - self.max_memory_used: int = 0 - self.max_memory_reserved: int = 0 - self.max_memory_allocated: int = 0 + self.max_memory_used = 0 + self.max_memory_reserved = 0 + self.max_memory_allocated = 0 if self.device == "cuda": - if self.device_ids is None: - self.device_ids = infer_cuda_device_ids() + if device_ids is None: + LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.") + self.device_ids = list(map(int, get_cuda_device_ids().split(","))) + else: + self.device_ids = list(map(int, device_ids.split(","))) + + LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices: {self.device_ids}") + + if self.backend == "pytorch": + self.pytorch_device_ids = list(range(torch.cuda.device_count())) + LOGGER.info(f"\t+ Tracking Pytorch memory of Pytorch CUDA devices: {self.pytorch_device_ids}") + + if len(self.device_ids) != len(self.pytorch_device_ids): + raise ValueError( + "The number of CUDA devices and Pytorch CUDA devices must be the same. " + f"Got {len(self.device_ids)} and {len(self.pytorch_device_ids)} respectively." + ) + else: + LOGGER.info("\t+ Tracking RAM memory") - LOGGER.info(f"Tracking CUDA devices: {self.device_ids}") + def reset(self): + self.max_memory_used = 0 + self.max_memory_reserved = 0 + self.max_memory_allocated = 0 @contextmanager def track(self): @@ -62,109 +88,122 @@ def track(self): else: yield from self._cpu_memory() - def get_max_memory_used(self): - return bytes_to_mega_bytes(self.max_memory_used) - - def get_max_memory_reserved(self): - return bytes_to_mega_bytes(self.max_memory_reserved) - - def get_max_memory_allocated(self): - return bytes_to_mega_bytes(self.max_memory_allocated) - def _cuda_pytorch_memory(self): torch.cuda.empty_cache() - - for device_index in range(torch.cuda.device_count()): + for pytorch_device_index in self.pytorch_device_ids: try: - torch.cuda.reset_peak_memory_stats(device=device_index) + torch.cuda.reset_peak_memory_stats(device=pytorch_device_index) except Exception as e: - LOGGER.warning(f"Could not reset peak memory stats for device {device_index}: {e}") + LOGGER.warning(f"\t+ Could not reset max memory stats for device {pytorch_device_index}: {e}") yield from self._cuda_memory() - for device_index in range(torch.cuda.device_count()): - self.max_memory_allocated += torch.cuda.max_memory_allocated(device=device_index) - self.max_memory_reserved += torch.cuda.max_memory_reserved(device=device_index) + for pytorch_device_index in self.pytorch_device_ids: + self.max_memory_reserved += torch.cuda.max_memory_reserved(device=pytorch_device_index) + self.max_memory_allocated += torch.cuda.max_memory_allocated(device=pytorch_device_index) - LOGGER.debug(f"Pytorch max memory allocated: {self.get_max_memory_allocated()} MB") - LOGGER.debug(f"Pytorch max memory reserved: {self.get_max_memory_reserved()} MB") + LOGGER.debug(f"\t+ Pytorch max memory reserved: {self.get_max_memory_reserved_mb()} MB") + LOGGER.debug(f"\t+ Pytorch max memory allocated: {self.get_max_memory_allocated_mb()} MB") - def _cuda_memory(self): - if is_nvidia_system() and is_py3nvml_available(): - handles = [] - nvml.nvmlInit() - for device_index in self.device_ids: - handle = nvml.nvmlDeviceGetHandleByIndex(device_index) - handles.append(handle) - - yield - - for handle in handles: - meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) - self.max_memory_used += meminfo.used - nvml.nvmlShutdown() - LOGGER.debug(f"PyNVML max memory used: {self.get_max_memory_used()} MB") - - elif is_rocm_system() and is_pyrsmi_available(): - rocml.smi_initialize() + def _cuda_memory(self, interval: float = 0.001): + child_connection, parent_connection = Pipe() + memory_process = Process( + target=monitor_gpu_max_vram_memory, + args=(self.device_ids, child_connection, interval), + daemon=True, + ) + memory_process.start() + parent_connection.recv() # wait for memory process to be ready - yield + yield - for device_index in self.device_ids: - meminfo_used = rocml.smi_get_device_memory_used(device_index) - self.max_memory_used += meminfo_used - rocml.smi_shutdown() - LOGGER.debug(f"PyRSMI max memory used: {self.get_max_memory_used()} MB") - else: - raise ValueError("Only NVIDIA and AMD RoCm GPUs are supported for CUDA memory tracking.") + parent_connection.send(True) + self.max_memory_used = parent_connection.recv() + LOGGER.debug(f"\t+ Max memory (VRAM) used: {self.get_max_memory_used_mb()} MB") - def _cpu_memory(self, interval: float = 0.0001): + def _cpu_memory(self, interval: float = 0.001): child_connection, parent_connection = Pipe() - # instantiate process memory_process = Process( - target=monitor_process_peak_memory, + target=monitor_cpu_max_ram_memory, args=(os.getpid(), child_connection, interval), daemon=True, ) memory_process.start() - parent_connection.recv() + parent_connection.recv() # wait for memory process to be ready yield - parent_connection.send(0) + parent_connection.send(True) self.max_memory_used = parent_connection.recv() - LOGGER.debug(f"Peak memory usage: {self.get_max_memory_used()} MB") + LOGGER.debug(f"\t+ Max memory (RAM) used: {self.get_max_memory_used_mb()} MB") + def get_max_memory_used_mb(self) -> int: + return bytes_to_mega_bytes(self.max_memory_used) + + def get_max_memory_allocated_mb(self) -> int: + return bytes_to_mega_bytes(self.max_memory_allocated) -def monitor_process_peak_memory(process_id: int, connection: Connection, interval: float): + def get_max_memory_reserved_mb(self) -> int: + return bytes_to_mega_bytes(self.max_memory_reserved) + + def get_memories_dict(self) -> Dict[str, int]: + if self.device == "cuda" and self.backend == "pytorch": + return { + "max_vram_used(MB)": self.get_max_memory_used_mb(), + "max_memory_reserved(MB)": self.get_max_memory_reserved_mb(), + "max_memory_allocated(MB)": self.get_max_memory_allocated_mb(), + } + elif self.device == "cuda": + return {"max_vram_used(MB)": self.get_max_memory_used_mb()} + else: + return {"max_ram_used(MB)": self.get_max_memory_used_mb()} + + +def monitor_cpu_max_ram_memory(process_id: int, connection: Connection, interval: float): process = psutil.Process(process_id) - peak_memory_usage = 0 + max_memory_usage = 0 connection.send(0) stop = False while not stop: meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info" current_memory_usage = getattr(process, meminfo_attr)()[0] - peak_memory_usage = max(peak_memory_usage, current_memory_usage) + max_memory_usage = max(max_memory_usage, current_memory_usage) stop = connection.poll(interval) - connection.send(peak_memory_usage) + connection.send(max_memory_usage) connection.close() -def infer_cuda_device_ids() -> List[int]: - if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: - cuda_device_ids = list(map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(","))) +def monitor_gpu_max_vram_memory(device_ids: List[int], connection: Connection, interval: float): + if is_nvidia_system() and is_py3nvml_available(): + nvml.nvmlInit() + handles = [nvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids] + max_memory_usage = 0 + connection.send(0) + stop = False + + while not stop: + current_memory_usage = sum(nvml.nvmlDeviceGetMemoryInfo(handle).used for handle in handles) + max_memory_usage = max(max_memory_usage, current_memory_usage) + stop = connection.poll(interval) + + connection.send(max_memory_usage) + nvml.nvmlShutdown() + connection.close() + elif is_rocm_system() and is_pyrsmi_available(): + rocml.smi_initialize() + max_memory_usage = 0 + connection.send(0) + stop = False + + while not stop: + current_memory_usage = sum(rocml.smi_get_device_memory_used(device_id) for device_id in device_ids) + max_memory_usage = max(max_memory_usage, current_memory_usage) + stop = connection.poll(interval) + + connection.send(max_memory_usage) + rocml.smi_shutdown() + connection.close() else: - if is_nvidia_system() and is_py3nvml_available(): - nvml.nvmlInit() - cuda_device_ids = list(range(nvml.nvmlDeviceGetCount())) - nvml.nvmlShutdown() - elif is_rocm_system() and is_pyrsmi_available(): - rocml.smi_initialize() - cuda_device_ids = list(range(rocml.smi_get_device_count())) - rocml.smi_shutdown() - else: - raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.") - - return cuda_device_ids + raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.") diff --git a/setup.py b/setup.py index 7b618ed4..40504fd3 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ # docker-based backends "text-generation-inference": ["docker"], # specific settings + "codecarbon": ["codecarbon"], "deepspeed": ["deepspeed"], "diffusers": ["diffusers"], "timm": ["timm"], diff --git a/tests/configs/_base_.yaml b/tests/configs/_base_.yaml index ff50aa22..d983b841 100644 --- a/tests/configs/_base_.yaml +++ b/tests/configs/_base_.yaml @@ -2,24 +2,27 @@ defaults: - launcher: process # isolated process launcher - experiment # inheriting experiment schema - _self_ # for hydra 1.1 compatibility - # - override hydra/hydra_logging: colorlog # colorful logging - # - override hydra/job_logging: colorlog # colorful logging + - override hydra/hydra_logging: colorlog # colorful logging + - override hydra/job_logging: colorlog # colorful logging - override hydra/launcher: joblib # for parallelization experiment_name: ${device}_${benchmark.name}_${backend.name}_${task} +# hydra/cli specific settings hydra: run: - dir: tests/experiments/${experiment_name} + # where to store run results + dir: tests/runs/${experiment_name} sweep: - dir: tests/experiments/${experiment_name} + # where to store sweep results + dir: tests/sweeps/${experiment_name} job: + # change working directory to the run directory chdir: true env_set: - OVERRIDE_BENCHMARKS: 1 # to not skip if results already exist - CUDA_VISIBLE_DEVICES: 0 # by default we only use one GPU - CUDA_DEVICE_ORDER: PCI_BUS_ID # laking we use the right GPU - + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before + OVERRIDE_BENCHMARKS: 1 # we are using joblib launcher to parallelize testing since # we're having ccorrect benchmarks is not important while testing diff --git a/tests/configs/_ddp_.yaml b/tests/configs/_ddp_.yaml index a5a946fc..aab449e4 100644 --- a/tests/configs/_ddp_.yaml +++ b/tests/configs/_ddp_.yaml @@ -4,7 +4,5 @@ defaults: launcher: nproc_per_node: 2 -hydra: - job: - env_set: - CUDA_VISIBLE_DEVICES: 0,1 +backend: + device_ids: 0,1 diff --git a/tests/configs/_dp_.yaml b/tests/configs/_dp_.yaml index 4d6528f6..b7578bdf 100644 --- a/tests/configs/_dp_.yaml +++ b/tests/configs/_dp_.yaml @@ -1,4 +1,2 @@ -hydra: - job: - env_set: - CUDA_VISIBLE_DEVICES: 0,1 +backend: + device_ids: 0,1 diff --git a/tests/configs/_ds_tp_.yaml b/tests/configs/_ds_tp_.yaml index 76608e2e..6c154e4f 100644 --- a/tests/configs/_ds_tp_.yaml +++ b/tests/configs/_ds_tp_.yaml @@ -5,12 +5,8 @@ launcher: nproc_per_node: 2 backend: + device_ids: 0,1 deepspeed_inference: true deepspeed_inference_config: tensor_parallel: tp_size: 2 - -hydra: - job: - env_set: - CUDA_VISIBLE_DEVICES: 0,1 diff --git a/tests/configs/_lm_naive_mp_.yaml b/tests/configs/_lm_naive_mp_.yaml index 20aef92a..2ac16fb8 100644 --- a/tests/configs/_lm_naive_mp_.yaml +++ b/tests/configs/_lm_naive_mp_.yaml @@ -1,10 +1,6 @@ backend: - model: gpt2 + device_ids: 0,1 + device_map: auto task: text-generation library: transformers - device_map: auto - -hydra: - job: - env_set: - CUDA_VISIBLE_DEVICES: 0,1 + model: gpt2 diff --git a/tests/test_api.py b/tests/test_api.py index f388e629..0bf6ced9 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -6,7 +6,6 @@ from optimum_benchmark.trackers.memory import MemoryTracker from optimum_benchmark.trackers.latency import LatencyTracker -from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS from optimum_benchmark.experiment import ExperimentConfig, launch from optimum_benchmark.launchers.inline.config import InlineConfig from optimum_benchmark.backends.pytorch.config import PyTorchConfig @@ -18,14 +17,13 @@ from optimum_benchmark.benchmarks.training.config import TrainingConfig from optimum_benchmark.benchmarks.inference.config import InferenceConfig from optimum_benchmark.generators.dataset_generator import DatasetGenerator +from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS +from optimum_benchmark.backends.timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config from optimum_benchmark.backends.transformers_utils import ( extract_transformers_shapes_from_artifacts, get_transformers_pretrained_config, ) -from optimum_benchmark.backends.timm_utils import ( - extract_timm_shapes_from_config, - get_timm_pretrained_config, -) + LOGGER = getLogger("test-api") @@ -45,8 +43,15 @@ ("transformers", "image-classification", "google/vit-base-patch16-224"), ("transformers", "semantic-segmentation", "google/vit-base-patch16-224"), ] -BENCHMARK_CONFIGS = [InferenceConfig(memory=True), TrainingConfig()] -LAUNCHER_CONFIGS = [InlineConfig(), ProcessConfig(), TorchrunConfig(nproc_per_node=2)] +BENCHMARK_CONFIGS = [ + InferenceConfig(latency=True, memory=True), + TrainingConfig(latency=True, memory=True), +] +LAUNCHER_CONFIGS = [ + TorchrunConfig(nproc_per_node=2, device_isolation=False), + ProcessConfig(device_isolation=False), + InlineConfig(device_isolation=False), +] @pytest.mark.parametrize("device,backend", DEVICES_BACKENDS) @@ -58,11 +63,11 @@ def test_api_latency_tracker(device, backend): with tracker.track(): time.sleep(1) - measured_latencies = tracker.get_latencies() + latencies_list = tracker.get_latencies_list() - assert len(measured_latencies) == 2 - assert measured_latencies[0] > expected_latency * 0.9 - assert measured_latencies[0] < expected_latency * 1.1 + assert len(latencies_list) == 2 + assert latencies_list[0] > expected_latency * 0.9 + assert latencies_list[0] < expected_latency * 1.1 @pytest.mark.parametrize("device,backend", DEVICES_BACKENDS) @@ -74,18 +79,18 @@ def test_api_memory_tracker(device, backend): # the process consumes memory that we can't control if backend == "pytorch": - initial_process_memory = tracker.get_max_memory_allocated() + initial_process_memory = tracker.get_max_memory_allocated_mb() else: - initial_process_memory = tracker.get_max_memory_used() + initial_process_memory = tracker.get_max_memory_used_mb() with tracker.track(): array = torch.ones((10000, 10000), dtype=torch.float64, device=device) expected_memory = array.nbytes / 1e6 # around 800 MB if backend == "pytorch": - final_process_memory = tracker.get_max_memory_allocated() + final_process_memory = tracker.get_max_memory_allocated_mb() else: - final_process_memory = tracker.get_max_memory_used() + final_process_memory = tracker.get_max_memory_used_mb() measured_memory = final_process_memory - initial_process_memory @@ -96,11 +101,11 @@ def test_api_memory_tracker(device, backend): @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS) def test_api_input_generator(library, task, model): if library == "transformers": - model_config = get_transformers_pretrained_config(model=model) - model_shapes = extract_transformers_shapes_from_artifacts(config=model_config) + model_config = get_transformers_pretrained_config(model) + model_shapes = extract_transformers_shapes_from_artifacts(model_config) elif library == "timm": model_config = get_timm_pretrained_config(model) - model_shapes = extract_timm_shapes_from_config(config=model_config) + model_shapes = extract_timm_shapes_from_config(model_config) else: raise ValueError(f"Unknown library {library}") @@ -110,9 +115,13 @@ def test_api_input_generator(library, task, model): model_shapes=model_shapes, ) - _ = generator.generate(mode="forward") if task in TEXT_GENERATION_TASKS: - _ = generator.generate(mode="generate") + _ = generator(mode="forward") + _ = generator(mode="generate") + elif task in IMAGE_DIFFUSION_TASKS: + _ = generator(mode="call") + else: + _ = generator(mode="forward") @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS) @@ -132,28 +141,15 @@ def test_api_dataset_generator(library, task, model): model_shapes=model_shapes, ) - _ = generator.generate() - - -@pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS) -def test_api_launchers(launcher_config): - backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cpu") - benchmark_config = InferenceConfig(memory=True) - experiment_config = ExperimentConfig( - experiment_name="api-launch-experiment", - benchmark=benchmark_config, - launcher=launcher_config, - backend=backend_config, - ) - _ = launch(experiment_config) + _ = generator() @pytest.mark.parametrize("benchmark_config", BENCHMARK_CONFIGS) -def test_api_benchmarks(benchmark_config): - backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cpu") - launcher_config = ProcessConfig() +@pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS) +def test_api_launch_cpu(benchmark_config, launcher_config): + backend_config = PyTorchConfig(model="bert-base-uncased", no_weights=True, device="cpu") experiment_config = ExperimentConfig( - experiment_name="api-benchmark-experiment", + experiment_name="", benchmark=benchmark_config, launcher=launcher_config, backend=backend_config, diff --git a/tests/test_cli.py b/tests/test_cli.py index b48283e1..afae3609 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,7 +3,7 @@ import pytest -from optimum_benchmark.logging_utils import run_process_and_log_stream_output +from optimum_benchmark.logging_utils import run_subprocess_and_log_stream_output LOGGER = getLogger("test-cli") @@ -26,7 +26,7 @@ def test_cli_configs(config_name): "--multirun", ] - popen = run_process_and_log_stream_output(LOGGER, args) + popen = run_subprocess_and_log_stream_output(LOGGER, args) assert popen.returncode == 0, f"Failed to run {config_name}" @@ -42,7 +42,7 @@ def test_cli_exit_code(): "backend.model=bert-base-uncased", ] - popen_0 = run_process_and_log_stream_output(LOGGER, args_0) + popen_0 = run_subprocess_and_log_stream_output(LOGGER, args_0) assert popen_0.returncode == 0 args_1 = [ @@ -56,5 +56,5 @@ def test_cli_exit_code(): "backend.model=bert-base-uncased", ] - popen_1 = run_process_and_log_stream_output(LOGGER, args_1) + popen_1 = run_subprocess_and_log_stream_output(LOGGER, args_1) assert popen_1.returncode == 1