diff --git a/.github/workflows/check_quality.yaml b/.github/workflows/check_quality.yaml
index da468da3..36b99f99 100644
--- a/.github/workflows/check_quality.yaml
+++ b/.github/workflows/check_quality.yaml
@@ -18,10 +18,10 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install quality requirements
         run: |
diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml
index 25ba8d1a..752afab7 100644
--- a/.github/workflows/test_api_cpu.yaml
+++ b/.github/workflows/test_api_cpu.yaml
@@ -18,10 +18,10 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml
index abc7aed4..df72ffb2 100644
--- a/.github/workflows/test_api_misc.yaml
+++ b/.github/workflows/test_api_misc.yaml
@@ -18,10 +18,10 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install requirements
         run: |
diff --git a/.github/workflows/test_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml
similarity index 82%
rename from .github/workflows/test_cpu_neural_compressor.yaml
rename to .github/workflows/test_cli_cpu_neural_compressor.yaml
index 7e3488d4..9150a90f 100644
--- a/.github/workflows/test_cpu_neural_compressor.yaml
+++ b/.github/workflows/test_cli_cpu_neural_compressor.yaml
@@ -1,4 +1,4 @@
-name: CPU Intel Neural Compressor Tests
+name: CLI CPU Intel Neural Compressor Tests
 
 on:
   workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  run_cpu_neural_compressor_tests:
+  run_cli_cpu_neural_compressor_tests:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install Intel Neural Compressor CPU requirements
         run: |
diff --git a/.github/workflows/test_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml
similarity index 82%
rename from .github/workflows/test_cpu_onnxruntime.yaml
rename to .github/workflows/test_cli_cpu_onnxruntime.yaml
index 2770b23f..e7caf218 100644
--- a/.github/workflows/test_cpu_onnxruntime.yaml
+++ b/.github/workflows/test_cli_cpu_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: CPU OnnxRuntime Tests
+name: CLI CPU OnnxRuntime Tests
 
 on:
   workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  run_cpu_onnxruntime_tests:
+  run_cli_cpu_onnxruntime_tests:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install requirements
         run: |
diff --git a/.github/workflows/test_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml
similarity index 83%
rename from .github/workflows/test_cpu_openvino.yaml
rename to .github/workflows/test_cli_cpu_openvino.yaml
index d2d93cce..00b40aef 100644
--- a/.github/workflows/test_cpu_openvino.yaml
+++ b/.github/workflows/test_cli_cpu_openvino.yaml
@@ -1,4 +1,4 @@
-name: CPU OpenVINO Tests
+name: CLI CPU OpenVINO Tests
 
 on:
   workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  run_cpu_openvino_tests:
+  run_cli_cpu_openvino_tests:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install requirements
         run: |
diff --git a/.github/workflows/test_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml
similarity index 83%
rename from .github/workflows/test_cpu_pytorch.yaml
rename to .github/workflows/test_cli_cpu_pytorch.yaml
index 1c6809cc..3df5368b 100644
--- a/.github/workflows/test_cpu_pytorch.yaml
+++ b/.github/workflows/test_cli_cpu_pytorch.yaml
@@ -1,4 +1,4 @@
-name: CPU Pytorch tests
+name: CLI CPU Pytorch tests
 
 on:
   workflow_dispatch:
@@ -12,16 +12,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  run_cpu_pytorch_tests:
+  run_cli_cpu_pytorch_tests:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install requirements
         run: |
diff --git a/.github/workflows/test_cuda_onnxruntime_inference.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
similarity index 86%
rename from .github/workflows/test_cuda_onnxruntime_inference.yaml
rename to .github/workflows/test_cli_cuda_onnxruntime.yaml
index bbb81b36..0b03608e 100644
--- a/.github/workflows/test_cuda_onnxruntime_inference.yaml
+++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: CUDA OnnxRuntime Inference Tests
+name: CLI CUDA OnnxRuntime Tests
 
 on:
   workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build_image_and_run_cuda_onnxruntime_inference_tests:
+  build_image_and_run_cli_cuda_onnxruntime_tests:
     runs-on: hf-dgx-01
     steps:
       - name: Checkout
@@ -40,4 +40,4 @@ jobs:
           --workdir /workspace/optimum-benchmark
           --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
-          -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime and inference' -x"
+          -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
diff --git a/.github/workflows/test_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
similarity index 95%
rename from .github/workflows/test_cuda_pytorch.yaml
rename to .github/workflows/test_cli_cuda_pytorch.yaml
index 49e77f8a..1b3fd99f 100644
--- a/.github/workflows/test_cuda_pytorch.yaml
+++ b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -1,4 +1,4 @@
-name: CUDA Pytorch Tests
+name: CLI CUDA Pytorch Tests
 
 on:
   workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build_image_and_run_cuda_pytorch_tests:
+  build_image_and_run_cli_cuda_pytorch_tests:
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/test_cuda_torch_ort_training.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
similarity index 91%
rename from .github/workflows/test_cuda_torch_ort_training.yaml
rename to .github/workflows/test_cli_cuda_torch_ort.yaml
index 20f87e67..71bfd33e 100644
--- a/.github/workflows/test_cuda_torch_ort_training.yaml
+++ b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -1,4 +1,4 @@
-name: CUDA Torch-ORT Training Tests
+name: CLI CUDA Torch-ORT Tests
 
 on:
   workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build_image_and_run_cuda_torch_ort_training_tests:
+  build_image_and_run_cli_cuda_torch_ort_tests:
     runs-on: hf-dgx-01
     steps:
       - name: Checkout
@@ -40,4 +40,4 @@ jobs:
           --workdir /workspace/optimum-benchmark
           --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
-          -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort and training' -x"
+          -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml
index c448a213..5b55c0a7 100644
--- a/.github/workflows/test_cli_misc.yaml
+++ b/.github/workflows/test_cli_misc.yaml
@@ -18,10 +18,10 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.10
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8
+          python-version: '3.10'
 
       - name: Install requirements
         run: |
diff --git a/.github/workflows/test_rocm_onnxruntime_inference.yaml b/.github/workflows/test_cli_rocm_onnxruntime.yaml
similarity index 90%
rename from .github/workflows/test_rocm_onnxruntime_inference.yaml
rename to .github/workflows/test_cli_rocm_onnxruntime.yaml
index 5a8cc0a3..fcd0f53d 100644
--- a/.github/workflows/test_rocm_onnxruntime_inference.yaml
+++ b/.github/workflows/test_cli_rocm_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: ROCm OnnxRuntime Inference Tests
+name: CLI ROCm OnnxRuntime Tests
 
 on:
   workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build_image_and_run_rocm_onnxruntime_inference_tests:
+  build_image_and_run_cli_rocm_onnxruntime_tests:
     runs-on: hf-amd-mi210-dev
     steps:
       - name: Checkout
@@ -51,4 +51,4 @@ jobs:
           --device /dev/dri/renderD129
           --entrypoint /bin/bash
           opt-bench-rocm-ort:5.7
-          -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime and inference' -x"
+          -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x"
diff --git a/.github/workflows/test_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
similarity index 95%
rename from .github/workflows/test_rocm_pytorch.yaml
rename to .github/workflows/test_cli_rocm_pytorch.yaml
index 3d14909d..11c9e77a 100644
--- a/.github/workflows/test_rocm_pytorch.yaml
+++ b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -1,4 +1,4 @@
-name: ROCm Pytorch Tests
+name: CLI ROCm Pytorch Tests
 
 on:
   workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build_image_and_run_rocm_pytorch_tests:
+  build_image_and_run_cli_rocm_pytorch_tests:
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/test_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml
similarity index 93%
rename from .github/workflows/test_tensorrt_llm.yaml
rename to .github/workflows/test_cli_tensorrt_llm.yaml
index 06640699..0169fca5 100644
--- a/.github/workflows/test_tensorrt_llm.yaml
+++ b/.github/workflows/test_cli_tensorrt_llm.yaml
@@ -1,4 +1,4 @@
-name: TensorRT-LLM Tests
+name: CLI TensorRT-LLM Tests
 
 on:
   workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  pull_image_and_run_tensorrt_llm_tests:
+  pull_image_and_run_cli_tensorrt_llm_tests:
     runs-on: hf-dgx-01
     steps:
       - name: Checkout
diff --git a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
similarity index 86%
rename from .github/workflows/test_tensorrt_onnxruntime_inference.yaml
rename to .github/workflows/test_cli_tensorrt_onnxruntime.yaml
index 4d41313d..92f425e7 100644
--- a/.github/workflows/test_tensorrt_onnxruntime_inference.yaml
+++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
@@ -1,4 +1,4 @@
-name: TensorRT OnnxRuntime Inference Tests
+name: CLI TensorRT OnnxRuntime Tests
 
 on:
   workflow_dispatch:
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build_image_and_run_tensorrt_onnxruntime_tests:
+  build_image_and_run_cli_tensorrt_onnxruntime_tests:
     runs-on: hf-dgx-01
     steps:
       - name: Checkout
@@ -40,4 +40,4 @@ jobs:
           --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-tensorrt:22.12
-          -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime and inference' -x"
+          -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x"
diff --git a/Makefile b/Makefile
index c993cc7c..55e44e1e 100644
--- a/Makefile
+++ b/Makefile
@@ -12,8 +12,68 @@ style:
 install:
 	pip install -e .
 
-install_cpu_dev:
-	pip install -e .[quality,testing,openvino,onnxruntime,neural-compressor,diffusers,timm,peft]
+build_docker_cpu:
+	docker build -f docker/cuda.dockerfile  --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t opt-bench-cpu:latest .
 
-install_gpu_dev:
-	pip install -e .[quality,testing,onnxruntime-gpu,deepspeed,diffusers,timm,peft]
+build_docker_cuda:
+	docker build -f docker/cuda.dockerfile  --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_CUDA=cu118 --build-arg CUDA_VERSION=11.8.0 -t opt-bench-cuda:11.8.0 . 
+
+build_docker_rocm:
+	docker build -f docker/rocm.dockerfile  --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_ROCM=rocm5.6 --build-arg ROCM_VERSION=5.6.1 -t opt-bench-rocm:5.6.1 . 
+
+test_cli_cpu_neural_compressor:
+	docker run \
+	--rm \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"
+
+test_cli_cpu_openvino:
+	docker run \
+	--rm \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers] && pytest tests/ -k 'cli and cpu and openvino' -x"
+
+test_cli_cpu_onnxruntime:
+	docker run \
+	--rm \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"
+
+test_cli_cpu_pytorch:
+	docker run \
+	--rm \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cpu:latest -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x"
+
+test_api_cpu:
+	docker run \
+	--rm \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cpu' -x"
+
+test_api_cuda:
+	docker run \
+	--rm \
+	--gpus '"device=0,1"' \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cuda' -x"
+
+test_api_misc:
+	docker run \
+	--rm \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
diff --git a/README.md b/README.md
index cc623d27..e338b888 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,13 @@
 </p>
 <h1 align="center">Optimum-Benchmark 🏋️</h1>
 
-Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with supported optimizations & quantization schemes, for [inference](https://github.com/huggingface/optimum#accelerated-inference) & [training](https://github.com/huggingface/optimum#accelerated-training), on multiple [backends & hardwares](https://github.com/huggingface/optimum-benchmark?tab=readme-ov-file#supported-backendsdevices).
+Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices-) utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with all their supported [optimizations & quantization schemes](#backend-features-), for [inference & training](#benchmark-features-%EF%B8%8F), in [distributed & non-distributed settings](#backend-features-).
 
 ## Motivation 🤔
 
-- Hardware vendors wanting to know how their hardware performs compared to others on the same models.
-- HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc.
+- HF hardware partners wanting to know how their hardware performs compared to another hardware on the same models.
+- HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc compared to another model.
 - Experimenting with hardware & backend specific optimizations & quantization schemes that can be applied to models and improve their computational/memory/energy efficiency.
-- [...]
 
 ## Current status 📈
 
@@ -19,23 +18,20 @@ Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transform
 [![CPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml)
 [![CUDA](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml)
 [![ROCM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml)
-[![MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml)
 
 ### CLI
+
 [![CPU Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml)
 [![CPU OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml)
 [![CPU Intel Neural Compressor Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml)
 [![CPU OpenVINO Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml)
-
 [![CUDA Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml)
-[![CUDA OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml)
-[![CUDA Torch-ORT Training Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml)
-
-[![TensorRT OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml)
+[![CUDA OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml)
+[![CUDA Torch-ORT Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml)
+[![TensorRT OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml)
 [![TensorRT-LLM Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml)
-
 [![ROCm Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml)
-[![ROCm OnnxRuntime Inference Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml)
+[![ROCm OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml)
 
 ## Quickstart 🚀
 
@@ -44,7 +40,7 @@ Optimum-Benchmark is a unified multi-backend utility for benchmarking [Transform
 You can install `optimum-benchmark` using pip:
 
 ```bash
-python -m pip install git+https://github.com/huggingface/optimum-benchmark.git
+pip install optimum-benchmark
 ```
 
 or by cloning the repository and installing it in editable mode:
@@ -66,33 +62,45 @@ Depending on the backends you want to use, you might need to install some extra
 - Intel Neural Compressor: `pip install optimum-benchmark[neural-compressor]`
 - Text Generation Inference: `pip install optimum-benchmark[text-generation-inference]`
 
-### Running benchmarks from python API 🧪
+### Running benchmarks from Python API 🧪
 
-You can run benchmarks from the python API:
+You can run benchmarks from the Python API, using the `launch` function from the `optimum_benchmark.experiment` module. Here's an example of how to run a benchmark using the `pytorch` backend, `process` launcher and `inference` benchmark.
 
 ```python
-import logging
-logging.basicConfig(level=logging.INFO)
-
+from optimum_benchmark.logging_utils import setup_logging
 from optimum_benchmark.experiment import launch, ExperimentConfig
 from optimum_benchmark.backends.pytorch.config import PyTorchConfig
 from optimum_benchmark.launchers.process.config import ProcessConfig
 from optimum_benchmark.benchmarks.inference.config import InferenceConfig
 
+
 if __name__ == "__main__":
-    backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cuda")
-    launcher_config = ProcessConfig(device_isolation=True)
-    benchmark_config = InferenceConfig(memory=True)
+    setup_logging(level="INFO")
+    benchmark_config = InferenceConfig(latency=False, memory=True, energy=True)
+    launcher_config = ProcessConfig()
+    backend_config = PyTorchConfig(
+        device="cuda",
+        no_weights=True,
+        device_ids="0,1",
+        device_map="auto",
+        model="IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm",
+    )
     experiment_config = ExperimentConfig(
-        experiment_name="api-launch-experiment",
+        experiment_name="python-api-launch-experiment",
         benchmark=benchmark_config,
         launcher=launcher_config,
         backend=backend_config,
     )
     benchmark_report = launch(experiment_config)
-    print("benchmark_report:", benchmark_report)
+    benchmark_report.log_all()
+    # or
+    print(benchmark_report.to_dict())
+    # or
+    benchmark_report.push_to_hub("IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm")
 ```
 
+Yep, it's that simple! Check the supported backends, launchers and benchmarks in the [features](#features-) section.
+
 ### Running benchmarks from CLI 🏃‍♂️
 
 You can run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension.
@@ -161,26 +169,26 @@ Other than the [examples](examples), you can also check [tests](tests/configs/).
 
 Everything else is optional or inferred at runtime, but can be configured to your needs.
 
-### Backends & Devices 📱
-
-- [x] Pytorch backend for CPU (`device=cpu`, `backend=pytorch`)
-- [x] Pytorch backend for CUDA (`device=cuda`, `backend=pytorch`)
-- [ ] Pytorch backend for Habana Gaudi Processor (`device=hpu`, `backend=pytorch`)
-- [x] OnnxRuntime backend for CPUExecutionProvider (`device=cpu`, `backend=onnxruntime`)
-- [x] OnnxRuntime backend for CUDAExecutionProvider (`device=cuda`, `backend=onnxruntime`)
-- [x] OnnxRuntime backend for ROCMExecutionProvider (`device=cuda`, `backend=onnxruntime`, `backend.provider=ROCMExecutionProvider`)
-- [x] OnnxRuntime backend for TensorrtExecutionProvider (`device=cuda`, `backend=onnxruntime`, `backend.provider=TensorrtExecutionProvider`)
-- [x] Intel Neural Compressor backend for CPU (`device=cpu`, `backend=neural-compressor`)
-- [x] TensorRT-LLM backend for CUDA (`device=cuda`, `backend=tensorrt-llm`)
-- [x] OpenVINO backend for CPU (`device=cpu`, `backend=openvino`)
-
-### Launcher features 🚀
+### Launchers 🚀
 
 - [x] Process isolation between consecutive runs (`launcher=process`)
-- [x] Assert devices (NVIDIA & AMD GPUs) isolation (`launcher.device_isolation=true`)
-- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`, etc)
+- [x] Assert GPU devices (NVIDIA & AMD) isolation (`launcher.device_isolation=true`)
+- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`)
+
+### Backends & Devices 📱
 
-### Benchmark features 🏋️
+- [x] Pytorch backend for CPU (`backend=pytorch`, `backend.device=cpu`)
+- [x] Pytorch backend for CUDA (`backend=pytorch`, `backend.device=cuda`)
+- [ ] Pytorch backend for Habana Gaudi Processor (`backend=pytorch`, `backend.device=habana`)
+- [x] OnnxRuntime backend for CPUExecutionProvider (`backend=onnxruntime`, `backend.device=cpu`)
+- [x] OnnxRuntime backend for CUDAExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`)
+- [x] OnnxRuntime backend for ROCMExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`, `backend.provider=ROCMExecutionProvider`)
+- [x] OnnxRuntime backend for TensorrtExecutionProvider (`backend=onnxruntime`, `backend.device=cuda`, `backend.provider=TensorrtExecutionProvider`)
+- [x] Intel Neural Compressor backend for CPU (`backend=neural-compressor`, `backend.device=cpu`)
+- [x] TensorRT-LLM backend for CUDA (`backend=tensorrt-llm`, `backend.device=cuda`)
+- [x] OpenVINO backend for CPU (`backend=openvino`, `backend.device=cpu`)
+
+### Benchmarking 🏋️
 
 - [x] Memory tracking (`benchmark.memory=true`)
 - [x] Latency and throughput tracking of forward pass (default)
diff --git a/docker/cpu.dockerfile b/docker/cpu.dockerfile
new file mode 100644
index 00000000..371a89c8
--- /dev/null
+++ b/docker/cpu.dockerfile
@@ -0,0 +1,42 @@
+FROM ubuntu:latest
+
+
+# Ignore interactive questions during `docker build`
+ENV DEBIAN_FRONTEND noninteractive
+
+# Run as non-root user
+ARG USER_ID
+ARG GROUP_ID
+
+RUN addgroup --gid $GROUP_ID user
+RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
+
+# Install python
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+
+# Add local bin to PATH
+ENV PATH="/home/user/.local/bin:${PATH}"
+
+# Add user to sudoers
+RUN adduser user sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >>/etc/sudoers
+
+# Change user
+USER user
+WORKDIR /home/user
+
+# Update pip
+RUN pip install --upgrade pip
+
+# Install PyTorch
+RUN if [ "${TORCH_PRE_RELEASE}" = "1" ]; \
+    then pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu ; \
+    else pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \
+    fi
diff --git a/examples/neural_compressor_ptq_bert.yaml b/examples/neural_compressor_ptq_bert.yaml
index 64691369..c8b0ee6e 100644
--- a/examples/neural_compressor_ptq_bert.yaml
+++ b/examples/neural_compressor_ptq_bert.yaml
@@ -7,25 +7,31 @@ defaults:
   - override hydra/job_logging: colorlog # colorful logging
   - override hydra/hydra_logging: colorlog # colorful logging
 
-experiment_name: openvino_static_quant_bert
+experiment_name: neural_compressor_ptq_bert
 
 backend:
-  model: bert-base-uncased
+  device: cpu
   no_weights: true
+  model: bert-base-uncased
   ptq_quantization: true
   calibration: true
-  device: cpu
 
 benchmark:
   input_shapes:
     batch_size: 1
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
diff --git a/examples/onnxruntime_static_quant_vit.yaml b/examples/onnxruntime_static_quant_vit.yaml
index 0b06bc0e..d324415d 100644
--- a/examples/onnxruntime_static_quant_vit.yaml
+++ b/examples/onnxruntime_static_quant_vit.yaml
@@ -10,23 +10,28 @@ defaults:
 experiment_name: onnxruntime_static_quant_vit
 
 backend:
+  device: cpu
+  no_weights: true
   model: google/vit-base-patch16-224
   quantization: true
   quantization_config:
     is_static: true
     per_channel: false
-  device: cpu
 
   calibration: true
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
-      CUDA_VISIBLE_DEVICES: 0
-      CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/openvino_diffusion.yaml b/examples/openvino_diffusion.yaml
index 3591ecd7..f9f62e64 100644
--- a/examples/openvino_diffusion.yaml
+++ b/examples/openvino_diffusion.yaml
@@ -10,22 +10,28 @@ defaults:
 model: stabilityai/stable-diffusion-2-1
 
 backend:
+  device: cpu
   experiment_name: openvino_diffusion
-  export: true
   reshape: true
+  export: true
   half: true
-  device: cpu
 
 benchmark:
   input_shapes:
     batch_size: 1
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
diff --git a/examples/openvino_static_quant_bert.yaml b/examples/openvino_static_quant_bert.yaml
index c349f3ea..83921f4c 100644
--- a/examples/openvino_static_quant_bert.yaml
+++ b/examples/openvino_static_quant_bert.yaml
@@ -10,24 +10,30 @@ defaults:
 experiment_name: openvino_static_quant_bert
 
 backend:
+  device: cpu
+  no_weights: true
   model: bert-base-uncased
   export: true
-  no_weights: true
   quantization: true
   calibration: true
   reshape: true
-  device: cpu
 
 benchmark:
   input_shapes:
     batch_size: 1
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml
index 71a087f0..5a36147c 100644
--- a/examples/pytorch_bert.yaml
+++ b/examples/pytorch_bert.yaml
@@ -10,17 +10,22 @@ defaults:
 experiment_name: pytorch_bert
 
 backend:
-  model: bert-base-uncased
   device: cpu
+  device_ids: 0
+  model: bert-base-uncased
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
-      CUDA_VISIBLE_DEVICES: 0
-      CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/pytorch_llama.yaml b/examples/pytorch_llama.yaml
index f6b29792..2c9e2845 100644
--- a/examples/pytorch_llama.yaml
+++ b/examples/pytorch_llama.yaml
@@ -10,8 +10,10 @@ defaults:
 experiment_name: pytorch_llama
 
 backend:
-  model: TheBloke/Llama-2-70B-AWQ
   device: cuda
+  device_ids: 0
+  no_weights: true
+  model: TheBloke/Llama-2-70B-AWQ
 
 launcher:
   device_isolation: true
@@ -22,14 +24,18 @@ benchmark:
     sequence_length: 256
   new_tokens: 1000
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
-      CUDA_VISIBLE_DEVICES: 0
-      CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/pytorch_timm.yaml b/examples/pytorch_timm.yaml
index 03125599..4b2c5295 100644
--- a/examples/pytorch_timm.yaml
+++ b/examples/pytorch_timm.yaml
@@ -10,8 +10,9 @@ defaults:
 experiment_name: pytorch_timm
 
 backend:
-  model: timm/mobilenetv3_large_100.ra_in1k
   device: cuda
+  device_ids: 0
+  model: timm/mobilenetv3_large_100.ra_in1k
 
 launcher:
   device_isolation: true
@@ -20,14 +21,18 @@ benchmark:
   input_shapes:
     batch_size: 1
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
-      CUDA_VISIBLE_DEVICES: 0
-      CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/tgi_llama.yaml b/examples/tgi_llama.yaml
index 9bf8b4d1..a23c5c55 100644
--- a/examples/tgi_llama.yaml
+++ b/examples/tgi_llama.yaml
@@ -10,10 +10,12 @@ defaults:
 experiment_name: tgi_llama
 
 backend:
+  device: cuda
+  device_ids: 0,1
+  device_map: true
   model: TheBloke/Llama-2-7B-AWQ
   quantization_scheme: awq
   sharded: false
-  device: cuda
 
 benchmark:
   input_shapes:
@@ -21,14 +23,18 @@ benchmark:
     sequence_length: 256
   new_tokens: 1000
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
-      CUDA_VISIBLE_DEVICES: 0
-      CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/examples/trt_llama.yaml b/examples/trt_llama.yaml
index e3f8844d..702bb39e 100644
--- a/examples/trt_llama.yaml
+++ b/examples/trt_llama.yaml
@@ -10,8 +10,8 @@ defaults:
 experiment_name: trt_llama
 
 backend:
-  model: NousResearch/Llama-2-7b-hf
   device: cuda
+  model: NousResearch/Llama-2-7b-hf
 
 benchmark:
   input_shapes:
@@ -19,14 +19,18 @@ benchmark:
     sequence_length: 64
   new_tokens: 128
 
+# hydra/cli specific settings
 hydra:
   run:
+    # where to store run results
     dir: runs/${experiment_name}
   sweep:
+    # where to store sweep results
     dir: sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
       OVERRIDE_BENCHMARKS: 1
-      CUDA_VISIBLE_DEVICES: 0
-      CUDA_DEVICE_ORDER: PCI_BUS_ID
diff --git a/optimum_benchmark/aggregators/__init__.py b/optimum_benchmark/aggregators/__init__.py
deleted file mode 100644
index a3015d55..00000000
--- a/optimum_benchmark/aggregators/__init__.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from pathlib import Path
-from typing import Tuple, List, Dict
-
-import pandas as pd
-from rich.table import Table
-from omegaconf import OmegaConf
-import matplotlib.pyplot as plt
-from rich.console import Console
-from flatten_dict import flatten
-from rich.terminal_theme import MONOKAI
-
-
-def gather(root_folders: List[Path]) -> pd.DataFrame:
-    configs_dfs = {}
-    results_dfs = {}
-
-    for root_folder in root_folders:
-        if not root_folder.exists():
-            raise ValueError(f"{root_folder} does not exist")
-
-        for f in root_folder.glob("**/hydra_config.yaml"):
-            parent_folder = f.parent.absolute().as_posix()
-            configs_dfs[parent_folder] = pd.DataFrame.from_dict(
-                flatten(OmegaConf.load(f), reducer="dot"), orient="index"
-            ).T
-
-        for f in root_folder.glob("**/*_results.csv"):
-            parent_folder = f.parent.absolute().as_posix()
-            results_dfs[parent_folder] = pd.read_csv(f)
-
-    if (len(results_dfs) == 0) or (len(configs_dfs) == 0):
-        raise ValueError(f"Results are missing in {root_folders}")
-
-    # Merge inference and config dataframes
-    full_dfs = {}
-    for parent_folder in results_dfs:
-        full_df = pd.concat(
-            [configs_dfs[parent_folder], results_dfs[parent_folder]],
-            axis=1,
-        )
-        full_df["parent_folder"] = parent_folder
-        full_dfs[parent_folder] = full_df
-
-    # Concatenate all dataframes
-    full_report = pd.concat(full_dfs.values(), ignore_index=True, axis=0)
-
-    return full_report
-
-
-def format_element(element):
-    if isinstance(element, float):
-        if element != element:
-            formated_element = ""
-        elif abs(element) >= 1:
-            formated_element = f"{element:.2f}"
-        elif abs(element) > 1e-6:
-            formated_element = f"{element:.2e}"
-        else:
-            formated_element = f"{element}"
-    elif element is None:
-        formated_element = ""
-    elif isinstance(element, bool):
-        if element:
-            formated_element = "[green]✔[/green]"
-        else:
-            formated_element = "[red]✘[/red]"
-    else:
-        formated_element = str(element)
-
-    return formated_element
-
-
-def display(report: pd.DataFrame) -> Table:
-    table = Table(show_header=True, show_lines=True)
-
-    for column in report.columns:
-        table.add_column(column, justify="right", header_style="bold")
-
-    for _, row in report.iterrows():
-        formated_row = []
-        for element in row.values:
-            formated_row.append(format_element(element))
-        table.add_row(*formated_row)
-
-    console = Console(record=True, theme=MONOKAI)
-    console.print(table, justify="center")
-
-    return console, table
-
-
-def rename(report: pd.DataFrame, rename_dict: Dict[str, str]):
-    summarized_report = report[list(rename_dict.keys())].rename(columns=rename_dict)
-
-    return summarized_report
-
-
-def plot(report: pd.DataFrame, x_axis: str, y_axis: str, groupby: str) -> Tuple[plt.Figure, plt.Axes]:
-    fig, ax = plt.subplots()
-
-    for group, sweep in report.groupby(groupby):
-        sorted_sweep = sweep.sort_values(by=x_axis)
-        ax.plot(sorted_sweep[x_axis], sorted_sweep[y_axis], label=group, marker="o")
-
-    ax.set_xlabel(x_axis)
-    ax.set_ylabel(y_axis)
-    ax.set_title(f"{y_axis} per {x_axis}")
-    ax.legend(fancybox=True, shadow=True)
-
-    return fig, ax
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index 1c55a5ab..cf0f5087 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -1,46 +1,25 @@
 import gc
-import os
 import random
-import shutil
 from abc import ABC
 from logging import getLogger
-from typing import (
-    Optional,
-    ClassVar,
-    Generic,
-    Dict,
-    Any,
-)
-
-import numpy as np
-from transformers.utils import ModelOutput
-from transformers import (
-    GenerationConfig,
-    PretrainedConfig,
-    PreTrainedModel,
-    TrainerState,
-    AutoModel,
-)
+from collections import OrderedDict
+from typing import Optional, ClassVar, Generic, Dict, Any
 
 from .config import BackendConfigT
 from ..task_utils import get_automodel_class_for_task
-from .diffusers_utils import (
-    extract_diffusers_shapes_from_config,
-    get_diffusers_pretrained_config,
-)
+
+from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config
+from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config, get_timm_pre_processor
 from .transformers_utils import (
     extract_transformers_shapes_from_artifacts,
-    get_transformers_pretrained_processor,
     get_transformers_generation_config,
     get_transformers_pretrained_config,
-    get_transformers_cache_dir,
+    get_transformers_pre_processor,
     PretrainedProcessor,
 )
-from .timm_utils import (
-    extract_timm_shapes_from_config,
-    get_timm_pretrained_processor,
-    get_timm_pretrained_config,
-)
+
+import numpy as np
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState
 
 LOGGER = getLogger("backend")
 
@@ -48,43 +27,38 @@
 class Backend(Generic[BackendConfigT], ABC):
     NAME: ClassVar[str]
 
-    config: BackendConfigT
-    automodel_class: AutoModel
-    pretrained_model: PreTrainedModel
+    model_type: str
     model_shapes: Dict[str, int]
 
+    pretrained_model: PreTrainedModel
     pretrained_config: Optional[PretrainedConfig]
-    pretrained_processor: Optional[PretrainedProcessor]
-    pretrained_generation_config: Optional[GenerationConfig]
+    generation_config: Optional[GenerationConfig]
+    pre_processor: Optional[PretrainedProcessor]
 
     def __init__(self, config: BackendConfigT):
         LOGGER.info(f"َAllocating {self.NAME} backend")
         self.config = config
+        self.seed()
 
         if self.config.library == "diffusers":
-            self.pretrained_processor = None
-            self.pretrained_generation_config = None
-            self.pretrained_config = get_diffusers_pretrained_config(model=self.config.model, **self.config.hub_kwargs)
-            self.model_shapes = extract_diffusers_shapes_from_config(model=self.config.model, **self.config.hub_kwargs)
+            self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.hub_kwargs)
+            self.model_shapes = extract_diffusers_shapes_from_config(self.config.model, **self.config.hub_kwargs)
             self.model_type = self.config.task
+            self.generation_config = None
+            self.pre_processor = None
+
         elif self.config.library == "timm":
-            self.pretrained_processor = get_timm_pretrained_processor(self.config.model)
+            self.pre_processor = get_timm_pre_processor(self.config.model)
             self.pretrained_config = get_timm_pretrained_config(self.config.model)
             self.model_shapes = extract_timm_shapes_from_config(config=self.pretrained_config)
             self.model_type = self.pretrained_config.architecture
-            self.pretrained_generation_config = None
+            self.generation_config = None
+
         else:
+            self.pre_processor = get_transformers_pre_processor(self.config.model, **self.config.hub_kwargs)
+            self.generation_config = get_transformers_generation_config(self.config.model, **self.config.hub_kwargs)
             self.pretrained_config = get_transformers_pretrained_config(self.config.model, **self.config.hub_kwargs)
-            self.pretrained_generation_config = get_transformers_generation_config(
-                self.config.model, **self.config.hub_kwargs
-            )
-            self.pretrained_processor = get_transformers_pretrained_processor(
-                self.config.model, **self.config.hub_kwargs
-            )
-            self.model_shapes = extract_transformers_shapes_from_artifacts(
-                config=self.pretrained_config,
-                processor=self.pretrained_processor,
-            )
+            self.model_shapes = extract_transformers_shapes_from_artifacts(self.pretrained_config, self.pre_processor)
             self.model_type = self.pretrained_config.model_type
 
         self.automodel_class = get_automodel_class_for_task(
@@ -95,6 +69,7 @@ def __init__(self, config: BackendConfigT):
         )
 
     def seed(self) -> None:
+        LOGGER.info(f"\t+ Setting random seed to {self.config.seed}")
         random.seed(self.config.seed)
         np.random.seed(self.config.seed)
 
@@ -112,40 +87,35 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """
         return inputs
 
-    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         """
         This method is used to perform the forward pass of the model.
         """
         raise NotImplementedError("Backend must implement forward method")
 
-    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         """
         This method is used to perform the generation pass of the model.
         """
         raise NotImplementedError("Backend must implement generate method")
 
+    def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        """
+        This method is used to call a whole pipeline.
+        """
+        raise NotImplementedError("Backend must implement call method")
+
     def train(self, **kwargs) -> TrainerState:
         """
         This method is used to train the model.
         """
         raise NotImplementedError("Backend must implement train method")
 
-    def delete_hf_model_cache(self) -> None:
-        LOGGER.info("\t+ Deleting model cache")
-        transformers_cache_path = get_transformers_cache_dir()
-        model_cache_folder = f"models/{self.config.model}".replace("/", "--")
-        model_cache_path = os.path.join(transformers_cache_path, model_cache_folder)
-        shutil.rmtree(model_cache_path, ignore_errors=True)
-
     def delete_pretrained_model(self) -> None:
-        LOGGER.info("\t+ Deleting pretrained model")
-        del self.pretrained_model
-        gc.collect()
+        if hasattr(self, "pretrained_model"):
+            del self.pretrained_model
 
     def clean(self) -> None:
         LOGGER.info(f"Cleaning {self.NAME} backend")
-
-        if hasattr(self, "pretrained_model"):
-            self.delete_pretrained_model()
-
+        self.delete_pretrained_model()
         gc.collect()
diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
index fff9bf80..a4919c15 100644
--- a/optimum_benchmark/backends/config.py
+++ b/optimum_benchmark/backends/config.py
@@ -4,13 +4,12 @@
 from dataclasses import dataclass, field
 from typing import Optional, TypeVar, Dict, Any
 
-from psutil import cpu_count
+from ..import_utils import is_psutil_available
+from ..env_utils import get_cuda_device_ids, is_nvidia_system, is_rocm_system
+from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path
 
-from ..env_utils import get_gpus, is_nvidia_system, is_rocm_system
-from ..task_utils import (
-    infer_library_from_model_name_or_path,
-    infer_task_from_model_name_or_path,
-)
+if is_psutil_available():
+    from psutil import cpu_count
 
 LOGGER = getLogger("backend")
 
@@ -18,6 +17,7 @@
     "revision": "main",
     "force_download": False,
     "local_files_only": False,
+    "trust_remote_code": False,
 }
 
 
@@ -31,6 +31,10 @@ class BackendConfig(ABC):
 
     model: Optional[str] = None
     device: Optional[str] = None
+    # yes we use a string here instead of a list
+    # it's easier to pass in a yaml or from cli
+    # also it's consistent with CUDA_VISIBLE_DEVICES
+    device_ids: Optional[str] = None
 
     task: Optional[str] = None
     library: Optional[str] = None
@@ -48,41 +52,20 @@ def __post_init__(self):
             self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu"
 
         if ":" in self.device:
-            raise ValueError(
-                f"Device was specified as {self.device} with a target index."
-                "We recommend using the main cuda device (e.g. `cuda`) and "
-                "specifying the target index in `CUDA_VISIBLE_DEVICES`."
-            )
+            # using device index
+            self.device = self.device.split(":")[0]
+            self.device_ids = self.device.split(":")[1]
+
+        if self.device == "cuda":
+            if self.device_ids is None:
+                self.device_ids = get_cuda_device_ids()
+
+            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+            os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids
+            # TODO: add rocm specific environment variables ?
 
         if self.device not in ["cuda", "cpu", "mps", "xla"]:
-            raise ValueError("`device` must be either `cuda`, `cpu`, `mps` or `xla`.")
-
-        if self.device == "cuda" and len(get_gpus()) > 1:
-            if os.environ.get("CUDA_VISIBLE_DEVICES", None) is None:
-                LOGGER.warning(
-                    "Multiple GPUs detected but CUDA_VISIBLE_DEVICES is not set. "
-                    "This means that code might allocate resources from the wrong GPUs. "
-                    "For example, with `auto_device='auto'. `We recommend setting CUDA_VISIBLE_DEVICES "
-                    "to isolate the GPUs that will be used for this experiment. `CUDA_VISIBLE_DEVICES` will "
-                    "be set to `0` to ensure that only the first GPU is used. If you want to use multiple "
-                    "GPUs, please set `CUDA_VISIBLE_DEVICES` to the desired GPU indices."
-                )
-                os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-
-            if os.environ.get("CUDA_DEVICE_ORDER", None) != "PCI_BUS_ID":
-                LOGGER.warning(
-                    "Multiple GPUs detected but CUDA_DEVICE_ORDER is not set to `PCI_BUS_ID`. "
-                    "This means that code might allocate resources from the wrong GPUs even if "
-                    "`CUDA_VISIBLE_DEVICES` is set. For example pytorch uses the `FASTEST_FIRST` "
-                    "order by default, which is not guaranteed to be the same as nvidia-smi. `CUDA_DEVICE_ORDER` "
-                    "will be set to `PCI_BUS_ID` to ensure that the GPUs are allocated in the same order as nvidia-smi. "
-                )
-                os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
-        elif self.device == "cuda" and len(get_gpus()) == 1:
-            if os.environ.get("CUDA_VISIBLE_DEVICES", None) is None:
-                os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-                os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+            raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}")
 
         if self.task is None:
             self.task = infer_task_from_model_name_or_path(self.model)
diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py
index 49c21906..705436d3 100644
--- a/optimum_benchmark/backends/diffusers_utils.py
+++ b/optimum_benchmark/backends/diffusers_utils.py
@@ -4,31 +4,27 @@
 
 from ..import_utils import is_diffusers_available
 
-
 if is_diffusers_available():
     import diffusers
 
 
 def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
-    assert is_diffusers_available(), "Diffusers is not available"
     return diffusers.DiffusionPipeline.load_config(model, **kwargs)
 
 
 def extract_diffusers_shapes_from_config(model: str, **kwargs) -> Dict[str, int]:
-    assert is_diffusers_available(), "Diffusers is not available"
+    config = diffusers.DiffusionPipeline.load_config(model, **kwargs)
 
     shapes = {}
-    pip_config = diffusers.DiffusionPipeline.load_config(model, **kwargs)
-
-    if "vae" in pip_config:
-        vae_import_path = pip_config["vae"]
+    if "vae" in config:
+        vae_import_path = config["vae"]
         vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}")
         vae_config = vae_class.load_config(model, subfolder="vae", **kwargs)
         shapes["num_channels"] = vae_config["out_channels"]
         shapes["height"] = vae_config["sample_size"]
         shapes["width"] = vae_config["sample_size"]
-    elif "vae_encoder" in pip_config:
-        vae_import_path = pip_config["vae_encoder"]
+    elif "vae_encoder" in config:
+        vae_import_path = config["vae_encoder"]
         vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}")
         vae_config = vae_class.load_config(model, subfolder="vae", **kwargs)
         shapes["num_channels"] = vae_config["out_channels"]
diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py
index 092affff..dd2a7a82 100644
--- a/optimum_benchmark/backends/neural_compressor/backend.py
+++ b/optimum_benchmark/backends/neural_compressor/backend.py
@@ -4,22 +4,19 @@
 from logging import getLogger
 from tempfile import TemporaryDirectory
 
+from ...generators.dataset_generator import DatasetGenerator
+from ..transformers_utils import randomize_weights
+from .utils import TASKS_TO_INCMODELS
+from .config import INCConfig
+from ..base import Backend
+
 import torch
 from hydra.utils import get_class
 from transformers.utils import ModelOutput
 from transformers.modeling_utils import no_init_weights
 from transformers.utils.logging import set_verbosity_error
 from optimum.intel.neural_compressor.quantization import INCQuantizer
-from neural_compressor.config import (
-    PostTrainingQuantConfig,
-    AccuracyCriterion,
-    TuningCriterion,
-)
-
-from ...generators.dataset_generator import DatasetGenerator
-from .utils import TASKS_TO_INCMODELS
-from .config import INCConfig
-from ..base import Backend
+from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion, TuningCriterion
 
 # disable transformers logging
 set_verbosity_error()
@@ -34,9 +31,7 @@ def __init__(self, config: INCConfig):
         super().__init__(config)
         self.validate_task()
 
-        self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task])
-        LOGGER.info(f"Using INCModel class {self.incmodel_class.__name__}")
-
+        LOGGER.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.ptq_quantization:
@@ -52,57 +47,65 @@ def __init__(self, config: INCConfig):
         else:
             self.load_incmodel_from_pretrained()
 
-        self.tmpdir.cleanup()
-
     def validate_task(self) -> None:
         if self.config.task not in TASKS_TO_INCMODELS:
             raise NotImplementedError(f"INCBackend does not support task {self.config.task}")
 
+        self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task])
+        LOGGER.info(f"Using INCModel class {self.incmodel_class.__name__}")
+
     def load_automodel_from_pretrained(self) -> None:
         LOGGER.info("\t+ Loading AutoModel from pretrained")
         self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs)
 
-    def load_automodel_with_no_weights(self) -> None:
-        no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+    def create_no_weights_model(self) -> None:
+        LOGGER.info("\t+ Creating no weights model state_dict")
+        state_dict = torch.nn.Linear(1, 1).state_dict()
 
-        if not os.path.exists(no_weights_model):
-            LOGGER.info("\t+ Creating no weights model directory")
-            os.makedirs(no_weights_model)
+        LOGGER.info("\t+ Creating no weights model directory")
+        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+        os.makedirs(self.no_weights_model, exist_ok=True)
 
-        LOGGER.info("\t+ Saving pretrained config")
-        self.pretrained_config.save_pretrained(save_directory=no_weights_model)
+        LOGGER.info("\t+ Saving no weights model pretrained config")
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
 
-        LOGGER.info("\t+ Creating no weights model")
-        state_dict = torch.nn.Linear(1, 1).state_dict()
+        LOGGER.info("\t+ Saving no weights model state_dict")
+        torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin"))
 
-        LOGGER.info("\t+ Saving no weights model")
-        torch.save(state_dict, os.path.join(no_weights_model, "pytorch_model.bin"))
+    def load_automodel_with_no_weights(self) -> None:
+        self.create_no_weights_model()
 
-        LOGGER.info("\t+ Loading no weights model")
         with no_init_weights():
             original_model = self.config.model
-            self.config.model = no_weights_model
+            self.config.model = self.no_weights_model
+            LOGGER.info("\t+ Loading no weights model")
             self.load_automodel_from_pretrained()
             self.config.model = original_model
 
+        LOGGER.info("\t+ Randomizing model weights")
+        randomize_weights(self.pretrained_model)
+        LOGGER.info("\t+ Tying model weights")
+        self.pretrained_model.tie_weights()
+
     def load_incmodel_from_pretrained(self) -> None:
         LOGGER.info("\t+ Loading INCModel from pretrained")
         self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs)
 
     def load_incmodel_with_no_weights(self) -> None:
-        no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
-
-        LOGGER.info("\t+ Loading AutoModel with no weights")
-        self.load_automodel_with_no_weights()
-        self.delete_pretrained_model()
+        self.create_no_weights_model()
 
-        LOGGER.info("\t+ Loading INCModel with no weights")
         with no_init_weights():
             original_model = self.config.model
-            self.config.model = no_weights_model
+            self.config.model = self.no_weights_model
+            LOGGER.info("\t+ Loading no weights model")
             self.load_incmodel_from_pretrained()
             self.config.model = original_model
 
+        LOGGER.info("\t+ Randomizing model weights")
+        randomize_weights(self.pretrained_model.model)
+        LOGGER.info("\t+ Tying model weights")
+        self.pretrained_model.model.tie_weights()
+
     def quantize_automodel(self) -> None:
         LOGGER.info("\t+ Attempting to quantize model")
         quantized_model_path = f"{self.tmpdir.name}/quantized"
@@ -134,7 +137,7 @@ def quantize_automodel(self) -> None:
                 task=self.config.task,
                 dataset_shapes=dataset_shapes,
                 model_shapes=self.model_shapes,
-            ).generate()
+            )()
             columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns))
             calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
         else:
@@ -169,6 +172,7 @@ def clean(self) -> None:
         super().clean()
 
         if hasattr(self, "tmpdir"):
+            LOGGER.info("\t+ Cleaning backend temporary directory")
             self.tmpdir.cleanup()
 
         gc.collect()
diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py
index 0801b000..07d5d860 100644
--- a/optimum_benchmark/backends/onnxruntime/backend.py
+++ b/optimum_benchmark/backends/onnxruntime/backend.py
@@ -1,16 +1,22 @@
 import gc
 import os
 from logging import getLogger
+from collections import OrderedDict
 from tempfile import TemporaryDirectory
 from typing import Any, Callable, Dict, List
 
+from ..base import Backend
+from .config import ORTConfig
+from ...task_utils import TEXT_GENERATION_TASKS
+from ...generators.dataset_generator import DatasetGenerator
+from .utils import format_calibration_config, format_quantization_config, TASKS_TO_ORTMODELS, TASKS_TO_ORTSD
+
 import torch
 from datasets import Dataset
 from hydra.utils import get_class
 from onnxruntime import SessionOptions
 from safetensors.torch import save_file
-from transformers.utils import ModelOutput
-from transformers import TrainerCallback, TrainerState
+from transformers import TrainerCallback
 from transformers.modeling_utils import no_init_weights
 from transformers.utils.logging import set_verbosity_error
 from optimum.onnxruntime.configuration import (
@@ -24,19 +30,10 @@
 from optimum.onnxruntime import (
     ONNX_DECODER_WITH_PAST_NAME,
     ONNX_DECODER_NAME,
+    ORTTrainingArguments,
     ORTOptimizer,
     ORTQuantizer,
-)
-
-from ...generators.dataset_generator import DatasetGenerator
-from ...task_utils import TEXT_GENERATION_TASKS
-from .config import ORTConfig
-from ..base import Backend
-from .utils import (
-    format_calibration_config,
-    format_quantization_config,
-    TASKS_TO_ORTMODELS,
-    TASKS_TO_ORTSD,
+    ORTTrainer,
 )
 
 # disable transformers logging
@@ -61,15 +58,19 @@ def __init__(self, config: ORTConfig) -> None:
         else:
             raise NotImplementedError(f"ORTBackend does not support task {self.config.task}")
 
-        self.set_session_options()
+        LOGGER.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
+        self.session_options = SessionOptions()
+        for key, value in self.config.session_options.items():
+            setattr(self.session_options, key, value)
+
         if self.config.no_weights:
             self.load_ortmodel_with_no_weights()
         else:
             self.load_ortmodel_from_pretrained()
 
-        if self.is_deferred_trt_loading():
+        if self.is_trt_text_generation:
             return
 
         if self.is_optimized or self.is_quantized:
@@ -99,35 +100,30 @@ def validate_provider(self) -> None:
             self.pretrained_model.providers[0] == self.config.provider
         ), f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}"
 
-    def is_deferred_trt_loading(self) -> bool:
-        return self.config.provider == "TensorrtExecutionProvider" and self.config.task in TEXT_GENERATION_TASKS
-
-    def set_session_options(self) -> None:
-        self.session_options = SessionOptions()
-        for key, value in self.config.session_options.items():
-            setattr(self.session_options, key, value)
-
-    def load_ortmodel_with_no_weights(self) -> None:
+    def create_no_weights_model(self) -> None:
         LOGGER.info("\t+ Creating no weights model directory")
-        no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
-        os.makedirs(no_weights_model, exist_ok=True)
+        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+        os.makedirs(self.no_weights_model, exist_ok=True)
 
         LOGGER.info("\t+ Saving pretrained config")
-        self.pretrained_config.save_pretrained(save_directory=no_weights_model)
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
 
-        LOGGER.info("\t+ Creating no weights model weights")
+        LOGGER.info("\t+ Creating no weights model state dict")
         state_dict = torch.nn.Linear(1, 1).state_dict()
 
-        LOGGER.info("\t+ Saving no weights model weights")
+        LOGGER.info("\t+ Saving no weights model state dict")
         save_file(
-            filename=os.path.join(no_weights_model, "model.safetensors"),
+            filename=os.path.join(self.no_weights_model, "model.safetensors"),
             metadata={"format": "pt"},
             tensors=state_dict,
         )
 
+    def load_ortmodel_with_no_weights(self) -> None:
+        self.create_no_weights_model()
+
         with no_init_weights():
             original_model = self.config.model
-            self.config.model = no_weights_model
+            self.config.model = self.no_weights_model
             LOGGER.info("\t+ Loading no weights model")
             self.load_ortmodel_from_pretrained()
             self.config.model = original_model
@@ -144,6 +140,10 @@ def load_ortmodel_from_pretrained(self) -> None:
             **self.ortmodel_kwargs,
         )
 
+    @property
+    def is_trt_text_generation(self) -> bool:
+        return self.config.provider == "TensorrtExecutionProvider" and self.config.task in TEXT_GENERATION_TASKS
+
     @property
     def is_optimized(self) -> bool:
         return (self.config.auto_optimization is not None) or self.config.optimization
@@ -252,7 +252,7 @@ def quantize_onnx_files(self) -> None:
                 task=self.config.task,
                 dataset_shapes=dataset_shapes,
                 model_shapes=self.model_shapes,
-            ).generate()
+            )()
             columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.inputs_names))
             calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
 
@@ -314,7 +314,7 @@ def quantize_onnx_files(self) -> None:
         self.config.model = quantized_model_path
 
     def prepare_for_inference(self, **kwargs) -> None:
-        if self.is_deferred_trt_loading():
+        if self.is_trt_text_generation:
             LOGGER.info("\t+ Creating dynamic shapes for Tensorrt engine. Engine creation might take a while.")
             batch_size = kwargs["batch_size"]
             max_new_tokens = kwargs["max_new_tokens"]
@@ -353,21 +353,22 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
 
         return inputs
 
-    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
-        return self.pretrained_model(**inputs, **kwargs)
+    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        return self.pretrained_model.forward(**inputs, **kwargs)
 
-    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.generate(**inputs, **kwargs)
 
+    def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        return self.pretrained_model(**inputs, **kwargs)
+
     def train(
         self,
         training_dataset: Dataset,
         training_arguments: Dict[str, Any],
         training_callbacks: List[TrainerCallback],
         training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]],
-    ) -> TrainerState:
-        from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
-
+    ) -> None:
         LOGGER.info("\t+ Setting dataset format to `torch`")
         training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys()))
         LOGGER.info("\t+ Wrapping training arguments with optimum.onnxruntime.ORTTrainingArguments")
@@ -384,13 +385,11 @@ def train(
         trainer.train()
         LOGGER.info("\t+ Training finished successfully")
 
-        return trainer.state
-
     def clean(self) -> None:
         super().clean()
 
         if hasattr(self, "tmpdir"):
-            LOGGER.info("\t+ Cleaning temporary directory")
+            LOGGER.info("\t+ Cleaning backend temporary directory")
             self.tmpdir.cleanup()
 
         gc.collect()
diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
index 0f9262cc..e0191b88 100644
--- a/optimum_benchmark/backends/onnxruntime/config.py
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -38,6 +38,7 @@ class ORTConfig(BackendConfig):
     version: Optional[str] = onnxruntime_version()
     _target_: str = "optimum_benchmark.backends.onnxruntime.backend.ORTBackend"
 
+    # load options
     no_weights: bool = False
 
     # export options
diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py
index 4140b973..73cbd63d 100644
--- a/optimum_benchmark/backends/openvino/backend.py
+++ b/optimum_benchmark/backends/openvino/backend.py
@@ -3,26 +3,25 @@
 import inspect
 from typing import Any, Dict
 from logging import getLogger
+from collections import OrderedDict
 from tempfile import TemporaryDirectory
 
+from ..base import Backend
+from .config import OVConfig
+from .utils import TASKS_TO_OVMODEL
+from ...task_utils import TEXT_GENERATION_TASKS
+from ..transformers_utils import randomize_weights
+from ...generators.dataset_generator import DatasetGenerator
+
 import torch
 from hydra.utils import get_class
 from openvino.runtime import properties
 from safetensors.torch import save_file
 from optimum.intel.openvino import OVQuantizer
 from transformers.modeling_utils import no_init_weights
-from transformers.utils import ModelOutput
 from transformers.utils.logging import set_verbosity_error
 from optimum.intel.openvino import OVConfig as OVQuantizationConfig  # naming conflict
 
-from ..base import Backend
-from .config import OVConfig
-from .utils import TASKS_TO_OVMODEL
-from ...task_utils import TEXT_GENERATION_TASKS
-from ..transformers_utils import randomize_weights
-from ...generators.dataset_generator import DatasetGenerator
-
-
 # disable transformers logging
 set_verbosity_error()
 
@@ -149,7 +148,11 @@ def quantize_automodel(self) -> None:
                 "sequence_length": 1,
                 **self.model_shapes,
             }
-            calibration_dataset = DatasetGenerator(task=self.config.task, dataset_shapes=dataset_shapes).generate()
+            calibration_dataset = DatasetGenerator(
+                task=self.config.task,
+                dataset_shapes=dataset_shapes,
+                model_shapes=self.model_shapes,
+            )()
             columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._export_input_names))
             calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
         else:
@@ -196,12 +199,15 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
 
         return inputs
 
-    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
-        return self.pretrained_model(**inputs, **kwargs)
+    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        return self.pretrained_model.forward(**inputs, **kwargs)
 
-    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
+    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.generate(**inputs, **kwargs)
 
+    def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        return self.pretrained_model(**inputs, **kwargs)
+
     def clean(self) -> None:
         super().clean()
 
diff --git a/optimum_benchmark/backends/peft_utils.py b/optimum_benchmark/backends/peft_utils.py
index 695e602c..1a367120 100644
--- a/optimum_benchmark/backends/peft_utils.py
+++ b/optimum_benchmark/backends/peft_utils.py
@@ -13,7 +13,6 @@
         PromptLearningConfig,
     )
 
-
 PEFT_TASKS_TYPES = [
     "SEQ_CLS",
     "SEQ_2_SEQ_LM",
diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index cadfe878..268f4306 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -1,27 +1,23 @@
 import gc
 import os
 from logging import getLogger
+from collections import OrderedDict
 from tempfile import TemporaryDirectory
 from typing import Any, Callable, Dict, List
 
+from ..base import Backend
+from .config import PyTorchConfig
+from ..peft_utils import get_peft_config_class
+from ..transformers_utils import randomize_weights
+from ...import_utils import is_deepspeed_available, is_peft_available
+
 import torch
 from datasets import Dataset
 from safetensors.torch import save_file
-from transformers.utils import ModelOutput
 import datasets.utils.logging as datasets_logging
-from transformers import TrainerCallback, TrainerState
 from transformers.modeling_utils import no_init_weights
 import transformers.utils.logging as transformers_logging
-
-from ..base import Backend
-from .config import PyTorchConfig
-from ..peft_utils import get_peft_config_class
-from ..transformers_utils import TransformersDataParallel, randomize_weights
-from ...import_utils import (
-    is_deepspeed_available,
-    is_peft_available,
-)
-
+from transformers import TrainerCallback, TrainerState, Trainer, TrainingArguments
 
 if is_peft_available():
     from peft import get_peft_model
@@ -38,21 +34,13 @@
 
 
 class PyTorchBackend(Backend[PyTorchConfig]):
-    NAME: str = "pytorch"
+    NAME = "pytorch"
 
     def __init__(self, config: PyTorchConfig):
         super().__init__(config)
+        self.validate_library()
 
-        if self.config.library == "timm":
-            LOGGER.info("\t+ Using method timm.create_model")
-        else:
-            automodel = self.automodel_class.__name__
-            if self.config.library == "diffusers":
-                LOGGER.info(f"\t+ Using Pipeline class {automodel}")
-            else:
-                LOGGER.info(f"\t+ Using AutoModel class {automodel}")
-
-        # Threading options
+        # Threads
         if self.config.inter_op_num_threads is not None:
             LOGGER.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))")
             torch.set_num_threads(self.config.inter_op_num_threads)
@@ -60,18 +48,23 @@ def __init__(self, config: PyTorchConfig):
             LOGGER.info(f"\t+ Setting pytorch intra_op_num_threads({self.config.intra_op_num_threads}))")
             torch.set_num_interop_threads(self.config.intra_op_num_threads)
 
-        # Dtypes options
-        self.amp_dtype = getattr(torch, self.config.amp_dtype) if self.config.amp_dtype is not None else None
+        # Mixed precision
+        if self.config.amp_dtype:
+            LOGGER.info(f"\t+ Setting mixed precision dtype to {self.config.amp_dtype}")
+            self.amp_dtype = getattr(torch, self.config.amp_dtype)
+        else:
+            self.amp_dtype = None
 
+        # Quantization
         if self.is_quantized:
             LOGGER.info("\t+ Processing quantization config")
             self.process_quantization_config()
         else:
             self.quantization_config = None
 
+        LOGGER.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
-        # Load model
         if self.config.no_weights and self.config.library == "diffusers":
             raise ValueError("Diffusion pipelines are not supported with no_weights=True")
         elif self.config.no_weights:
@@ -81,8 +74,12 @@ def __init__(self, config: PyTorchConfig):
             LOGGER.info("\t+ Loading model with pretrained weights")
             self.load_model_from_pretrained()
 
+        if self.config.cache_implementation is not None:
+            LOGGER.info(f"\t+ Setting cache implementation to {self.config.cache_implementation}")
+            self.pretrained_model.generation_config.cache_implementation = self.config.cache_implementation
+
         # Eval mode
-        if self.config.eval_mode and not self.config.library == "diffusers":
+        if self.config.eval_mode and self.config.library != "diffusers":
             LOGGER.info("\t+ Turning on model's eval mode")
             self.pretrained_model.eval()
 
@@ -91,7 +88,7 @@ def __init__(self, config: PyTorchConfig):
             LOGGER.info("\t+ Enabling BetterTransformer")
             self.pretrained_model.to_bettertransformer()
 
-        # Compile model
+        # Torch compile
         if self.config.torch_compile:
             if self.config.library == "diffusers":
                 LOGGER.info("\t+ Using torch.compile on unet forward pass")
@@ -115,18 +112,21 @@ def __init__(self, config: PyTorchConfig):
 
         if self.config.deepspeed_inference:
             LOGGER.info("\t+ Using DeepSpeed-Inference")
-
             self.pretrained_model = init_inference(
                 self.pretrained_model,
                 config=self.config.deepspeed_inference_config,
                 dtype=getattr(self.pretrained_model, "dtype", None),
             )
 
-        if self.config.data_parallel:
-            LOGGER.info("\t+ Using TransformersDataParallel")
-            self.pretrained_model = TransformersDataParallel(self.pretrained_model)
-
-        self.tmpdir.cleanup()
+    def validate_library(self) -> None:
+        if self.config.library == "timm":
+            LOGGER.info(f"\t+ Using Timm method {self.automodel_class.__name__}")
+        elif self.config.library == "diffusers":
+            LOGGER.info(f"\t+ Using Pipeline class {self.automodel_class.__name__}")
+        elif self.config.library == "transformers":
+            LOGGER.info(f"\t+ Using AutoModel class {self.automodel_class.__name__}")
+        else:
+            raise ValueError(f"Library {self.config.library} not supported")
 
     def load_model_from_pretrained(self) -> None:
         if self.config.library == "timm":
@@ -138,8 +138,8 @@ def load_model_from_pretrained(self) -> None:
             self.pretrained_model = self.automodel_class.from_pretrained(
                 pretrained_model_name_or_path=self.config.model,
                 device_map=self.config.device_map,
-                **self.automodel_kwargs,
                 **self.config.hub_kwargs,
+                **self.automodel_kwargs,
             )
             if self.config.device_map is None:
                 LOGGER.info(f"\t+ Moving pipeline to device: {self.config.device}")
@@ -148,7 +148,6 @@ def load_model_from_pretrained(self) -> None:
             LOGGER.info("\t+ Loading BnB quantized model")
             self.pretrained_model = self.automodel_class.from_pretrained(
                 pretrained_model_name_or_path=self.config.model,
-                low_cpu_mem_usage=self.config.low_cpu_mem_usage,
                 device_map=self.config.device_map,
                 **self.config.hub_kwargs,
                 **self.automodel_kwargs,
@@ -158,10 +157,8 @@ def load_model_from_pretrained(self) -> None:
             self.pretrained_model = self.automodel_class.from_pretrained(
                 pretrained_model_name_or_path=self.config.model,
                 # for gptq, we need to specify the device_map to either auto
-                # or a cuda adevice to avoid any modules being assigned to cpu
+                # or a cuda adevice to avoid any modules being assigned to cpu ¯\_(ツ)_/¯
                 device_map=self.config.device_map or torch.device(self.config.device),
-                # this avoids unnecessary memory usage when loading quantized models
-                low_cpu_mem_usage=self.config.low_cpu_mem_usage,
                 **self.config.hub_kwargs,
                 **self.automodel_kwargs,
             )
@@ -175,39 +172,39 @@ def load_model_from_pretrained(self) -> None:
             )
         else:
             # this is the fastest way to load a model on a specific device
+            # but not compatible with all quantization methods (and pipelines)
             LOGGER.info(f"\t+ Loading model directly on device: {self.config.device}")
             with torch.device(self.config.device):
                 self.pretrained_model = self.automodel_class.from_pretrained(
                     pretrained_model_name_or_path=self.config.model,
-                    **self.automodel_kwargs,
                     **self.config.hub_kwargs,
+                    **self.automodel_kwargs,
                 )
 
     def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
-
-        LOGGER.info("\t+ Creating no weights model directory")
-        os.makedirs(self.no_weights_model, exist_ok=True)
-
-        if self.is_quantized:
-            # tricking from_pretrained to load the model as if it was quantized
-            self.pretrained_config.quantization_config = self.quantization_config.to_dict()
-
-        LOGGER.info("\t+ Saving pretrained config")
-        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-
         LOGGER.info("\t+ Creating no weights model state_dict")
         state_dict = torch.nn.Linear(1, 1).state_dict()
 
         if self.is_exllamav2:
-            # for exllamav2 we need to add g_idx to the state_dict
+            # for exllamav2 we need to add g_idx to the state_dict which
+            # requires some information about linear layers dimensions
             with torch.device("meta"):
                 meta_model = self.automodel_class.from_config(self.pretrained_config)
-
             for name, module in meta_model.named_modules():
                 if hasattr(module, "in_features"):
                     state_dict[name + ".g_idx"] = torch.ones((module.in_features,), dtype=torch.int32)
 
+        if self.is_quantized:
+            # tricking from_pretrained to load the model as if it was quantized
+            self.pretrained_config.quantization_config = self.quantization_config.to_dict()
+
+        LOGGER.info("\t+ Creating no weights model directory")
+        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights")
+        os.makedirs(self.no_weights_model, exist_ok=True)
+
+        LOGGER.info("\t+ Saving no weights model pretrained config")
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+
         LOGGER.info("\t+ Saving no weights model state_dict")
         save_file(
             filename=os.path.join(self.no_weights_model, "model.safetensors"),
@@ -292,10 +289,9 @@ def is_awq_quantized(self) -> bool:
     @property
     def is_exllamav2(self) -> bool:
         return (
-            self.is_quantized
-            and self.is_gptq_quantized
-            and "exllama_config" in self.config.quantization_config
-            and self.config.quantization_config["exllama_config"]["version"] == 2
+            self.is_gptq_quantized
+            and "exllama_config" in self.quantization_config
+            and self.quantization_config["exllama_config"].get("version", None) == 2
         )
 
     @property
@@ -305,12 +301,14 @@ def automodel_kwargs(self) -> Dict[str, Any]:
         if self.config.torch_dtype is not None:
             kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype)
 
-        if self.config.use_flash_attention_2:
-            kwargs["use_flash_attention_2"] = True
+        if self.config.attn_implementation is not None:
+            kwargs["attn_implementation"] = self.config.attn_implementation
 
-        if self.is_gptq_quantized or self.is_bnb_quantized:
-            # awq quantization doesn't support overriding the quantization
-            # config by passing quantization_config to from_pretrained
+        if self.config.low_cpu_mem_usage is not None:
+            kwargs["low_cpu_mem_usage"] = self.config.low_cpu_mem_usage
+
+        if self.is_quantized:
+            kwargs["_fast_init"] = False
             kwargs["quantization_config"] = self.quantization_config
 
         return kwargs
@@ -329,24 +327,19 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
 
     @torch.inference_mode()
-    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
-        if self.config.library == "diffusers":
-            return self.pretrained_model(**inputs, **kwargs)
-
-        if self.config.amp_autocast:
-            with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype):
-                return self.pretrained_model(**inputs, **kwargs)
-        else:
-            return self.pretrained_model(**inputs, **kwargs)
+    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype, enabled=self.config.amp_autocast):
+            return self.pretrained_model.forward(**inputs, **kwargs)
 
     @torch.inference_mode()
-    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
-        if self.config.amp_autocast:
-            with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype):
-                return self.pretrained_model.generate(**inputs, **kwargs)
-        else:
+    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        with torch.autocast(device_type=self.config.device, dtype=self.amp_dtype, enabled=self.config.amp_autocast):
             return self.pretrained_model.generate(**inputs, **kwargs)
 
+    @torch.inference_mode()
+    def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        return self.pretrained_model(**inputs, **kwargs)
+
     def train(
         self,
         training_dataset: Dataset,
@@ -354,16 +347,12 @@ def train(
         training_callbacks: List[TrainerCallback],
         training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]],
     ) -> TrainerState:
-        from transformers import Trainer, TrainingArguments
-
-        LOGGER.info("\t+ Setting dataset format to `torch`")
-        training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys()))
         LOGGER.info("\t+ Wrapping training arguments with transformers.TrainingArguments")
         training_arguments = TrainingArguments(**training_arguments)
         LOGGER.info("\t+ Wrapping model with transformers.Trainer")
         trainer = Trainer(
-            model=self.pretrained_model,
             args=training_arguments,
+            model=self.pretrained_model,
             callbacks=training_callbacks,
             train_dataset=training_dataset,
             data_collator=training_data_collator,
@@ -372,8 +361,6 @@ def train(
         trainer.train()
         LOGGER.info("\t+ Training finished successfully")
 
-        return trainer.state
-
     def seed(self):
         super().seed()
         torch.manual_seed(self.config.seed)
@@ -385,7 +372,7 @@ def clean(self) -> None:
         super().clean()
 
         if hasattr(self, "tmpdir"):
-            LOGGER.info("\t+ Cleaning temporary directory")
+            LOGGER.info("\t+ Cleaning backend temporary directory")
             self.tmpdir.cleanup()
 
         gc.collect()
diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py
index 1cbb04ba..d8089f60 100644
--- a/optimum_benchmark/backends/pytorch/config.py
+++ b/optimum_benchmark/backends/pytorch/config.py
@@ -42,9 +42,10 @@ class PyTorchConfig(BackendConfig):
 
     # optimization options
     eval_mode: bool = True
-    low_cpu_mem_usage: bool = False
     to_bettertransformer: bool = False
-    use_flash_attention_2: bool = False
+    low_cpu_mem_usage: Optional[bool] = None
+    attn_implementation: Optional[str] = None
+    cache_implementation: Optional[str] = None
 
     # compilation options
     torch_compile: bool = False
@@ -55,7 +56,6 @@ class PyTorchConfig(BackendConfig):
     quantization_config: Dict[str, Any] = field(default_factory=dict)
 
     # distributed inference options
-    data_parallel: bool = False
     deepspeed_inference: bool = False
     deepspeed_inference_config: Dict[str, Any] = field(default_factory=dict)
 
diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
index 43a5fd75..7c86adeb 100644
--- a/optimum_benchmark/backends/tensorrt_llm/backend.py
+++ b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -1,13 +1,13 @@
 from logging import getLogger
 from typing import Any, Dict
 
-from hydra.utils import get_class
-from transformers.utils import ModelOutput
-
 from ..base import Backend
 from .config import TRTLLMConfig
 from .utils import MODEL_TYPE_TO_TRTLLMMODEL
 
+from hydra.utils import get_class
+from transformers.utils import ModelOutput
+
 LOGGER = getLogger("tensorrt-llm")
 
 
@@ -18,15 +18,15 @@ def __init__(self, config: TRTLLMConfig):
         super().__init__(config)
         self.validate_model_type()
 
-        self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type])
-        LOGGER.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}")
-
         self.load_trtmodel_from_pretrained()
 
     def validate_model_type(self) -> None:
         if self.model_type not in MODEL_TYPE_TO_TRTLLMMODEL:
             raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.model_type}")
 
+        self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type])
+        LOGGER.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}")
+
     def load_trtmodel_from_pretrained(self) -> None:
         self.pretrained_model = self.trtmodel_class.from_pretrained(
             self.config.model,
diff --git a/optimum_benchmark/backends/text_generation_inference/backend.py b/optimum_benchmark/backends/text_generation_inference/backend.py
index fbd3d1de..538de53c 100644
--- a/optimum_benchmark/backends/text_generation_inference/backend.py
+++ b/optimum_benchmark/backends/text_generation_inference/backend.py
@@ -6,6 +6,11 @@
 from tempfile import TemporaryDirectory
 from concurrent.futures import ThreadPoolExecutor
 
+from ..base import Backend
+from .config import TGIConfig
+from ...task_utils import TEXT_GENERATION_TASKS
+from ..transformers_utils import randomize_weights
+
 import torch
 import docker
 import docker.types
@@ -14,10 +19,6 @@
 from huggingface_hub import InferenceClient, snapshot_download
 from huggingface_hub.inference._text_generation import TextGenerationResponse
 
-from ..base import Backend
-from .config import TGIConfig
-from ..transformers_utils import randomize_weights
-
 # bachend logger
 LOGGER = getLogger("text-generation-inference")
 
@@ -29,8 +30,7 @@ def __init__(self, config: TGIConfig) -> None:
         super().__init__(config)
         self.validate_task()
 
-        LOGGER.info(f"Using AutoModel class {self.automodel_class.__name__}")
-
+        LOGGER.info("\t+ Creating backend temporary directory")
         self.tmp_dir = TemporaryDirectory()
 
         if self.config.no_weights:
@@ -40,9 +40,11 @@ def __init__(self, config: TGIConfig) -> None:
             self.load_model_from_pretrained()
 
     def validate_task(self) -> None:
-        if self.config.task not in ["text-generation", "text2text-generation"]:
+        if self.config.task not in TEXT_GENERATION_TASKS:
             raise NotImplementedError(f"TGI does not support task {self.config.task}")
 
+        LOGGER.info(f"Using AutoModel class {self.automodel_class.__name__}")
+
     def download_pretrained_model(self) -> None:
         LOGGER.info("\t+ Downloading pretrained model")
         snapshot_download(self.config.model, **self.config.hub_kwargs)
@@ -93,7 +95,7 @@ def create_no_weights_model(self) -> None:
         self.pretrained_model = self.automodel_class.from_pretrained(
             self.no_weights_model,
             **self.config.hub_kwargs,
-            device_map="auto",
+            device_map="auto",  # for faster/safer loading
         )
 
         LOGGER.info("\t+ Randomizing weights")
diff --git a/optimum_benchmark/backends/text_generation_inference/config.py b/optimum_benchmark/backends/text_generation_inference/config.py
index edf37ba3..8b73617e 100644
--- a/optimum_benchmark/backends/text_generation_inference/config.py
+++ b/optimum_benchmark/backends/text_generation_inference/config.py
@@ -11,6 +11,9 @@ class TGIConfig(BackendConfig):
     version: Optional[str] = "0.0.1"
     _target_: str = "optimum_benchmark.backends.text_generation_inference.backend.TGIBackend"
 
+    # optimum benchmark specific
+    no_weights: bool = False
+
     # docker options
     image: str = "ghcr.io/huggingface/text-generation-inference:latest"
     volume: str = f"{os.path.expanduser('~')}/.cache/huggingface/hub"
@@ -28,9 +31,6 @@ class TGIConfig(BackendConfig):
     sharded: Optional[bool] = None  # None, True, False
     num_shard: Optional[int] = None  # None, 1, 2, 4, 8, 16, 32, 64
 
-    # optimum benchmark specific
-    no_weights: bool = False  # True, False
-
     def __post_init__(self):
         super().__post_init__()
 
diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py
index 3af970a3..9e2924b2 100644
--- a/optimum_benchmark/backends/timm_utils.py
+++ b/optimum_benchmark/backends/timm_utils.py
@@ -1,22 +1,18 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
-from transformers import PretrainedConfig
+from ..import_utils import is_timm_available, is_transformers_available, is_torch_available
 
-from ..import_utils import is_timm_available
+if is_torch_available():
+    import torch
 
 if is_timm_available():
     import timm
 
-
-def get_timm_pretrained_processor(model: str) -> Any:
-    try:
-        pretrained_config = get_timm_pretrained_config(model)
-        return timm.data.create_transform(**timm.data.resolve_data_config(pretrained_config))
-    except Exception:
-        return None
+if is_transformers_available():
+    from transformers import PretrainedConfig
 
 
-def get_timm_pretrained_config(model_name: str) -> PretrainedConfig:
+def get_timm_pretrained_config(model_name: str) -> "PretrainedConfig":
     model_source, model_name = timm.models.parse_model_name(model_name)
     if model_source == "hf-hub":
         # For model names specified in the form `hf-hub:path/architecture_name@revision`,
@@ -27,13 +23,22 @@ def get_timm_pretrained_config(model_name: str) -> PretrainedConfig:
     return timm.get_pretrained_cfg(model_name)
 
 
-def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
-    shapes = {}
+def get_timm_pre_processor(model: str) -> Optional["torch.nn.Module"]:
+    try:
+        pretrained_config = get_timm_pretrained_config(model)
+        return timm.data.create_transform(**timm.data.resolve_data_config(pretrained_config))
+    except Exception:
+        return None
+
+
+def extract_timm_shapes_from_config(config: "PretrainedConfig") -> Dict[str, Any]:
     artifacts_dict = {}
 
     config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
     artifacts_dict.update(config_dict)
 
+    shapes = {}
+
     # image input
     shapes["num_channels"] = artifacts_dict.get("num_channels", None)
     if shapes["num_channels"] is None:
diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py
index aefce8ea..a7515d2f 100644
--- a/optimum_benchmark/backends/torch_ort/backend.py
+++ b/optimum_benchmark/backends/torch_ort/backend.py
@@ -4,6 +4,11 @@
 from tempfile import TemporaryDirectory
 from typing import Any, Callable, Dict, List
 
+from ..transformers_utils import randomize_weights
+from ..peft_utils import get_peft_config_class
+from .config import TorchORTConfig
+from ..base import Backend
+
 import torch
 from datasets import Dataset
 from safetensors.torch import save_file
@@ -12,11 +17,6 @@
 from transformers.utils.logging import set_verbosity_error
 from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
 
-from ..transformers_utils import randomize_weights
-from ..peft_utils import get_peft_config_class
-from .config import TorchORTConfig
-from ..base import Backend
-
 # disable transformers logging
 set_verbosity_error()
 
@@ -28,9 +28,9 @@ class TorchORTBackend(Backend[TorchORTConfig]):
 
     def __init__(self, config: TorchORTConfig):
         super().__init__(config)
+        self.validate_library()
 
-        LOGGER.info(f"Using AutoModel: {self.automodel_class.__name__}")
-
+        LOGGER.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.no_weights:
@@ -46,7 +46,11 @@ def __init__(self, config: TorchORTConfig):
             peft_config = peft_config_class(**self.config.peft_config)
             self.pretrained_model = get_peft_model(self.pretrained_model, peft_config=peft_config)
 
-        self.tmpdir.cleanup()
+    def validate_library(self) -> None:
+        if self.config.library == "transformers":
+            LOGGER.info(f"Using AutoModel: {self.automodel_class.__name__}")
+        else:
+            raise NotImplementedError(f"TorchORTBackend does not support {self.config.library} library")
 
     def create_no_weights_model(self) -> None:
         LOGGER.info("\t+ Creating no weights model directory")
@@ -76,9 +80,9 @@ def load_automodel_with_no_weights(self) -> None:
             self.load_automodel_from_pretrained()
             self.config.model = original_model
 
-        LOGGER.info("\t+ Randomizing weights")
+        LOGGER.info("\t+ Randomizing model weights")
         randomize_weights(self.pretrained_model)
-        LOGGER.info("\t+ Tying model weights after randomization")
+        LOGGER.info("\t+ Tying model weights")
         self.pretrained_model.tie_weights()
 
     def load_automodel_from_pretrained(self) -> None:
@@ -126,7 +130,7 @@ def clean(self) -> None:
         super().clean()
 
         if hasattr(self, "tmpdir"):
-            LOGGER.info("\t+ Cleaning temporary directory")
+            LOGGER.info("\t+ Cleaning backend temporary directory")
             self.tmpdir.cleanup()
 
         gc.collect()
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 488adca5..1d7ad410 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -1,54 +1,49 @@
 import os
-import threading
-from itertools import chain
-from typing import Any, Dict, List, Optional, Sequence, Union, cast
-
-import torch
-from torch.nn.modules import Module
-from torch.cuda.amp import autocast
-from torch._utils import ExceptionWrapper
-from torch.cuda._utils import _get_device_index
-from torch.nn.parallel.parallel_apply import get_a_var
-from transformers import (
-    FeatureExtractionMixin,
-    ImageProcessingMixin,
-    PreTrainedTokenizer,
-    GenerationConfig,
-    PretrainedConfig,
-    ProcessorMixin,
-    AutoProcessor,
-    AutoConfig,
-)
+from typing import Any, Dict, Optional, Union
+
+from ..import_utils import is_transformers_available, is_torch_available
+
+if is_torch_available():
+    import torch
+
+if is_transformers_available():
+    from transformers import (
+        FeatureExtractionMixin,
+        ImageProcessingMixin,
+        PreTrainedTokenizer,
+        GenerationConfig,
+        PretrainedConfig,
+        ProcessorMixin,
+        AutoProcessor,
+        AutoConfig,
+    )
 
-PretrainedProcessor = Union[
-    FeatureExtractionMixin,
-    ImageProcessingMixin,
-    PreTrainedTokenizer,
-    ProcessorMixin,
-]
+    PretrainedProcessor = Union[
+        FeatureExtractionMixin,
+        ImageProcessingMixin,
+        PreTrainedTokenizer,
+        ProcessorMixin,
+    ]
 
 
-def get_transformers_cache_dir():
+def get_transformers_cache_dir() -> str:
     return os.path.expanduser("~/.cache/huggingface/hub")
 
 
-def get_transformers_generation_config(model: str, **kwargs: Dict[str, Any]):
-    try:
-        # sometimes contains information about the model's input shapes that are not available in the config
-        return GenerationConfig.from_pretrained(model, **kwargs)
-    except Exception:
-        return None
+def get_transformers_pretrained_config(model: str, **kwargs) -> "PretrainedConfig":
+    # sometimes contains information about the model's input shapes that are not available in the config
+    return AutoConfig.from_pretrained(model, **kwargs)
 
 
-def get_transformers_pretrained_config(model: str, **kwargs: Dict[str, Any]):
+def get_transformers_generation_config(model: str, **kwargs) -> Optional["GenerationConfig"]:
     try:
         # sometimes contains information about the model's input shapes that are not available in the config
-        return AutoConfig.from_pretrained(model, **kwargs)
-    except ValueError:
+        return GenerationConfig.from_pretrained(model, **kwargs)
+    except Exception:
         return None
 
 
-def get_transformers_pretrained_processor(model: str, **kwargs: Dict[str, Any]):
+def get_transformers_pre_processor(model: str, **kwargs) -> Optional["PretrainedProcessor"]:
     try:
         # sometimes contains information about the model's input shapes that are not available in the config
         return AutoProcessor.from_pretrained(model, **kwargs)
@@ -57,9 +52,9 @@ def get_transformers_pretrained_processor(model: str, **kwargs: Dict[str, Any]):
 
 
 def extract_transformers_shapes_from_artifacts(
-    config: PretrainedConfig, processor: Optional[PretrainedProcessor] = None
+    config: "PretrainedConfig",
+    processor: Optional["PretrainedProcessor"] = None,
 ) -> Dict[str, Any]:
-    shapes = {}
     artifacts_dict = {}
 
     config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
@@ -68,6 +63,10 @@ def extract_transformers_shapes_from_artifacts(
     if processor is not None and hasattr(processor, "to_dict"):
         processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
         artifacts_dict.update(processor_dict)
+    elif processor is not None:
+        processor_dict = {k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int)}
+
+    shapes = {}
 
     # text input
     shapes["vocab_size"] = artifacts_dict.get("vocab_size", None)
@@ -126,142 +125,20 @@ def extract_transformers_shapes_from_artifacts(
     return shapes
 
 
-def randomize_weights(model):
+def randomize_weights(model: "torch.nn.Module") -> None:
     for param in model.parameters():
-        if param.data.dtype in (torch.float32, torch.float16, torch.bfloat16):
-            if torch.cuda.is_available() and param.device.type == "cpu":
+        if param.data.is_floating_point():
+            if torch.cuda.is_available() and param.device.type != "cuda":
                 param.data.cuda().normal_(mean=0.0, std=0.2).cpu()
-            elif torch.backends.mps.is_available() and param.device.type == "cpu":
-                param.data.mps_normal_(mean=0.0, std=0.2)
+            elif torch.backends.mps.is_available() and param.device.type != "mps":
+                param.data.to("mps").normal_(mean=0.0, std=0.2).cpu()
             else:
                 param.data.normal_(mean=0.0, std=0.2)
-        elif param.data.dtype in (torch.int8, torch.int16, torch.int32, torch.int64):
-            if torch.cuda.is_available() and param.device.type == "cpu":
-                param.data.cuda().randint_(low=-127, high=127).cpu()
-            elif torch.backends.mps.is_available() and param.device.type == "cpu":
-                param.data.mps_randint_(low=-127, high=127)
-            else:
-                param.data.randint_(low=-127, high=127)
-
-
-# adapted from torch to use generate instead of forward
-def parallel_generate_apply(
-    modules: Sequence[Module],
-    inputs: Sequence[Any],
-    kwargs_tup: Optional[Sequence[Dict[str, Any]]] = None,
-    devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None,
-) -> List[Any]:
-    assert len(modules) == len(
-        inputs
-    ), f"The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}"
-    if kwargs_tup is not None:
-        assert len(modules) == len(kwargs_tup)
-    else:
-        kwargs_tup = (cast(Dict[str, Any], {}),) * len(modules)
-    if devices is not None:
-        assert len(modules) == len(devices)
-    else:
-        devices = [None] * len(modules)
-    devices = [_get_device_index(x, True) for x in devices]
-    streams = [torch.cuda.current_stream(x) for x in devices]
-    lock = threading.Lock()
-    results = {}
-    grad_enabled, autocast_enabled = (
-        torch.is_grad_enabled(),
-        torch.is_autocast_enabled(),
-    )
-
-    def _worker(
-        i: int,
-        module: Module,
-        input: Any,
-        kwargs: Dict[str, Any],
-        device: Optional[Union[int, torch.device]] = None,
-        stream: Optional[torch.cuda.Stream] = None,
-    ) -> None:
-        torch.set_grad_enabled(grad_enabled)
-        if device is None:
-            t = get_a_var(input)
-            if t is None:
-                with lock:
-                    results[i] = ExceptionWrapper(
-                        where=f"in replica {i}, no device was provided and no tensor input was found; "
-                        "device cannot be resolved"
-                    )
-                return
-            device = t.get_device()
-        if stream is None:
-            stream = torch.cuda.current_stream(device)
-        try:
-            with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
-                # this also avoids accidental slicing of `input` if it is a Tensor
-                if not isinstance(input, (list, tuple)):
-                    input = (input,)
-                output = module.generate(*input, **kwargs)
-            with lock:
-                results[i] = output
-        except Exception:
-            with lock:
-                results[i] = ExceptionWrapper(where=f"in replica {i} on device {device}")
-
-    if len(modules) > 1:
-        threads = [
-            threading.Thread(target=_worker, args=(i, module, input, kwargs, device, stream))
-            for i, (module, input, kwargs, device, stream) in enumerate(
-                zip(modules, inputs, kwargs_tup, devices, streams)
-            )
-        ]
-
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join()
-    else:
-        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0])
-
-    outputs = []
-    for i in range(len(inputs)):
-        output = results[i]
-        if isinstance(output, ExceptionWrapper):
-            output.reraise()
-        outputs.append(output)
-    return outputs
 
-
-# adapted from torch to support generate
-class TransformersDataParallel(torch.nn.DataParallel):
-    def generate(self, *inputs: Any, **kwargs: Any) -> Any:
-        with torch.autograd.profiler.record_function("DataParallel.generate"):
-            if not self.device_ids:
-                return self.module.generate(*inputs, **kwargs)
-
-            for t in chain(self.module.parameters(), self.module.buffers()):
-                if t.device != self.src_device_obj:
-                    raise RuntimeError(
-                        "module must have its parameters and buffers "
-                        f"on device {self.src_device_obj} (device_ids[0]) but found one of "
-                        f"them on device: {t.device}"
-                    )
-
-            inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
-            # for forward function without any inputs, empty list and dict will be created
-            # so the module can be executed on one device which is the first one in device_ids
-            if not inputs and not module_kwargs:
-                inputs = ((),)
-                module_kwargs = ({},)
-
-            if len(self.device_ids) == 1:
-                return self.module.generate(*inputs[0], **module_kwargs[0])
-
-            replicas = self.replicate(self.module, self.device_ids[: len(inputs)])
-            outputs = self.parallel_generate_apply(replicas, inputs, module_kwargs)
-            return self.gather(outputs, self.output_device)
-
-    def parallel_generate_apply(self, replicas: Sequence, inputs: Sequence, kwargs: Any) -> List[Any]:
-        return parallel_generate_apply(replicas, inputs, kwargs, self.device_ids[: len(replicas)])
-
-    def __getattr__(self, name: str) -> Any:
-        try:
-            return super().__getattr__(name)
-        except AttributeError:
-            return getattr(self.module, name)
+        elif param.data.dtype in (torch.int32, torch.int16, torch.int8):
+            if torch.cuda.is_available() and param.device.type != "cuda":
+                param.data.copy_(torch.randint(-127, 127, param.data.shape, device="cuda"))
+            elif torch.backends.mps.is_available() and param.device.type != "mps":
+                param.data.copy_(torch.randint(-127, 127, param.data.shape, device="mps"))
+            else:
+                param.data.copy_(torch.randint(-127, 127, param.data.shape))
diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py
index dbc68c3c..84495a1a 100644
--- a/optimum_benchmark/benchmarks/base.py
+++ b/optimum_benchmark/benchmarks/base.py
@@ -1,19 +1,17 @@
 from abc import ABC
 from logging import getLogger
-from typing import ClassVar, Generic, Dict, Any
+from typing import ClassVar, Generic
 
 from ..backends.base import Backend
+from .report import BenchmarkReport
 from .config import BenchmarkConfigT
 
-
 LOGGER = getLogger("benchmark")
 
 
 class Benchmark(Generic[BenchmarkConfigT], ABC):
     NAME: ClassVar[str]
 
-    config: BenchmarkConfigT
-
     def __init__(self, config: BenchmarkConfigT) -> None:
         LOGGER.info(f"Allocating {self.NAME} benchmark")
         self.config = config
@@ -21,5 +19,5 @@ def __init__(self, config: BenchmarkConfigT) -> None:
     def run(self, backend: Backend) -> None:
         raise NotImplementedError("Benchmark must implement run method")
 
-    def report(self) -> Dict[str, Any]:
-        raise NotImplementedError("Benchmark must implement save method")
+    def get_report(self) -> BenchmarkReport:
+        raise NotImplementedError("Benchmark must implement report method")
diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py
index 23e479b4..9cc96ee1 100644
--- a/optimum_benchmark/benchmarks/inference/benchmark.py
+++ b/optimum_benchmark/benchmarks/inference/benchmark.py
@@ -1,24 +1,28 @@
-import os
-import statistics
 from logging import getLogger
-from typing import List, Dict, Any
+from typing import List, Tuple, Dict
 
 from ..base import Benchmark
 from .config import InferenceConfig
-from ...backends.base import Backend
 from ...trackers.energy import EnergyTracker
 from ...trackers.memory import MemoryTracker
 from ...trackers.latency import LatencyTracker
+from ...backends.base import Backend, BackendConfigT
 from ...generators.input_generator import InputGenerator
-from ...task_utils import TEXT_GENERATION_TASKS, DIFFUSION_TASKS
+from ...import_utils import is_torch_distributed_available
+from ...task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS
+from .report import InferenceReport, TextGenerationReport, ImageDiffusionReport
+
+if is_torch_distributed_available():
+    import torch.distributed
 
 LOGGER = getLogger("inference")
 
-DIFFUSION_KWARGS = {
+IMAGE_DIFFUSION_KWARGS = {
+    "num_inference_steps": 30,
     "num_images_per_prompt": 1,
 }
 
-GENERATE_KWARGS = {
+TEXT_GENERATION_KWARGS = {
     "num_return_sequences": 1,
     "max_new_tokens": 100,
     "min_new_tokens": 100,
@@ -36,45 +40,13 @@ class InferenceBenchmark(Benchmark[InferenceConfig]):
     def __init__(self, config: InferenceConfig) -> None:
         super().__init__(config)
 
-        self.forward_energy: float = 0
-        self.forward_emissions: float = 0
-        self.forward_max_memory_used: int = 0
-        self.forward_max_memory_allocated: int = 0
-        self.forward_max_memory_reserved: int = 0
-        self.forward_latencies: List[float] = []
-
-        self.generate_energy: float = 0
-        self.generate_emissions: float = 0
-        self.generate_max_memory_used: int = 0
-        self.generate_max_memory_allocated: int = 0
-        self.generate_max_memory_reserved: int = 0
-        self.generate_latencies: List[float] = []
-
-    def run(self, backend: Backend) -> None:
-        self.can_diffuse = backend.config.task in DIFFUSION_TASKS
-        self.can_generate = backend.config.task in TEXT_GENERATION_TASKS
-
-        if self.can_diffuse:
-            LOGGER.info("\t+ Updating forward kwargs with default values")
-            self.config.forward_kwargs = {
-                **DIFFUSION_KWARGS,
-                **self.config.forward_kwargs,
-            }
-        if self.can_generate:
-            LOGGER.info("\t+ Updating generate kwargs with default values")
-            self.config.generate_kwargs = {
-                **GENERATE_KWARGS,
-                **self.config.generate_kwargs,
-            }
-
-        # compile with static shapes if needed
-        LOGGER.info("\t+ Preparing backend for inference")
-        backend.prepare_for_inference(
-            **backend.model_shapes,
-            **self.config.input_shapes,
-            **self.config.forward_kwargs,
-            **self.config.generate_kwargs,
-        )
+    def run(self, backend: Backend[BackendConfigT]) -> None:
+        if is_torch_distributed_available() and torch.distributed.is_initialized():
+            if self.config.input_shapes["batch_size"] % torch.distributed.get_world_size() != 0:
+                raise ValueError(
+                    "The batch size must be divisible by the number of processes in a distributed environment"
+                )
+            self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size()
 
         LOGGER.info("\t+ Creating input generator")
         self.input_generator = InputGenerator(
@@ -83,226 +55,223 @@ def run(self, backend: Backend) -> None:
             input_shapes=self.config.input_shapes,
         )
 
-        # run memory tracking
-        # we do this first to measure the memory on the first call to forward/generate
-        if self.config.memory:
-            self.run_forward_memory_tracking(backend)
-            if self.can_generate:
-                self.run_generate_memory_tracking(backend)
+        if backend.config.task in TEXT_GENERATION_TASKS:
+            LOGGER.info("\t+ Generating and preparing Text Generation input")
+            self.forward_inputs = self.input_generator(mode="forward")
+            self.generate_input = self.input_generator(mode="generate")
+            self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
+            self.generate_input = backend.prepare_inputs(self.generate_input)
+            LOGGER.info("\t+ Updating Text Generation kwargs with default values")
+            self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs}
+            LOGGER.info("\t+ Initializing Text Generation report")
+            self.report = TextGenerationReport(
+                batch_size=self.config.input_shapes["batch_size"],
+                sequence_length=self.config.input_shapes["sequence_length"],
+                num_new_tokens=self.config.generate_kwargs["max_new_tokens"],
+                num_return_sequences=self.config.generate_kwargs["num_return_sequences"],
+            )
+
+        elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+            LOGGER.info("\t+ Generating and preparing Image Diffusion input")
+            self.diffuse_input = self.input_generator(mode="call")
+            self.diffuse_input = backend.prepare_inputs(self.diffuse_input)
+            LOGGER.info("\t+ Updating Image Diffusion kwargs with default values")
+            self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs}
+            LOGGER.info("\t+ Initializing Image Diffusion report")
+            self.report = ImageDiffusionReport(
+                batch_size=self.config.input_shapes["batch_size"],
+                num_images_per_prompts=self.config.forward_kwargs["num_images_per_prompt"],
+            )
+
+        else:
+            LOGGER.info("\t+ Generating and preparing Inference input")
+            self.forward_inputs = self.input_generator(mode="forward")
+            self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
+            LOGGER.info("\t+ Initializing Inference report")
+            self.report = InferenceReport(
+                batch_size=self.config.input_shapes["batch_size"],
+            )
+
+        LOGGER.info("\t+ Preparing backend for Inference")
+        backend.prepare_for_inference(
+            **backend.model_shapes,
+            **self.config.input_shapes,
+            **self.config.forward_kwargs,
+            **self.config.generate_kwargs,
+        )
 
-        # run lacency tracking
-        self.run_forward_latency_tracking(backend)
-        if self.can_generate:
-            self.run_generate_latency_tracking(backend)
+        LOGGER.info("\t+ Warming up backend for Inference")
+        for _ in range(self.config.warmup_runs):
+            if backend.config.task in TEXT_GENERATION_TASKS:
+                generate_warmup_kwargs = {"max_new_tokens": 2, "min_new_tokens": 2}
+                _ = backend.generate(self.generate_input, generate_warmup_kwargs)
+            elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+                diffuse_warmup_kwargs = {"num_inference_steps": 2}
+                _ = backend.call(self.diffuse_input, diffuse_warmup_kwargs)
+            else:
+                _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+        if self.config.memory:
+            LOGGER.info("\t+ Creating inference memory tracker")
+            self.memory_tracker = MemoryTracker(
+                backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids
+            )
+            if backend.config.task in TEXT_GENERATION_TASKS:
+                forward_memories_dict, generate_memories_dict = self.run_text_generation_memory_tracking(backend)
+                self.report.populate_memory(forward_memories_dict, generate_memories_dict)
+            elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+                call_memories_dict = self.run_image_diffusion_memory_tracking(backend)
+                self.report.populate_memory(call_memories_dict)
+            else:
+                forward_memories_dict = self.run_inference_memory_tracking(backend)
+                self.report.populate_memory(forward_memories_dict)
+
+            self.report.log_memory()
+
+        if self.config.latency:
+            LOGGER.info("\t+ Creating inference latency tracker")
+            self.latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device)
+            if backend.config.task in TEXT_GENERATION_TASKS:
+                forward_latencies_dict, generate_latencies_dict = self.run_text_generation_latency_tracking(backend)
+                self.report.populate_latency(forward_latencies_dict, generate_latencies_dict)
+            elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+                call_latencies_dict = self.run_image_diffusion_latency_tracking(backend)
+                self.report.populate_latency(call_latencies_dict)
+            else:
+                forward_latencies_dict = self.run_latency_inference_tracking(backend)
+                self.report.populate_latency(forward_latencies_dict)
+
+            self.report.log_latency()
 
-        # run energy tracking
         if self.config.energy:
-            self.run_forward_energy_tracking(backend)
-            if self.can_generate:
-                self.run_generate_energy_tracking(backend)
+            LOGGER.info("\t+ Creating inference energy tracker")
+            self.energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids)
+            if backend.config.task in TEXT_GENERATION_TASKS:
+                forward_energies_dict, generate_energies_dict = self.run_text_generation_energy_tracking(backend)
+                self.report.populate_energy(forward_energies_dict, generate_energies_dict)
+            elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+                call_energies_dict = self.run_image_diffusion_energy_tracking(backend)
+                self.report.populate_energy(call_energies_dict)
+            else:
+                forward_energies_dict = self.run_inference_energy_tracking(backend)
+                self.report.populate_energy(forward_energies_dict)
 
-    def run_forward_latency_tracking(self, backend: "Backend") -> None:
-        forward_input = self.input_generator.generate(mode="forward")
+            self.report.log_energy()
 
-        LOGGER.info("\t+ Preparing input for the forward pass")
-        forward_input = backend.prepare_inputs(forward_input)
+    ## Memory tracking
+    def run_text_generation_memory_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]:
+        LOGGER.info("\t+ Running memory tracking")
+        self.memory_tracker.reset()
+        with self.memory_tracker.track():
+            _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-        LOGGER.info("\t+ Warming up the forward pass")
-        for _ in range(self.config.warmup_runs):
-            _ = backend.forward(forward_input, self.config.forward_kwargs)
-
-        LOGGER.info("\t+ Tracking forward pass latency and throughput")
-        latency_tracker = LatencyTracker(device=backend.config.device, backend=backend.config.name)
-        while sum(self.forward_latencies) < self.config.duration:
-            with latency_tracker.track():
-                _ = backend.forward(forward_input, self.config.forward_kwargs)
-            self.forward_latencies = latency_tracker.get_latencies()
-
-        LOGGER.debug(f"\t+ Forward pass latency: {self.forward_latency:.3g} (s)")
-        LOGGER.debug(f"\t+ Forward pass throughput: {self.forward_throughput:.3g} (samples/s)")
-
-    def run_forward_energy_tracking(self, backend: Backend) -> None:
-        forward_input = self.input_generator.generate(mode="forward")
-
-        LOGGER.info("\t+ Preparing input for the forward pass")
-        forward_input = backend.prepare_inputs(forward_input)
-
-        LOGGER.info("\t+ Tracking forward pass energy consumption")
-        num_forward_passes = 0
-        energy_tracker = EnergyTracker()
-        with energy_tracker.track(interval=1, file_prefix="forward"):
-            while energy_tracker.get_elapsed_time() < self.config.duration:
-                _ = backend.forward(forward_input, self.config.forward_kwargs)
-                num_forward_passes += 1
-        num_forward_samples = num_forward_passes * self.config.input_shapes["batch_size"]
-        self.forward_energy = energy_tracker.get_total_energy() / num_forward_samples
-        self.forward_emissions = energy_tracker.get_total_emissions() / num_forward_samples
-
-        LOGGER.debug(f"\t+ Forward pass energy consumption: {self.forward_energy:.3g} (kWh/sample)")
-        LOGGER.debug(f"\t+ Forward pass carbon emissions: {self.forward_emissions:.3g} (kgCO2eq/sample)")
-        LOGGER.debug(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/forward_codecarbon.csv")
-
-    def run_forward_memory_tracking(self, backend: "Backend") -> None:
-        forward_input = self.input_generator.generate(mode="forward")
-
-        LOGGER.info("\t+ Preparing input for the forward pass")
-        forward_input = backend.prepare_inputs(forward_input)
-
-        LOGGER.info("\t+ Tracking forward pass peak memory")
-        memory_tracker = MemoryTracker(device=backend.config.device, backend=backend.config.name)
-        with memory_tracker.track():
-            _ = backend.forward(forward_input, self.config.forward_kwargs)
-        self.forward_max_memory_used = memory_tracker.get_max_memory_used()
-        self.forward_max_memory_reserved = memory_tracker.get_max_memory_reserved()
-        self.forward_max_memory_allocated = memory_tracker.get_max_memory_allocated()
-
-        LOGGER.debug(f"\t+ Forward pass max memory used: {self.forward_max_memory_used:.3g} (MB)")
-        LOGGER.debug(f"\t+ Forward pass max memory reserved: {self.forward_max_memory_reserved:.3g} (MB)")
-        LOGGER.debug(f"\t+ Forward pass max memory allocated: {self.forward_max_memory_allocated:.3g} (MB)")
-
-    def run_generate_latency_tracking(self, backend: "Backend") -> None:
-        generate_input = self.input_generator.generate(mode="generate")
-
-        LOGGER.info("\t+ Preparing input for the generation pass")
-        generate_input = backend.prepare_inputs(generate_input)
-
-        LOGGER.info("\t+ Warming up the generation pass")
-        _ = backend.generate(generate_input, self.config.generate_kwargs)
-
-        LOGGER.info("\t+ Tracking generation latency and throughput")
-        latency_tracker = LatencyTracker(device=backend.config.device, backend=backend.config.name)
-        while sum(self.generate_latencies) < self.config.duration:
-            with latency_tracker.track():
-                _ = backend.generate(generate_input, self.config.generate_kwargs)
-            self.generate_latencies = latency_tracker.get_latencies()
-
-        LOGGER.debug(f"\t+ Generation pass latency: {self.generate_latency:.3g} (s)")
-        LOGGER.debug(f"\t+ Generation pass throughput: {self.generate_throughput:.3g} (tokens/s)")
-
-    def run_generate_energy_tracking(self, backend: Backend) -> None:
-        generate_input = self.input_generator.generate(mode="generate")
-
-        LOGGER.info("\t+ Preparing input for the generation pass")
-        generate_input = backend.prepare_inputs(generate_input)
-
-        LOGGER.info("\t+ Tracking generation pass energy consumption")
-        num_generate_passes = 0
-        energy_tracker = EnergyTracker()
-        with energy_tracker.track(interval=1, file_prefix="generate"):
-            while energy_tracker.get_elapsed_time() < self.config.duration:
-                _ = backend.generate(generate_input, self.config.generate_kwargs)
-                num_generate_passes += 1
-        num_generated_tokens = (
-            num_generate_passes
-            * self.config.generate_kwargs["min_new_tokens"]
-            * self.config.generate_kwargs["num_return_sequences"]
-            * self.config.input_shapes["batch_size"]
-        )
-        self.generate_energy = energy_tracker.get_total_energy() / num_generated_tokens
-        self.generate_emissions = energy_tracker.get_total_emissions() / num_generated_tokens
-
-        LOGGER.debug(f"\t+ Generation pass energy consumption: {self.generate_energy:.3g} (kWh/token)")
-        LOGGER.debug(f"\t+ Generation pass carbon emissions: {self.generate_emissions:.3g} (kgCO2eq/token)")
-        LOGGER.debug(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/generate_codecarbon.csv")
-
-    def run_generate_memory_tracking(self, backend: "Backend") -> None:
-        generate_input = self.input_generator.generate(mode="generate")
-
-        LOGGER.info("\t+ Preparing input for the generation pass")
-        generate_input = backend.prepare_inputs(generate_input)
-
-        LOGGER.info("\t+ Tracking generation pass peak memory")
-        memory_tracker = MemoryTracker(device=backend.config.device, backend=backend.config.name)
-        with memory_tracker.track():
-            _ = backend.generate(generate_input, self.config.generate_kwargs)
-        self.generate_max_memory_used = memory_tracker.get_max_memory_used()
-        self.generate_max_memory_reserved = memory_tracker.get_max_memory_reserved()
-        self.generate_max_memory_allocated = memory_tracker.get_max_memory_allocated()
-
-        LOGGER.debug(f"\t+ Generation pass max memory used: {self.generate_max_memory_used:.3g} (MB)")
-        LOGGER.debug(f"\t+ Generation pass max memory reserved: {self.generate_max_memory_reserved:.3g} (MB)")
-        LOGGER.debug(f"\t+ Generation pass max memory allocated: {self.generate_max_memory_allocated:.3g} (MB)")
-
-    # Metrics
-    ## Forward pass metrics
-    @property
-    def forward_latency(self) -> float:
-        return statistics.mean(self.forward_latencies)
-
-    @property
-    def forward_throughput(self) -> float:
-        return self.config.input_shapes["batch_size"] / self.forward_latency
-
-    ## Generation pass metrics
-    @property
-    def generate_latency(self) -> float:
-        return statistics.mean(self.generate_latencies)
-
-    @property
-    def generate_throughput(self) -> float:
-        return (
-            self.config.generate_kwargs["min_new_tokens"]
-            * self.config.generate_kwargs["num_return_sequences"]
-            * self.config.input_shapes["batch_size"]
-            / self.generate_latency
-        )
+        forward_memories_dict = self.memory_tracker.get_memories_dict()
 
-    @property
-    def decode_latency(self) -> float:
-        return self.generate_latency - self.forward_latency
-
-    @property
-    def decode_throughput(self) -> float:
-        return (
-            (self.config.generate_kwargs["min_new_tokens"] - 1)
-            * self.config.generate_kwargs["num_return_sequences"]
-            * self.config.input_shapes["batch_size"]
-            / self.decode_latency
-        )
+        self.memory_tracker.reset()
+        with self.memory_tracker.track():
+            _ = backend.generate(self.generate_input, self.config.generate_kwargs)
 
-    ## Diffusion pass metrics
-    @property
-    def diffusion_throughput(self) -> float:
-        return (
-            self.config.input_shapes["batch_size"]
-            * self.config.forward_kwargs["num_images_per_prompt"]
-            / self.forward_latency
-        )
+        generate_memories_dict = self.memory_tracker.get_memories_dict()
 
-    def report(self) -> Dict[str, Any]:
-        report_dict = {}
+        return forward_memories_dict, generate_memories_dict
 
-        report_dict["forward.latency(s)"] = self.forward_latency
-        report_dict["forward.throughput(samples/s)"] = self.forward_throughput
+    def run_image_diffusion_memory_tracking(self, backend: Backend) -> Dict[str, float]:
+        LOGGER.info("\t+ Running memory tracking")
+        self.memory_tracker.reset()
+        with self.memory_tracker.track():
+            _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
 
-        if self.can_diffuse:
-            report_dict["diffusion.throughput(images/s)"] = self.diffusion_throughput
+        call_memories_dict = self.memory_tracker.get_memories_dict()
 
-        if self.config.memory:
-            report_dict["forward.peak_memory(MB)"] = self.forward_max_memory_used
-            report_dict["forward.max_memory_used(MB)"] = self.forward_max_memory_used
-            report_dict["forward.max_memory_allocated(MB)"] = self.forward_max_memory_allocated
-            report_dict["forward.max_memory_reserved(MB)"] = self.forward_max_memory_reserved
+        return call_memories_dict
 
-        if self.config.energy:
-            report_dict["forward.energy_consumption(kWh/sample)"] = self.forward_energy
-            report_dict["forward.carbon_emissions(kgCO2eq/sample)"] = self.forward_emissions
+    def run_inference_memory_tracking(self, backend: Backend) -> Dict[str, float]:
+        LOGGER.info("\t+ Running memory tracking")
+        self.memory_tracker.reset()
+        with self.memory_tracker.track():
+            _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+        forward_memories_dict = self.memory_tracker.get_memories_dict()
+
+        return forward_memories_dict
+
+    ## Latency tracking
+    def run_text_generation_latency_tracking(self, backend: Backend) -> Tuple[List[float], List[float]]:
+        LOGGER.info("\t+ Running latency tracking")
+        self.latency_tracker.reset()
+        while self.latency_tracker.get_total_latency() < self.config.duration:
+            with self.latency_tracker.track():
+                _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+        forward_latencies_list = self.latency_tracker.get_latencies_list()
+
+        self.latency_tracker.reset()
+        while self.latency_tracker.get_total_latency() < self.config.duration:
+            with self.latency_tracker.track():
+                _ = backend.generate(self.generate_input, self.config.generate_kwargs)
+
+        generate_latencies_list = self.latency_tracker.get_latencies_list()
+
+        return forward_latencies_list, generate_latencies_list
+
+    def run_image_diffusion_latency_tracking(self, backend: Backend) -> List[float]:
+        LOGGER.info("\t+ Running latency tracking")
+        self.latency_tracker.reset()
+        while self.latency_tracker.get_total_latency() < self.config.duration:
+            with self.latency_tracker.track():
+                _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
+
+        call_latencies_list = self.latency_tracker.get_latencies_list()
+
+        return call_latencies_list
+
+    def run_latency_inference_tracking(self, backend: Backend) -> List[float]:
+        LOGGER.info("\t+ Running latency tracking")
+        self.latency_tracker.reset()
+        while self.latency_tracker.get_total_latency() < self.config.duration:
+            with self.latency_tracker.track():
+                _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+        forward_latencies_list = self.latency_tracker.get_latencies_list()
+
+        return forward_latencies_list
+
+    ## Energy tracking
+    def run_text_generation_energy_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]:
+        LOGGER.info("\t+ Running energy tracking")
+        self.energy_tracker.reset()
+        with self.energy_tracker.track():
+            _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
+
+        forward_energies_dict = self.energy_tracker.get_energies_dict()
+
+        self.energy_tracker.reset()
+        with self.energy_tracker.track():
+            _ = backend.generate(self.generate_input, self.config.generate_kwargs)
+
+        generate_energies_dict = self.energy_tracker.get_energies_dict()
+
+        return forward_energies_dict, generate_energies_dict
+
+    def run_image_diffusion_energy_tracking(self, backend: Backend) -> Dict[str, float]:
+        LOGGER.info("\t+ Running energy tracking")
+        self.energy_tracker.reset()
+        with self.energy_tracker.track():
+            _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
+
+        call_energies_dict = self.energy_tracker.get_energies_dict()
 
-        if self.can_generate:
-            report_dict["generate.latency(s)"] = self.generate_latency
-            report_dict["generate.throughput(tokens/s)"] = self.generate_throughput
+        return call_energies_dict
 
-            report_dict["decode.latency(s)"] = self.decode_latency
-            report_dict["decode.throughput(tokens/s)"] = self.decode_throughput
+    def run_inference_energy_tracking(self, backend: Backend) -> Dict[str, float]:
+        LOGGER.info("\t+ Running energy tracking")
+        self.energy_tracker.reset()
+        with self.energy_tracker.track():
+            _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-            if self.config.memory:
-                report_dict["generate.peak_memory(MB)"] = self.generate_max_memory_used
-                report_dict["generate.max_memory_used(MB)"] = self.generate_max_memory_used
-                report_dict["generate.max_memory_allocated(MB)"] = self.generate_max_memory_allocated
-                report_dict["generate.max_memory_reserved(MB)"] = self.generate_max_memory_reserved
+        forward_energies_dict = self.energy_tracker.get_energies_dict()
 
-            if self.config.energy:
-                report_dict["generate.energy_consumption(kWh/token)"] = self.generate_energy
-                report_dict["generate.carbon_emissions(kgCO2eq/token)"] = self.generate_emissions
+        return forward_energies_dict
 
-        return report_dict
+    def get_report(self) -> InferenceReport:
+        return self.report
diff --git a/optimum_benchmark/benchmarks/inference/callback.py b/optimum_benchmark/benchmarks/inference/callback.py
new file mode 100644
index 00000000..4871691d
--- /dev/null
+++ b/optimum_benchmark/benchmarks/inference/callback.py
@@ -0,0 +1,25 @@
+import time
+
+from ...import_utils import is_torch_available
+
+from transformers import LogitsProcessor
+
+if is_torch_available():
+    import torch
+
+
+# TODO: uses this class for more fine-grained latency measurements in text generation
+class MeasurementProcessor(LogitsProcessor):
+    def __init__(self, device: str, backend: str):
+        self.device = device
+        self.backend = backend
+
+        self.latencies = []
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        """
+        Callback to track the time it takes to generate one batch of tokens.
+        """
+        self.latencies.append(time.perf_counter_ns())
+
+        return scores
diff --git a/optimum_benchmark/benchmarks/inference/config.py b/optimum_benchmark/benchmarks/inference/config.py
index 1299ca85..d5c4a0bb 100644
--- a/optimum_benchmark/benchmarks/inference/config.py
+++ b/optimum_benchmark/benchmarks/inference/config.py
@@ -2,33 +2,15 @@
 from typing import Any, Dict, Optional
 from dataclasses import dataclass, field
 
-from ..config import BenchmarkConfig
 from ...env_utils import is_rocm_system
+from ..config import BenchmarkConfig
 
 LOGGER = getLogger("inference")
 
 INPUT_SHAPES = {
-    # used with all tasks
     "batch_size": 2,
-    # used with text input tasks
     "sequence_length": 16,
-    # used with multiple choice tasks where input
-    # is of shape (batch_size, num_choices, sequence_length)
-    "num_choices": 1,
-    # used with audio input tasks
-    "feature_size": 80,
-    "nb_max_frames": 3000,
-}
-
-GENERATE_CONFIG = {
-    "num_return_sequences": 1,
-    "max_new_tokens": 100,
-    "min_new_tokens": 100,
-    "do_sample": False,
-    "use_cache": True,
-    "pad_token_id": 0,
-    "temperature": 1.0,
-    "num_beams": 1,
+    "num_choices": 2,
 }
 
 
@@ -38,37 +20,73 @@ class InferenceConfig(BenchmarkConfig):
     _target_: str = "optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark"
 
     # benchmark options
-    duration: int = 10
-    warmup_runs: int = 10
+    duration: int = field(default=10, metadata={"help": "Minimum duration of the benchmark in seconds"})
+    warmup_runs: int = field(default=10, metadata={"help": "Number of warmup runs to perform before benchmarking"})
 
-    # additional/optional metrics
-    memory: bool = False
-    energy: bool = False
+    # input/output shapes
+    input_shapes: Dict[str, Any] = field(
+        default_factory=dict,
+        metadata={"help": "Input shapes for the model. Missing keys will be filled with default values."},
+    )
+    new_tokens: Optional[int] = field(
+        default=None,
+        metadata={"help": "Deprecated. If set, `max_new_tokens` and `min_new_tokens` will be set to this value."},
+    )
 
-    # input options
-    input_shapes: Dict = field(default_factory=dict)
-    # output options
-    new_tokens: Optional[int] = None
+    # tracking options
+    energy: bool = field(default=False, metadata={"help": "Measure energy usage"})
+    memory: bool = field(default=False, metadata={"help": "Measure max memory usage"})
+    latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"})
 
-    # forward options
-    forward_kwargs: Dict[str, Any] = field(default_factory=dict)
-    # generation options
-    generate_kwargs: Dict[str, Any] = field(default_factory=dict)
+    # methods kwargs
+    forward_kwargs: Dict[str, Any] = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments to pass to the forward method of the model."},
+    )
+    generate_kwargs: Dict[str, Any] = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments to pass to the generate method of the model."},
+    )
+    call_kwargs: Dict[str, Any] = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments to pass to the __call__ method of the pipeline."},
+    )
 
     def __post_init__(self):
         super().__post_init__()
 
         self.input_shapes = {**INPUT_SHAPES, **self.input_shapes}
-        self.generate_kwargs = {**GENERATE_CONFIG, **self.generate_kwargs}
-
-        if self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]:
-            raise ValueError("`max_new_tokens` and `min_new_tokens` must be equal for fixed length output.")
 
         if self.new_tokens is not None:
+            LOGGER.warning(
+                "`new_tokens` is deprecated. Use `max_new_tokens` and `min_new_tokens` instead. "
+                "Setting `max_new_tokens` and `min_new_tokens` to `new_tokens`."
+            )
             self.generate_kwargs["max_new_tokens"] = self.new_tokens
             self.generate_kwargs["min_new_tokens"] = self.new_tokens
-        else:
-            self.new_tokens = self.generate_kwargs["min_new_tokens"]
+
+        if (
+            "max_new_tokens" in self.generate_kwargs
+            and "min_new_tokens" in self.generate_kwargs
+            and self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]
+        ):
+            raise ValueError(
+                "Setting `min_new_tokens` and `max_new_tokens` to different values results in non-deterministic behavior."
+            )
+
+        elif "max_new_tokens" in self.generate_kwargs and "min_new_tokens" not in self.generate_kwargs:
+            LOGGER.warning(
+                "Setting `max_new_tokens` without `min_new_tokens` results in non-deterministic behavior. "
+                "Setting `min_new_tokens` to `max_new_tokens`."
+            )
+            self.generate_kwargs["min_new_tokens"] = self.generate_kwargs["max_new_tokens"]
+
+        elif "min_new_tokens" in self.generate_kwargs and "max_new_tokens" not in self.generate_kwargs:
+            LOGGER.warning(
+                "Setting `min_new_tokens` without `max_new_tokens` results in non-deterministic behavior. "
+                "Setting `max_new_tokens` to `min_new_tokens`."
+            )
+            self.generate_kwargs["max_new_tokens"] = self.generate_kwargs["min_new_tokens"]
 
         if self.energy and is_rocm_system():
             raise ValueError("Energy measurement through codecarbon is not yet available on ROCm-powered devices.")
diff --git a/optimum_benchmark/benchmarks/inference/report.py b/optimum_benchmark/benchmarks/inference/report.py
new file mode 100644
index 00000000..9cd43cfc
--- /dev/null
+++ b/optimum_benchmark/benchmarks/inference/report.py
@@ -0,0 +1,353 @@
+from dataclasses import dataclass, field
+from statistics import mean, stdev
+from typing import Any, Dict, List
+from logging import getLogger
+
+from ..report import BenchmarkReport
+
+LOGGER = getLogger("report")
+
+
+@dataclass
+class InferenceReport(BenchmarkReport):
+    # Config
+    batch_size: int
+    # Metrics
+    forward: Dict[str, Any] = field(default_factory=dict)
+
+    # POPULATING
+    def populate_latency(self, forward_latencies_list: List[float]):
+        ## Latency
+        self.forward["latency"] = {
+            "list[s]": forward_latencies_list,
+            "mean(s)": compute_mean(forward_latencies_list),
+            "stdev(s)": compute_stdev(forward_latencies_list),
+        }
+        ## Throughput
+        forward_throughputs_list = [self.batch_size / latency for latency in forward_latencies_list]
+        self.forward["throughput"] = {
+            "list[samples/s]": forward_throughputs_list,
+            "mean(samples/s)": compute_mean(forward_throughputs_list),
+            "stdev(samples/s)": compute_stdev(forward_throughputs_list),
+        }
+
+    def populate_memory(self, forward_memories_dict: Dict[str, Any]):
+        self.forward["memory"] = forward_memories_dict
+
+    def populate_energy(self, forward_energies_dict: Dict[str, Any]):
+        self.forward["energy"] = forward_energies_dict
+
+    # LOGGING
+    def log_latency(self):
+        for key, value in self.forward["latency"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ forward.latency.{key}: {value:f} (s)")
+        for key, value in self.forward["throughput"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ forward.throughput.{key}: {value:f} (samples/s)")
+
+    def log_memory(self):
+        for key, value in self.forward["memory"].items():
+            LOGGER.info(f"\t+ forward.memory.{key}: {value:f} (MB)")
+
+    def log_energy(self):
+        for key, value in self.forward["energy"].items():
+            LOGGER.info(f"\t+ forward.energy.{key}: {value:f} (kWh)")
+
+    def log_all(self) -> None:
+        if "latency" in self.forward:
+            self.log_latency()
+        if "memory" in self.forward:
+            self.log_memory()
+        if "energy" in self.forward:
+            self.log_energy()
+
+    # add operator to aggregate multiple reports
+    def __add__(self, other: "InferenceReport") -> "InferenceReport":
+        agg_report = InferenceReport(batch_size=self.batch_size + other.batch_size)
+        if "latency" in self.forward and "latency" in other.forward:
+            agg_forward_latencies_list = [
+                (lat_1 + lat_2) / 2
+                for lat_1, lat_2 in zip(self.forward["latency"]["list[s]"], other.forward["latency"]["list[s]"])
+            ]
+            agg_report.populate_latency(agg_forward_latencies_list)
+
+        if "memory" in self.forward and "memory" in other.forward:
+            agg_forward_memories_dict = {}
+            for key in self.forward["memory"]:
+                if "vram" in key:
+                    # our vram measures are not process-specific
+                    agg_forward_memories_dict[key] = max(self.forward["memory"][key], other.forward["memory"][key])
+                else:
+                    # ram and pytorch measures are process-specific
+                    agg_forward_memories_dict[key] = self.forward["memory"][key] + other.forward["memory"][key]
+
+            agg_report.populate_memory(agg_forward_memories_dict)
+
+        if "energy" in self.forward and "energy" in other.forward:
+            agg_forward_energies_dict = {}
+            for key in self.forward["energy"]:
+                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+                agg_forward_energies_dict[key] = self.forward["energy"][key] + other.forward["energy"][key]
+
+            agg_report.populate_energy(agg_forward_energies_dict)
+
+        return agg_report
+
+
+@dataclass
+class ImageDiffusionReport(BenchmarkReport):
+    # Config
+    batch_size: int
+    num_images_per_prompts: int
+    # Metrics
+    call: Dict[str, Any] = field(default_factory=dict)
+
+    # POPULATING
+    def populate_latency(self, call_latencies_list: List[float]):
+        ## Latency
+        self.call["latency"] = {
+            "list[s]": call_latencies_list,
+            "mean(s)": compute_mean(call_latencies_list),
+            "stdev(s)": compute_stdev(call_latencies_list),
+        }
+        ## Throughput
+        call_throughputs_list = [
+            self.batch_size * self.num_images_per_prompts / latency for latency in call_latencies_list
+        ]
+        self.call["throughput"] = {
+            "list[images/s]": call_throughputs_list,
+            "mean[images/s]": compute_mean(call_throughputs_list),
+            "stdev[images/s]": compute_stdev(call_throughputs_list),
+        }
+
+    def populate_memory(self, call_memories_dict: Dict[str, Any]):
+        self.call["memory"] = call_memories_dict
+
+    def populate_energy(self, call_energies_dict: Dict[str, Any]):
+        self.call["energy"] = call_energies_dict
+
+    # LOGGING
+    def log_latency(self):
+        for key, value in self.call["latency"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ call.latency.{key}: {value:f} (s)")
+        for key, value in self.call["throughput"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ call.throughput.{key}: {value:f} (images/s)")
+
+    def log_memory(self):
+        for key, value in self.call["memory"].items():
+            LOGGER.info(f"\t+ call.memory.{key}: {value:f} (MB)")
+
+    def log_energy(self):
+        for key, value in self.call["energy"].items():
+            LOGGER.info(f"\t+ call.energy.{key}: {value:f} (kWh)")
+
+    def log_all(self) -> None:
+        if "latency" in self.call:
+            self.log_latency()
+        if "memory" in self.call:
+            self.log_memory()
+        if "energy" in self.call:
+            self.log_energy()
+
+    # add operator to aggregate multiple reports
+    def __add__(self, other: "ImageDiffusionReport") -> "ImageDiffusionReport":
+        assert self.num_images_per_prompts == other.num_images_per_prompts, "num_images_per_prompts must be the same"
+
+        agg_report = ImageDiffusionReport(
+            batch_size=self.batch_size + other.batch_size,
+            num_images_per_prompts=self.num_images_per_prompts,
+        )
+        if "latency" in self.call and "latency" in other.call:
+            agg_call_latencies_list = [
+                (lat_1 + lat_2) / 2
+                for lat_1, lat_2 in zip(self.call["latency"]["list[s]"], other.call["latency"]["list[s]"])
+            ]
+            agg_report.populate_latency(agg_call_latencies_list)
+
+        if "memory" in self.call and "memory" in other.call:
+            agg_call_memories_dict = {}
+            for key in self.call["memory"]:
+                if "vram" in key:
+                    # our vram measures are not process-specific
+                    agg_call_memories_dict[key] = max(self.call["memory"][key], other.call["memory"][key])
+                else:
+                    # ram and pytorch measures are process-specific
+                    agg_call_memories_dict[key] = self.call["memory"][key] + other.call["memory"][key]
+
+            agg_report.populate_memory(agg_call_memories_dict)
+
+        if "energy" in self.call and "energy" in other.call:
+            agg_call_energies_dict = {}
+            for key in self.call["energy"]:
+                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+                agg_call_energies_dict[key] = self.call["energy"][key] + other.call["energy"][key]
+
+            agg_report.populate_energy(agg_call_energies_dict)
+
+        return agg_report
+
+
+@dataclass
+class TextGenerationReport(BenchmarkReport):
+    # Config
+    batch_size: int
+    sequence_length: int
+    num_new_tokens: int
+    num_return_sequences: int
+    # Prefill Metrics
+    prefill: Dict[str, Any] = field(default_factory=dict)
+    # Decode Metrics
+    decode: Dict[str, Any] = field(default_factory=dict)
+
+    def populate_latency(self, forward_latencies_list: List[float], generate_latencies_list: List[float]):
+        ## Latency
+        self.prefill["latency"] = {
+            "list[s]": forward_latencies_list,
+            "mean(s)": compute_mean(forward_latencies_list),
+            "stdev(s)": compute_stdev(forward_latencies_list),
+        }
+        ## Throughput
+        prefill_throughputs_list = [
+            self.batch_size * self.sequence_length / latency for latency in forward_latencies_list
+        ]
+        self.prefill["throughput"] = {
+            "list[tokens/s]": prefill_throughputs_list,
+            "mean[tokens/s]": compute_mean(prefill_throughputs_list),
+            "stdev[tokens/s]": compute_stdev(prefill_throughputs_list),
+        }
+        ## Latency
+        decode_latencies_list = [
+            generate_latency - self.prefill["latency"]["mean(s)"] for generate_latency in generate_latencies_list
+        ]
+        self.decode["latency"] = {
+            "list[s]": decode_latencies_list,
+            "mean(s)": compute_mean(decode_latencies_list),
+            "stdev(s)": compute_stdev(decode_latencies_list),
+        }
+        ## Throughput
+        decode_throughputs_list = [
+            self.batch_size * self.num_new_tokens * self.num_return_sequences / latency
+            for latency in decode_latencies_list
+        ]
+        self.decode["throughput"] = {
+            "list[tokens/s]": decode_throughputs_list,
+            "mean[tokens/s]": compute_mean(decode_throughputs_list),
+            "stdev[tokens/s]": compute_stdev(decode_throughputs_list),
+        }
+
+    def populate_memory(self, forward_memories_dict: Dict[str, Any], generate_memories_dict: Dict[str, Any]):
+        self.prefill["memory"] = forward_memories_dict
+        self.decode["memory"] = generate_memories_dict
+
+    def populate_energy(self, forward_energies_dict: Dict[str, Any], generate_energies_dict: Dict[str, Any]):
+        self.prefill["energy"] = forward_energies_dict
+        self.decode["energy"] = generate_energies_dict
+
+    # LOGGING
+    def log_latency(self):
+        for key, value in self.prefill["latency"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ prefill.latency.{key}: {value:f} (s)")
+        for key, value in self.prefill["throughput"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ prefill.throughput.{key}: {value:f} (tokens/s)")
+        for key, value in self.decode["latency"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ decode.latency.{key}: {value:f} (s)")
+        for key, value in self.decode["throughput"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ decode.throughput.{key}: {value:f} (tokens/s)")
+
+    def log_memory(self):
+        for key, value in self.prefill["memory"].items():
+            LOGGER.info(f"\t+ prefill.memory.{key}: {value:f} (MB)")
+        for key, value in self.decode["memory"].items():
+            LOGGER.info(f"\t+ decode.memory.{key}: {value:f} (MB)")
+
+    def log_energy(self):
+        for key, value in self.prefill["energy"].items():
+            LOGGER.info(f"\t+ prefill.energy.{key}: {value:f} (kWh)")
+        for key, value in self.decode["energy"].items():
+            LOGGER.info(f"\t+ decode.energy.{key}: {value:f} (kWh)")
+
+    def log_all(self) -> None:
+        if "latency" in self.prefill:
+            self.log_latency()
+        if "memory" in self.prefill:
+            self.log_memory()
+        if "energy" in self.prefill:
+            self.log_energy()
+
+    # add operator to aggregate multiple reports
+    def __add__(self, other: "TextGenerationReport") -> "TextGenerationReport":
+        agg_report = TextGenerationReport(
+            batch_size=self.batch_size + other.batch_size,
+            sequence_length=self.sequence_length,
+            num_new_tokens=self.num_new_tokens,
+            num_return_sequences=self.num_return_sequences,
+        )
+        if "latency" in self.prefill and "latency" in other.prefill:
+            agg_forward_latencies_list = [
+                (lat_1 + lat_2) / 2
+                for lat_1, lat_2 in zip(self.prefill["latency"]["list[s]"], other.prefill["latency"]["list[s]"])
+            ]
+            agg_generate_latencies_list = [
+                (lat_1 + lat_2) / 2
+                for lat_1, lat_2 in zip(self.decode["latency"]["list[s]"], other.decode["latency"]["list[s]"])
+            ]
+            agg_report.populate_latency(agg_forward_latencies_list, agg_generate_latencies_list)
+
+        if "memory" in self.prefill and "memory" in other.prefill:
+            agg_forward_memories_dict = {}
+            for key in self.prefill["memory"]:
+                if "vram" in key:
+                    # our vram measures are not process-specific
+                    agg_forward_memories_dict[key] = max(self.prefill["memory"][key], other.prefill["memory"][key])
+                else:
+                    # ram and pytorch measures are process-specific
+                    agg_forward_memories_dict[key] = self.prefill["memory"][key] + other.prefill["memory"][key]
+
+            agg_generate_memories_dict = {}
+            for key in self.decode["memory"]:
+                if "vram" in key:
+                    # our vram measures are not process-specific
+                    agg_generate_memories_dict[key] = max(self.decode["memory"][key], other.decode["memory"][key])
+                else:
+                    # ram and pytorch measures are process-specific
+                    agg_generate_memories_dict[key] = self.decode["memory"][key] + other.decode["memory"][key]
+
+            agg_report.populate_memory(agg_forward_memories_dict, agg_generate_memories_dict)
+
+        if "energy" in self.prefill and "energy" in other.prefill:
+            agg_forward_energies_dict = {}
+            for key in self.prefill["energy"]:
+                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+                agg_forward_energies_dict[key] = self.prefill["energy"][key] + other.prefill["energy"][key]
+
+            agg_generate_energies_dict = {}
+            for key in self.decode["energy"]:
+                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+                agg_generate_energies_dict[key] = self.decode["energy"][key] + other.decode["energy"][key]
+
+            agg_report.populate_energy(agg_forward_energies_dict, agg_generate_energies_dict)
+
+        return agg_report
+
+
+def compute_mean(values: List[float]) -> float:
+    return mean(values) if len(values) > 0 else 0.0
+
+
+def compute_stdev(values: List[float]) -> float:
+    return stdev(values) if len(values) > 1 else 0.0
diff --git a/optimum_benchmark/benchmarks/report.py b/optimum_benchmark/benchmarks/report.py
new file mode 100644
index 00000000..69491d65
--- /dev/null
+++ b/optimum_benchmark/benchmarks/report.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass, asdict
+from typing import Union, Optional
+from json import dump
+import os
+
+from transformers.configuration_utils import PushToHubMixin
+from flatten_dict import flatten
+import pandas as pd
+
+
+@dataclass
+class BenchmarkReport(PushToHubMixin):
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        config_file_name: Optional[Union[str, os.PathLike]] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            kwargs["token"] = use_auth_token
+
+        config_file_name = config_file_name if config_file_name is not None else "benchmark_report.json"
+
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+
+        output_config_file = os.path.join(save_directory, config_file_name)
+        self.to_json(output_config_file)
+
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    def to_flat_dict(self) -> dict:
+        report_dict = self.to_dict()
+        return flatten(report_dict, reducer="dot")
+
+    def to_json(self, path: str, flat: bool = False) -> None:
+        if flat:
+            with open(path, "w") as f:
+                dump(self.to_flat_dict(), f, indent=4)
+        else:
+            with open(path, "w") as f:
+                dump(self.to_dict(), f, indent=4)
+
+    def to_dataframe(self) -> pd.DataFrame:
+        flat_report_dict = self.to_flat_dict()
+        return pd.DataFrame(flat_report_dict, index=[0])
+
+    def to_csv(self, path: str) -> None:
+        self.to_dataframe().to_csv(path, index=False)
+
+    def log_all(self) -> None:
+        raise NotImplementedError("`log_all` method must be implemented in the child class")
diff --git a/optimum_benchmark/benchmarks/training/benchmark.py b/optimum_benchmark/benchmarks/training/benchmark.py
index e5eaa65f..90c231d0 100644
--- a/optimum_benchmark/benchmarks/training/benchmark.py
+++ b/optimum_benchmark/benchmarks/training/benchmark.py
@@ -1,19 +1,16 @@
-import time
-from typing import Any, Dict
 from logging import getLogger
+from contextlib import ExitStack
 
-from transformers import (
-    default_data_collator,
-    TrainingArguments,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-)
-
-from ...generators.dataset_generator import DatasetGenerator
-from ...backends.base import Backend
-from .config import TrainingConfig
 from ..base import Benchmark
+from .config import TrainingConfig
+from .report import TrainingReport
+from ...trackers.memory import MemoryTracker
+from ...trackers.energy import EnergyTracker
+from .callback import LatencyTrainerCallback
+from ...backends.base import Backend, BackendConfigT
+from ...generators.dataset_generator import DatasetGenerator
+
+from transformers import default_data_collator
 
 LOGGER = getLogger("training")
 
@@ -24,9 +21,7 @@ class TrainingBenchmark(Benchmark[TrainingConfig]):
     def __init__(self, config: TrainingConfig) -> None:
         super().__init__(config)
 
-    def run(self, backend: Backend) -> None:
-        LOGGER.info("Running training benchmark")
-
+    def run(self, backend: Backend[BackendConfigT]) -> None:
         LOGGER.info("\t+ Creating dataset generator")
         dataset_generator = DatasetGenerator(
             task=backend.config.task,
@@ -35,105 +30,57 @@ def run(self, backend: Backend) -> None:
         )
 
         LOGGER.info("\t+ Generating training dataset")
-        training_dataset = dataset_generator.generate()
-
-        LOGGER.info("\t+ Creating training callbacks")
-        training_callbacks = [MeasurementCallback(warmup_steps=self.config.warmup_steps)]
-
-        self.trainer_state = backend.train(
-            training_dataset=training_dataset,
-            training_callbacks=training_callbacks,
-            training_data_collator=default_data_collator,
-            training_arguments=self.config.training_arguments,
+        training_dataset = dataset_generator()
+
+        LOGGER.info("\t+ Initializing training report")
+        self.report = TrainingReport(
+            max_steps=self.config.max_steps,
+            warmup_steps=self.config.warmup_steps,
+            per_process_batch_size=self.config.training_arguments["per_device_train_batch_size"],
+            gradient_accumulation_steps=self.config.training_arguments["gradient_accumulation_steps"],
         )
 
-        LOGGER.debug(f"Training runtime: {self.trainer_state.training_runtime:.3g} (s)")
-        LOGGER.debug(f"Training throughput: {self.trainer_state.training_throughput:.3g} (samples/s)")
-
-        return self.report()
-
-    def report(self) -> Dict[str, Any]:
-        return {
-            # warmup metrics
-            "warmup.runtime(s)": self.trainer_state.warmup_runtime,
-            "warmup.throughput(samples/s)": self.trainer_state.warmup_throughput,
-            # training metrics
-            "training.runtime(s)": self.trainer_state.training_runtime,
-            "training.throughput(samples/s)": self.trainer_state.training_throughput,
-            # overall metrics
-            "overall.runtime(s)": self.trainer_state.overall_runtime,
-            "overall.throughput(samples/s)": (self.trainer_state.overall_throughput),
-        }
-
-
-class MeasurementCallback(TrainerCallback):
-    def __init__(self, warmup_steps: int):
-        self.warmup_steps = warmup_steps
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        state.warmup_start = time.perf_counter_ns() * 1e-9
-        state.overall_start = time.perf_counter_ns() * 1e-9
-
-    def on_step_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if state.global_step == self.warmup_steps:
-            state.warmup_end = time.perf_counter_ns() * 1e-9
-            state.training_start = time.perf_counter_ns() * 1e-9
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        state.training_end = time.perf_counter_ns() * 1e-9
-        state.overall_end = time.perf_counter_ns() * 1e-9
-
-        state.total_training_batch_size = args.train_batch_size * args.gradient_accumulation_steps
-
-        # warmup metrics
-        state.warmup_runtime = state.warmup_end - state.warmup_start
-        state.num_warmup_samples = self.warmup_steps * state.total_training_batch_size
-        state.warmup_throughput = state.num_warmup_samples / state.warmup_runtime
-        state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime
-
-        # training metrics
-        state.training_runtime = state.training_end - state.training_start
-        state.num_training_steps = state.max_steps - self.warmup_steps
-        state.num_training_samples = state.num_training_steps * state.total_training_batch_size
-        state.training_throughput = state.num_training_samples / state.training_runtime
-        state.training_steps_per_second = state.num_training_steps / state.training_runtime
-
-        # overall training metrics
-        state.overall_runtime = state.training_end - state.warmup_start
-        state.num_overall_samples = state.num_warmup_samples + state.num_training_samples
-        state.overall_throughput = state.num_overall_samples / state.overall_runtime
-        state.overall_steps_per_second = state.num_overall_samples / state.overall_runtime
-
-
-# def get_data_collator(task: str):
-#     if task == "object-detection":
-#         return object_detection_data_collator
-#     else:
-#         return default_data_collator
-
-
-# def object_detection_data_collator(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
-#     pixel_values = torch.stack([example["pixel_values"] for example in batch])
-#     labels = [example["labels"] for example in batch]
-#     return {
-#         "pixel_values": pixel_values,
-#         "labels": labels,
-#     }
+        training_callbackes = []
+        if self.config.latency:
+            LOGGER.info("\t+ Adding latency measuring callback")
+            latency_callback = LatencyTrainerCallback(device=backend.config.device, backend=backend.config.name)
+            training_callbackes.append(latency_callback)
+
+        training_trackers = []
+        if self.config.memory:
+            LOGGER.info("\t+ Adding memory tracking context manager")
+            memory_tracker = MemoryTracker(
+                device=backend.config.device, backend=backend.config.name, device_ids=backend.config.device_ids
+            )
+            training_trackers.append(memory_tracker.track())
+
+        if self.config.energy:
+            LOGGER.info("\t+ Adding energy tracking context manager")
+            energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids)
+            training_trackers.append(energy_tracker.track())
+
+        with ExitStack() as stack:
+            for tracker in training_trackers:
+                stack.enter_context(tracker)
+
+            backend.train(
+                training_dataset=training_dataset,
+                training_callbacks=training_callbackes,
+                training_data_collator=default_data_collator,
+                training_arguments=self.config.training_arguments,
+            )
+
+        if self.config.latency:
+            self.report.populate_latency(overall_latencies_list=latency_callback.get_latencies_list())
+            self.report.log_latency()
+
+        if self.config.memory:
+            self.report.populate_memory(overall_memories_dict=memory_tracker.get_memories_dict())
+            self.report.log_memory()
+
+        if self.config.energy:
+            self.report.populate_energy(overall_energies_dict=energy_tracker.get_energies_dict())
+            self.report.log_energy()
+
+    def get_report(self) -> TrainingReport:
+        return self.report
diff --git a/optimum_benchmark/benchmarks/training/callback.py b/optimum_benchmark/benchmarks/training/callback.py
new file mode 100644
index 00000000..88026d79
--- /dev/null
+++ b/optimum_benchmark/benchmarks/training/callback.py
@@ -0,0 +1,43 @@
+import time
+from typing import List
+
+import torch
+from transformers import TrainerCallback
+
+
+class LatencyTrainerCallback(TrainerCallback):
+    def __init__(self, device: str, backend: str) -> None:
+        self.device = device
+        self.backend = backend
+        self.all_latencies_list = []
+
+    def on_step_begin(self, *args, **kwargs):
+        # one record per step
+        if self.device == "cuda" and self.backend == "pytorch":
+            self.all_latencies_list.append(torch.cuda.Event(enable_timing=True))
+            self.all_latencies_list[-1].record()
+        else:
+            self.all_latencies_list.append(time.perf_counter_ns())
+
+    def on_train_end(self, *args, **kwargs):
+        # one last record to measure the time of the last step
+        if self.device == "cuda" and self.backend == "pytorch":
+            self.all_latencies_list.append(torch.cuda.Event(enable_timing=True))
+            self.all_latencies_list[-1].record()
+        else:
+            self.all_latencies_list.append(time.perf_counter_ns())
+
+    def get_latencies_list(self) -> List[float]:
+        if self.device == "cuda" and self.backend == "pytorch":
+            torch.cuda.synchronize()  # synchronize the device to make sure all events have been recorded
+            latencies_list = [
+                self.all_latencies_list[i - 1].elapsed_time(self.all_latencies_list[i]) * 1e-3
+                for i in range(1, len(self.all_latencies_list))
+            ]
+        else:
+            latencies_list = [
+                (self.all_latencies_list[i] - self.all_latencies_list[i - 1]) * 1e-9
+                for i in range(1, len(self.all_latencies_list))
+            ]
+
+        return latencies_list
diff --git a/optimum_benchmark/benchmarks/training/config.py b/optimum_benchmark/benchmarks/training/config.py
index 3a872684..e5d19581 100644
--- a/optimum_benchmark/benchmarks/training/config.py
+++ b/optimum_benchmark/benchmarks/training/config.py
@@ -8,6 +8,7 @@
 
 TRAINING_ARGUMENT = {
     "per_device_train_batch_size": 2,
+    "gradient_accumulation_steps": 1,
     "output_dir": "./trainer_output",
     "do_train": True,
     "use_cpu": False,
@@ -25,16 +26,9 @@
 }
 
 DATASET_SHAPES = {
-    # used with all tasks
     "dataset_size": 500,
-    # used with text input tasks
     "sequence_length": 16,
-    # used with multiple choice tasks where input
-    # is of shape (batch_size, num_choices, sequence_length)
     "num_choices": 1,
-    # used with audio input tasks
-    "feature_size": 80,
-    "nb_max_frames": 3000,
 }
 
 
@@ -49,10 +43,14 @@ class TrainingConfig(BenchmarkConfig):
 
     # dataset options
     dataset_shapes: Dict[str, Any] = field(default_factory=dict)
-
     # training options
     training_arguments: Dict[str, Any] = field(default_factory=dict)
 
+    # tracking options
+    latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"})
+    memory: bool = field(default=False, metadata={"help": "Measure max memory usage"})
+    energy: bool = field(default=False, metadata={"help": "Measure energy usage"})
+
     def __post_init__(self):
         super().__post_init__()
 
diff --git a/optimum_benchmark/benchmarks/training/report.py b/optimum_benchmark/benchmarks/training/report.py
new file mode 100644
index 00000000..9eeba211
--- /dev/null
+++ b/optimum_benchmark/benchmarks/training/report.py
@@ -0,0 +1,169 @@
+from dataclasses import dataclass, field
+from statistics import mean, stdev
+from typing import Any, Dict, List
+from logging import getLogger
+
+from ..report import BenchmarkReport
+
+LOGGER = getLogger("report")
+
+
+@dataclass
+class TrainingReport(BenchmarkReport):
+    max_steps: int
+    warmup_steps: int
+    per_process_batch_size: int
+    gradient_accumulation_steps: int
+
+    overall: Dict[str, Any] = field(default_factory=dict)
+    training: Dict[str, Any] = field(default_factory=dict)
+    warmup: Dict[str, Any] = field(default_factory=dict)
+
+    world_size: int = 1
+
+    # POPULATING
+    def populate_latency(self, overall_latencies_list: List[float]) -> None:
+        assert (
+            len(overall_latencies_list) == self.max_steps
+        ), f"Expected {self.max_steps} latencies, but got {len(overall_latencies_list)} latencies"
+        # Overall
+        ## Latency
+        self.overall["latency"] = {
+            "list[s/step]": overall_latencies_list,
+            "mean(s/step)": compute_mean(overall_latencies_list),
+            "stdev(s/step)": compute_stdev(overall_latencies_list),
+        }
+        ## Throughput
+        overall_throughputs_list = [
+            self.world_size * self.per_process_batch_size * self.gradient_accumulation_steps / latency
+            for latency in overall_latencies_list
+        ]
+        self.overall["throughput"] = {
+            "list[samples/s]": overall_throughputs_list,
+            "mean(samples/s)": compute_mean(overall_throughputs_list),
+            "stdev(samples/s)": compute_stdev(overall_throughputs_list),
+        }
+        # Training
+        ## Latency
+        training_latencies_list = overall_latencies_list[self.warmup_steps :]
+        self.training["latency"] = {
+            "list[s/step]": training_latencies_list,
+            "mean(s/step)": compute_mean(training_latencies_list),
+            "stdev(s/step)": compute_stdev(training_latencies_list),
+        }
+        ## Throughput
+        training_throughputs_list = overall_throughputs_list[self.warmup_steps :]
+        self.training["throughput"] = {
+            "list[samples/s]": training_throughputs_list,
+            "mean(samples/s)": compute_mean(training_throughputs_list),
+            "stdev(samples/s)": compute_stdev(training_throughputs_list),
+        }
+        # Warmup
+        ## Latency
+        warmup_latencies_list = overall_latencies_list[: self.warmup_steps]
+        self.warmup["latency"] = {
+            "list[s/step]": warmup_latencies_list,
+            "mean(s/step)": compute_mean(warmup_latencies_list),
+            "stdev(s/step)": compute_stdev(warmup_latencies_list),
+        }
+        ## Throughput
+        warmup_throughputs_list = overall_throughputs_list[: self.warmup_steps]
+        self.warmup["throughput"] = {
+            "list[samples/s]": warmup_throughputs_list,
+            "mean(samples/s)": compute_mean(warmup_throughputs_list),
+            "stdev(samples/s)": compute_stdev(warmup_throughputs_list),
+        }
+
+    def populate_memory(self, overall_memories_dict: Dict[str, float]) -> None:
+        self.warmup["memory"] = overall_memories_dict
+        self.overall["memory"] = overall_memories_dict
+        self.training["memory"] = overall_memories_dict
+
+    def populate_energy(self, overall_energies_dict: Dict[str, float]) -> None:
+        self.overall["energy"] = overall_energies_dict
+        # can't get training only or warmup only energies
+        # self.warmup["energy"] = overall_energies_dict
+        # self.training["energy"] = overall_energies_dict
+        # TODO: use a callback for energy instead of a tracker
+
+    # LOGGING
+    def log_latency(self):
+        for key, value in self.training["latency"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ training.latency.{key}: {value:f} (s)")
+        for key, value in self.training["throughput"].items():
+            if "list" in key:
+                continue
+            LOGGER.info(f"\t+ training.throughput.{key}: {value:f} (samples/s)")
+
+    def log_memory(self):
+        for key, value in self.training["memory"].items():
+            LOGGER.info(f"\t+ training.memory.{key}: {value:f} (MB)")
+
+    def log_energy(self):
+        for key, value in self.overall["energy"].items():
+            LOGGER.info(f"\t+ overall.energy.{key}: {value:f} (kWh)")
+
+    def log_all(self):
+        if "latency" in self.training:
+            self.log_latency()
+        if "memory" in self.training:
+            self.log_memory()
+        if "energy" in self.training:
+            self.log_energy()
+
+    # LOGIC
+    def __add__(self, other: "TrainingReport") -> "TrainingReport":
+        assert self.max_steps == other.max_steps, "Both reports must have the same max_steps"
+        assert self.warmup_steps == other.warmup_steps, "Both reports must have the same warmup_steps"
+        assert (
+            self.gradient_accumulation_steps == other.gradient_accumulation_steps
+        ), "Both reports must have the same gradient_accumulation_steps"
+
+        agg_report = TrainingReport(
+            max_steps=self.max_steps,
+            warmup_steps=self.warmup_steps,
+            world_size=self.world_size + other.world_size,
+            per_process_batch_size=self.per_process_batch_size,
+            gradient_accumulation_steps=self.gradient_accumulation_steps,
+        )
+
+        if "latency" in self.overall:
+            agg_overall_latencies_list = [
+                max(lat_1, lat_2)
+                for lat_1, lat_2 in zip(
+                    self.overall["latency"]["list[s/step]"], other.overall["latency"]["list[s/step]"]
+                )
+            ]
+            agg_report.populate_latency(agg_overall_latencies_list)
+
+        if "memory" in self.overall:
+            agg_overall_memories_dict = {}
+            for key in self.overall["memory"]:
+                if "vram" in key:
+                    # our vram measures are not process-specific
+                    agg_overall_memories_dict[key] = max(self.overall["memory"][key], other.overall["memory"][key])
+                else:
+                    # ram and pytorch measures are process-specific (can be accumulated)
+                    agg_overall_memories_dict[key] = self.overall["memory"][key] + other.overall["memory"][key]
+
+            agg_report.populate_memory(agg_overall_memories_dict)
+
+        if "energy" in self.overall:
+            agg_overall_energies_dict = {}
+            for key in self.overall["energy"]:
+                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
+                agg_overall_energies_dict[key] = self.overall["energy"][key] + other.overall["energy"][key]
+
+            agg_report.populate_energy(agg_overall_energies_dict)
+
+        return agg_report
+
+
+def compute_mean(values: List[float]) -> float:
+    return mean(values) if len(values) > 0 else 0.0
+
+
+def compute_stdev(values: List[float]) -> float:
+    return stdev(values) if len(values) > 1 else 0.0
diff --git a/optimum_benchmark/benchmarks/utils.py b/optimum_benchmark/benchmarks/utils.py
index 052276c4..8b137891 100644
--- a/optimum_benchmark/benchmarks/utils.py
+++ b/optimum_benchmark/benchmarks/utils.py
@@ -1,55 +1 @@
-from typing import List, Dict, Any
 
-
-# TODO: use some kind of logic to handle this instead of this function
-def consolidate_reports(reports: List[Dict[str, Any]]) -> Dict[str, Any]:
-    report = {}
-
-    ## Training
-
-    if "warmup.runtime(s)" in reports[0]:
-        report["warmup.runtime(s)"] = reports[0]["warmup.runtime(s)"]
-        report["warmup.throughput(samples/s)"] = sum(r["warmup.throughput(samples/s)"] for r in reports)
-
-    if "training.runtime(s)" in reports[0]:
-        report["training.runtime(s)"] = reports[0]["training.runtime(s)"]
-        report["training.throughput(samples/s)"] = sum(r["training.throughput(samples/s)"] for r in reports)
-
-    if "overall.runtime(s)" in reports[0]:
-        report["overall.runtime(s)"] = reports[0]["overall.runtime(s)"]
-        report["overall.throughput(samples/s)"] = sum(r["overall.throughput(samples/s)"] for r in reports)
-
-    ## Inference
-
-    if "forward.latency(s)" in reports[0]:
-        report["forward.latency(s)"] = reports[0]["forward.latency(s)"]
-        report["forward.throughput(samples/s)"] = sum(r["forward.throughput(samples/s)"] for r in reports)
-
-    if "diffusion.throughput(images/s)" in reports[0]:
-        report["diffusion.throughput(images/s)"] = sum(r["diffusion.throughput(images/s)"] for r in reports)
-
-    if "forward.peak_memory(MB)" in reports[0]:
-        report["forward.max_memory_used(MB)"] = reports[0]["forward.max_memory_used(MB)"]
-        report["forward.max_memory_allocated(MB)"] = sum(r["forward.max_memory_allocated(MB)"] for r in reports)
-        report["forward.max_memory_reserved(MB)"] = sum(r["forward.max_memory_reserved(MB)"] for r in reports)
-
-    if "forward.energy_consumption(kWh/sample)" in reports[0]:
-        report["forward.energy_consumption(kWh/sample)"] = reports[0]["forward.energy_consumption(kWh/sample)"]
-        report["forward.carbon_emissions(kgCO2eq/sample)"] = reports[0]["forward.carbon_emissions(kgCO2eq/sample)"]
-
-    if "generate.latency(s)" in reports[0]:
-        report["generate.latency(s)"] = reports[0]["generate.latency(s)"]
-        report["generate.throughput(tokens/s)"] = sum(r["generate.throughput(tokens/s)"] for r in reports)
-        report["decode.latency(s)"] = reports[0]["decode.latency(s)"]
-        report["decode.throughput(tokens/s)"] = sum(r["decode.throughput(tokens/s)"] for r in reports)
-
-    if "generate.peak_memory(MB)" in reports[0]:
-        report["generate.max_memory_used(MB)"] = reports[0]["generate.max_memory_used(MB)"]
-        report["generate.max_memory_allocated(MB)"] = sum(r["generate.max_memory_allocated(MB)"] for r in reports)
-        report["generate.max_memory_reserved(MB)"] = sum(r["generate.max_memory_reserved(MB)"] for r in reports)
-
-    if "generate.energy_consumption(kWh/token)" in reports[0]:
-        report["generate.energy_consumption(kWh/token)"] = reports[0]["generate.energy_consumption(kWh/token)"]
-        report["generate.carbon_emissions(kgCO2eq/token)"] = reports[0]["generate.carbon_emissions(kgCO2eq/token)"]
-
-    return report
diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py
index cf36855d..4961c189 100644
--- a/optimum_benchmark/cli.py
+++ b/optimum_benchmark/cli.py
@@ -1,6 +1,5 @@
 import os
 import glob
-import json
 from logging import getLogger
 
 import hydra
@@ -19,6 +18,7 @@
 from .backends.neural_compressor.config import INCConfig
 from .backends.text_generation_inference.config import TGIConfig
 
+from .benchmarks.report import BenchmarkReport
 from .experiment import launch, ExperimentConfig
 from .benchmarks.training.config import TrainingConfig
 from .benchmarks.inference.config import InferenceConfig
@@ -49,6 +49,8 @@
 # optimum-benchmark
 @hydra.main(version_base=None)
 def benchmark_cli(experiment_config: DictConfig) -> None:
+    os.environ["BENCHMARK_CLI"] = "1"
+
     if glob.glob("*.csv") and os.environ.get("OVERRIDE_BENCHMARKS", "0") != "1":
         LOGGER.warning(
             "Skipping benchmark because results already exist. "
@@ -74,10 +76,6 @@ def benchmark_cli(experiment_config: DictConfig) -> None:
     experiment_config: ExperimentConfig = OmegaConf.to_object(experiment_config)
     OmegaConf.save(experiment_config, "experiment_config.yaml", resolve=True)
 
-    benchmark_report = launch(experiment_config=experiment_config)
-
-    LOGGER.info("Benchmark Report:")
-    for metric, value in benchmark_report.items():
-        LOGGER.info(f"\t+  {metric}: {value:.3f}")
+    benchmark_report: BenchmarkReport = launch(experiment_config=experiment_config)
 
-    json.dump(benchmark_report, open("benchmark_report.json", "w"), indent=4)
+    benchmark_report.to_json("benchmark_report.json")
diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py
index 5a714dd9..ed4b710b 100644
--- a/optimum_benchmark/env_utils.py
+++ b/optimum_benchmark/env_utils.py
@@ -1,13 +1,14 @@
+import os
 import re
 import platform
 import subprocess
 import importlib.util
-from typing import Optional
-
-import psutil
+from typing import Optional, List
 
 from .import_utils import is_py3nvml_available, is_pyrsmi_available
 
+import psutil
+
 
 def is_nvidia_system():
     try:
@@ -91,20 +92,84 @@ def get_gpus():
     return gpus
 
 
-def get_git_revision_hash(package_name: str, path: Optional[str] = None) -> Optional[str]:
+def get_gpu_vram_mb() -> List[int]:
+    if is_nvidia_system():
+        if not is_py3nvml_available():
+            raise ValueError(
+                "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. "
+                "Please install it through `pip install py3nvml`."
+            )
+        import py3nvml.py3nvml as nvml
+
+        nvml.nvmlInit()
+        device_count = nvml.nvmlDeviceGetCount()
+        vrams = [nvml.nvmlDeviceGetMemoryInfo(nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)]
+        nvml.nvmlShutdown()
+    elif is_rocm_system():
+        if not is_pyrsmi_available():
+            raise ValueError(
+                "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. "
+                "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi."
+            )
+
+        from pyrsmi import rocml
+
+        rocml.smi_initialize()
+        device_count = rocml.smi_get_device_count()
+        vrams = [rocml.smi_get_device_memory_total(index) for index in range(device_count)]
+        rocml.smi_shutdown()
+    else:
+        vrams = []
+
+    return sum(vrams)
+
+
+def get_cuda_device_ids() -> str:
+    if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
+    else:
+        if is_nvidia_system():
+            if not is_py3nvml_available():
+                raise ValueError(
+                    "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. "
+                    "Please install it through `pip install py3nvml`."
+                )
+            import py3nvml.py3nvml as nvml
+
+            nvml.nvmlInit()
+            device_ids = list(range(nvml.nvmlDeviceGetCount()))
+            nvml.nvmlShutdown()
+        elif is_rocm_system():
+            if not is_pyrsmi_available():
+                raise ValueError(
+                    "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. "
+                    "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi."
+                )
+
+            from pyrsmi import rocml
+
+            rocml.smi_initialize()
+            device_ids = list(range(rocml.smi_get_device_count()))
+            rocml.smi_shutdown()
+        else:
+            raise ValueError("No NVIDIA or ROCm GPUs found.")
+
+    return ",".join(str(i) for i in device_ids)
+
+
+def get_git_revision_hash(package_name: str) -> Optional[str]:
     """
     Returns the git commit SHA of a package installed from a git repository.
     """
 
-    if path is None:
-        try:
-            path = importlib.util.find_spec(package_name).origin
-        except Exception:
-            return None
+    try:
+        path = importlib.util.find_spec(package_name).origin
+    except Exception:
+        return None
 
     try:
         git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip()
     except Exception:
-        git_hash = None
+        return None
 
     return git_hash
diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py
index 1396c131..c9b6d733 100644
--- a/optimum_benchmark/experiment.py
+++ b/optimum_benchmark/experiment.py
@@ -1,11 +1,13 @@
 import os
 import platform
 from logging import getLogger
+from tempfile import TemporaryDirectory
 from dataclasses import dataclass, field
 from typing import Any, Dict, Type, Optional, TYPE_CHECKING
 
 from hydra.utils import get_class
 
+from .benchmarks.report import BenchmarkReport
 from .benchmarks.config import BenchmarkConfig
 from .launchers.config import LauncherConfig
 from .backends.config import BackendConfig
@@ -19,6 +21,9 @@
 )
 from .env_utils import (
     get_git_revision_hash,
+    is_nvidia_system,
+    is_rocm_system,
+    get_gpu_vram_mb,
     get_cpu_ram_mb,
     get_gpus,
     get_cpu,
@@ -57,29 +62,39 @@ class ExperimentConfig:
     environment: Dict = field(
         default_factory=lambda: {
             "cpu": get_cpu(),
-            "gpus": get_gpus(),
             "cpu_count": os.cpu_count(),
-            "system": platform.system(),
             "cpu_ram_mb": get_cpu_ram_mb(),
+            "system": platform.system(),
             "python_version": platform.python_version(),
             # libraries
             "transformers_version": transformers_version(),
-            "transformers_commit": get_git_revision_hash("transformers", os.environ.get("TRANSFORMERS_PATH", None)),
+            "transformers_commit": get_git_revision_hash("transformers"),
             "accelerate_version": accelerate_version(),
-            "accelerate_commit": get_git_revision_hash("accelerate", os.environ.get("ACCELERATE_PATH", None)),
-            "optimum_version": optimum_version(),
-            "optimum_commit": get_git_revision_hash("optimum", os.environ.get("OPTIMUM_PATH", None)),
+            "accelerate_commit": get_git_revision_hash("accelerate"),
             "diffusers_version": diffusers_version(),
-            "diffusers_commit": get_git_revision_hash("diffusers", os.environ.get("DIFFUSERS_PATH", None)),
+            "diffusers_commit": get_git_revision_hash("diffusers"),
+            "optimum_version": optimum_version(),
+            "optimum_commit": get_git_revision_hash("optimum"),
             "timm_version": timm_version(),
-            "timm_commit": get_git_revision_hash("timm", os.environ.get("TIMM_PATH", None)),
+            "timm_commit": get_git_revision_hash("timm"),
             "peft_version": peft_version(),
-            "peft_commit": get_git_revision_hash("peft", os.environ.get("PEFT_PATH", None)),
+            "peft_commit": get_git_revision_hash("peft"),
         }
     )
 
+    def __post_init__(self):
+        # adding GPU information to the environment
+        if is_nvidia_system() or is_rocm_system():
+            available_gpus = get_gpus()
+            if len(available_gpus) > 0:
+                self.environment["gpu"] = available_gpus[0]
+                self.environment["gpu_count"] = len(available_gpus)
+                self.environment["gpu_vram_mb"] = get_gpu_vram_mb()
+            else:
+                LOGGER.warning("Detected NVIDIA or ROCm system, but no GPUs found.")
 
-def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dict[str, Any]:
+
+def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> BenchmarkReport:
     try:
         # Allocate requested backend
         backend_factory: Type[Backend] = get_class(backend_config._target_)
@@ -107,7 +122,7 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dic
         raise e
 
     try:
-        report = benchmark.report()
+        report = benchmark.get_report()
     except Exception as e:
         LOGGER.error("Error during report generation: %s", e)
         raise e
@@ -115,7 +130,13 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Dic
     return report
 
 
-def launch(experiment_config: ExperimentConfig) -> Dict[str, Any]:
+def launch(experiment_config: ExperimentConfig) -> BenchmarkReport:
+    if os.environ.get("BENCHMARK_CLI", "0") == "0":
+        LOGGER.info("Launching experiment in a temporary directory.")
+        tmep_dir = TemporaryDirectory()
+        original_dir = os.getcwd()
+        os.chdir(tmep_dir.name)
+
     launcher_config: LauncherConfig = experiment_config.launcher
 
     try:
@@ -135,4 +156,8 @@ def launch(experiment_config: ExperimentConfig) -> Dict[str, Any]:
         LOGGER.error(f"Error during experiment launching: {e}")
         raise e
 
+    if os.environ.get("BENCHMARK_CLI", "0") == "0":
+        os.chdir(original_dir)
+        tmep_dir.cleanup()
+
     return output
diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py
index f0ba921f..4bb9f188 100644
--- a/optimum_benchmark/generators/dataset_generator.py
+++ b/optimum_benchmark/generators/dataset_generator.py
@@ -15,7 +15,7 @@ def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict
         dataset_shapes["batch_size"] = dataset_shapes["dataset_size"]
 
         if task in TASKS_TO_GENERATORS:
-            LOGGER.info(f"Using {task} task generator")
+            LOGGER.info(f"\t+ Using {task} task generator")
             shapes = {**dataset_shapes, **model_shapes}
             self.task_generator = TASKS_TO_GENERATORS[task](shapes=shapes, with_labels=True)
         else:
@@ -26,7 +26,8 @@ def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict
                 "please submit a PR or a feature request to optimum-benchmark. \n"
             )
 
-    def generate(self) -> Dataset:
-        task_dataset = self.task_generator.generate()
+    def __call__(self) -> Dataset:
+        task_dataset = self.task_generator()
         task_dataset = Dataset.from_dict(task_dataset)
+        task_dataset.set_format(type="torch", columns=list(task_dataset.features.keys()))
         return task_dataset
diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py
index bc14d6c8..13f1d9aa 100644
--- a/optimum_benchmark/generators/input_generator.py
+++ b/optimum_benchmark/generators/input_generator.py
@@ -22,8 +22,8 @@ def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[s
                 "please submit a PR or a feature request to optimum-benchmark. "
             )
 
-    def generate(self, mode: str) -> Dict[str, Any]:
-        task_input = self.task_generator.generate()
+    def __call__(self, mode: str) -> Dict[str, Any]:
+        task_input = self.task_generator()
 
         if mode == "generate":
             if "pixel_values" in task_input:
@@ -46,5 +46,9 @@ def generate(self, mode: str) -> Dict[str, Any]:
                 task_input = {
                     "inputs": task_input["input_ids"],
                 }
+        elif mode == "call":
+            task_input = {
+                "prompt": task_input["prompt"],
+            }
 
         return task_input
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index 9aee6ee9..1f3e9b23 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -34,8 +34,8 @@ def generate_random_strings(shape: Tuple[int]):
             for _ in range(shape[0])
         ]
 
-    def generate(self):
-        raise NotImplementedError("Generator must implement generate method")
+    def __call__(self):
+        raise NotImplementedError("Generator must implement __call__ method")
 
 
 class TextGenerator(TaskGenerator):
@@ -131,7 +131,7 @@ def labels(self):
             shape=(self.shapes["batch_size"],),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
 
         dummy["input_ids"] = self.input_ids()
@@ -160,7 +160,7 @@ def labels(self):
             ),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
 
         dummy["input_ids"] = self.input_ids()
@@ -179,7 +179,7 @@ def generate(self):
 
 
 class TextGenerationGenerator(TextGenerator):
-    def generate(self):
+    def __call__(self):
         dummy = {}
         dummy["input_ids"] = self.input_ids()
         dummy["attention_mask"] = self.attention_mask()
@@ -211,7 +211,7 @@ def end_positions(self):
             shape=(self.shapes["batch_size"],),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
 
         dummy["input_ids"] = self.input_ids()
@@ -226,7 +226,7 @@ def generate(self):
 
 
 class MaskedLanguageModelingGenerator(TextGenerator):
-    def generate(self):
+    def __call__(self):
         dummy = {}
 
         dummy["input_ids"] = self.input_ids()
@@ -252,7 +252,7 @@ def labels(self):
             shape=(self.shapes["batch_size"],),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
 
         dummy["input_ids"] = (
@@ -288,7 +288,7 @@ def labels(self):
             shape=(self.shapes["batch_size"],),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
         dummy["pixel_values"] = self.pixel_values()
 
@@ -316,7 +316,7 @@ def labels(self):
             for _ in range(self.shapes["batch_size"])
         ]
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
         dummy["pixel_values"] = self.pixel_values()
 
@@ -338,7 +338,7 @@ def labels(self):
             ),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
         dummy["pixel_values"] = self.pixel_values()
 
@@ -356,7 +356,7 @@ def labels(self):
             shape=(self.shapes["batch_size"],),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
         dummy["input_values"] = self.input_values()
 
@@ -377,7 +377,7 @@ def labels(self):
             ),
         )
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
         dummy["input_values"] = self.input_values()
 
@@ -391,7 +391,7 @@ class PromptGenerator(TaskGenerator):
     def prompt(self):
         return self.generate_random_strings(shape=(self.shapes["batch_size"], 10))
 
-    def generate(self):
+    def __call__(self):
         dummy = {}
         dummy["prompt"] = self.prompt()
 
@@ -399,7 +399,7 @@ def generate(self):
 
 
 class FeatureExtractionGenerator(TextGenerator, ImageGenerator):
-    def generate(self):
+    def __call__(self):
         dummy = {}
 
         if self.shapes["num_channels"] is not None and self.shapes["height"] is not None:
diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
index 1c4cc7e8..f19fbda3 100644
--- a/optimum_benchmark/import_utils.py
+++ b/optimum_benchmark/import_utils.py
@@ -1,6 +1,7 @@
 import importlib.metadata
 import importlib.util
 
+
 _transformers_available = importlib.util.find_spec("transformers") is not None
 _accelerate_available = importlib.util.find_spec("accelerate") is not None
 _diffusers_available = importlib.util.find_spec("diffusers") is not None
@@ -19,23 +20,31 @@
 _amdsmi_available = importlib.util.find_spec("amdsmi") is not None
 _tensorflow_available = importlib.util.find_spec("tensorflow") is not None
 _timm_available = importlib.util.find_spec("timm") is not None
-_is_diffusers_available = importlib.util.find_spec("diffusers") is not None
-_is_accelerate_available = importlib.util.find_spec("accelerate") is not None
-_is_torch_ort_available = importlib.util.find_spec("torch_ort") is not None
-_is_deepspeed_available = importlib.util.find_spec("deepspeed") is not None
-_is_tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None
+_diffusers_available = importlib.util.find_spec("diffusers") is not None
+_torch_ort_available = importlib.util.find_spec("torch_ort") is not None
+_deepspeed_available = importlib.util.find_spec("deepspeed") is not None
+_tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None
+_psutil_available = importlib.util.find_spec("psutil") is not None
+
+
+def is_psutil_available():
+    return _psutil_available
+
+
+def is_transformers_available():
+    return _transformers_available
 
 
 def is_tensorrt_llm_available():
-    return _is_tensorrt_llm_available
+    return _tensorrt_llm_available
 
 
 def is_deepspeed_available():
-    return _is_deepspeed_available
+    return _deepspeed_available
 
 
 def is_torch_ort_available():
-    return _is_torch_ort_available
+    return _torch_ort_available
 
 
 def is_accelerate_available():
@@ -43,7 +52,7 @@ def is_accelerate_available():
 
 
 def is_diffusers_available():
-    return _is_diffusers_available
+    return _diffusers_available
 
 
 def is_timm_available():
@@ -118,7 +127,7 @@ def onnxruntime_version():
             try:
                 return "ort-training:" + importlib.metadata.version("onnxruntime-training")
             except importlib.metadata.PackageNotFoundError:
-                return "ort:unknown"
+                return None
 
 
 def openvino_version():
@@ -152,7 +161,7 @@ def diffusers_version():
 
 
 def torch_ort_version():
-    if _is_torch_ort_available:
+    if _torch_ort_available:
         return importlib.metadata.version("torch_ort")
 
 
@@ -167,5 +176,5 @@ def peft_version():
 
 
 def tesnorrt_llm_version():
-    if _is_tensorrt_llm_available:
+    if _tensorrt_llm_available:
         return importlib.metadata.version("tensorrt_llm")
diff --git a/optimum_benchmark/launchers/isolation_utils.py b/optimum_benchmark/launchers/isolation_utils.py
index f48fc919..52006bcc 100644
--- a/optimum_benchmark/launchers/isolation_utils.py
+++ b/optimum_benchmark/launchers/isolation_utils.py
@@ -6,15 +6,15 @@
 from multiprocessing import Process
 from contextlib import contextmanager
 
-import psutil
-
 from ..logging_utils import setup_logging
 from ..env_utils import is_nvidia_system, is_rocm_system
-from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version
+from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version, is_psutil_available
 
+if is_psutil_available():
+    import psutil
 
 if is_py3nvml_available():
-    import py3nvml.py3nvml as nvml  # type: ignore
+    import py3nvml.py3nvml as nvml
 
 if is_amdsmi_available():
     import amdsmi  # type: ignore
@@ -172,7 +172,7 @@ def assert_system_devices_isolation(benchmark_pid: int) -> None:
 
 
 @contextmanager
-def device_isolation(benchmark_pid: int, enabled: bool) -> None:
+def device_isolation(benchmark_pid: int, enabled: bool):
     if not enabled:
         yield
         return
diff --git a/optimum_benchmark/launchers/torchrun/config.py b/optimum_benchmark/launchers/torchrun/config.py
index 77b6b4ef..2d87ff03 100644
--- a/optimum_benchmark/launchers/torchrun/config.py
+++ b/optimum_benchmark/launchers/torchrun/config.py
@@ -1,4 +1,3 @@
-import os
 import uuid
 from logging import getLogger
 from typing import Any, Dict, Optional
@@ -20,7 +19,7 @@ class TorchrunConfig(LauncherConfig):
     # Maximum amount of nodes that the user function will be launched on.
     max_nodes: int = 1
     # On each node the elastic agent will launch this amount of workers that will execute user defined function.
-    nproc_per_node: Optional[int] = None
+    nproc_per_node: int = 2
     # User defined role of the worker (defaults to "trainer").
     role: str = "benchmark_worker"
     # The interval in seconds that is used by the elastic_agent as a period of monitoring workers.
@@ -61,26 +60,3 @@ def __post_init__(self) -> None:
         if self.min_nodes != 1:
             LOGGER.info("For multi-node benchmarks, run the benchmark on each node separately.")
             LOGGER.info(f"Waiting for the other nodes to be avaialable at {self.rdzv_endpoint}...")
-
-        if self.nproc_per_node is None:
-            if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
-                LOGGER.warning(
-                    "`nproc_per_node` is not set but `CUDA_VISIBLE_DEVICES` is set. "
-                    "Setting `nproc_per_node` to the number of visible devices."
-                )
-                self.nproc_per_node = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
-            else:
-                LOGGER.warning(
-                    "`nproc_per_node` is not set and `CUDA_VISIBLE_DEVICES` is not set. "
-                    "Setting `nproc_per_node` and `CUDA_VISIBLE_DEVICES` to 1."
-                )
-                os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-                self.nproc_per_node = 1
-        else:
-            if len(os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")) != self.nproc_per_node:
-                LOGGER.warning(
-                    f"`nproc_per_node` is set to {self.nproc_per_node} but `CUDA_VISIBLE_DEVICES` "
-                    f"is set to {os.environ.get('CUDA_VISIBLE_DEVICES', '')}. "
-                    "Setting `CUDA_VISIBLE_DEVICES` to match `nproc_per_node`."
-                )
-                os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in range(self.nproc_per_node)])
diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py
index b50f9f55..f327e85c 100644
--- a/optimum_benchmark/launchers/torchrun/launcher.py
+++ b/optimum_benchmark/launchers/torchrun/launcher.py
@@ -4,17 +4,19 @@
 from multiprocessing import Queue
 from typing import Callable, Dict, Any
 
-import torch.distributed
-from torch.distributed import FileStore
-from torch.distributed.elastic.multiprocessing import Std
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.launcher.api import LaunchConfig, launch_agent
-
 from ..base import Launcher
 from .config import TorchrunConfig
 from ...logging_utils import setup_logging
 from ..isolation_utils import device_isolation
-from ...benchmarks.utils import consolidate_reports
+from ...benchmarks.report import BenchmarkReport
+from ...import_utils import is_torch_distributed_available
+
+if is_torch_distributed_available():
+    import torch.distributed
+    from torch.distributed import FileStore
+    from torch.distributed.elastic.multiprocessing import Std
+    from torch.distributed.elastic.multiprocessing.errors import record
+    from torch.distributed.launcher.api import LaunchConfig, launch_agent
 
 
 LOGGER = getLogger("torchrun")
@@ -49,8 +51,8 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
             local_addr=self.config.local_addr,
             log_dir=self.config.log_dir,
         )
-        current_log_level = getLogger().getEffectiveLevel()
         queue = Queue()
+        current_log_level = getLogger().getEffectiveLevel()
 
         with device_isolation(enabled=self.config.device_isolation, benchmark_pid=os.getpid()):
             LOGGER.info(f"\t+ Launching torchrun agent with {self.config.nproc_per_node} workers processes")
@@ -61,10 +63,16 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
             )
 
         outputs = []
+
         while not queue.empty():
             outputs.append(queue.get())
 
-        report = consolidate_reports(outputs)
+        if len(outputs) == 1:
+            report: BenchmarkReport = outputs[0]
+        else:
+            LOGGER.info(f"\t+ Merging benchmark reports from {len(outputs)} workers")
+            report: BenchmarkReport = sum(outputs[1:], outputs[0])
+            report.log_all()
 
         return report
 
@@ -85,12 +93,12 @@ def entrypoint(fn, q, log_level, *args):
         torch.cuda.set_device(rank)
 
     if rank == 0:
-        setup_logging(log_level)
+        setup_logging(level=log_level, prefix="RANK-0")
     else:
-        setup_logging("ERROR")
+        setup_logging(level="ERROR")
 
     # TODO: use a tcp store instead
-    store = FileStore("torchrun_filestore")
+    store = FileStore("torchrun.filestore")
     store.set(f"rank_{rank}", str(os.getpid()))
 
     output = fn(*args)
diff --git a/optimum_benchmark/logging_utils.py b/optimum_benchmark/logging_utils.py
index 398c7bf4..72f76889 100644
--- a/optimum_benchmark/logging_utils.py
+++ b/optimum_benchmark/logging_utils.py
@@ -1,11 +1,13 @@
 import os
 import logging
 import logging.config
+from logging import Logger
+from typing import Optional
 from subprocess import Popen, PIPE, STDOUT
 
 from omegaconf import OmegaConf
 
-JOB_LOGGING = {
+API_JOB_LOGGING = {
     "version": 1,
     "formatters": {
         "simple": {"format": "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"},
@@ -27,32 +29,34 @@
             "stream": "ext://sys.stdout",
             "class": "logging.StreamHandler",
         },
-        "file": {
-            "filename": "api.log",
-            "formatter": "simple",
-            "class": "logging.FileHandler",
-        },
     },
-    "root": {"level": "INFO", "handlers": ["console", "file"]},
+    "root": {"level": "INFO", "handlers": ["console"]},
     "disable_existing_loggers": False,
 }
 
 
-def setup_logging(level: str = "INFO"):
-    if os.path.exists(".hydra/hydra.yaml"):
+def setup_logging(level: str = "INFO", prefix: Optional[str] = None):
+    if os.environ.get("BENCHMARK_CLI", "0") == "1":
         hydra_config = OmegaConf.load(".hydra/hydra.yaml")
         job_logging = OmegaConf.to_container(
             hydra_config.hydra.job_logging,
             resolve=True,
         )
     else:
-        job_logging = JOB_LOGGING.copy()
+        job_logging = API_JOB_LOGGING.copy()
 
     job_logging["root"]["level"] = level
+
+    if prefix is not None:
+        job_logging["formatters"]["simple"]["format"] = f"[{prefix}]" + job_logging["formatters"]["simple"]["format"]
+        job_logging["formatters"]["colorlog"]["format"] = (
+            f"[{prefix}]" + job_logging["formatters"]["colorlog"]["format"]
+        )
+
     logging.config.dictConfig(job_logging)
 
 
-def run_process_and_log_stream_output(logger, args):
+def run_subprocess_and_log_stream_output(logger: Logger, args):
     popen = Popen(args, stdout=PIPE, stderr=STDOUT)
     for line in iter(popen.stdout.readline, b""):
         if line is not None:
diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
index b3038812..e35baae3 100644
--- a/optimum_benchmark/task_utils.py
+++ b/optimum_benchmark/task_utils.py
@@ -114,7 +114,7 @@
     ),
 }
 
-DIFFUSION_TASKS = [
+IMAGE_DIFFUSION_TASKS = [
     "stable-diffusion",
     "stable-diffusion-xl",
 ]
diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
index 815abaa9..7d3bb7ad 100644
--- a/optimum_benchmark/trackers/energy.py
+++ b/optimum_benchmark/trackers/energy.py
@@ -1,53 +1,44 @@
 import os
 from logging import getLogger
-from typing import List, Optional
 from contextlib import contextmanager
+from typing import Optional, Dict
 
-from ..env_utils import is_nvidia_system, is_rocm_system
-from ..import_utils import (
-    is_py3nvml_available,
-    is_pyrsmi_available,
-    is_codecarbon_available,
-)
+from ..env_utils import get_cuda_device_ids
+from ..import_utils import is_codecarbon_available
 
 if is_codecarbon_available():
     from codecarbon import EmissionsTracker, OfflineEmissionsTracker
 
-if is_nvidia_system():
-    if is_py3nvml_available():
-        import py3nvml.py3nvml as nvml
-    else:
-        raise ValueError(
-            "The library py3nvml is required to run energy benchmark on NVIDIA GPUs, but is not installed. "
-            "Please install it through `pip install py3nvml`."
-        )
-
-if is_rocm_system():
-    if is_pyrsmi_available():
-        # TODO: use amdsmi instead of pyrsmi
-        from pyrsmi import rocml
-    else:
-        raise ValueError(
-            "The library pyrsmi is required to run energy benchmark on ROCm-powered GPUs, but is not installed. "
-            "Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git."
-        )
-
 
 LOGGER = getLogger("energy")
 
 
 class EnergyTracker:
-    def __init__(self, device_ids: Optional[List[int]] = None):
-        self.device_ids = device_ids
+    def __init__(self, device: str, device_ids: Optional[str] = None):
+        self.device = device
 
+        self.cpu_energy: float = 0
+        self.gpu_energy: float = 0
+        self.ram_energy: float = 0
         self.total_energy: float = 0
-        self.total_emissions: float = 0
 
-        if self.device_ids is None:
-            self.device_ids = infer_cuda_device_ids()
+        if self.device == "cuda":
+            if device_ids is None:
+                LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.")
+                self.device_ids = list(map(int, get_cuda_device_ids().split(",")))
+            else:
+                self.device_ids = list(map(int, device_ids.split(",")))
+        else:
+            self.device_ids = []
+
+    def reset(self):
+        self.cpu_energy = 0
+        self.gpu_energy = 0
+        self.ram_energy = 0
+        self.total_energy = 0
 
     @contextmanager
-    def track(self, interval=1, file_prefix=""):
+    def track(self, interval=1, file_prefix="method"):
         if not is_codecarbon_available():
             raise ValueError(
                 "The library codecarbon is required to run energy benchmark, but is not installed. "
@@ -55,6 +46,7 @@ def track(self, interval=1, file_prefix=""):
             )
 
         try:
+            # TODO: use pynvml and amdsmi directly to get the GPU power consumption
             self.emission_tracker = EmissionsTracker(
                 log_level="error",  # "info" for more verbosity
                 tracking_mode="process",  # "machine" for machine-level tracking
@@ -63,11 +55,11 @@ def track(self, interval=1, file_prefix=""):
                 output_file=f"{file_prefix}_codecarbon.csv",
             )
         except Exception as e:
-            LOGGER.warning(f"Failed to initialize Online Emissions Tracker: {e}")
-            LOGGER.warning("Falling back to Offline Emissions Tracker")
+            LOGGER.warning("\t+ Failed to initialize Online Emissions Tracker:, %s", e)
+            LOGGER.warning("\t+ Falling back to Offline Emissions Tracker")
             if os.environ.get("COUNTRY_ISO_CODE", None) is None:
                 LOGGER.warning(
-                    "Offline Emissions Tracker requires COUNTRY_ISO_CODE to be set. "
+                    "\t+ Offline Emissions Tracker requires COUNTRY_ISO_CODE to be set. "
                     "We will set it to FRA but the carbon footprint will be inaccurate."
                 )
 
@@ -83,32 +75,19 @@ def track(self, interval=1, file_prefix=""):
         self.emission_tracker.start()
         yield
         self.emission_tracker.stop()
-        self.total_energy = self.emission_tracker._total_energy.kWh
-        self.total_emissions = self.emission_tracker.final_emissions
-
-    def get_total_energy(self) -> float:
-        return self.total_energy
 
-    def get_total_emissions(self) -> float:
-        return self.total_emissions
+        self.cpu_energy = self.emission_tracker._total_cpu_energy.kWh
+        self.gpu_energy = self.emission_tracker._total_gpu_energy.kWh
+        self.ram_energy = self.emission_tracker._total_ram_energy.kWh
+        self.total_energy = self.emission_tracker._total_energy.kWh
 
     def get_elapsed_time(self) -> float:
         return self.emission_tracker._last_measured_time - self.emission_tracker._start_time
 
-
-def infer_cuda_device_ids() -> List[int]:
-    if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
-        cuda_device_ids = list(map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
-    else:
-        if is_nvidia_system() and is_py3nvml_available():
-            nvml.nvmlInit()
-            cuda_device_ids = list(range(nvml.nvmlDeviceGetCount()))
-            nvml.nvmlShutdown()
-        elif is_rocm_system() and is_pyrsmi_available():
-            rocml.smi_initialize()
-            cuda_device_ids = list(range(rocml.smi_get_device_count()))
-            rocml.smi_shutdown()
-        else:
-            raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA energy tracking.")
-
-    return cuda_device_ids
+    def get_energies_dict(self) -> Dict[str, float]:
+        return {
+            "cpu_energy(kHh)": self.cpu_energy,
+            "gpu_energy(kHh)": self.gpu_energy,
+            "ram_energy(kHh)": self.ram_energy,
+            "total(kHh)": self.total_energy,
+        }
diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 2823919c..369c2b70 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -1,11 +1,12 @@
-import time
 from contextlib import contextmanager
 from logging import getLogger
 from typing import List
+import time
 
-import torch
+from ..import_utils import is_torch_distributed_available, is_torch_available
 
-from ..import_utils import is_torch_distributed_available
+if is_torch_available():
+    import torch
 
 if is_torch_distributed_available():
     import torch.distributed
@@ -20,61 +21,71 @@ def __init__(self, device: str, backend: str):
 
         self.latencies: List[float] = []
 
+        # this is not in track, because this tracker is used repeatedly
         if is_torch_distributed_available() and torch.distributed.is_initialized():
-            LOGGER.info("Tracking Pytorch Distributed latency")
+            LOGGER.info("\t+ Tracking Pytorch Distributed latency")
         elif self.device == "cuda" and self.backend == "pytorch":
-            LOGGER.info("Tracking Pytorch CUDA latency")
+            LOGGER.info("\t+ Tracking Pytorch CUDA latency")
         else:
-            LOGGER.info("Tracking CPU latency")
+            LOGGER.info("\t+ Tracking CPU latency")
+
+    def reset(self):
+        self.latencies = []
 
     @contextmanager
     def track(self):
         if is_torch_distributed_available() and torch.distributed.is_initialized():
-            yield from self._pytorch_distributed_tracker()
+            yield from self._pytorch_distributed_latency()
         elif self.backend == "pytorch" and self.device == "cuda":
-            yield from self._pytorch_cuda_tracker()
+            yield from self._pytorch_cuda_latency()
         else:
-            yield from self._cpu_tracker()
+            yield from self._cpu_latency()
 
-    def _pytorch_distributed_tracker(self):
+    def _pytorch_distributed_latency(self):
         torch.distributed.barrier()  # synchronize before workload
         start = time.perf_counter_ns()
         yield
         torch.distributed.barrier()  # synchronize after workload
         end = time.perf_counter_ns()
 
-        latency_ns = end - start
-        latency = latency_ns / 1e9
+        latency = (end - start) / 1e9
         self.latencies.append(latency)
 
-        LOGGER.debug(f"Tracked Pytorch Distributed latency: {latency:.2e}s")
+        LOGGER.debug(f"\t+ Tracked Pytorch distributed latency: {latency:.2e}s")
 
-    def _pytorch_cuda_tracker(self):
+    def _pytorch_cuda_latency(self):
+        # Note: torch.cuda.Event is not used here,
+        # there's actually no specific need to use cuda events if you're synchronizing
+        # it's rather a feature that can be used to measure kernel latency without synchronizing,
+        # allowing us to measure the time it takes to perform an operation without necessarily stalling the GPU.
+        # An interesting use case is with cuda graphs where synchronization makes us shoot the optimization in the foot.
+        # details: https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/
         torch.cuda.synchronize()  # synchronize before workload
         start = time.perf_counter_ns()
         yield
         torch.cuda.synchronize()  # synchronize after workload
         end = time.perf_counter_ns()
 
-        latency_ns = end - start
-        latency = latency_ns / 1e9
+        latency = (end - start) / 1e9
         self.latencies.append(latency)
 
-        LOGGER.debug(f"Tracked Pytorch CUDA latency: {latency:.2e}s")
+        LOGGER.debug(f"\t+ Tracked Pytorch CUDA latency: {latency:.2e}s")
 
-    def _cpu_tracker(self):
+    def _cpu_latency(self):
         start = time.perf_counter_ns()
         yield
         end = time.perf_counter_ns()
 
-        latency_ns = end - start
-        latency = latency_ns / 1e9
+        latency = (end - start) / 1e9
         self.latencies.append(latency)
 
-        LOGGER.debug(f"Tracked CPU latency: {latency:.2e}s")
+        LOGGER.debug(f"\t+ Tracked CPU latency: {latency:.2e}s")
 
-    def get_latencies(self):
-        return self.latencies
+    def get_total_count(self):
+        return len(self.latencies)
 
     def get_total_latency(self):
         return sum(self.latencies)
+
+    def get_latencies_list(self) -> List[float]:
+        return self.latencies
diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
index 06b0683a..816f1d5a 100644
--- a/optimum_benchmark/trackers/memory.py
+++ b/optimum_benchmark/trackers/memory.py
@@ -1,18 +1,12 @@
 import os
 from logging import getLogger
-from typing import List, Optional
 from contextlib import contextmanager
+from typing import List, Optional, Dict
 from multiprocessing import Pipe, Process
 from multiprocessing.connection import Connection
 
-import psutil
-import torch
-
-from ..env_utils import bytes_to_mega_bytes, is_nvidia_system, is_rocm_system
-from ..import_utils import (
-    is_py3nvml_available,
-    is_pyrsmi_available,
-)
+from ..env_utils import bytes_to_mega_bytes, get_cuda_device_ids, is_nvidia_system, is_rocm_system
+from ..import_utils import is_py3nvml_available, is_pyrsmi_available, is_torch_available
 
 if is_nvidia_system():
     if is_py3nvml_available():
@@ -25,33 +19,65 @@
 
 if is_rocm_system():
     if is_pyrsmi_available():
-        # TODO: use amdsmi instead of pyrsmi
         from pyrsmi import rocml
     else:
         raise ValueError(
-            "The library pyrsmi is required to run memory benchmark on ROCm-powered GPUs, but is not installed. "
+            "The library pyrsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
             "Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git."
         )
 
+if is_torch_available():
+    import torch
+
+import psutil
+
 
 LOGGER = getLogger("memory")
 
 
 class MemoryTracker:
-    def __init__(self, device: str, backend: str, device_ids: Optional[List[int]] = None):
+    """
+    Memory tracker to measure max memory usage of CPU or GPU devices.
+
+    Args:
+        device (str): Device to track memory usage. Can be either "cuda" or any other device.
+        backend (str): Backend to track memory usage. Can be either "pytorch" or any other backend.
+        device_ids (List[int], optional): List of device IDs to track memory usage. Defaults to None.
+    """
+
+    def __init__(self, device: str, backend: str, device_ids: Optional[str] = None):
         self.device = device
         self.backend = backend
-        self.device_ids = device_ids
 
-        self.max_memory_used: int = 0
-        self.max_memory_reserved: int = 0
-        self.max_memory_allocated: int = 0
+        self.max_memory_used = 0
+        self.max_memory_reserved = 0
+        self.max_memory_allocated = 0
 
         if self.device == "cuda":
-            if self.device_ids is None:
-                self.device_ids = infer_cuda_device_ids()
+            if device_ids is None:
+                LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.")
+                self.device_ids = list(map(int, get_cuda_device_ids().split(",")))
+            else:
+                self.device_ids = list(map(int, device_ids.split(",")))
+
+            LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices: {self.device_ids}")
+
+            if self.backend == "pytorch":
+                self.pytorch_device_ids = list(range(torch.cuda.device_count()))
+                LOGGER.info(f"\t+ Tracking Pytorch memory of Pytorch CUDA devices: {self.pytorch_device_ids}")
+
+                if len(self.device_ids) != len(self.pytorch_device_ids):
+                    raise ValueError(
+                        "The number of CUDA devices and Pytorch CUDA devices must be the same. "
+                        f"Got {len(self.device_ids)} and {len(self.pytorch_device_ids)} respectively."
+                    )
+        else:
+            LOGGER.info("\t+ Tracking RAM memory")
 
-            LOGGER.info(f"Tracking CUDA devices: {self.device_ids}")
+    def reset(self):
+        self.max_memory_used = 0
+        self.max_memory_reserved = 0
+        self.max_memory_allocated = 0
 
     @contextmanager
     def track(self):
@@ -62,109 +88,122 @@ def track(self):
         else:
             yield from self._cpu_memory()
 
-    def get_max_memory_used(self):
-        return bytes_to_mega_bytes(self.max_memory_used)
-
-    def get_max_memory_reserved(self):
-        return bytes_to_mega_bytes(self.max_memory_reserved)
-
-    def get_max_memory_allocated(self):
-        return bytes_to_mega_bytes(self.max_memory_allocated)
-
     def _cuda_pytorch_memory(self):
         torch.cuda.empty_cache()
-
-        for device_index in range(torch.cuda.device_count()):
+        for pytorch_device_index in self.pytorch_device_ids:
             try:
-                torch.cuda.reset_peak_memory_stats(device=device_index)
+                torch.cuda.reset_peak_memory_stats(device=pytorch_device_index)
             except Exception as e:
-                LOGGER.warning(f"Could not reset peak memory stats for device {device_index}: {e}")
+                LOGGER.warning(f"\t+ Could not reset max memory stats for device {pytorch_device_index}: {e}")
 
         yield from self._cuda_memory()
 
-        for device_index in range(torch.cuda.device_count()):
-            self.max_memory_allocated += torch.cuda.max_memory_allocated(device=device_index)
-            self.max_memory_reserved += torch.cuda.max_memory_reserved(device=device_index)
+        for pytorch_device_index in self.pytorch_device_ids:
+            self.max_memory_reserved += torch.cuda.max_memory_reserved(device=pytorch_device_index)
+            self.max_memory_allocated += torch.cuda.max_memory_allocated(device=pytorch_device_index)
 
-        LOGGER.debug(f"Pytorch max memory allocated: {self.get_max_memory_allocated()} MB")
-        LOGGER.debug(f"Pytorch max memory reserved: {self.get_max_memory_reserved()} MB")
+        LOGGER.debug(f"\t+ Pytorch max memory reserved: {self.get_max_memory_reserved_mb()} MB")
+        LOGGER.debug(f"\t+ Pytorch max memory allocated: {self.get_max_memory_allocated_mb()} MB")
 
-    def _cuda_memory(self):
-        if is_nvidia_system() and is_py3nvml_available():
-            handles = []
-            nvml.nvmlInit()
-            for device_index in self.device_ids:
-                handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
-                handles.append(handle)
-
-            yield
-
-            for handle in handles:
-                meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
-                self.max_memory_used += meminfo.used
-            nvml.nvmlShutdown()
-            LOGGER.debug(f"PyNVML max memory used: {self.get_max_memory_used()} MB")
-
-        elif is_rocm_system() and is_pyrsmi_available():
-            rocml.smi_initialize()
+    def _cuda_memory(self, interval: float = 0.001):
+        child_connection, parent_connection = Pipe()
+        memory_process = Process(
+            target=monitor_gpu_max_vram_memory,
+            args=(self.device_ids, child_connection, interval),
+            daemon=True,
+        )
+        memory_process.start()
+        parent_connection.recv()  # wait for memory process to be ready
 
-            yield
+        yield
 
-            for device_index in self.device_ids:
-                meminfo_used = rocml.smi_get_device_memory_used(device_index)
-                self.max_memory_used += meminfo_used
-            rocml.smi_shutdown()
-            LOGGER.debug(f"PyRSMI max memory used: {self.get_max_memory_used()} MB")
-        else:
-            raise ValueError("Only NVIDIA and AMD RoCm GPUs are supported for CUDA memory tracking.")
+        parent_connection.send(True)
+        self.max_memory_used = parent_connection.recv()
+        LOGGER.debug(f"\t+ Max memory (VRAM) used: {self.get_max_memory_used_mb()} MB")
 
-    def _cpu_memory(self, interval: float = 0.0001):
+    def _cpu_memory(self, interval: float = 0.001):
         child_connection, parent_connection = Pipe()
-        # instantiate process
         memory_process = Process(
-            target=monitor_process_peak_memory,
+            target=monitor_cpu_max_ram_memory,
             args=(os.getpid(), child_connection, interval),
             daemon=True,
         )
         memory_process.start()
-        parent_connection.recv()
+        parent_connection.recv()  # wait for memory process to be ready
 
         yield
 
-        parent_connection.send(0)
+        parent_connection.send(True)
         self.max_memory_used = parent_connection.recv()
-        LOGGER.debug(f"Peak memory usage: {self.get_max_memory_used()} MB")
+        LOGGER.debug(f"\t+ Max memory (RAM) used: {self.get_max_memory_used_mb()} MB")
 
+    def get_max_memory_used_mb(self) -> int:
+        return bytes_to_mega_bytes(self.max_memory_used)
+
+    def get_max_memory_allocated_mb(self) -> int:
+        return bytes_to_mega_bytes(self.max_memory_allocated)
 
-def monitor_process_peak_memory(process_id: int, connection: Connection, interval: float):
+    def get_max_memory_reserved_mb(self) -> int:
+        return bytes_to_mega_bytes(self.max_memory_reserved)
+
+    def get_memories_dict(self) -> Dict[str, int]:
+        if self.device == "cuda" and self.backend == "pytorch":
+            return {
+                "max_vram_used(MB)": self.get_max_memory_used_mb(),
+                "max_memory_reserved(MB)": self.get_max_memory_reserved_mb(),
+                "max_memory_allocated(MB)": self.get_max_memory_allocated_mb(),
+            }
+        elif self.device == "cuda":
+            return {"max_vram_used(MB)": self.get_max_memory_used_mb()}
+        else:
+            return {"max_ram_used(MB)": self.get_max_memory_used_mb()}
+
+
+def monitor_cpu_max_ram_memory(process_id: int, connection: Connection, interval: float):
     process = psutil.Process(process_id)
-    peak_memory_usage = 0
+    max_memory_usage = 0
     connection.send(0)
     stop = False
 
     while not stop:
         meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
         current_memory_usage = getattr(process, meminfo_attr)()[0]
-        peak_memory_usage = max(peak_memory_usage, current_memory_usage)
+        max_memory_usage = max(max_memory_usage, current_memory_usage)
         stop = connection.poll(interval)
 
-    connection.send(peak_memory_usage)
+    connection.send(max_memory_usage)
     connection.close()
 
 
-def infer_cuda_device_ids() -> List[int]:
-    if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
-        cuda_device_ids = list(map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
+def monitor_gpu_max_vram_memory(device_ids: List[int], connection: Connection, interval: float):
+    if is_nvidia_system() and is_py3nvml_available():
+        nvml.nvmlInit()
+        handles = [nvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]
+        max_memory_usage = 0
+        connection.send(0)
+        stop = False
+
+        while not stop:
+            current_memory_usage = sum(nvml.nvmlDeviceGetMemoryInfo(handle).used for handle in handles)
+            max_memory_usage = max(max_memory_usage, current_memory_usage)
+            stop = connection.poll(interval)
+
+        connection.send(max_memory_usage)
+        nvml.nvmlShutdown()
+        connection.close()
+    elif is_rocm_system() and is_pyrsmi_available():
+        rocml.smi_initialize()
+        max_memory_usage = 0
+        connection.send(0)
+        stop = False
+
+        while not stop:
+            current_memory_usage = sum(rocml.smi_get_device_memory_used(device_id) for device_id in device_ids)
+            max_memory_usage = max(max_memory_usage, current_memory_usage)
+            stop = connection.poll(interval)
+
+        connection.send(max_memory_usage)
+        rocml.smi_shutdown()
+        connection.close()
     else:
-        if is_nvidia_system() and is_py3nvml_available():
-            nvml.nvmlInit()
-            cuda_device_ids = list(range(nvml.nvmlDeviceGetCount()))
-            nvml.nvmlShutdown()
-        elif is_rocm_system() and is_pyrsmi_available():
-            rocml.smi_initialize()
-            cuda_device_ids = list(range(rocml.smi_get_device_count()))
-            rocml.smi_shutdown()
-        else:
-            raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.")
-
-    return cuda_device_ids
+        raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.")
diff --git a/setup.py b/setup.py
index 7b618ed4..40504fd3 100644
--- a/setup.py
+++ b/setup.py
@@ -62,6 +62,7 @@
     # docker-based backends
     "text-generation-inference": ["docker"],
     # specific settings
+    "codecarbon": ["codecarbon"],
     "deepspeed": ["deepspeed"],
     "diffusers": ["diffusers"],
     "timm": ["timm"],
diff --git a/tests/configs/_base_.yaml b/tests/configs/_base_.yaml
index ff50aa22..d983b841 100644
--- a/tests/configs/_base_.yaml
+++ b/tests/configs/_base_.yaml
@@ -2,24 +2,27 @@ defaults:
   - launcher: process # isolated process launcher
   - experiment # inheriting experiment schema
   - _self_ # for hydra 1.1 compatibility
-  # - override hydra/hydra_logging: colorlog # colorful logging
-  # - override hydra/job_logging: colorlog # colorful logging
+  - override hydra/hydra_logging: colorlog # colorful logging
+  - override hydra/job_logging: colorlog # colorful logging
   - override hydra/launcher: joblib # for parallelization
 
 experiment_name: ${device}_${benchmark.name}_${backend.name}_${task}
 
+# hydra/cli specific settings
 hydra:
   run:
-    dir: tests/experiments/${experiment_name}
+    # where to store run results
+    dir: tests/runs/${experiment_name}
   sweep:
-    dir: tests/experiments/${experiment_name}
+    # where to store sweep results
+    dir: tests/sweeps/${experiment_name}
   job:
+    # change working directory to the run directory
     chdir: true
     env_set:
-      OVERRIDE_BENCHMARKS: 1 # to not skip if results already exist
-      CUDA_VISIBLE_DEVICES: 0 # by default we only use one GPU
-      CUDA_DEVICE_ORDER: PCI_BUS_ID # laking we use the right GPU
-
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
+      OVERRIDE_BENCHMARKS: 1
 
   # we are using joblib launcher to parallelize testing since
   # we're having ccorrect benchmarks is not important while testing
diff --git a/tests/configs/_ddp_.yaml b/tests/configs/_ddp_.yaml
index a5a946fc..aab449e4 100644
--- a/tests/configs/_ddp_.yaml
+++ b/tests/configs/_ddp_.yaml
@@ -4,7 +4,5 @@ defaults:
 launcher:
   nproc_per_node: 2
 
-hydra:
-  job:
-    env_set:
-      CUDA_VISIBLE_DEVICES: 0,1
+backend:
+  device_ids: 0,1
diff --git a/tests/configs/_dp_.yaml b/tests/configs/_dp_.yaml
index 4d6528f6..b7578bdf 100644
--- a/tests/configs/_dp_.yaml
+++ b/tests/configs/_dp_.yaml
@@ -1,4 +1,2 @@
-hydra:
-  job:
-    env_set:
-      CUDA_VISIBLE_DEVICES: 0,1
+backend:
+  device_ids: 0,1
diff --git a/tests/configs/_ds_tp_.yaml b/tests/configs/_ds_tp_.yaml
index 76608e2e..6c154e4f 100644
--- a/tests/configs/_ds_tp_.yaml
+++ b/tests/configs/_ds_tp_.yaml
@@ -5,12 +5,8 @@ launcher:
   nproc_per_node: 2
 
 backend:
+  device_ids: 0,1
   deepspeed_inference: true
   deepspeed_inference_config:
     tensor_parallel:
       tp_size: 2
-
-hydra:
-  job:
-    env_set:
-      CUDA_VISIBLE_DEVICES: 0,1
diff --git a/tests/configs/_lm_naive_mp_.yaml b/tests/configs/_lm_naive_mp_.yaml
index 20aef92a..2ac16fb8 100644
--- a/tests/configs/_lm_naive_mp_.yaml
+++ b/tests/configs/_lm_naive_mp_.yaml
@@ -1,10 +1,6 @@
 backend:
-  model: gpt2
+  device_ids: 0,1
+  device_map: auto
   task: text-generation
   library: transformers
-  device_map: auto
-
-hydra:
-  job:
-    env_set:
-      CUDA_VISIBLE_DEVICES: 0,1
+  model: gpt2
diff --git a/tests/test_api.py b/tests/test_api.py
index f388e629..0bf6ced9 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -6,7 +6,6 @@
 
 from optimum_benchmark.trackers.memory import MemoryTracker
 from optimum_benchmark.trackers.latency import LatencyTracker
-from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS
 from optimum_benchmark.experiment import ExperimentConfig, launch
 from optimum_benchmark.launchers.inline.config import InlineConfig
 from optimum_benchmark.backends.pytorch.config import PyTorchConfig
@@ -18,14 +17,13 @@
 from optimum_benchmark.benchmarks.training.config import TrainingConfig
 from optimum_benchmark.benchmarks.inference.config import InferenceConfig
 from optimum_benchmark.generators.dataset_generator import DatasetGenerator
+from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS
+from optimum_benchmark.backends.timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config
 from optimum_benchmark.backends.transformers_utils import (
     extract_transformers_shapes_from_artifacts,
     get_transformers_pretrained_config,
 )
-from optimum_benchmark.backends.timm_utils import (
-    extract_timm_shapes_from_config,
-    get_timm_pretrained_config,
-)
+
 
 LOGGER = getLogger("test-api")
 
@@ -45,8 +43,15 @@
     ("transformers", "image-classification", "google/vit-base-patch16-224"),
     ("transformers", "semantic-segmentation", "google/vit-base-patch16-224"),
 ]
-BENCHMARK_CONFIGS = [InferenceConfig(memory=True), TrainingConfig()]
-LAUNCHER_CONFIGS = [InlineConfig(), ProcessConfig(), TorchrunConfig(nproc_per_node=2)]
+BENCHMARK_CONFIGS = [
+    InferenceConfig(latency=True, memory=True),
+    TrainingConfig(latency=True, memory=True),
+]
+LAUNCHER_CONFIGS = [
+    TorchrunConfig(nproc_per_node=2, device_isolation=False),
+    ProcessConfig(device_isolation=False),
+    InlineConfig(device_isolation=False),
+]
 
 
 @pytest.mark.parametrize("device,backend", DEVICES_BACKENDS)
@@ -58,11 +63,11 @@ def test_api_latency_tracker(device, backend):
         with tracker.track():
             time.sleep(1)
 
-    measured_latencies = tracker.get_latencies()
+    latencies_list = tracker.get_latencies_list()
 
-    assert len(measured_latencies) == 2
-    assert measured_latencies[0] > expected_latency * 0.9
-    assert measured_latencies[0] < expected_latency * 1.1
+    assert len(latencies_list) == 2
+    assert latencies_list[0] > expected_latency * 0.9
+    assert latencies_list[0] < expected_latency * 1.1
 
 
 @pytest.mark.parametrize("device,backend", DEVICES_BACKENDS)
@@ -74,18 +79,18 @@ def test_api_memory_tracker(device, backend):
 
     # the process consumes memory that we can't control
     if backend == "pytorch":
-        initial_process_memory = tracker.get_max_memory_allocated()
+        initial_process_memory = tracker.get_max_memory_allocated_mb()
     else:
-        initial_process_memory = tracker.get_max_memory_used()
+        initial_process_memory = tracker.get_max_memory_used_mb()
 
     with tracker.track():
         array = torch.ones((10000, 10000), dtype=torch.float64, device=device)
         expected_memory = array.nbytes / 1e6  # around 800 MB
 
     if backend == "pytorch":
-        final_process_memory = tracker.get_max_memory_allocated()
+        final_process_memory = tracker.get_max_memory_allocated_mb()
     else:
-        final_process_memory = tracker.get_max_memory_used()
+        final_process_memory = tracker.get_max_memory_used_mb()
 
     measured_memory = final_process_memory - initial_process_memory
 
@@ -96,11 +101,11 @@ def test_api_memory_tracker(device, backend):
 @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS)
 def test_api_input_generator(library, task, model):
     if library == "transformers":
-        model_config = get_transformers_pretrained_config(model=model)
-        model_shapes = extract_transformers_shapes_from_artifacts(config=model_config)
+        model_config = get_transformers_pretrained_config(model)
+        model_shapes = extract_transformers_shapes_from_artifacts(model_config)
     elif library == "timm":
         model_config = get_timm_pretrained_config(model)
-        model_shapes = extract_timm_shapes_from_config(config=model_config)
+        model_shapes = extract_timm_shapes_from_config(model_config)
     else:
         raise ValueError(f"Unknown library {library}")
 
@@ -110,9 +115,13 @@ def test_api_input_generator(library, task, model):
         model_shapes=model_shapes,
     )
 
-    _ = generator.generate(mode="forward")
     if task in TEXT_GENERATION_TASKS:
-        _ = generator.generate(mode="generate")
+        _ = generator(mode="forward")
+        _ = generator(mode="generate")
+    elif task in IMAGE_DIFFUSION_TASKS:
+        _ = generator(mode="call")
+    else:
+        _ = generator(mode="forward")
 
 
 @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS)
@@ -132,28 +141,15 @@ def test_api_dataset_generator(library, task, model):
         model_shapes=model_shapes,
     )
 
-    _ = generator.generate()
-
-
-@pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS)
-def test_api_launchers(launcher_config):
-    backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cpu")
-    benchmark_config = InferenceConfig(memory=True)
-    experiment_config = ExperimentConfig(
-        experiment_name="api-launch-experiment",
-        benchmark=benchmark_config,
-        launcher=launcher_config,
-        backend=backend_config,
-    )
-    _ = launch(experiment_config)
+    _ = generator()
 
 
 @pytest.mark.parametrize("benchmark_config", BENCHMARK_CONFIGS)
-def test_api_benchmarks(benchmark_config):
-    backend_config = PyTorchConfig(model="gpt2", no_weights=True, device="cpu")
-    launcher_config = ProcessConfig()
+@pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS)
+def test_api_launch_cpu(benchmark_config, launcher_config):
+    backend_config = PyTorchConfig(model="bert-base-uncased", no_weights=True, device="cpu")
     experiment_config = ExperimentConfig(
-        experiment_name="api-benchmark-experiment",
+        experiment_name="",
         benchmark=benchmark_config,
         launcher=launcher_config,
         backend=backend_config,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b48283e1..afae3609 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from optimum_benchmark.logging_utils import run_process_and_log_stream_output
+from optimum_benchmark.logging_utils import run_subprocess_and_log_stream_output
 
 LOGGER = getLogger("test-cli")
 
@@ -26,7 +26,7 @@ def test_cli_configs(config_name):
         "--multirun",
     ]
 
-    popen = run_process_and_log_stream_output(LOGGER, args)
+    popen = run_subprocess_and_log_stream_output(LOGGER, args)
     assert popen.returncode == 0, f"Failed to run {config_name}"
 
 
@@ -42,7 +42,7 @@ def test_cli_exit_code():
         "backend.model=bert-base-uncased",
     ]
 
-    popen_0 = run_process_and_log_stream_output(LOGGER, args_0)
+    popen_0 = run_subprocess_and_log_stream_output(LOGGER, args_0)
     assert popen_0.returncode == 0
 
     args_1 = [
@@ -56,5 +56,5 @@ def test_cli_exit_code():
         "backend.model=bert-base-uncased",
     ]
 
-    popen_1 = run_process_and_log_stream_output(LOGGER, args_1)
+    popen_1 = run_subprocess_and_log_stream_output(LOGGER, args_1)
     assert popen_1.returncode == 1