Skip to content

Commit

Permalink
fix ort inputs filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil committed Feb 19, 2024
1 parent 06dab18 commit 2fe3d85
Show file tree
Hide file tree
Showing 11 changed files with 67 additions and 82 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ data/
version.txt

.engine/
actions-runner-duplicate/
actions-runner/
experiments/
amdsmi/
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ build_docker_rocm:
test_cli_cpu_neural_compressor:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -36,6 +37,7 @@ test_cli_cpu_neural_compressor:
test_cli_cpu_onnxruntime:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -44,6 +46,7 @@ test_cli_cpu_onnxruntime:
test_cli_cpu_openvino:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -52,6 +55,7 @@ test_cli_cpu_openvino:
test_cli_cpu_pytorch:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -60,6 +64,7 @@ test_cli_cpu_pytorch:
test_cli_rocm_pytorch:
docker run \
--rm \
--pid=host \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
Expand All @@ -72,6 +77,7 @@ test_cli_rocm_pytorch:
test_cli_cuda_pytorch:
docker run \
--rm \
--pid=host \
--gpus '"device=0,1"' \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
Expand All @@ -81,6 +87,7 @@ test_cli_cuda_pytorch:
test_api_cpu:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -89,6 +96,7 @@ test_api_cpu:
test_api_cuda:
docker run \
--rm \
--pid=host \
--gpus '"device=0,1"' \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
Expand All @@ -98,6 +106,7 @@ test_api_cuda:
test_api_rocm:
docker run \
--rm \
--pid=host \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
Expand All @@ -110,6 +119,7 @@ test_api_rocm:
test_api_misc:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand Down
4 changes: 2 additions & 2 deletions optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ..task_utils import get_automodel_class_for_task
from .config import BackendConfigT
from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config
from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config
from .timm_utils import extract_timm_shapes_from_config, get_timm_pre_processor, get_timm_pretrained_config
from .transformers_utils import (
PretrainedProcessor,
Expand Down Expand Up @@ -41,7 +41,7 @@ def __init__(self, config: BackendConfigT):

if self.config.library == "diffusers":
self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.hub_kwargs)
self.model_shapes = extract_diffusers_shapes_from_config(self.config.model, **self.config.hub_kwargs)
self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.hub_kwargs)
self.model_type = self.config.task
self.generation_config = None
self.pre_processor = None
Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/backends/diffusers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
return diffusers.DiffusionPipeline.load_config(model, **kwargs)


def extract_diffusers_shapes_from_config(model: str, **kwargs) -> Dict[str, int]:
def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
config = diffusers.DiffusionPipeline.load_config(model, **kwargs)

shapes = {}
Expand Down
3 changes: 2 additions & 1 deletion optimum_benchmark/backends/onnxruntime/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,14 @@ def prepare_for_inference(self, **kwargs) -> None:

def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
if self.config.library == "diffusers":
return {"prompt": inputs["prompt"]}
return inputs

LOGGER.info(f"\t+ Moving inputs tensors to device {self.config.device}")
for key, value in list(inputs.items()):
if key in self.inputs_names:
inputs[key] = value.to(self.config.device)
else:
LOGGER.warning(f"Input {key} is not in expected inputs names. Removing it.")
inputs.pop(key)

return inputs
Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/backends/transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_transformers_pre_processor(model: str, **kwargs) -> Optional["Pretrained
try:
# sometimes contains information about the model's input shapes that are not available in the config
return AutoProcessor.from_pretrained(model, **kwargs)
except ValueError:
except Exception:
return None


Expand Down
61 changes: 30 additions & 31 deletions optimum_benchmark/benchmarks/inference/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
"The batch size must be divisible by the number of processes in a distributed environment"
)
self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size()
if backend.config.device == "cuda" and backend.config.task in TEXT_GENERATION_TASKS:
TEXT_GENERATION_TASKS["synced_gpus"] = True

LOGGER.info("\t+ Creating input generator")
self.input_generator = InputGenerator(
Expand All @@ -82,47 +80,46 @@ def run(self, backend: Backend[BackendConfigT]) -> None:

if backend.config.task in TEXT_GENERATION_TASKS:
LOGGER.info("\t+ Generating and preparing Text Generation input")
self.forward_inputs = self.input_generator(mode="forward")
self.generate_input = self.input_generator(mode="generate")
self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
self.generate_input = backend.prepare_inputs(self.generate_input)
self.text_generation_inputs = self.input_generator()
self.text_generation_inputs = backend.prepare_inputs(self.text_generation_inputs)
LOGGER.info("\t+ Updating Text Generation kwargs with default values")
self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs}
LOGGER.info("\t+ Initializing Text Generation report")
self.report = TextGenerationReport(prefill=BenchmarkMeasurements(), decode=BenchmarkMeasurements())

elif backend.config.task in IMAGE_DIFFUSION_TASKS:
LOGGER.info("\t+ Generating and preparing Image Diffusion input")
self.diffuse_input = self.input_generator(mode="call")
self.diffuse_input = backend.prepare_inputs(self.diffuse_input)
self.image_diffusion_inputs = self.input_generator()
self.image_diffusion_inputs = backend.prepare_inputs(self.image_diffusion_inputs)
LOGGER.info("\t+ Updating Image Diffusion kwargs with default values")
self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs}
self.config.call_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.call_kwargs}
LOGGER.info("\t+ Initializing Image Diffusion report")
self.report = ImageDiffusionReport(call=BenchmarkMeasurements())

else:
LOGGER.info("\t+ Generating and preparing Inference input")
self.forward_inputs = self.input_generator(mode="forward")
self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
self.inference_inputs = self.input_generator()
self.inference_inputs = backend.prepare_inputs(self.inference_inputs)
LOGGER.info("\t+ Initializing Inference report")
self.report = InferenceReport(forward=BenchmarkMeasurements())

LOGGER.info("\t+ Preparing backend for Inference")
backend.prepare_for_inference(
**backend.model_shapes,
**self.config.input_shapes,
**self.config.forward_kwargs,
**self.config.generate_kwargs,
**self.config.forward_kwargs,
**self.config.call_kwargs,
)

LOGGER.info("\t+ Warming up backend for Inference")
for _ in range(self.config.warmup_runs):
if backend.config.task in TEXT_GENERATION_TASKS:
_ = backend.generate(self.generate_input, {"max_new_tokens": 2, "min_new_tokens": 2})
_ = backend.generate(self.text_generation_inputs, {"max_new_tokens": 2, "min_new_tokens": 2})
elif backend.config.task in IMAGE_DIFFUSION_TASKS:
_ = backend.call(self.diffuse_input, {"num_inference_steps": 2})
_ = backend.call(self.image_diffusion_inputs, {"num_inference_steps": 2})
else:
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
_ = backend.forward(self.inference_inputs, self.config.forward_kwargs)

if self.config.memory:
LOGGER.info("\t+ Creating inference memory tracker")
Expand Down Expand Up @@ -164,36 +161,34 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
self.report.log_energy()
self.report.log_efficiency()

self.report.log()

## Memory tracking
def run_text_generation_memory_tracking(self, backend: Backend):
LOGGER.info("\t+ Running memory tracking")
self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
_ = backend.forward(self.text_generation_inputs, self.config.forward_kwargs)

self.report.prefill.memory = self.memory_tracker.get_max_memory()

self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.text_generation_inputs, self.config.generate_kwargs)

self.report.decode.memory = self.memory_tracker.get_max_memory()

def run_image_diffusion_memory_tracking(self, backend: Backend):
LOGGER.info("\t+ Running memory tracking")
self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.image_diffusion_inputs, self.config.forward_kwargs)

self.report.call.memory = self.memory_tracker.get_max_memory()

def run_inference_memory_tracking(self, backend: Backend):
LOGGER.info("\t+ Running memory tracking")
self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
_ = backend.forward(self.inference_inputs, self.config.forward_kwargs)

self.report.forward.memory = self.memory_tracker.get_max_memory()

Expand All @@ -203,19 +198,23 @@ def run_text_generation_latency_tracking(self, backend: Backend):
self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
_ = backend.forward(self.text_generation_inputs, self.config.forward_kwargs)

self.report.prefill.latency = self.latency_tracker.get_latency()
forward_latency = self.latency_tracker.get_latency()
forward_latency.log(prefix="forward")
self.report.prefill.latency = forward_latency
self.report.prefill.throughput = self.latency_tracker.get_throughput(
volume=self.prefill_volume, unit=PREFILL_THROUGHPUT_UNIT
)

self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.text_generation_inputs, self.config.generate_kwargs)

self.report.decode.latency = self.latency_tracker.get_latency() - self.report.prefill.latency.mean
generate_latency = self.latency_tracker.get_latency()
generate_latency.log(prefix="generate")
self.report.decode.latency = generate_latency - self.report.prefill.latency.mean
self.report.decode.throughput = Throughput.from_latency(
self.report.decode.latency, self.decode_volume, unit=DECODE_THROUGHPUT_UNIT
)
Expand All @@ -225,7 +224,7 @@ def run_image_diffusion_latency_tracking(self, backend: Backend):
self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.image_diffusion_inputs, self.config.forward_kwargs)

self.report.call.latency = self.latency_tracker.get_latency()
self.report.call.throughput = Throughput.from_latency(
Expand All @@ -237,7 +236,7 @@ def run_latency_inference_tracking(self, backend: Backend):
self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
_ = backend.forward(self.inference_inputs, self.config.forward_kwargs)

self.report.forward.latency = self.latency_tracker.get_latency()
self.report.forward.throughput = Throughput.from_latency(
Expand All @@ -249,7 +248,7 @@ def run_text_generation_energy_tracking(self, backend: Backend):
LOGGER.info("\t+ Running energy tracking")
self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
_ = backend.forward(self.text_generation_inputs, self.config.forward_kwargs)

self.report.prefill.energy = self.energy_tracker.get_energy()
self.report.prefill.efficiency = Efficiency.from_energy(
Expand All @@ -258,7 +257,7 @@ def run_text_generation_energy_tracking(self, backend: Backend):

self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.text_generation_inputs, self.config.generate_kwargs)

self.report.decode.energy = self.energy_tracker.get_energy() - self.report.prefill.energy
self.report.decode.efficiency = Efficiency.from_energy(
Expand All @@ -269,7 +268,7 @@ def run_image_diffusion_energy_tracking(self, backend: Backend):
LOGGER.info("\t+ Running energy tracking")
self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.image_diffusion_inputs, self.config.forward_kwargs)

self.report.call.energy = self.energy_tracker.get_energy()
self.report.call.efficiency = Efficiency.from_energy(
Expand All @@ -280,7 +279,7 @@ def run_inference_energy_tracking(self, backend: Backend):
LOGGER.info("\t+ Running energy tracking")
self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
_ = backend.forward(self.inference_inputs, self.config.forward_kwargs)

self.report.forward.energy = self.energy_tracker.get_energy()
self.report.forward.efficiency = Efficiency.from_energy(
Expand Down
19 changes: 1 addition & 18 deletions optimum_benchmark/generators/input_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,6 @@ def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[s
"please submit a PR or a feature request to optimum-benchmark. "
)

def __call__(self, mode: str) -> Dict[str, Any]:
def __call__(self) -> Dict[str, Any]:
task_input = self.task_generator()

if mode == "generate":
if "pixel_values" in task_input:
# image input
task_input = {"inputs": task_input["pixel_values"]}
elif "input_values" in task_input:
# speech input
task_input = {"inputs": task_input["input_values"]}
elif "input_features" in task_input:
# waveform input
task_input = {"inputs": task_input["input_features"]}
elif "input_ids" in task_input:
# text input
task_input = {"inputs": task_input["input_ids"]}
elif mode == "call":
task_input = {"prompt": task_input["prompt"]}

return task_input
14 changes: 4 additions & 10 deletions optimum_benchmark/trackers/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,13 @@ def _cpu_latency(self):
self.end_events.append(end)

def get_elapsed_time(self) -> float:
# we measured in cpu to not synchronize all events
# we measure it in cpu to not synchronize all events
return time.perf_counter() - self.start_time

def get_latency(self) -> Latency:
if self.backend == "pytorch" and self.device == "cuda":
# synchronize the last event to make sure it has been recorded
self.start_events[-1].synchronize()
self.end_events[-1].synchronize()
# synchronize the device to make sure all events have been recorded
torch.cuda.synchronize()

latencies_list = [
self.start_events[i].elapsed_time(self.end_events[i]) / 1e3 for i in range(len(self.start_events))
Expand Down Expand Up @@ -210,12 +209,7 @@ def __init__(self, device: str, backend: str):
self.reset()

def reset(self):
if self.device == "cuda" and self.backend == "pytorch":
event = torch.cuda.Event(enable_timing=True)
event.record()
self.events = [event]
else:
self.events = [time.perf_counter()]
self.events: List[Union[float, torch.cuda.Event]] = []

def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
if self.device == "cuda" and self.backend == "pytorch":
Expand Down
Loading

0 comments on commit 2fe3d85

Please sign in to comment.