Skip to content

Commit

Permalink
awq+exllama_example
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil committed Feb 23, 2024
1 parent 6836d1e commit 584b59f
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 30 deletions.
20 changes: 0 additions & 20 deletions examples/api_launch.py

This file was deleted.

32 changes: 32 additions & 0 deletions examples/pytorch_awq_exllama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from optimum_benchmark.backends.pytorch.config import PyTorchConfig
from optimum_benchmark.benchmarks.inference.config import InferenceConfig
from optimum_benchmark.experiment import ExperimentConfig, launch
from optimum_benchmark.launchers.process.config import ProcessConfig
from optimum_benchmark.logging_utils import setup_logging

if __name__ == "__main__":
setup_logging(level="INFO")
launcher_config = ProcessConfig(device_isolation=False)
benchmark_config = InferenceConfig(
memory=True,
latency=True,
input_shapes={"batch_size": 4, "sequence_length": 128},
generate_kwargs={"max_new_tokens": 128, "min_new_tokens": 128},
)
backend_config = PyTorchConfig(
model="TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
device="cuda",
device_ids="0",
no_weights=True,
quantization_scheme="awq",
quantization_config={"version": "exllama"},
)
experiment_config = ExperimentConfig(
experiment_name="awq-exllamav2",
benchmark=benchmark_config,
launcher=launcher_config,
backend=backend_config,
)
benchmark_report = launch(experiment_config)
experiment_config.push_to_hub("IlyasMoutawwakil/awq-benchmarks")
benchmark_report.push_to_hub("IlyasMoutawwakil/awq-benchmarks")
23 changes: 13 additions & 10 deletions optimum_benchmark/backends/pytorch/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,10 @@ def load_model_from_pretrained(self) -> None:
LOGGER.info("\t+ Loading Quantized model")
self.pretrained_model = self.automodel_class.from_pretrained(
pretrained_model_name_or_path=self.config.model,
device_map=self.config.device_map,
device_map=self.config.device_map or torch.device(self.config.device),
**self.config.hub_kwargs,
**self.automodel_kwargs,
)
if self.config.device_map is None and self.config.device != "cpu":
LOGGER.info(f"\t+ Moving model to device: {self.config.device}")
self.pretrained_model.to(self.config.device)
elif self.config.device_map is not None:
# we can't use device context manager since device_map is specified
LOGGER.info(f"\t+ Loading model with device map: {self.config.device_map}")
Expand Down Expand Up @@ -268,13 +265,19 @@ def is_awq_quantized(self) -> bool:

@property
def is_exllamav2(self) -> bool:
dummy_exllama = {"exllama_version": None}
return (self.is_gptq_quantized or self.is_awq_quantized) and (
getattr(self.quantization_config, "exllama_config", dummy_exllama)["exllama_version"]
or getattr(self.pretrained_config, "quantization_config", {}).get("exllama_config", dummy_exllama)[
"exllama_version"
]
) == 2
(
hasattr(self.pretrained_config, "quantization_config")
and hasattr(self.pretrained_config.quantization_config, "exllama_config")
and "exllama_version" in self.pretrained_config.quantization_config.exllama_config
and self.pretrained_config.quantization_config.exllama_config["exllama_version"] == 2
)
or (
hasattr(self.quantization_config, "exllama_config")
and "exllama_version" in self.quantization_config.exllama_config
and self.quantization_config.exllama_config["exllama_version"] == 2
)
)

@property
def automodel_kwargs(self) -> Dict[str, Any]:
Expand Down

0 comments on commit 584b59f

Please sign in to comment.