awq+exllama_example

huggingface · Feb 23, 2024 · 584b59f · 584b59f
1 parent 6836d1e
commit 584b59f
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 30 deletions.
diff --git a/examples/api_launch.py b/examples/api_launch.py
diff --git a/examples/pytorch_awq_exllama.py b/examples/pytorch_awq_exllama.py
@@ -0,0 +1,32 @@
+from optimum_benchmark.backends.pytorch.config import PyTorchConfig
+from optimum_benchmark.benchmarks.inference.config import InferenceConfig
+from optimum_benchmark.experiment import ExperimentConfig, launch
+from optimum_benchmark.launchers.process.config import ProcessConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+if __name__ == "__main__":
+    setup_logging(level="INFO")
+    launcher_config = ProcessConfig(device_isolation=False)
+    benchmark_config = InferenceConfig(
+        memory=True,
+        latency=True,
+        input_shapes={"batch_size": 4, "sequence_length": 128},
+        generate_kwargs={"max_new_tokens": 128, "min_new_tokens": 128},
+    )
+    backend_config = PyTorchConfig(
+        model="TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
+        device="cuda",
+        device_ids="0",
+        no_weights=True,
+        quantization_scheme="awq",
+        quantization_config={"version": "exllama"},
+    )
+    experiment_config = ExperimentConfig(
+        experiment_name="awq-exllamav2",
+        benchmark=benchmark_config,
+        launcher=launcher_config,
+        backend=backend_config,
+    )
+    benchmark_report = launch(experiment_config)
+    experiment_config.push_to_hub("IlyasMoutawwakil/awq-benchmarks")
+    benchmark_report.push_to_hub("IlyasMoutawwakil/awq-benchmarks")
diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
@@ -157,13 +157,10 @@ def load_model_from_pretrained(self) -> None:
             LOGGER.info("\t+ Loading Quantized model")
             self.pretrained_model = self.automodel_class.from_pretrained(
                 pretrained_model_name_or_path=self.config.model,
-                device_map=self.config.device_map,
+                device_map=self.config.device_map or torch.device(self.config.device),
                 **self.config.hub_kwargs,
                 **self.automodel_kwargs,
             )
-            if self.config.device_map is None and self.config.device != "cpu":
-                LOGGER.info(f"\t+ Moving model to device: {self.config.device}")
-                self.pretrained_model.to(self.config.device)
         elif self.config.device_map is not None:
             # we can't use device context manager since device_map is specified
             LOGGER.info(f"\t+ Loading model with device map: {self.config.device_map}")
@@ -268,13 +265,19 @@ def is_awq_quantized(self) -> bool:
 
     @property
     def is_exllamav2(self) -> bool:
-        dummy_exllama = {"exllama_version": None}
         return (self.is_gptq_quantized or self.is_awq_quantized) and (
-            getattr(self.quantization_config, "exllama_config", dummy_exllama)["exllama_version"]
-            or getattr(self.pretrained_config, "quantization_config", {}).get("exllama_config", dummy_exllama)[
-                "exllama_version"
-            ]
-        ) == 2
+            (
+                hasattr(self.pretrained_config, "quantization_config")
+                and hasattr(self.pretrained_config.quantization_config, "exllama_config")
+                and "exllama_version" in self.pretrained_config.quantization_config.exllama_config
+                and self.pretrained_config.quantization_config.exllama_config["exllama_version"] == 2
+            )
+            or (
+                hasattr(self.quantization_config, "exllama_config")
+                and "exllama_version" in self.quantization_config.exllama_config
+                and self.quantization_config.exllama_config["exllama_version"] == 2
+            )
+        )
 
     @property
     def automodel_kwargs(self) -> Dict[str, Any]: