fix marlin flag set on hpu

HabanaAI · Nov 25, 2024 · c94d24c · c94d24c
1 parent dbde4b8
commit c94d24c
Showing 1 changed file with 8 additions and 7 deletions.
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -117,16 +117,17 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
+        self.use_marlin = False
         if current_platform.is_cuda_alike():
             self.cutlass_fp8_supported = cutlass_fp8_supported()
 
-        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
-        # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (not current_platform.has_device_capability(89)
-                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
-        # Disable marlin for rocm
-        if current_platform.is_rocm():
-            self.use_marlin = False
+            # For GPUs that lack FP8 hardware support, we can leverage the
+            # Marlin kernel for fast weight-only FP8 quantization
+            self.use_marlin = (not current_platform.has_device_capability(89)
+                            or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+            # Disable marlin for rocm
+            if current_platform.is_rocm():
+                self.use_marlin = False
 
     def create_weights(
         self,