support layerwise

Signed-off-by: changwangss <[email protected]>
intel · Jun 19, 2024 · 397d661 · 397d661
1 parent 9a5a578
commit 397d661
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 7 deletions.
diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py
@@ -25,6 +25,7 @@
 from datasets import load_dataset
 from neural_compressor import quantization
 from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
+from neural_compressor.utils.pytorch import load
 from neural_compressor.utils.utility import LazyImport
 from neural_compressor.config import PostTrainingQuantConfig
 from intel_extension_for_transformers.tools.utils import (
@@ -583,6 +584,10 @@ def default_calib_func(model):
         inc_model = quantization.fit(
             model, conf, calib_func=calib_func, calib_dataloader=calib_dataloader
         )
+        if config.layer_wise:
+            inc_model.save("./tmp")
+            inc_model = load("./tmp", model, weight_only=True, layer_wise=True)
+            return inc_model.eval()
         inc_model.eval()
 
         if device == "xpu" or device == torch.device("xpu"):

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -74,6 +74,7 @@
 from accelerate import init_empty_weights
 from huggingface_hub import hf_hub_download
 from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
+from neural_compressor.adaptor.torch_utils.layer_wise_quant import load_empty_model
 from neural_compressor.model.torch_model import PyTorchFXModel
 from threading import Thread
 from transformers.configuration_utils import PretrainedConfig
@@ -778,13 +779,16 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
                         if quantization_config.quant_method.value in ["teq", "awq"]
                         else False
                     )
-                    model = cls.ORIG_MODEL.from_pretrained(
-                        pretrained_model_name_or_path,
-                        *model_args,
-                        config=config,
-                        **kwargs,
-                    )
-                    model.config.update({"low_cpu_mem_usage": True})
+                    if quantization_config.layer_wise:
+                        model = load_empty_model(pretrained_model_name_or_path, torchscript=True)
+                    else:
+                        model = cls.ORIG_MODEL.from_pretrained(
+                            pretrained_model_name_or_path,
+                            *model_args,
+                            config=config,
+                            **kwargs,
+                        )
+                        model.config.update({"low_cpu_mem_usage": True})
                 model.eval()
 
                 if use_xpu: