huggingface · IlyasMoutawwakil · Mar 21, 2024 · Feb 27, 2024 · Feb 29, 2024 · Mar 6, 2024
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -49,4 +49,4 @@ jobs:
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
-          -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,bitsandbytes,autoawq] && pytest -k 'cli and cuda and pytorch' -x"
+          -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,bitsandbytes,autoawq,auto-gptq-${{ matrix.image.torch_cuda }}] && pytest -k 'cli and cuda and pytorch' -x"
diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -1,3 +1,4 @@
+
 name: CLI ROCm Pytorch Tests
 
 on:
@@ -51,4 +52,4 @@ jobs:
           --device /dev/dri/renderD129
           --entrypoint /bin/bash
           opt-bench-rocm:${{ matrix.image.rocm_version }}
-          -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq] && pytest -k 'cli and cuda and pytorch and not bnb' -x"
+          -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq] && pytest -k 'cli and cuda and pytorch and not bnb and not gptq' -x"
diff --git a/Makefile b/Makefile
@@ -15,7 +15,7 @@ CLI_MISC_REQS := testing
 
 CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers
 CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers
-CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,bitsandbytes,autoawq
+CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,bitsandbytes,autoawq,auto-gptq-cu118
 CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,autoawq
 CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers
 CLI_CPU_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft
@@ -143,7 +143,7 @@ test_cli_cuda_pytorch:
 	$(call test_nvidia,cuda,$(CLI_CUDA_PYTORCH_REQS),cli and cuda and pytorch)
 
 test_cli_rocm_pytorch:
-	$(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft and not bnb)
+	$(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft and not bnb and not gptq)
 
 test_cli_cuda_onnxruntime:
 	$(call test_nvidia,cuda,$(CLI_CUDA_ONNXRUNTIME_REQS),cli and cuda and onnxruntime)

diff --git a/setup.py b/setup.py
@@ -70,6 +70,11 @@
     "peft": ["peft"],
     "autoawq": ["autoawq@git+https://github.com/casper-hansen/AutoAWQ.git"],
     "bitsandbytes": ["bitsandbytes"],
+    "auto-gptq-cu118": [
+        "optimum",
+        "auto-gptq@https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.7.1%2Bcu118-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+    ],
+    "auto-gptq-cu121": ["optimum", "auto-gptq"],
 }
 
 

diff --git a/tests/configs/cuda_inference_pytorch_llama_gptq.yaml b/tests/configs/cuda_inference_pytorch_llama_gptq.yaml
@@ -0,0 +1,21 @@
+defaults:
+  - backend: pytorch
+  # order of inheritance, last one overrides previous ones
+  - _base_ # inherits from base config
+  - _inference_ # inherits from inference config
+  - _cuda_ # inherits from cpu config
+  - _self_ # hydra 1.1 compatibility
+
+experiment_name: cuda_inference_pytorch_gptq
+
+backend:
+  model: TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ
+  quantization_config:
+    exllama_config:
+      version: 2
+
+# hydra/cli specific settings
+hydra:
+  sweeper:
+    params:
+      backend.no_weights: false,true