diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml index db88fb5a..5f1ce155 100644 --- a/.github/workflows/test_cli_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -49,4 +49,4 @@ jobs: --workdir /workspace/optimum-benchmark --entrypoint /bin/bash opt-bench-cuda:${{ matrix.image.cuda_version }} - -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,bitsandbytes,autoawq] && pytest -k 'cli and cuda and pytorch' -x" + -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,bitsandbytes,autoawq,auto-gptq-${{ matrix.image.torch_cuda }}] && pytest -k 'cli and cuda and pytorch' -x" \ No newline at end of file diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml index 1006b2e1..9531ca0d 100644 --- a/.github/workflows/test_cli_rocm_pytorch.yaml +++ b/.github/workflows/test_cli_rocm_pytorch.yaml @@ -1,3 +1,4 @@ + name: CLI ROCm Pytorch Tests on: @@ -51,4 +52,4 @@ jobs: --device /dev/dri/renderD129 --entrypoint /bin/bash opt-bench-rocm:${{ matrix.image.rocm_version }} - -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq] && pytest -k 'cli and cuda and pytorch and not bnb' -x" + -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq] && pytest -k 'cli and cuda and pytorch and not bnb and not gptq' -x" diff --git a/Makefile b/Makefile index 622588ae..ccfd028d 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ CLI_MISC_REQS := testing CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers -CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,bitsandbytes,autoawq +CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,bitsandbytes,autoawq,auto-gptq-cu118 CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,autoawq CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers CLI_CPU_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft @@ -143,7 +143,7 @@ test_cli_cuda_pytorch: $(call test_nvidia,cuda,$(CLI_CUDA_PYTORCH_REQS),cli and cuda and pytorch) test_cli_rocm_pytorch: - $(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft and not bnb) + $(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft and not bnb and not gptq) test_cli_cuda_onnxruntime: $(call test_nvidia,cuda,$(CLI_CUDA_ONNXRUNTIME_REQS),cli and cuda and onnxruntime) diff --git a/setup.py b/setup.py index 710c3676..4e892bcb 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,11 @@ "peft": ["peft"], "autoawq": ["autoawq@git+https://github.com/casper-hansen/AutoAWQ.git"], "bitsandbytes": ["bitsandbytes"], + "auto-gptq-cu118": [ + "optimum", + "auto-gptq@https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.7.1%2Bcu118-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + ], + "auto-gptq-cu121": ["optimum", "auto-gptq"], } diff --git a/tests/configs/cuda_inference_pytorch_llama_gptq.yaml b/tests/configs/cuda_inference_pytorch_llama_gptq.yaml new file mode 100644 index 00000000..2d0f8e0d --- /dev/null +++ b/tests/configs/cuda_inference_pytorch_llama_gptq.yaml @@ -0,0 +1,21 @@ +defaults: + - backend: pytorch + # order of inheritance, last one overrides previous ones + - _base_ # inherits from base config + - _inference_ # inherits from inference config + - _cuda_ # inherits from cpu config + - _self_ # hydra 1.1 compatibility + +experiment_name: cuda_inference_pytorch_gptq + +backend: + model: TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ + quantization_config: + exllama_config: + version: 2 + +# hydra/cli specific settings +hydra: + sweeper: + params: + backend.no_weights: false,true \ No newline at end of file