chore: remove deprecated exllama (#396)

Signed-off-by: Sertac Ozercan <[email protected]>
sozercan · Sep 27, 2024 · 1fce06c · 1fce06c
1 parent 72bcc36
commit 1fce06c
Show file tree

Hide file tree

Showing 16 changed files with 26 additions and 103 deletions.
diff --git a/.github/workflows/test-docker-gpu.yaml b/.github/workflows/test-docker-gpu.yaml
@@ -18,7 +18,6 @@ jobs:
           - exllama2-gptq
           - exllama2-exl2
           - diffusers
-          # - exllama
           # - mamba
     steps:
       - name: cleanup workspace
@@ -63,7 +62,7 @@ jobs:
           fi
 
       - name: run test (exl2/gptq)
-        if: matrix.backend == 'exllama2-gptq' || matrix.backend == 'exllama2-exl2' || matrix.backend == 'exllama'
+        if: matrix.backend == 'exllama2-gptq' || matrix.backend == 'exllama2-exl2'
         run: |
           result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
             "model": "llama-2-7b-chat",

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ AIKit offers two main capabilities:
 - ✨ OpenAI API compatible to use with any OpenAI API compatible client
 - 📸 [Multi-modal model support](https://sozercan.github.io/aikit/docs/vision)
 - 🖼️ [Image generation support](https://sozercan.github.io/aikit/docs/diffusion)
-- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ ([`exllama`](https://github.com/turboderp/exllama) or [`exllama2`](https://github.com/turboderp/exllamav2)), EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
+- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ or EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
 - 🚢 [Kubernetes deployment ready](https://sozercan.github.io/aikit/docs/kubernetes)
 - 📦 Supports multiple models with a single image
 - 🖥️ Supports [AMD64 and ARM64](https://sozercan.github.io/aikit/docs/create-images#multi-platform-support) CPUs and [GPU-accelerated inferencing with NVIDIA GPUs](https://sozercan.github.io/aikit/docs/gpu)

diff --git a/pkg/aikit/config/specs_test.go b/pkg/aikit/config/specs_test.go
@@ -21,19 +21,19 @@ func TestNewFromBytes(t *testing.T) {
 			name: "valid yaml",
 			args: args{b: []byte(`
 apiVersion: v1alpha1
-runtime: avx512
+runtime: cuda
 backends:
-- exllama
+- exllama2
 - stablediffusion
 models:
 - name: test
   source: foo
 `)},
 			want: &InferenceConfig{
 				APIVersion: utils.APIv1alpha1,
-				Runtime:    utils.RuntimeCPUAVX512,
+				Runtime:    utils.RuntimeNVIDIA,
 				Backends: []string{
-					utils.BackendExllama,
+					utils.BackendExllamaV2,
 					utils.BackendStableDiffusion,
 				},
 				Models: []Model{

diff --git a/pkg/aikit2llb/inference/convert.go b/pkg/aikit2llb/inference/convert.go
@@ -43,8 +43,8 @@ func Aikit2LLB(c *config.InferenceConfig, platform *specs.Platform) (llb.State,
 	// install backend dependencies
 	for b := range c.Backends {
 		switch c.Backends[b] {
-		case utils.BackendExllama, utils.BackendExllamaV2:
-			merge = installExllama(c, state, merge)
+		case utils.BackendExllamaV2:
+			merge = installExllama(state, merge)
 		case utils.BackendStableDiffusion:
 			merge = installOpenCV(state, merge)
 		case utils.BackendMamba:
@@ -131,12 +131,8 @@ func installCuda(c *config.InferenceConfig, s llb.State, merge llb.State) (llb.S
 
 	// installing dev dependencies used for exllama
 	for b := range c.Backends {
-		if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 {
-			var exllama2Dep string
-			if c.Backends[b] == utils.BackendExllamaV2 {
-				exllama2Dep = fmt.Sprintf("libcurand-dev-%[1]s", cudaVersion)
-			}
-			exllamaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-cudart-dev-%[1]s cuda-crt-%[1]s libcusparse-dev-%[1]s libcublas-dev-%[1]s libcusolver-dev-%[1]s cuda-nvcc-%[1]s %[2]s && apt-get clean", cudaVersion, exllama2Dep)
+		if c.Backends[b] == utils.BackendExllamaV2 {
+			exllamaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-cudart-dev-%[1]s cuda-crt-%[1]s libcusparse-dev-%[1]s libcublas-dev-%[1]s libcusolver-dev-%[1]s cuda-nvcc-%[1]s libcurand-dev-%[1]s && apt-get clean", cudaVersion)
 
 			s = s.Run(utils.Sh(exllamaDeps)).Root()
 		}

diff --git a/pkg/aikit2llb/inference/exllama.go b/pkg/aikit2llb/inference/exllama.go
@@ -2,26 +2,16 @@ package inference
 
 import (
 	"github.com/moby/buildkit/client/llb"
-	"github.com/sozercan/aikit/pkg/aikit/config"
 	"github.com/sozercan/aikit/pkg/utils"
 )
 
-func installExllama(c *config.InferenceConfig, s llb.State, merge llb.State) llb.State {
-	backend := utils.BackendExllama
-	for b := range c.Backends {
-		if c.Backends[b] == utils.BackendExllamaV2 {
-			backend = utils.BackendExllamaV2
-		}
-	}
-
+func installExllama(s llb.State, merge llb.State) llb.State {
 	savedState := s
 	s = s.Run(utils.Sh("apt-get update && apt-get install --no-install-recommends -y bash git ca-certificates python3-pip python3-dev python3-venv python-is-python3 make g++ curl && pip install uv grpcio-tools && apt-get clean"), llb.IgnoreCache).Root()
 
 	s = cloneLocalAI(s)
 
-	// TODO: remove sed for grpcio with localai v2.20.2+
-	// https://github.com/mudler/LocalAI/pull/3428/files
-	s = s.Run(utils.Bashf("export BUILD_TYPE=cublas && export CUDA_MAJOR_VERSION=12 && cd /tmp/localai/backend/python/%[1]s && sed -i 's/grpcio==1.66.0/grpcio==1.66.1/g' requirements.txt && make %[1]s", backend)).Root()
+	s = s.Run(utils.Bashf("export BUILD_TYPE=cublas && export CUDA_MAJOR_VERSION=12 && cd /tmp/localai/backend/python/%[1]s && make %[1]s", utils.BackendExllamaV2)).Root()
 
 	diff := llb.Diff(savedState, s)
 	return llb.Merge([]llb.State{merge, diff})

diff --git a/pkg/aikit2llb/inference/image.go b/pkg/aikit2llb/inference/image.go
@@ -50,9 +50,9 @@ func emptyImage(c *config.InferenceConfig, platform *specs.Platform) *specs.Imag
 
 	for b := range c.Backends {
 		switch c.Backends[b] {
-		case utils.BackendExllama, utils.BackendExllamaV2:
+		case utils.BackendExllamaV2:
 			exllamaEnv := []string{
-				"EXTERNAL_GRPC_BACKENDS=exllama:/tmp/localai/backend/python/exllama/run.sh,exllama2:/tmp/localai/backend/python/exllama2/run.sh",
+				"EXTERNAL_GRPC_BACKENDS=exllama2:/tmp/localai/backend/python/exllama2/run.sh",
 				"CUDA_HOME=/usr/local/cuda",
 			}
 			img.Config.Env = append(img.Config.Env, exllamaEnv...)

diff --git a/pkg/build/build.go b/pkg/build/build.go
@@ -444,22 +444,22 @@ func validateInferenceConfig(c *config.InferenceConfig) error {
 		return errors.New("only one backend is supported at this time")
 	}
 
-	if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) {
-		return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
+	if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllamaV2)) {
+		return errors.New("cannot specify both stablediffusion with exllama2 at this time")
 	}
 
-	if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers)) && c.Runtime != utils.RuntimeNVIDIA {
+	if (slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers)) && c.Runtime != utils.RuntimeNVIDIA {
 		return errors.New("exllama, mamba, and diffusers backends only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
 	}
 
-	backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers}
+	backends := []string{utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers}
 	for _, b := range c.Backends {
 		if !slices.Contains(backends, b) {
 			return errors.Errorf("backend %s is not supported", b)
 		}
 	}
 
-	runtimes := []string{"", utils.RuntimeNVIDIA, utils.RuntimeCPUAVX, utils.RuntimeCPUAVX2, utils.RuntimeCPUAVX512}
+	runtimes := []string{"", utils.RuntimeNVIDIA}
 	if !slices.Contains(runtimes, c.Runtime) {
 		return errors.Errorf("runtime %s is not supported", c.Runtime)
 	}

diff --git a/pkg/build/build_test.go b/pkg/build/build_test.go
@@ -41,7 +41,7 @@ func Test_validateConfig(t *testing.T) {
 			args: args{c: &config.InferenceConfig{
 				APIVersion: "v1alpha1",
 				Runtime:    "cuda",
-				Backends:   []string{"exllama"},
+				Backends:   []string{"exllama2"},
 				Models: []config.Model{
 					{
 						Name:   "test",
@@ -69,7 +69,7 @@ func Test_validateConfig(t *testing.T) {
 			name: "valid backend but no cuda runtime",
 			args: args{c: &config.InferenceConfig{
 				APIVersion: "v1alpha1",
-				Backends:   []string{"exllama"},
+				Backends:   []string{"exllama2"},
 				Models: []config.Model{
 					{
 						Name:   "test",
@@ -80,22 +80,7 @@ func Test_validateConfig(t *testing.T) {
 			wantErr: true,
 		},
 		{
-			name: "invalid backend combination 1",
-			args: args{c: &config.InferenceConfig{
-				APIVersion: "v1alpha1",
-				Runtime:    "cuda",
-				Backends:   []string{"exllama", "exllama2"},
-				Models: []config.Model{
-					{
-						Name:   "test",
-						Source: "foo",
-					},
-				},
-			}},
-			wantErr: true,
-		},
-		{
-			name: "invalid backend combination 2",
+			name: "invalid backend combination",
 			args: args{c: &config.InferenceConfig{
 				APIVersion: "v1alpha1",
 				Runtime:    "cuda",

diff --git a/pkg/utils/const.go b/pkg/utils/const.go
@@ -1,13 +1,9 @@
 package utils
 
 const (
-	RuntimeNVIDIA    = "cuda"
-	RuntimeCPUAVX    = "avx"
-	RuntimeCPUAVX2   = "avx2"
-	RuntimeCPUAVX512 = "avx512"
+	RuntimeNVIDIA = "cuda"
 
 	BackendStableDiffusion = "stablediffusion"
-	BackendExllama         = "exllama"
 	BackendExllamaV2       = "exllama2"
 	BackendMamba           = "mamba"
 	BackendDiffusers       = "diffusers"

diff --git a/test/aikitfile-exllama.yaml b/test/aikitfile-exllama.yaml
diff --git a/test/aikitfile-stablediffusion.yaml b/test/aikitfile-stablediffusion.yaml
@@ -1,7 +1,6 @@
 #syntax=aikit:test
 apiVersion: v1alpha1
 debug: true
-runtime: avx2
 backends:
   - stablediffusion
 models:

diff --git a/website/docs/exllama.md b/website/docs/exllama.md
diff --git a/website/docs/gpu.md b/website/docs/gpu.md
@@ -31,7 +31,7 @@ Make sure to customize these values based on your model and GPU specs.
 :::
 
 :::note
-For `exllama` and `exllama2` backends, GPU acceleration is enabled by default and cannot be disabled.
+For `exllama2` backend, GPU acceleration is enabled by default and cannot be disabled.
 :::
 
 After building the model, you can run it with [`--gpus all`](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html#gpu-enumeration) flag to enable GPU support:

diff --git a/website/docs/intro.md b/website/docs/intro.md
@@ -23,7 +23,7 @@ AIKit offers two main capabilities:
 - ✨ OpenAI API compatible to use with any OpenAI API compatible client
 - 📸 [Multi-modal model support](vision.md)
 - 🖼️ [Image generation support](diffusion.md)
-- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ ([`exllama`](https://github.com/turboderp/exllama) or [`exllama2`](https://github.com/turboderp/exllamav2)), EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
+- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ or EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
 - 🚢 [Kubernetes deployment ready](#kubernetes-deployment)
 - 📦 Supports multiple models with a single image
 - 🖥️ Supports [AMD64 and ARM64](create-images.md#multi-platform-support) CPUs and [GPU-accelerated inferencing with NVIDIA GPUs](gpu.md)

diff --git a/website/docs/specs-inference.md b/website/docs/specs-inference.md
@@ -8,7 +8,7 @@ title: Inference API Specifications
 apiVersion: # required. only v1alpha1 is supported at the moment
 debug: # optional. if set to true, debug logs will be printed
 runtime: # optional. defaults to avx. can be "avx", "avx2", "avx512", "cuda"
-backends: # optional. list of additional backends. can be "stablediffusion", "exllama" or "exllama2"
+backends: # optional. list of additional backends. can be "stablediffusion", "exllama2", "diffusers", "mamba"
 models: # required. list of models to build
   - name: # required. name of the model
     source: # required. source of the model. can be a url or a local file

diff --git a/website/sidebars.js b/website/sidebars.js
@@ -53,7 +53,6 @@ const sidebars = {
       collapsed: false,
       items: [
         'llama-cpp',
-        'exllama',
         'exllama2',
         'mamba',
         'diffusion',
-Original file line number
+Diff line change
@@ Expand Up @@
     :::
     :::note
-    For `exllama` and `exllama2` backends, GPU acceleration is enabled by default and cannot be disabled.
+    For `exllama2` backend, GPU acceleration is enabled by default and cannot be disabled.
     :::
     After building the model, you can run it with [`--gpus all`](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html#gpu-enumeration) flag to enable GPU support:
@@ Expand Down @@