feat: add exllama and exllamav2 support (#49)

Signed-off-by: Sertac Ozercan <[email protected]>
sozercan · Dec 24, 2023 · a5066d3 · a5066d3
1 parent c8e7c98
commit a5066d3
Show file tree

Hide file tree

Showing 10 changed files with 174 additions and 26 deletions.
diff --git a/Makefile b/Makefile
@@ -19,7 +19,8 @@ build-aikit:
 
 .PHONY: build-test-model
 build-test-model:
-	docker buildx build . -t ${REGISTRY}/testmodel:${TAG} -f ${TEST_FILE} --output=${OUTPUT_TYPE}
+	docker buildx build . -t ${REGISTRY}/testmodel:${TAG} -f ${TEST_FILE} --output=${OUTPUT_TYPE} \
+		--progress=plain --provenance=false
 
 .PHONY: run-test-model
 run-test-model:

diff --git a/README.md b/README.md
@@ -8,9 +8,6 @@ AIKit is a quick, easy, and local or cloud-agnostic way to get started to host a
 
 AIKit uses [LocalAI](https://localai.io/) under-the-hood to run inference. LocalAI provides a drop-in replacement REST API that is OpenAI API compatible, so you can use any OpenAI API compatible client, such as [Kubectl AI](https://github.com/sozercan/kubectl-ai), [Chatbot-UI](https://github.com/sozercan/chatbot-ui) and many more, to send requests to open-source LLMs powered by AIKit!
 
-> [!NOTE]
-> At this time, AIKit is tested with LocalAI `llama` and `stablediffusion` backends. Other backends may work but are not tested. Please open an issue if you'd like to see support for other backends.
-
 ## Features
 
 - 🐳 No GPU, Internet access or additional tools needed except for [Docker](https://docs.docker.com/desktop/install/linux-install/)!
@@ -19,6 +16,7 @@ AIKit uses [LocalAI](https://localai.io/) under-the-hood to run inference. Local
 - ✨ OpenAI API compatible to use with any OpenAI API compatible client
 - 📸 [Multi-modal model support](./docs/demo.md#vision-with-llava)
 - 🖼️ Image generation support with Stable Diffusion
+- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ ([`exllama`](https://github.com/turboderp/exllama) or [`exllama2`](https://github.com/turboderp/exllamav2)), EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) formats
 - 🚢 [Kubernetes deployment ready](#kubernetes-deployment)
 - 📦 Supports multiple models with a single image
 - 🖥️ [Supports GPU-accelerated inferencing with NVIDIA GPUs](#nvidia)
@@ -30,7 +28,9 @@ You can get started with AIKit quickly on your local machine without a GPU!
 
 ```bash
 docker run -d --rm -p 8080:8080 ghcr.io/sozercan/llama2:7b
+```
 
+```bash
 curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
     "model": "llama-2-7b-chat",
     "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
@@ -39,9 +39,7 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso
 
 Output should be similar to:
 
-```json
-{"created":1701236489,"object":"chat.completion","id":"dd1ff40b-31a7-4418-9e32-42151ab6875a","model":"llama-2-7b-chat","choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"\nKubernetes is a container orchestration system that automates the deployment, scaling, and management of containerized applications in a microservices architecture."}}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}
-```
+`{"created":1701236489,"object":"chat.completion","id":"dd1ff40b-31a7-4418-9e32-42151ab6875a","model":"llama-2-7b-chat","choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"\nKubernetes is a container orchestration system that automates the deployment, scaling, and management of containerized applications in a microservices architecture."}}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}`
 
 That's it! 🎉 API is OpenAI compatible so this is a drop-in replacement for any OpenAI API compatible client.
 
@@ -176,6 +174,11 @@ To get started with GPU-accelerated inferencing, make sure to set the following
 
 ```yaml
 runtime: cuda         # use NVIDIA CUDA runtime
+```
+
+For `llama` backend, set the following in your `config`:
+
+```yaml
 f16: true             # use float16 precision
 gpu_layers: 35        # number of layers to offload to GPU
 low_vram: true        # for devices with low VRAM
@@ -184,6 +187,9 @@ low_vram: true        # for devices with low VRAM
 > [!TIP]
 > Make sure to customize these values based on your model and GPU specs.
 
+> [!NOTE]
+> For `exllama` and `exllama2` backends, GPU acceleration is enabled by default and cannot be disabled.
+
 After building the model, you can run it with [`--gpus all`](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html#gpu-enumeration) flag to enable GPU support:
 
 ```bash

diff --git a/docs/specs.md b/docs/specs.md
@@ -4,9 +4,9 @@
 
 ```yaml
 apiVersion: # required. only v1alpha1 is supported at the moment
-debug: # optional. if set to true, will print debug logs
+debug: # optional. if set to true, debug logs will be printed
 runtime: # optional. defaults to avx. can be "avx", "avx2", "avx512", "cuda"
-backends: # optional. list of additional backends. can be "stablediffusion"
+backends: # optional. list of additional backends. can be "stablediffusion", "exllama" or "exllama2"
 models: # required. list of models to build
   - name: # required. name of the model
     source: # required. source of the model. must be a url

diff --git a/pkg/aikit2llb/convert.go b/pkg/aikit2llb/convert.go
@@ -16,7 +16,9 @@ import (
 const (
 	debianSlim     = "docker.io/library/debian:12-slim"
 	distrolessBase = "gcr.io/distroless/cc-debian12:latest"
+
 	localAIVersion = "v2.1.0"
+	localAIRepo    = "https://github.com/mudler/LocalAI"
 	cudaVersion    = "12-3"
 )
 
@@ -30,12 +32,16 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {
 
 	// install cuda if runtime is nvidia
 	if c.Runtime == utils.RuntimeNVIDIA {
-		merge = installCuda(state, merge)
+		merge = installCuda(c, state, merge)
 	}
 
 	// install opencv and friends if stable diffusion backend is being used
 	for b := range c.Backends {
-		if strings.Contains(c.Backends[b], "stablediffusion") {
+		switch c.Backends[b] {
+		case utils.BackendExllama:
+		case utils.BackendExllamaV2:
+			merge = installExllama(c, state, merge)
+		case utils.BackendStableDiffusion:
 			merge = installOpenCV(state, merge)
 		}
 	}
@@ -46,8 +52,11 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {
 
 func getBaseImage(c *config.Config) llb.State {
 	for b := range c.Backends {
-		if strings.Contains(c.Backends[b], "stablediffusion") {
-			// due to too many dependencies, using debian slim as base for stable diffusion
+		switch c.Backends[b] {
+		case utils.BackendExllama:
+		case utils.BackendExllamaV2:
+			return llb.Image(debianSlim)
+		case utils.BackendStableDiffusion:
 			return llb.Image(debianSlim)
 		}
 	}
@@ -108,30 +117,73 @@ func fileNameFromURL(urlString string) string {
 	return path.Base(parsedURL.Path)
 }
 
-func installCuda(s llb.State, merge llb.State) llb.State {
+func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State {
 	cudaKeyringURL := "https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb"
 	cudaKeyring := llb.HTTP(cudaKeyringURL)
 	s = s.File(
 		llb.Copy(cudaKeyring, fileNameFromURL(cudaKeyringURL), "/"),
 		llb.WithCustomName("Copying "+fileNameFromURL(cudaKeyringURL)), //nolint: goconst
 	)
-	s = s.Run(shf("dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb")).Root()
+	s = s.Run(sh("dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb")).Root()
 	// running apt-get update twice due to nvidia repo
-	s = s.Run(shf("apt-get update && apt-get install -y ca-certificates && apt-get update"), llb.IgnoreCache).Root()
+	s = s.Run(sh("apt-get update && apt-get install -y ca-certificates && apt-get update"), llb.IgnoreCache).Root()
 	savedState := s
-	s = s.Run(shf("apt-get install -y libcublas-%[1]s cuda-cudart-%[1]s && apt-get clean", cudaVersion)).Root()
+	s = s.Run(shf("apt-get install -y --no-install-recommends libcublas-%[1]s cuda-cudart-%[1]s && apt-get clean", cudaVersion)).Root()
+
+	// installing dev dependencies used for exllama
+	for b := range c.Backends {
+		if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 {
+			var exllama2Dep string
+			if c.Backends[b] == utils.BackendExllamaV2 {
+				exllama2Dep = fmt.Sprintf("libcurand-dev-%[1]s", cudaVersion)
+			}
+			exllamaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-cudart-dev-%[1]s cuda-crt-%[1]s libcusparse-dev-%[1]s libcublas-dev-%[1]s libcusolver-dev-%[1]s cuda-nvcc-%[1]s %[2]s && apt-get clean", cudaVersion, exllama2Dep)
+
+			s = s.Run(sh(exllamaDeps)).Root()
+		}
+	}
 
 	diff := llb.Diff(savedState, s)
-	merge = llb.Merge([]llb.State{merge, diff})
-	return merge
+	return llb.Merge([]llb.State{merge, diff})
+}
+
+func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State {
+	backend := "exllama"
+	exllamaRepo := "https://github.com/turboderp/exllama"
+	exllamaTag := "master"
+	for b := range c.Backends {
+		if c.Backends[b] == utils.BackendExllamaV2 {
+			exllamaRepo = "https://github.com/turboderp/exllamav2"
+			backend = "exllama2"
+			exllamaTag = "v0.0.11"
+		}
+	}
+
+	savedState := s
+	s = s.Run(sh("apt-get update && apt-get install --no-install-recommends -y git ca-certificates python3-pip python3-dev g++ && apt-get clean"), llb.IgnoreCache).Root()
+
+	// clone localai exllama backend only
+	s = s.Run(shf("git clone --filter=blob:none --no-checkout %[1]s /tmp/localai/ && cd /tmp/localai && git sparse-checkout init --cone && git sparse-checkout set backend/python/%[2]s && git checkout %[3]s && rm -rf .git", localAIRepo, backend, localAIVersion)).Root()
+
+	// workaround until https://github.com/mudler/LocalAI/pull/1484 is merged
+	if backend == utils.BackendExllamaV2 {
+		s = s.Run(sh("sed -i 's/self.seed/None/g' /tmp/localai/backend/python/exllama2/exllama2_backend.py && sed -i 's/bytes(t/bytes(output/g' /tmp/localai/backend/python/exllama2/exllama2_backend.py")).Root()
+	}
+
+	// clone exllama to localai exllama backend path and install python dependencies
+	s = s.Run(shf("git clone --depth 1 %[1]s --branch %[2]s /tmp/%[3]s && mv /tmp/%[3]s/* /tmp/localai/backend/python/%[3]s && rm -rf /tmp/%[3]s && cd /tmp/localai/backend/python/%[3]s && rm -rf .git && pip3 install grpcio protobuf typing-extensions sympy mpmath setuptools numpy --break-system-packages && pip3 install -r /tmp/localai/backend/python/%[3]s/requirements.txt --break-system-packages", exllamaRepo, exllamaTag, backend)).Root()
+
+	diff := llb.Diff(savedState, s)
+	return llb.Merge([]llb.State{merge, diff})
 }
 
 func installOpenCV(s llb.State, merge llb.State) llb.State {
 	savedState := s
 	// adding debian 11 (bullseye) repo due to opencv 4.5 requirement
-	s = s.Run(shf("echo 'deb http://deb.debian.org/debian bullseye main' | tee -a /etc/apt/sources.list")).Root()
+	s = s.Run(sh("echo 'deb http://deb.debian.org/debian bullseye main' | tee -a /etc/apt/sources.list")).Root()
 	// pinning libdap packages to bullseye version due to symbol error
-	s = s.Run(shf("apt-get update && mkdir -p /tmp/generated/images && apt-get install -y libopencv-imgcodecs4.5 libgomp1 libdap27=3.20.7-6 libdapclient6v5=3.20.7-6 && apt-get clean"), llb.IgnoreCache).Root()
+	libdapVersion := "3.20.7-6"
+	s = s.Run(shf("apt-get update && mkdir -p /tmp/generated/images && apt-get install -y libopencv-imgcodecs4.5 libgomp1 libdap27=%[1]s libdapclient6v5=%[1]s && apt-get clean", libdapVersion), llb.IgnoreCache).Root()
 	diff := llb.Diff(savedState, s)
 	merge = llb.Merge([]llb.State{merge, diff})
 
@@ -181,3 +233,7 @@ func addLocalAI(c *config.Config, s llb.State, merge llb.State) (llb.State, llb.
 func shf(cmd string, v ...interface{}) llb.RunOption {
 	return llb.Args([]string{"/bin/sh", "-c", fmt.Sprintf(cmd, v...)})
 }
+
+func sh(cmd string) llb.RunOption {
+	return llb.Args([]string{"/bin/sh", "-c", cmd})
+}
diff --git a/pkg/aikit2llb/image.go b/pkg/aikit2llb/image.go
@@ -30,19 +30,28 @@ func emptyImage(c *config.Config) *specs.Image {
 	img.RootFS.Type = "layers"
 	img.Config.WorkingDir = "/"
 
+	img.Config.Env = []string{
+		"PATH=" + system.DefaultPathEnv("linux"),
+	}
+
 	cudaEnv := []string{
 		"PATH=" + system.DefaultPathEnv("linux") + ":/usr/local/cuda/bin",
 		"NVIDIA_REQUIRE_CUDA=cuda>=12.0",
 		"NVIDIA_DRIVER_CAPABILITIES=compute,utility",
 		"NVIDIA_VISIBLE_DEVICES=all",
 		"LD_LIBRARY_PATH=/usr/local/cuda/lib64",
 	}
-
 	if c.Runtime == utils.RuntimeNVIDIA {
-		img.Config.Env = cudaEnv
-	} else {
-		img.Config.Env = []string{
-			"PATH=" + system.DefaultPathEnv("linux"),
+		img.Config.Env = append(img.Config.Env, cudaEnv...)
+	}
+
+	for b := range c.Backends {
+		if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 {
+			exllamaEnv := []string{
+				"EXTERNAL_GRPC_BACKENDS=exllama:/tmp/localai/backend/python/exllama/exllama.py,exllama2:/tmp/localai/backend/python/exllama2/exllama2_backend.py",
+				"CUDA_HOME=/usr/local/cuda",
+			}
+			img.Config.Env = append(img.Config.Env, exllamaEnv...)
 		}
 	}
 

diff --git a/pkg/build/build.go b/pkg/build/build.go
@@ -128,6 +128,18 @@ func validateConfig(c *config.Config) error {
 		return errors.New("no models defined")
 	}
 
+	if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) {
+		return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
+	}
+
+	if slices.Contains(c.Backends, utils.BackendExllama) && slices.Contains(c.Backends, utils.BackendExllamaV2) {
+		return errors.New("cannot specify both exllama and exllamav2 at this time")
+	}
+
+	if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) && c.Runtime != utils.RuntimeNVIDIA {
+		return errors.New("exllama only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
+	}
+
 	runtimes := []string{"", utils.RuntimeNVIDIA, utils.RuntimeCPUAVX, utils.RuntimeCPUAVX2, utils.RuntimeCPUAVX512}
 	if !slices.Contains(runtimes, c.Runtime) {
 		return errors.Errorf("runtime %s is not supported", c.Runtime)

diff --git a/pkg/utils/const.go b/pkg/utils/const.go
@@ -6,5 +6,9 @@ const (
 	RuntimeCPUAVX2   = "avx2"
 	RuntimeCPUAVX512 = "avx512"
 
+	BackendStableDiffusion = "stablediffusion"
+	BackendExllama         = "exllama"
+	BackendExllamaV2       = "exllama2"
+
 	APIv1alpha1 = "v1alpha1"
 )
diff --git a/test/aikitfile-exllama.yaml b/test/aikitfile-exllama.yaml
@@ -0,0 +1,20 @@
+#syntax=aikit:test
+apiVersion: v1alpha1
+debug: true
+runtime: cuda
+backends:
+  - exllama
+models:
+  - name: Llama-2-7B-Chat-GPTQ/model.safetensors
+    source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/model.safetensors
+  - name: Llama-2-7B-Chat-GPTQ/tokenizer.model
+    source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/tokenizer.model
+  - name: Llama-2-7B-Chat-GPTQ/config.json
+    source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/config.json
+config: |
+  - name: llama-2-7b-chat
+    backend: exllama
+    context_size: 4096
+    parameters:
+      model: "Llama-2-7B-Chat-GPTQ"
+      temperature: 0.2
diff --git a/test/aikitfile-exllama2-exl2.yaml b/test/aikitfile-exllama2-exl2.yaml
@@ -0,0 +1,20 @@
+#syntax=aikit:test
+apiVersion: v1alpha1
+debug: true
+runtime: cuda
+backends:
+  - exllama2
+models:
+  - name: Llama2-7B-chat-exl2/output.safetensors
+    source: https://huggingface.co/turboderp/Llama2-7B-chat-exl2/resolve/2.5bpw/output.safetensors
+  - name: Llama2-7B-chat-exl2/tokenizer.model
+    source: https://huggingface.co/turboderp/Llama2-7B-chat-exl2/resolve/2.5bpw/tokenizer.model
+  - name: Llama2-7B-chat-exl2/config.json
+    source: https://huggingface.co/turboderp/Llama2-7B-chat-exl2/raw/2.5bpw/config.json
+config: |
+  - name: llama-2-7b-chat
+    backend: exllama2
+    context_size: 4096
+    parameters:
+      model: "Llama2-7B-chat-exl2"
+      temperature: 0.2
diff --git a/test/aikitfile-exllama2-gptq.yaml b/test/aikitfile-exllama2-gptq.yaml
@@ -0,0 +1,20 @@
+#syntax=aikit:test
+apiVersion: v1alpha1
+debug: true
+runtime: cuda
+backends:
+  - exllama2
+models:
+  - name: Llama-2-7B-Chat-GPTQ/model.safetensors
+    source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/model.safetensors
+  - name: Llama-2-7B-Chat-GPTQ/tokenizer.model
+    source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/tokenizer.model
+  - name: Llama-2-7B-Chat-GPTQ/config.json
+    source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/config.json
+config: |
+  - name: llama-2-7b-chat
+    backend: exllama2
+    context_size: 4096
+    parameters:
+      model: "Llama-2-7B-Chat-GPTQ"
+      temperature: 0.2