diff --git a/.github/workflows/test-docker-gpu.yaml b/.github/workflows/test-docker-gpu.yaml index 968ff3af..c1fd5f7f 100644 --- a/.github/workflows/test-docker-gpu.yaml +++ b/.github/workflows/test-docker-gpu.yaml @@ -18,7 +18,6 @@ jobs: - exllama2-gptq - exllama2-exl2 - diffusers - # - exllama # - mamba steps: - name: cleanup workspace @@ -63,7 +62,7 @@ jobs: fi - name: run test (exl2/gptq) - if: matrix.backend == 'exllama2-gptq' || matrix.backend == 'exllama2-exl2' || matrix.backend == 'exllama' + if: matrix.backend == 'exllama2-gptq' || matrix.backend == 'exllama2-exl2' run: | result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "llama-2-7b-chat", diff --git a/README.md b/README.md index 26e396fa..1d1d5a88 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ AIKit offers two main capabilities: - ✨ OpenAI API compatible to use with any OpenAI API compatible client - 📸 [Multi-modal model support](https://sozercan.github.io/aikit/docs/vision) - 🖼️ [Image generation support](https://sozercan.github.io/aikit/docs/diffusion) -- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ ([`exllama`](https://github.com/turboderp/exllama) or [`exllama2`](https://github.com/turboderp/exllamav2)), EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models +- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ or EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models - 🚢 [Kubernetes deployment ready](https://sozercan.github.io/aikit/docs/kubernetes) - 📦 Supports multiple models with a single image - 🖥️ Supports [AMD64 and ARM64](https://sozercan.github.io/aikit/docs/create-images#multi-platform-support) CPUs and [GPU-accelerated inferencing with NVIDIA GPUs](https://sozercan.github.io/aikit/docs/gpu) diff --git a/pkg/aikit/config/specs_test.go b/pkg/aikit/config/specs_test.go index 0e434f24..5944c90b 100644 --- a/pkg/aikit/config/specs_test.go +++ b/pkg/aikit/config/specs_test.go @@ -21,9 +21,9 @@ func TestNewFromBytes(t *testing.T) { name: "valid yaml", args: args{b: []byte(` apiVersion: v1alpha1 -runtime: avx512 +runtime: cuda backends: -- exllama +- exllama2 - stablediffusion models: - name: test @@ -31,9 +31,9 @@ models: `)}, want: &InferenceConfig{ APIVersion: utils.APIv1alpha1, - Runtime: utils.RuntimeCPUAVX512, + Runtime: utils.RuntimeNVIDIA, Backends: []string{ - utils.BackendExllama, + utils.BackendExllamaV2, utils.BackendStableDiffusion, }, Models: []Model{ diff --git a/pkg/aikit2llb/inference/convert.go b/pkg/aikit2llb/inference/convert.go index ab9583cd..0852cdc8 100644 --- a/pkg/aikit2llb/inference/convert.go +++ b/pkg/aikit2llb/inference/convert.go @@ -43,8 +43,8 @@ func Aikit2LLB(c *config.InferenceConfig, platform *specs.Platform) (llb.State, // install backend dependencies for b := range c.Backends { switch c.Backends[b] { - case utils.BackendExllama, utils.BackendExllamaV2: - merge = installExllama(c, state, merge) + case utils.BackendExllamaV2: + merge = installExllama(state, merge) case utils.BackendStableDiffusion: merge = installOpenCV(state, merge) case utils.BackendMamba: @@ -131,12 +131,8 @@ func installCuda(c *config.InferenceConfig, s llb.State, merge llb.State) (llb.S // installing dev dependencies used for exllama for b := range c.Backends { - if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 { - var exllama2Dep string - if c.Backends[b] == utils.BackendExllamaV2 { - exllama2Dep = fmt.Sprintf("libcurand-dev-%[1]s", cudaVersion) - } - exllamaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-cudart-dev-%[1]s cuda-crt-%[1]s libcusparse-dev-%[1]s libcublas-dev-%[1]s libcusolver-dev-%[1]s cuda-nvcc-%[1]s %[2]s && apt-get clean", cudaVersion, exllama2Dep) + if c.Backends[b] == utils.BackendExllamaV2 { + exllamaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-cudart-dev-%[1]s cuda-crt-%[1]s libcusparse-dev-%[1]s libcublas-dev-%[1]s libcusolver-dev-%[1]s cuda-nvcc-%[1]s libcurand-dev-%[1]s && apt-get clean", cudaVersion) s = s.Run(utils.Sh(exllamaDeps)).Root() } diff --git a/pkg/aikit2llb/inference/exllama.go b/pkg/aikit2llb/inference/exllama.go index 852bf365..efbdde78 100644 --- a/pkg/aikit2llb/inference/exllama.go +++ b/pkg/aikit2llb/inference/exllama.go @@ -2,26 +2,16 @@ package inference import ( "github.com/moby/buildkit/client/llb" - "github.com/sozercan/aikit/pkg/aikit/config" "github.com/sozercan/aikit/pkg/utils" ) -func installExllama(c *config.InferenceConfig, s llb.State, merge llb.State) llb.State { - backend := utils.BackendExllama - for b := range c.Backends { - if c.Backends[b] == utils.BackendExllamaV2 { - backend = utils.BackendExllamaV2 - } - } - +func installExllama(s llb.State, merge llb.State) llb.State { savedState := s s = s.Run(utils.Sh("apt-get update && apt-get install --no-install-recommends -y bash git ca-certificates python3-pip python3-dev python3-venv python-is-python3 make g++ curl && pip install uv grpcio-tools && apt-get clean"), llb.IgnoreCache).Root() s = cloneLocalAI(s) - // TODO: remove sed for grpcio with localai v2.20.2+ - // https://github.com/mudler/LocalAI/pull/3428/files - s = s.Run(utils.Bashf("export BUILD_TYPE=cublas && export CUDA_MAJOR_VERSION=12 && cd /tmp/localai/backend/python/%[1]s && sed -i 's/grpcio==1.66.0/grpcio==1.66.1/g' requirements.txt && make %[1]s", backend)).Root() + s = s.Run(utils.Bashf("export BUILD_TYPE=cublas && export CUDA_MAJOR_VERSION=12 && cd /tmp/localai/backend/python/%[1]s && make %[1]s", utils.BackendExllamaV2)).Root() diff := llb.Diff(savedState, s) return llb.Merge([]llb.State{merge, diff}) diff --git a/pkg/aikit2llb/inference/image.go b/pkg/aikit2llb/inference/image.go index bee0b519..fb82f762 100644 --- a/pkg/aikit2llb/inference/image.go +++ b/pkg/aikit2llb/inference/image.go @@ -50,9 +50,9 @@ func emptyImage(c *config.InferenceConfig, platform *specs.Platform) *specs.Imag for b := range c.Backends { switch c.Backends[b] { - case utils.BackendExllama, utils.BackendExllamaV2: + case utils.BackendExllamaV2: exllamaEnv := []string{ - "EXTERNAL_GRPC_BACKENDS=exllama:/tmp/localai/backend/python/exllama/run.sh,exllama2:/tmp/localai/backend/python/exllama2/run.sh", + "EXTERNAL_GRPC_BACKENDS=exllama2:/tmp/localai/backend/python/exllama2/run.sh", "CUDA_HOME=/usr/local/cuda", } img.Config.Env = append(img.Config.Env, exllamaEnv...) diff --git a/pkg/build/build.go b/pkg/build/build.go index 106d60b2..7ca8dc4d 100644 --- a/pkg/build/build.go +++ b/pkg/build/build.go @@ -444,22 +444,22 @@ func validateInferenceConfig(c *config.InferenceConfig) error { return errors.New("only one backend is supported at this time") } - if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) { - return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time") + if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllamaV2)) { + return errors.New("cannot specify both stablediffusion with exllama2 at this time") } - if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers)) && c.Runtime != utils.RuntimeNVIDIA { + if (slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers)) && c.Runtime != utils.RuntimeNVIDIA { return errors.New("exllama, mamba, and diffusers backends only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml") } - backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers} + backends := []string{utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers} for _, b := range c.Backends { if !slices.Contains(backends, b) { return errors.Errorf("backend %s is not supported", b) } } - runtimes := []string{"", utils.RuntimeNVIDIA, utils.RuntimeCPUAVX, utils.RuntimeCPUAVX2, utils.RuntimeCPUAVX512} + runtimes := []string{"", utils.RuntimeNVIDIA} if !slices.Contains(runtimes, c.Runtime) { return errors.Errorf("runtime %s is not supported", c.Runtime) } diff --git a/pkg/build/build_test.go b/pkg/build/build_test.go index 12002e08..792ece8c 100644 --- a/pkg/build/build_test.go +++ b/pkg/build/build_test.go @@ -41,7 +41,7 @@ func Test_validateConfig(t *testing.T) { args: args{c: &config.InferenceConfig{ APIVersion: "v1alpha1", Runtime: "cuda", - Backends: []string{"exllama"}, + Backends: []string{"exllama2"}, Models: []config.Model{ { Name: "test", @@ -69,7 +69,7 @@ func Test_validateConfig(t *testing.T) { name: "valid backend but no cuda runtime", args: args{c: &config.InferenceConfig{ APIVersion: "v1alpha1", - Backends: []string{"exllama"}, + Backends: []string{"exllama2"}, Models: []config.Model{ { Name: "test", @@ -80,22 +80,7 @@ func Test_validateConfig(t *testing.T) { wantErr: true, }, { - name: "invalid backend combination 1", - args: args{c: &config.InferenceConfig{ - APIVersion: "v1alpha1", - Runtime: "cuda", - Backends: []string{"exllama", "exllama2"}, - Models: []config.Model{ - { - Name: "test", - Source: "foo", - }, - }, - }}, - wantErr: true, - }, - { - name: "invalid backend combination 2", + name: "invalid backend combination", args: args{c: &config.InferenceConfig{ APIVersion: "v1alpha1", Runtime: "cuda", diff --git a/pkg/utils/const.go b/pkg/utils/const.go index c56c45cd..0ebd3bea 100644 --- a/pkg/utils/const.go +++ b/pkg/utils/const.go @@ -1,13 +1,9 @@ package utils const ( - RuntimeNVIDIA = "cuda" - RuntimeCPUAVX = "avx" - RuntimeCPUAVX2 = "avx2" - RuntimeCPUAVX512 = "avx512" + RuntimeNVIDIA = "cuda" BackendStableDiffusion = "stablediffusion" - BackendExllama = "exllama" BackendExllamaV2 = "exllama2" BackendMamba = "mamba" BackendDiffusers = "diffusers" diff --git a/test/aikitfile-exllama.yaml b/test/aikitfile-exllama.yaml deleted file mode 100644 index d78883c7..00000000 --- a/test/aikitfile-exllama.yaml +++ /dev/null @@ -1,20 +0,0 @@ -#syntax=aikit:test -apiVersion: v1alpha1 -debug: true -runtime: cuda -backends: - - exllama -models: - - name: Llama-2-7B-Chat-GPTQ/model.safetensors - source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/model.safetensors - - name: Llama-2-7B-Chat-GPTQ/tokenizer.model - source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/tokenizer.model - - name: Llama-2-7B-Chat-GPTQ/config.json - source: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/resolve/main/config.json -config: | - - name: llama-2-7b-chat - backend: exllama - context_size: 4096 - parameters: - model: "Llama-2-7B-Chat-GPTQ" - temperature: 0.2 diff --git a/test/aikitfile-stablediffusion.yaml b/test/aikitfile-stablediffusion.yaml index b7f199c3..52f79c9a 100644 --- a/test/aikitfile-stablediffusion.yaml +++ b/test/aikitfile-stablediffusion.yaml @@ -1,7 +1,6 @@ #syntax=aikit:test apiVersion: v1alpha1 debug: true -runtime: avx2 backends: - stablediffusion models: diff --git a/website/docs/exllama.md b/website/docs/exllama.md deleted file mode 100644 index 7cbbc5f5..00000000 --- a/website/docs/exllama.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -title: Exllama (GPTQ) ---- - -[Exllama](https://github.com/turboderp/exllama) is a standalone Python/C++/CUDA implementation of Llama for use with 4-bit GPTQ weights, designed to be fast and memory-efficient on modern GPUs. - -This backend: -- provides support for GPTQ models -- requires CUDA runtime - -:::note -This is an experimental backend and it may change in the future. -::: - -## Example - -:::warning -Please make sure to change syntax to `#syntax=ghcr.io/sozercan/aikit:latest` in the examples below. -::: - -https://github.com/sozercan/aikit/blob/main/test/aikitfile-exllama.yaml diff --git a/website/docs/gpu.md b/website/docs/gpu.md index 1217b4bb..f0ca4023 100644 --- a/website/docs/gpu.md +++ b/website/docs/gpu.md @@ -31,7 +31,7 @@ Make sure to customize these values based on your model and GPU specs. ::: :::note -For `exllama` and `exllama2` backends, GPU acceleration is enabled by default and cannot be disabled. +For `exllama2` backend, GPU acceleration is enabled by default and cannot be disabled. ::: After building the model, you can run it with [`--gpus all`](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html#gpu-enumeration) flag to enable GPU support: diff --git a/website/docs/intro.md b/website/docs/intro.md index 6d815aea..813bb03d 100644 --- a/website/docs/intro.md +++ b/website/docs/intro.md @@ -23,7 +23,7 @@ AIKit offers two main capabilities: - ✨ OpenAI API compatible to use with any OpenAI API compatible client - 📸 [Multi-modal model support](vision.md) - 🖼️ [Image generation support](diffusion.md) -- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ ([`exllama`](https://github.com/turboderp/exllama) or [`exllama2`](https://github.com/turboderp/exllamav2)), EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models +- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ or EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models - 🚢 [Kubernetes deployment ready](#kubernetes-deployment) - 📦 Supports multiple models with a single image - 🖥️ Supports [AMD64 and ARM64](create-images.md#multi-platform-support) CPUs and [GPU-accelerated inferencing with NVIDIA GPUs](gpu.md) diff --git a/website/docs/specs-inference.md b/website/docs/specs-inference.md index df151aeb..c49e1947 100644 --- a/website/docs/specs-inference.md +++ b/website/docs/specs-inference.md @@ -8,7 +8,7 @@ title: Inference API Specifications apiVersion: # required. only v1alpha1 is supported at the moment debug: # optional. if set to true, debug logs will be printed runtime: # optional. defaults to avx. can be "avx", "avx2", "avx512", "cuda" -backends: # optional. list of additional backends. can be "stablediffusion", "exllama" or "exllama2" +backends: # optional. list of additional backends. can be "stablediffusion", "exllama2", "diffusers", "mamba" models: # required. list of models to build - name: # required. name of the model source: # required. source of the model. can be a url or a local file diff --git a/website/sidebars.js b/website/sidebars.js index 8c5c9cef..8a9e27a3 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -53,7 +53,6 @@ const sidebars = { collapsed: false, items: [ 'llama-cpp', - 'exllama', 'exllama2', 'mamba', 'diffusion',