Skip to content

Commit

Permalink
chore: remove deprecated exllama (#396)
Browse files Browse the repository at this point in the history
Signed-off-by: Sertac Ozercan <[email protected]>
  • Loading branch information
sozercan authored Sep 27, 2024
1 parent 72bcc36 commit 1fce06c
Show file tree
Hide file tree
Showing 16 changed files with 26 additions and 103 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/test-docker-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ jobs:
- exllama2-gptq
- exllama2-exl2
- diffusers
# - exllama
# - mamba
steps:
- name: cleanup workspace
Expand Down Expand Up @@ -63,7 +62,7 @@ jobs:
fi
- name: run test (exl2/gptq)
if: matrix.backend == 'exllama2-gptq' || matrix.backend == 'exllama2-exl2' || matrix.backend == 'exllama'
if: matrix.backend == 'exllama2-gptq' || matrix.backend == 'exllama2-exl2'
run: |
result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llama-2-7b-chat",
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ AIKit offers two main capabilities:
- ✨ OpenAI API compatible to use with any OpenAI API compatible client
- 📸 [Multi-modal model support](https://sozercan.github.io/aikit/docs/vision)
- 🖼️ [Image generation support](https://sozercan.github.io/aikit/docs/diffusion)
- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ ([`exllama`](https://github.com/turboderp/exllama) or [`exllama2`](https://github.com/turboderp/exllamav2)), EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ or EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
- 🚢 [Kubernetes deployment ready](https://sozercan.github.io/aikit/docs/kubernetes)
- 📦 Supports multiple models with a single image
- 🖥️ Supports [AMD64 and ARM64](https://sozercan.github.io/aikit/docs/create-images#multi-platform-support) CPUs and [GPU-accelerated inferencing with NVIDIA GPUs](https://sozercan.github.io/aikit/docs/gpu)
Expand Down
8 changes: 4 additions & 4 deletions pkg/aikit/config/specs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@ func TestNewFromBytes(t *testing.T) {
name: "valid yaml",
args: args{b: []byte(`
apiVersion: v1alpha1
runtime: avx512
runtime: cuda
backends:
- exllama
- exllama2
- stablediffusion
models:
- name: test
source: foo
`)},
want: &InferenceConfig{
APIVersion: utils.APIv1alpha1,
Runtime: utils.RuntimeCPUAVX512,
Runtime: utils.RuntimeNVIDIA,
Backends: []string{
utils.BackendExllama,
utils.BackendExllamaV2,
utils.BackendStableDiffusion,
},
Models: []Model{
Expand Down
12 changes: 4 additions & 8 deletions pkg/aikit2llb/inference/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ func Aikit2LLB(c *config.InferenceConfig, platform *specs.Platform) (llb.State,
// install backend dependencies
for b := range c.Backends {
switch c.Backends[b] {
case utils.BackendExllama, utils.BackendExllamaV2:
merge = installExllama(c, state, merge)
case utils.BackendExllamaV2:
merge = installExllama(state, merge)
case utils.BackendStableDiffusion:
merge = installOpenCV(state, merge)
case utils.BackendMamba:
Expand Down Expand Up @@ -131,12 +131,8 @@ func installCuda(c *config.InferenceConfig, s llb.State, merge llb.State) (llb.S

// installing dev dependencies used for exllama
for b := range c.Backends {
if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 {
var exllama2Dep string
if c.Backends[b] == utils.BackendExllamaV2 {
exllama2Dep = fmt.Sprintf("libcurand-dev-%[1]s", cudaVersion)
}
exllamaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-cudart-dev-%[1]s cuda-crt-%[1]s libcusparse-dev-%[1]s libcublas-dev-%[1]s libcusolver-dev-%[1]s cuda-nvcc-%[1]s %[2]s && apt-get clean", cudaVersion, exllama2Dep)
if c.Backends[b] == utils.BackendExllamaV2 {
exllamaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-cudart-dev-%[1]s cuda-crt-%[1]s libcusparse-dev-%[1]s libcublas-dev-%[1]s libcusolver-dev-%[1]s cuda-nvcc-%[1]s libcurand-dev-%[1]s && apt-get clean", cudaVersion)

s = s.Run(utils.Sh(exllamaDeps)).Root()
}
Expand Down
14 changes: 2 additions & 12 deletions pkg/aikit2llb/inference/exllama.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,16 @@ package inference

import (
"github.com/moby/buildkit/client/llb"
"github.com/sozercan/aikit/pkg/aikit/config"
"github.com/sozercan/aikit/pkg/utils"
)

func installExllama(c *config.InferenceConfig, s llb.State, merge llb.State) llb.State {
backend := utils.BackendExllama
for b := range c.Backends {
if c.Backends[b] == utils.BackendExllamaV2 {
backend = utils.BackendExllamaV2
}
}

func installExllama(s llb.State, merge llb.State) llb.State {
savedState := s
s = s.Run(utils.Sh("apt-get update && apt-get install --no-install-recommends -y bash git ca-certificates python3-pip python3-dev python3-venv python-is-python3 make g++ curl && pip install uv grpcio-tools && apt-get clean"), llb.IgnoreCache).Root()

s = cloneLocalAI(s)

// TODO: remove sed for grpcio with localai v2.20.2+
// https://github.com/mudler/LocalAI/pull/3428/files
s = s.Run(utils.Bashf("export BUILD_TYPE=cublas && export CUDA_MAJOR_VERSION=12 && cd /tmp/localai/backend/python/%[1]s && sed -i 's/grpcio==1.66.0/grpcio==1.66.1/g' requirements.txt && make %[1]s", backend)).Root()
s = s.Run(utils.Bashf("export BUILD_TYPE=cublas && export CUDA_MAJOR_VERSION=12 && cd /tmp/localai/backend/python/%[1]s && make %[1]s", utils.BackendExllamaV2)).Root()

diff := llb.Diff(savedState, s)
return llb.Merge([]llb.State{merge, diff})
Expand Down
4 changes: 2 additions & 2 deletions pkg/aikit2llb/inference/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ func emptyImage(c *config.InferenceConfig, platform *specs.Platform) *specs.Imag

for b := range c.Backends {
switch c.Backends[b] {
case utils.BackendExllama, utils.BackendExllamaV2:
case utils.BackendExllamaV2:
exllamaEnv := []string{
"EXTERNAL_GRPC_BACKENDS=exllama:/tmp/localai/backend/python/exllama/run.sh,exllama2:/tmp/localai/backend/python/exllama2/run.sh",
"EXTERNAL_GRPC_BACKENDS=exllama2:/tmp/localai/backend/python/exllama2/run.sh",
"CUDA_HOME=/usr/local/cuda",
}
img.Config.Env = append(img.Config.Env, exllamaEnv...)
Expand Down
10 changes: 5 additions & 5 deletions pkg/build/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,22 +444,22 @@ func validateInferenceConfig(c *config.InferenceConfig) error {
return errors.New("only one backend is supported at this time")
}

if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) {
return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllamaV2)) {
return errors.New("cannot specify both stablediffusion with exllama2 at this time")
}

if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers)) && c.Runtime != utils.RuntimeNVIDIA {
if (slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers)) && c.Runtime != utils.RuntimeNVIDIA {
return errors.New("exllama, mamba, and diffusers backends only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
}

backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers}
backends := []string{utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers}
for _, b := range c.Backends {
if !slices.Contains(backends, b) {
return errors.Errorf("backend %s is not supported", b)
}
}

runtimes := []string{"", utils.RuntimeNVIDIA, utils.RuntimeCPUAVX, utils.RuntimeCPUAVX2, utils.RuntimeCPUAVX512}
runtimes := []string{"", utils.RuntimeNVIDIA}
if !slices.Contains(runtimes, c.Runtime) {
return errors.Errorf("runtime %s is not supported", c.Runtime)
}
Expand Down
21 changes: 3 additions & 18 deletions pkg/build/build_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func Test_validateConfig(t *testing.T) {
args: args{c: &config.InferenceConfig{
APIVersion: "v1alpha1",
Runtime: "cuda",
Backends: []string{"exllama"},
Backends: []string{"exllama2"},
Models: []config.Model{
{
Name: "test",
Expand Down Expand Up @@ -69,7 +69,7 @@ func Test_validateConfig(t *testing.T) {
name: "valid backend but no cuda runtime",
args: args{c: &config.InferenceConfig{
APIVersion: "v1alpha1",
Backends: []string{"exllama"},
Backends: []string{"exllama2"},
Models: []config.Model{
{
Name: "test",
Expand All @@ -80,22 +80,7 @@ func Test_validateConfig(t *testing.T) {
wantErr: true,
},
{
name: "invalid backend combination 1",
args: args{c: &config.InferenceConfig{
APIVersion: "v1alpha1",
Runtime: "cuda",
Backends: []string{"exllama", "exllama2"},
Models: []config.Model{
{
Name: "test",
Source: "foo",
},
},
}},
wantErr: true,
},
{
name: "invalid backend combination 2",
name: "invalid backend combination",
args: args{c: &config.InferenceConfig{
APIVersion: "v1alpha1",
Runtime: "cuda",
Expand Down
6 changes: 1 addition & 5 deletions pkg/utils/const.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
package utils

const (
RuntimeNVIDIA = "cuda"
RuntimeCPUAVX = "avx"
RuntimeCPUAVX2 = "avx2"
RuntimeCPUAVX512 = "avx512"
RuntimeNVIDIA = "cuda"

BackendStableDiffusion = "stablediffusion"
BackendExllama = "exllama"
BackendExllamaV2 = "exllama2"
BackendMamba = "mamba"
BackendDiffusers = "diffusers"
Expand Down
20 changes: 0 additions & 20 deletions test/aikitfile-exllama.yaml

This file was deleted.

1 change: 0 additions & 1 deletion test/aikitfile-stablediffusion.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#syntax=aikit:test
apiVersion: v1alpha1
debug: true
runtime: avx2
backends:
- stablediffusion
models:
Expand Down
21 changes: 0 additions & 21 deletions website/docs/exllama.md

This file was deleted.

2 changes: 1 addition & 1 deletion website/docs/gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Make sure to customize these values based on your model and GPU specs.
:::

:::note
For `exllama` and `exllama2` backends, GPU acceleration is enabled by default and cannot be disabled.
For `exllama2` backend, GPU acceleration is enabled by default and cannot be disabled.
:::

After building the model, you can run it with [`--gpus all`](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html#gpu-enumeration) flag to enable GPU support:
Expand Down
2 changes: 1 addition & 1 deletion website/docs/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ AIKit offers two main capabilities:
- ✨ OpenAI API compatible to use with any OpenAI API compatible client
- 📸 [Multi-modal model support](vision.md)
- 🖼️ [Image generation support](diffusion.md)
- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ ([`exllama`](https://github.com/turboderp/exllama) or [`exllama2`](https://github.com/turboderp/exllamav2)), EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
- 🦙 Support for GGUF ([`llama`](https://github.com/ggerganov/llama.cpp)), GPTQ or EXL2 ([`exllama2`](https://github.com/turboderp/exllamav2)), and GGML ([`llama-ggml`](https://github.com/ggerganov/llama.cpp)) and [Mamba](https://github.com/state-spaces/mamba) models
- 🚢 [Kubernetes deployment ready](#kubernetes-deployment)
- 📦 Supports multiple models with a single image
- 🖥️ Supports [AMD64 and ARM64](create-images.md#multi-platform-support) CPUs and [GPU-accelerated inferencing with NVIDIA GPUs](gpu.md)
Expand Down
2 changes: 1 addition & 1 deletion website/docs/specs-inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ title: Inference API Specifications
apiVersion: # required. only v1alpha1 is supported at the moment
debug: # optional. if set to true, debug logs will be printed
runtime: # optional. defaults to avx. can be "avx", "avx2", "avx512", "cuda"
backends: # optional. list of additional backends. can be "stablediffusion", "exllama" or "exllama2"
backends: # optional. list of additional backends. can be "stablediffusion", "exllama2", "diffusers", "mamba"
models: # required. list of models to build
- name: # required. name of the model
source: # required. source of the model. can be a url or a local file
Expand Down
1 change: 0 additions & 1 deletion website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ const sidebars = {
collapsed: false,
items: [
'llama-cpp',
'exllama',
'exllama2',
'mamba',
'diffusion',
Expand Down

0 comments on commit 1fce06c

Please sign in to comment.