sozercan · sozercan · Sep 8, 2024 · Sep 8, 2024
diff --git a/.github/workflows/test-docker-gpu.yaml b/.github/workflows/test-docker-gpu.yaml
@@ -18,6 +18,7 @@ jobs:
           - exllama2-gptq
           - exllama2-exl2
           - diffusers
+          - transformers
           # - exllama
           # - mamba
     steps:
@@ -48,8 +49,8 @@ jobs:
       - name: run test model
         run: docker run --name testmodel -d --rm -p 8080:8080 --gpus all testmodel:test
 
-      - name: run test (gguf)
-        if: matrix.backend == 'llama-cuda'
+      - name: run test (${{ matrix.backend }})
+        if: matrix.backend == 'llama-cuda' || matrix.backend == 'transformers'
         run: |
           result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
             "model": "llama-3.1-8b-instruct",

diff --git a/pkg/aikit2llb/inference/convert.go b/pkg/aikit2llb/inference/convert.go
@@ -51,6 +51,8 @@ func Aikit2LLB(c *config.InferenceConfig, platform *specs.Platform) (llb.State,
 			merge = installMamba(state, merge)
 		case utils.BackendDiffusers:
 			merge = installDiffusers(state, merge)
+		case utils.BackendTransformers:
+			merge = installTransformers(state, merge)
 		}
 	}
 

diff --git a/pkg/aikit2llb/inference/image.go b/pkg/aikit2llb/inference/image.go
@@ -68,6 +68,12 @@ func emptyImage(c *config.InferenceConfig, platform *specs.Platform) *specs.Imag
 				"CUDA_HOME=/usr/local/cuda",
 			}
 			img.Config.Env = append(img.Config.Env, diffusersEnv...)
+		case utils.BackendTransformers:
+			transformersEnv := []string{
+				"EXTERNAL_GRPC_BACKENDS=transformers:/tmp/localai/backend/python/transformers/run.sh",
+				"CUDA_HOME=/usr/local/cuda",
+			}
+			img.Config.Env = append(img.Config.Env, transformersEnv...)
 		}
 	}
 

diff --git a/pkg/aikit2llb/inference/transformers.go b/pkg/aikit2llb/inference/transformers.go
@@ -0,0 +1,18 @@
+package inference
+
+import (
+	"github.com/moby/buildkit/client/llb"
+	"github.com/sozercan/aikit/pkg/utils"
+)
+
+func installTransformers(s llb.State, merge llb.State) llb.State {
+	savedState := s
+	s = s.Run(utils.Sh("apt-get install --no-install-recommends -y git python3 python3-pip python3-venv python-is-python3 make && pip install uv grpcio-tools && apt-get clean"), llb.IgnoreCache).Root()
+
+	s = cloneLocalAI(s)
+
+	s = s.Run(utils.Bashf("export BUILD_TYPE=cublas && export CUDA_MAJOR_VERSION=12 && cd /tmp/localai/backend/python/%[1]s && sed -i 's/grpcio==1.66.0/grpcio==1.66.1/g' requirements.txt && make %[1]s", utils.BackendTransformers)).Root()
+
+	diff := llb.Diff(savedState, s)
+	return llb.Merge([]llb.State{merge, diff})
+}
diff --git a/pkg/build/build.go b/pkg/build/build.go
@@ -448,11 +448,11 @@ func validateInferenceConfig(c *config.InferenceConfig) error {
 		return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
 	}
 
-	if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers)) && c.Runtime != utils.RuntimeNVIDIA {
+	if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba) || slices.Contains(c.Backends, utils.BackendDiffusers) || slices.Contains(c.Backends, utils.BackendTransformers)) && c.Runtime != utils.RuntimeNVIDIA {
 		return errors.New("exllama, mamba, and diffusers backends only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
 	}
 
-	backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers}
+	backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba, utils.BackendDiffusers, utils.BackendTransformers}
 	for _, b := range c.Backends {
 		if !slices.Contains(backends, b) {
 			return errors.Errorf("backend %s is not supported", b)

diff --git a/pkg/utils/const.go b/pkg/utils/const.go
@@ -11,6 +11,7 @@ const (
 	BackendExllamaV2       = "exllama2"
 	BackendMamba           = "mamba"
 	BackendDiffusers       = "diffusers"
+	BackendTransformers    = "transformers"
 
 	TargetUnsloth = "unsloth"
 

diff --git a/test/aikitfile-transformers.yaml b/test/aikitfile-transformers.yaml
@@ -0,0 +1,26 @@
+#syntax=aikit:test
+apiVersion: v1alpha1
+debug: true
+runtime: cuda
+backends:
+  - transformers
+config: |
+  - name: llama-3.1-8b-instruct
+    backend: transformers
+    type: AutoModelForCausalLM
+    parameters:
+      model: microsoft/Phi-3.5-mini-instruct
+    f16: true
+    stopwords:
+    - <|user|>
+    - <|assistant|>
+    - <|end|>
+    template:
+      chat_message: |
+          <|{{ .RoleName }}|>
+          {{.Content}}<|end|>
+      chat: |
+          {{.Input}}
+          <|assistant|>
+      completion: |
+          {{.Input}}