diff --git a/ai-ml/llm-multihost-gpus/vllm-llama3-405b-A3.yaml b/ai-ml/llm-multihost-gpus/vllm-llama3-405b-A3.yaml new file mode 100644 index 0000000000..3c8a25ec52 --- /dev/null +++ b/ai-ml/llm-multihost-gpus/vllm-llama3-405b-A3.yaml @@ -0,0 +1,131 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_llm_serving_multihost_gpus_vllm_llama3_405b_a4] + +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: vllm +spec: + replicas: 1 + leaderWorkerTemplate: + size: 2 + restartPolicy: RecreateGroupOnPodRestart + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - name: vllm-leader + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240821_1034_RC00 + env: + - name: RAY_CLUSTER_SIZE + valueFrom: + fieldRef: + fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/size'] + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + command: + - sh + - -c + - "/workspace/vllm/examples/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE; + python3 -m vllm.entrypoints.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2" + resources: + limits: + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi + requests: + ephemeral-storage: 800Gi + cpu: 125 + ports: + - containerPort: 8080 + readinessProbe: + tcpSocket: + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + workerTemplate: + spec: + containers: + - name: vllm-worker + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240821_1034_RC00 + command: + - sh + - -c + - "/workspace/vllm/examples/ray_init.sh worker --ray_address=$(LEADER_NAME).$(LWS_NAME).$(NAMESPACE).svc.cluster.local" + resources: + limits: + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi + requests: + ephemeral-storage: 800Gi + cpu: 125 + env: + - name: LEADER_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/leader-name'] + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: LWS_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name'] + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-leader +spec: + ports: + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + leaderworkerset.sigs.k8s.io/name: vllm + role: leader + type: ClusterIP + + # [END gke_ai_ml_llm_serving_multihost_gpus_vllm_llama3_405b_a4] \ No newline at end of file