Skip to content

Commit

Permalink
Add samples for LLM multi-host GPUs tutorials (#1409)
Browse files Browse the repository at this point in the history
* added yaml file for multihost gpus user guide

* updated image with publicly available one

* fixed separator

---------

Co-authored-by: Nim Jayawardena <[email protected]>
  • Loading branch information
Edwinhr716 and NimJay authored Aug 22, 2024
1 parent 974050a commit 7d06c7f
Showing 1 changed file with 131 additions and 0 deletions.
131 changes: 131 additions & 0 deletions ai-ml/llm-multihost-gpus/vllm-llama3-405b-A3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START gke_ai_ml_llm_serving_multihost_gpus_vllm_llama3_405b_a4]

apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
spec:
replicas: 1
leaderWorkerTemplate:
size: 2
restartPolicy: RecreateGroupOnPodRestart
leaderTemplate:
metadata:
labels:
role: leader
spec:
containers:
- name: vllm-leader
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240821_1034_RC00
env:
- name: RAY_CLUSTER_SIZE
valueFrom:
fieldRef:
fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/size']
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
command:
- sh
- -c
- "/workspace/vllm/examples/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE;
python3 -m vllm.entrypoints.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
ports:
- containerPort: 8080
readinessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 15
periodSeconds: 10
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
workerTemplate:
spec:
containers:
- name: vllm-worker
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240821_1034_RC00
command:
- sh
- -c
- "/workspace/vllm/examples/ray_init.sh worker --ray_address=$(LEADER_NAME).$(LWS_NAME).$(NAMESPACE).svc.cluster.local"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
env:
- name: LEADER_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/leader-name']
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: LWS_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name']
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP

# [END gke_ai_ml_llm_serving_multihost_gpus_vllm_llama3_405b_a4]

0 comments on commit 7d06c7f

Please sign in to comment.