-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk8s_gpu.yaml
183 lines (181 loc) · 6.17 KB
/
k8s_gpu.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
---
# The ingress says "please HTTP proxy PATH on HOSTNAME to the
# respective service I am specifying."
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: llm-ingress-gpu
namespace:
rse # Everything we do will be in the "rse" namespace,
# which logically separates different uses (like with
# permissions, resource limits, etc). Everything
# below is in the rse namespace.
annotations:
# The line below magically gives us letsencrypt certificates for
# our service! This is for the CSIT cluster, other clusters may
# be different.
cert-manager.io/cluster-issuer: "letsencrypt-prod"
# We can define various things declaratively... this tells the
# maximum HTTP body size to the ingress.
# TODO: Might need to adapt this...
nginx.ingress.kubernetes.io/proxy-body-size: 20m
nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
# Rewrite request for target pod
nginx.ingress.kubernetes.io/use-regex: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$2
# Spec tells the actual parameters: hosts to listen on, path prefixes.
spec:
tls:
- hosts:
- llm.k8s-test.cs.aalto.fi
secretName: llm-fastchat-deployment
rules:
- host: llm.k8s-test.cs.aalto.fi
http:
paths:
- path: /llama2-gpu(/|$)(.*)
pathType: Prefix
# To where do we send these incoming requests? This
# defines the target of these requests, and corresponds to
# the service defined below (matching the name).
backend:
service:
name: llm-lama2-gpu-svc
port:
number: 80
---
# A service defines a network target, basically: you could think of it
# as a load-balanced internal DNS of some sort (it's more fancy since
# it has ports and stuff like that).
#
# This service points (the "selector") to the app with a certain name
# (defined below).
apiVersion: v1
kind: Service
metadata:
name: llm-lama2-gpu-svc
namespace: rse
labels:
app.kubernetes.io/name: llm-lama2-gpu-svc
app.kubernetes.io/component: server
spec:
# type: inference server
# What ports and stuff does this service have?
ports:
- name: http
port: 80
targetPort: 80
protocol: TCP
# This defines where incoming connections are routed to: app of a
# certain name, defined below.
selector:
app.kubernetes.io/name: llm-lama2-gpu
# app.kubernetes.io/component: server
---
# This is the actual application: actually it's a **deployment**,
# which means:
# - Define a container that runs
# - It can run multiple copies of this in parellel
# - Keeps them in sync, scale up and down as needed, etc.
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-lama2-gpu
namespace: rse
labels:
app.kubernetes.io/name: llm-lama2-gpu
app.kubernetes.io/component: server
# The definition of the service itself
spec:
# Test with one replica
replicas: 1
# This is how the deployment knows which pods are part of it. These
# labels should match the template labels.
selector:
matchLabels:
app.kubernetes.io/name: llm-lama2-gpu
app.kubernetes.io/component: server
# The template is used to make each pod of the deployment - as many
# as needed to match spec.replicas.
template:
metadata:
labels:
app.kubernetes.io/name: llm-lama2-gpu
app.kubernetes.io/component: server
# You can probably figure out what most of these mean...
spec:
# Use specific host only
tolerations:
- key: "cs-aalto/gpu"
operator: Exists
effect: NoSchedule
nodeSelector:
kubernetes.io/hostname: k8s-gpu.cs.aalto.fi
restartPolicy: Always
securityContext:
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
volumes:
- name: llm-models
hostPath:
path: /srv/models/
type: Directory
containers:
- name: llm-llama2-cont-gpu
image: harbor.cs.aalto.fi/aaltorse-public/llm_llama_cpp_gpu:latest
#imagePullPolicy: Always
ports:
- containerPort: 8000
resources: # Restrict to one GPU
limits:
nvidia.com/gpu: 1
# This defines environment variables inside the container.
# This one is from a **secret**, which can be used to
# logically separate secrets from config.
#
# `ConfigMaps` can alse be useful: you can set environment
# variables or even dynamically mount config files inside of
# the container.
env:
- name: MODEL
value: "llama-2-7b-chat.gguf.q4_0.bin"
- name: HOST
value: "0.0.0.0"
- name: PORT
value: "8000"
volumeMounts:
- mountPath: /models
name: llm-models
readOnly: true
#env:
# created with: kubectl -n rse create secret generic scicomp-docs-search-update --from-literal=token=TOKEN
# - name: SEARCH_UPDATE_AUTHORIZATION
# valueFrom:
# secretKeyRef:
# name: scicomp-docs-search-update
# key: token
- name: llm-nginx-cont
image: harbor.cs.aalto.fi/aaltorse-public/llm_nginx:latest
ports:
- containerPort: 80
env:
- name: LLM_MODEL
value: "llama-2"
- {
name: AUTH_TOKEN,
valueFrom:
{ secretKeyRef: { name: llm-gateway, key: inference_key } },
}
securityContext:
runAsUser: 0
runAsGroup: 0
allowPrivilegeEscalation: false
# Since the image is private, we need permission to pull it. I
# somehow got this from harbor config. (AaltoRSE people can
# probably re-use this existing secret).
#
# created with: `kubectl -n rse create secret docker-registry robot-scicomp-docs-search-pull --docker-server=DOMAIN --docker-username='robot$NAME' --docker-password='SECRET'`
# imagePullSecrets:
# - name: robot-scicomp-docs-search-pull