-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk8s.yaml
317 lines (312 loc) · 10.6 KB
/
k8s.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
---
# The ingress says "please HTTP proxy PATH on HOSTNAME to the
# respective service I am specifying."
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: llm-ingress
namespace:
rse # Everything we do will be in the "rse" namespace,
# which logically separates different uses (like with
# permissions, resource limits, etc). Everything
# below is in the rse namespace.
annotations:
# The line below magically gives us letsencrypt certificates for
# our service! This is for the CSIT cluster, other clusters may
# be different.
cert-manager.io/cluster-issuer: "letsencrypt-prod"
# We can define various things declaratively... this tells the
# maximum HTTP body size to the ingress.
# TODO: Might need to adapt this...
nginx.ingress.kubernetes.io/proxy-body-size: 20m
nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
# Rewrite request for target pod
nginx.ingress.kubernetes.io/use-regex: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$2
# Spec tells the actual parameters: hosts to listen on, path prefixes.
spec:
tls:
- hosts:
- llm.k8s-test.cs.aalto.fi
secretName: llm-fastchat-deployment
rules:
- host: llm.k8s-test.cs.aalto.fi
http:
paths:
- path: /llama2(/|$)(.*)
pathType: Prefix
# To where do we send these incoming requests? This
# defines the target of these requests, and corresponds to
# the service defined below (matching the name).
backend:
service:
name: llm-lama2-svc
port:
number: 80
- path: /llama2-13b-chat(/|$)(.*)
pathType: Prefix
# To where do we send these incoming requests? This
# defines the target of these requests, and corresponds to
# the service defined below (matching the name).
backend:
service:
name: llm-lama2-chat-svc
port:
number: 80
---
# A service defines a network target, basically: you could think of it
# as a load-balanced internal DNS of some sort (it's more fancy since
# it has ports and stuff like that).
#
# This service points (the "selector") to the app with a certain name
# (defined below).
apiVersion: v1
kind: Service
metadata:
name: llm-lama2-svc
namespace: rse
labels:
app.kubernetes.io/name: llm-lama2-svc
app.kubernetes.io/component: server
spec:
# type: inference server
# What ports and stuff does this service have?
ports:
- name: http
port: 80
targetPort: 80
protocol: TCP
# This defines where incoming connections are routed to: app of a
# certain name, defined below.
selector:
app.kubernetes.io/name: llm-lama2
# app.kubernetes.io/component: server
---
# A service defines a network target, basically: you could think of it
# as a load-balanced internal DNS of some sort (it's more fancy since
# it has ports and stuff like that).
#
# This service points (the "selector") to the app with a certain name
# (defined below).
apiVersion: v1
kind: Service
metadata:
name: llm-lama2-chat-svc
namespace: rse
labels:
app.kubernetes.io/name: llm-lama2-chat-svc
app.kubernetes.io/component: server
spec:
# type: inference server
# What ports and stuff does this service have?
ports:
- name: http
port: 80
targetPort: 80
protocol: TCP
# This defines where incoming connections are routed to: app of a
# certain name, defined below.
selector:
app.kubernetes.io/name: llm-lama2-chat
# app.kubernetes.io/component: server
---
# This is the actual application: actually it's a **deployment**,
# which means:
# - Define a container that runs
# - It can run multiple copies of this in parellel
# - Keeps them in sync, scale up and down as needed, etc.
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-lama2
namespace: rse
labels:
app.kubernetes.io/name: llm-lama2
app.kubernetes.io/component: server
# The definition of the service itself
spec:
# Test with one replica
replicas: 1
# This is how the deployment knows which pods are part of it. These
# labels should match the template labels.
selector:
matchLabels:
app.kubernetes.io/name: llm-lama2
app.kubernetes.io/component: server
# The template is used to make each pod of the deployment - as many
# as needed to match spec.replicas.
template:
metadata:
labels:
app.kubernetes.io/name: llm-lama2
app.kubernetes.io/component: server
# You can probably figure out what most of these mean...
spec:
# Use specific host only
nodeSelector:
kubernetes.io/hostname: k8s-node21.cs.aalto.fi
restartPolicy: Always
securityContext:
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
volumes:
- name: llm-models
hostPath:
path: /srv/models/
type: Directory
containers:
- name: llm-llama2-cont
image: harbor.cs.aalto.fi/aaltorse-public/llm_llama_cpp:latest
#imagePullPolicy: Always
ports:
- containerPort: 8000
# This defines environment variables inside the container.
# This one is from a **secret**, which can be used to
# logically separate secrets from config.
#
# `ConfigMaps` can alse be useful: you can set environment
# variables or even dynamically mount config files inside of
# the container.
env:
- name: MODEL
value: "llama-2-7b-chat.gguf.q4_0.bin"
- name: HOST
value: "0.0.0.0"
- name: PORT
value: "8000"
volumeMounts:
- mountPath: /models
name: llm-models
readOnly: true
#env:
# created with: kubectl -n rse create secret generic scicomp-docs-search-update --from-literal=token=TOKEN
# - name: SEARCH_UPDATE_AUTHORIZATION
# valueFrom:
# secretKeyRef:
# name: scicomp-docs-search-update
# key: token
- name: llm-nginx-cont
image: harbor.cs.aalto.fi/aaltorse-public/llm_nginx:latest
ports:
- containerPort: 80
env:
- name: LLM_MODEL
value: "llama-2"
- {
name: AUTH_TOKEN,
valueFrom:
{ secretKeyRef: { name: llm-gateway, key: inference_key } },
}
securityContext:
runAsUser: 0
runAsGroup: 0
allowPrivilegeEscalation: false
# Since the image is private, we need permission to pull it. I
# somehow got this from harbor config. (AaltoRSE people can
# probably re-use this existing secret).
#
# created with: `kubectl -n rse create secret docker-registry robot-scicomp-docs-search-pull --docker-server=DOMAIN --docker-username='robot$NAME' --docker-password='SECRET'`
# imagePullSecrets:
# - name: robot-scicomp-docs-search-pull
---
# This is the actual application: actually it's a **deployment**,
# which means:
# - Define a container that runs
# - It can run multiple copies of this in parellel
# - Keeps them in sync, scale up and down as needed, etc.
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-lama2-chat
namespace: rse
labels:
app.kubernetes.io/name: llm-lama2-chat
app.kubernetes.io/component: server
# The definition of the service itself
spec:
# Test with one replica
replicas: 1
# This is how the deployment knows which pods are part of it. These
# labels should match the template labels.
selector:
matchLabels:
app.kubernetes.io/name: llm-lama2-chat
app.kubernetes.io/component: server
# The template is used to make each pod of the deployment - as many
# as needed to match spec.replicas.
template:
metadata:
labels:
app.kubernetes.io/name: llm-lama2-chat
app.kubernetes.io/component: server
# You can probably figure out what most of these mean...
spec:
# Use specific host only
nodeSelector:
kubernetes.io/hostname: k8s-node21.cs.aalto.fi
restartPolicy: Always
securityContext:
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
volumes:
- name: llm-models
hostPath:
path: /srv/models/
type: Directory
containers:
- name: llm-llama2-chat-cont
image: harbor.cs.aalto.fi/aaltorse-public/llm_llama_cpp:latest
#imagePullPolicy: Always
ports:
- containerPort: 8000
# This defines environment variables inside the container.
# This one is from a **secret**, which can be used to
# logically separate secrets from config.
#
# `ConfigMaps` can alse be useful: you can set environment
# variables or even dynamically mount config files inside of
# the container.
env:
- name: MODEL
value: "llama2-llama.cpp-2023-12-04/llama-2-13b-chat/ggml-model-f16-v2.gguf"
- name: HOST
value: "0.0.0.0"
- name: PORT
value: "8000"
volumeMounts:
- mountPath: /models
name: llm-models
readOnly: true
#env:
# created with: kubectl -n rse create secret generic scicomp-docs-search-update --from-literal=token=TOKEN
# - name: SEARCH_UPDATE_AUTHORIZATION
# valueFrom:
# secretKeyRef:
# name: scicomp-docs-search-update
# key: token
- name: llm-nginx-cont
image: harbor.cs.aalto.fi/aaltorse-public/llm_nginx:latest
ports:
- containerPort: 80
env:
- name: LLM_MODEL
value: "llama-2"
- {
name: AUTH_TOKEN,
valueFrom:
{ secretKeyRef: { name: llm-gateway, key: inference_key } },
}
securityContext:
runAsUser: 0
runAsGroup: 0
allowPrivilegeEscalation: false
# Since the image is private, we need permission to pull it. I
# somehow got this from harbor config. (AaltoRSE people can
# probably re-use this existing secret).
#
# created with: `kubectl -n rse create secret docker-registry robot-scicomp-docs-search-pull --docker-server=DOMAIN --docker-username='robot$NAME' --docker-password='SECRET'`
# imagePullSecrets:
# - name: robot-scicomp-docs-search-pull