Skip to content

Commit

Permalink
Kernel, and trainer updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Celes Renata committed Dec 9, 2024
1 parent cb46b83 commit 7083cbf
Show file tree
Hide file tree
Showing 50 changed files with 761 additions and 199 deletions.
4 changes: 2 additions & 2 deletions OneTrainer/bin/create-users.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ while read id username hash groups; do
# Create group
addgroup --gid $id $username
# Create user
useradd -m -u $id -s /bin/bash -g $username $username
useradd -m -u $id -s /bin/bash -g $username -d /workspace $username
# Set password
echo "$username:$hash" | /usr/sbin/chpasswd -e
# Add supplemental groups
if [ $groups ]; then
usermod -aG $groups $username
fi
chown -R $username /home/workspace
chown -R $username /workspace
done < /etc/users.list
3 changes: 3 additions & 0 deletions OneTrainer/bin/docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
# Add users
bash /usr/bin/create-users.sh

# Clone Static Homedir
rsync -aHx /home/workspace /

# Add the ssh config if needed

if [ ! -f "/etc/ssh/sshd_config" ];
Expand Down
67 changes: 67 additions & 0 deletions OneTrainer/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
kompose.cmd: kompose convert
kompose.version: 1.34.0 (HEAD)
labels:
io.kompose.service: onetrainer
name: onetrainer
spec:
replicas: 1
selector:
matchLabels:
io.kompose.service: onetrainer
strategy:
type: Recreate
template:
metadata:
annotations:
kompose.cmd: kompose convert
kompose.version: 1.34.0 (HEAD)
labels:
io.kompose.service: onetrainer
spec:
containers:
- env:
image: ghcr.io/celesrenata/onetrainer:latest
name: onetrainer
ports:
- containerPort: 3350
protocol: TCP
- containerPort: 22
protocol: TCP
- containerPort: 3389
protocol: TCP
resources:
limits:
memory: 64Gi
nvidia.com/gpu: "1"
intel.com/sriov-gpudevice: "1"
volumeMounts:
- mountPath: /home/workspace
name: onetrainer-home
- mountPath: /workspace
name: onetrainer-workspace
- mountPath: /workspace/models
name: onetrainer-models
- mountPath: /workspace/input
name: onetrainer-tmp
subPath: input
- mountPath: /workspace/output
name: onetrainer-tmp
subPath: output
restartPolicy: Always
volumes:
- name: onetrainer-workspace
persistentVolumeClaim:
claimName: onetrainer-workspace
- name: onetrainer-home
persistentVolumeClaim:
claimName: onetrainer-home
- name: onetrainer-models
persistentVolumeClaim:
claimName: onetrainer-models
- name: onetrainer-tmp
persistentVolumeClaim:
claimName: onetrainer-tmp
17 changes: 11 additions & 6 deletions OneTrainer/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ RUN make
RUN mkdir -p /tmp/so
RUN cp src/.libs/*.so /tmp/so

WORKDIR /tmp
RUN git clone https://github.com/aristocratos/btop.git
WORKDIR /tmp/btop
RUN gmake PREFIX=/usr && gmake install && gmake setuid


FROM nvidia/cuda:12.5.0-runtime-ubuntu22.04
ENV TZ=America/Los_Angeles
ENV DEBIAN_FRONTEND noninteractive
Expand All @@ -47,6 +53,7 @@ RUN apt -y full-upgrade && apt-get install -y \
ca-certificates \
crudini \
firefox \
kitty \
less \
locales \
openssh-server \
Expand All @@ -56,6 +63,7 @@ RUN apt -y full-upgrade && apt-get install -y \
uuid-runtime \
vim \
vlc \
rsync \
wget \
xauth \
xautolock \
Expand All @@ -72,6 +80,7 @@ RUN apt -y full-upgrade && apt-get install -y \
xorgxrdp \
xprintidle \
xrdp \
nvidia-cuda-toolkit \
$ADDITIONAL_PACKAGES && \
apt remove -y light-locker xscreensaver && \
apt autoremove -y && \
Expand Down Expand Up @@ -99,14 +108,10 @@ RUN mkdir /var/run/dbus && \
RUN apt update && apt-get install fonts-droid-fallback ttf-wqy-zenhei ttf-wqy-microhei fonts-arphic-ukai fonts-arphic-uming -y
RUN apt update && apt install curl htop neofetch python3-pip python3-tk python-is-python3 libjpeg-dev p7zip-full gcc g++ fonts-noto-cjk-extra -y

WORKDIR /home/workspace
WORKDIR /workspace

#RUN curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba

ENV PYTHON=python3.10
ENV ONETRAINER_PATH=/home/workspace/
ENV PATH="$PATH:$ONETRAINER_PATH"

# python
RUN apt-get update \
&& apt-get install -y mesa-utils libgl1-mesa-dri libgtkgl2.0-dev libgtkglext1-dev git software-properties-common apt-utils \
Expand All @@ -127,7 +132,7 @@ RUN git clone https://github.com/Nerogar/OneTrainer.git

# RUN
ENV PYTHON=python3.10
ENV ONETRAINER_PATH=/home/workspace/
ENV ONETRAINER_PATH=/workspace/OneTrainer
ENV PATH="$PATH:$ONETRAINER_PATH"
WORKDIR $ONETRAINER_PATH

Expand Down
47 changes: 47 additions & 0 deletions OneTrainer/ingress.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: tensor-http
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: web
traefik.ingress.kubernetes.io/router.middlewares: default-redirectscheme@kubernetescrd
labels:
app: onetrainer
spec:
rules:
- host: tensorboard.celestium.life
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: onetrainer
port:
number: 3703
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: tensor-https
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
cert-manager.io/cluster-issuer: ca-issuer
labels:
app: onetrainer
spec:
rules:
- host: tensorboard.celestium.life
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: onetrainer
port:
number: 3703
tls:
- hosts:
- tensorboard.celestium.life
secretName: tensor-cert
27 changes: 3 additions & 24 deletions OneTrainer/nfs-pv.yaml
Original file line number Diff line number Diff line change
@@ -1,28 +1,7 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: supervisor-workspace
spec:
persistentVolumeReclaimPolicy: Delete
capacity:
storage: 1000Gi
accessModes:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
storageClassName: ""
nfs:
path: /volume1/Kubernetes/onetrainer/workspace # The path to your media
server: 192.168.42.8 # Your NFS server with Media
mountOptions:
- vers=4
- minorversion=1
- noac
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: supervisor-home
name: onetrainer-home
spec:
persistentVolumeReclaimPolicy: Delete
capacity:
Expand All @@ -43,7 +22,7 @@ spec:
apiVersion: v1
kind: PersistentVolume
metadata:
name: supervisor-models
name: onetrainer-models
spec:
persistentVolumeReclaimPolicy: Delete
capacity:
Expand All @@ -64,7 +43,7 @@ spec:
apiVersion: v1
kind: PersistentVolume
metadata:
name: supervisor-tmp
name: onetrainer-tmp
spec:
persistentVolumeReclaimPolicy: Delete
capacity:
Expand Down
57 changes: 57 additions & 0 deletions OneTrainer/nfs-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
io.kompose.service: onetrainer-workspace
name: onetrainer-workspace
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 400Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
io.kompose.service: onetrainer-home
name: onetrainer-home
spec:
volumeName: onetrainer-home
storageClassName: ""
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1000Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
io.kompose.service: onetrainer-models
name: onetrainer-models
spec:
volumeName: onetrainer-models
storageClassName: ""
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1000Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
io.kompose.service: onetrainer-tmp
name: onetrainer-tmp
spec:
volumeName: onetrainer-tmp
storageClassName: ""
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1000Gi
3 changes: 2 additions & 1 deletion OneTrainer/runmefirst.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env bash
kubectl create namespace onetrainer-service
kubectl apply -n onetrainer-service -f .
kubectl apply -f . -n onetrainer-service
kubectl apply -f snapshot.yaml
3 changes: 3 additions & 0 deletions OneTrainer/runmelast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash
helm uninstall ollama --namespace onetrainer-servicee
kubectl delete -f . -n onetrainer-service
41 changes: 41 additions & 0 deletions OneTrainer/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
apiVersion: v1
kind: Service
metadata:
name: onetrainer
spec:
ports:
- nodePort: 31244
name: "trainer-ssh"
port: 3701
protocol: TCP
targetPort: 22
- nodePort: 31245
name: "trainer-rdp"
port: 3702
protocol: TCP
targetPort: 3389
- name: "trainer-tensorboard"
port: 3703
protocol: TCP
targetPort: 6006
selector:
io.kompose.service: onetrainer
sessionAffinity: None
type: LoadBalancer
status:
loadBalancer: {}
---
apiVersion: v1
kind: Service
metadata:
name: onetrainer-node-exporter
labels:
prometheus.kubevirt.io: "node-exporter"
spec:
ports:
- name: metrics
port: 9100
targetPort: 9100
protocol: TCP
selector:
prometheus.kubevirt.io: "node-exporter"
15 changes: 15 additions & 0 deletions OneTrainer/snapshot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: longhorn.io/v1beta1
kind: RecurringJob
metadata:
name: onetrainer-snapshot
namespace: longhorn-system
spec:
cron: "0 */2 * * *"
task: "snapshot"
groups:
- onetrainer-service
retain: 40
concurrency: 2
labels:
label/1: a
label/2: b
Loading

0 comments on commit 7083cbf

Please sign in to comment.