diff --git a/OneTrainer/bin/create-users.sh b/OneTrainer/bin/create-users.sh index b22e9e2..f3dfee1 100644 --- a/OneTrainer/bin/create-users.sh +++ b/OneTrainer/bin/create-users.sh @@ -8,12 +8,12 @@ while read id username hash groups; do # Create group addgroup --gid $id $username # Create user - useradd -m -u $id -s /bin/bash -g $username $username + useradd -m -u $id -s /bin/bash -g $username -d /workspace $username # Set password echo "$username:$hash" | /usr/sbin/chpasswd -e # Add supplemental groups if [ $groups ]; then usermod -aG $groups $username fi - chown -R $username /home/workspace + chown -R $username /workspace done < /etc/users.list diff --git a/OneTrainer/bin/docker-entrypoint.sh b/OneTrainer/bin/docker-entrypoint.sh index a432100..83f2e1c 100755 --- a/OneTrainer/bin/docker-entrypoint.sh +++ b/OneTrainer/bin/docker-entrypoint.sh @@ -3,6 +3,9 @@ # Add users bash /usr/bin/create-users.sh +# Clone Static Homedir +rsync -aHx /home/workspace / + # Add the ssh config if needed if [ ! -f "/etc/ssh/sshd_config" ]; diff --git a/OneTrainer/deployment.yaml b/OneTrainer/deployment.yaml new file mode 100644 index 0000000..0a0728c --- /dev/null +++ b/OneTrainer/deployment.yaml @@ -0,0 +1,67 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + annotations: + kompose.cmd: kompose convert + kompose.version: 1.34.0 (HEAD) + labels: + io.kompose.service: onetrainer + name: onetrainer +spec: + replicas: 1 + selector: + matchLabels: + io.kompose.service: onetrainer + strategy: + type: Recreate + template: + metadata: + annotations: + kompose.cmd: kompose convert + kompose.version: 1.34.0 (HEAD) + labels: + io.kompose.service: onetrainer + spec: + containers: + - env: + image: ghcr.io/celesrenata/onetrainer:latest + name: onetrainer + ports: + - containerPort: 3350 + protocol: TCP + - containerPort: 22 + protocol: TCP + - containerPort: 3389 + protocol: TCP + resources: + limits: + memory: 64Gi + nvidia.com/gpu: "1" + intel.com/sriov-gpudevice: "1" + volumeMounts: + - mountPath: /home/workspace + name: onetrainer-home + - mountPath: /workspace + name: onetrainer-workspace + - mountPath: /workspace/models + name: onetrainer-models + - mountPath: /workspace/input + name: onetrainer-tmp + subPath: input + - mountPath: /workspace/output + name: onetrainer-tmp + subPath: output + restartPolicy: Always + volumes: + - name: onetrainer-workspace + persistentVolumeClaim: + claimName: onetrainer-workspace + - name: onetrainer-home + persistentVolumeClaim: + claimName: onetrainer-home + - name: onetrainer-models + persistentVolumeClaim: + claimName: onetrainer-models + - name: onetrainer-tmp + persistentVolumeClaim: + claimName: onetrainer-tmp diff --git a/OneTrainer/docker/Dockerfile b/OneTrainer/docker/Dockerfile index 451d3fb..38d826f 100644 --- a/OneTrainer/docker/Dockerfile +++ b/OneTrainer/docker/Dockerfile @@ -32,6 +32,12 @@ RUN make RUN mkdir -p /tmp/so RUN cp src/.libs/*.so /tmp/so +WORKDIR /tmp +RUN git clone https://github.com/aristocratos/btop.git +WORKDIR /tmp/btop +RUN gmake PREFIX=/usr && gmake install && gmake setuid + + FROM nvidia/cuda:12.5.0-runtime-ubuntu22.04 ENV TZ=America/Los_Angeles ENV DEBIAN_FRONTEND noninteractive @@ -47,6 +53,7 @@ RUN apt -y full-upgrade && apt-get install -y \ ca-certificates \ crudini \ firefox \ + kitty \ less \ locales \ openssh-server \ @@ -56,6 +63,7 @@ RUN apt -y full-upgrade && apt-get install -y \ uuid-runtime \ vim \ vlc \ + rsync \ wget \ xauth \ xautolock \ @@ -72,6 +80,7 @@ RUN apt -y full-upgrade && apt-get install -y \ xorgxrdp \ xprintidle \ xrdp \ + nvidia-cuda-toolkit \ $ADDITIONAL_PACKAGES && \ apt remove -y light-locker xscreensaver && \ apt autoremove -y && \ @@ -99,14 +108,10 @@ RUN mkdir /var/run/dbus && \ RUN apt update && apt-get install fonts-droid-fallback ttf-wqy-zenhei ttf-wqy-microhei fonts-arphic-ukai fonts-arphic-uming -y RUN apt update && apt install curl htop neofetch python3-pip python3-tk python-is-python3 libjpeg-dev p7zip-full gcc g++ fonts-noto-cjk-extra -y -WORKDIR /home/workspace +WORKDIR /workspace #RUN curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba -ENV PYTHON=python3.10 -ENV ONETRAINER_PATH=/home/workspace/ -ENV PATH="$PATH:$ONETRAINER_PATH" - # python RUN apt-get update \ && apt-get install -y mesa-utils libgl1-mesa-dri libgtkgl2.0-dev libgtkglext1-dev git software-properties-common apt-utils \ @@ -127,7 +132,7 @@ RUN git clone https://github.com/Nerogar/OneTrainer.git # RUN ENV PYTHON=python3.10 -ENV ONETRAINER_PATH=/home/workspace/ +ENV ONETRAINER_PATH=/workspace/OneTrainer ENV PATH="$PATH:$ONETRAINER_PATH" WORKDIR $ONETRAINER_PATH diff --git a/OneTrainer/ingress.yaml b/OneTrainer/ingress.yaml new file mode 100644 index 0000000..2bb6238 --- /dev/null +++ b/OneTrainer/ingress.yaml @@ -0,0 +1,47 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tensor-http + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: web + traefik.ingress.kubernetes.io/router.middlewares: default-redirectscheme@kubernetescrd + labels: + app: onetrainer +spec: + rules: + - host: tensorboard.celestium.life + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: onetrainer + port: + number: 3703 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tensor-https + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: websecure + cert-manager.io/cluster-issuer: ca-issuer + labels: + app: onetrainer +spec: + rules: + - host: tensorboard.celestium.life + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: onetrainer + port: + number: 3703 + tls: + - hosts: + - tensorboard.celestium.life + secretName: tensor-cert diff --git a/OneTrainer/nfs-pv.yaml b/OneTrainer/nfs-pv.yaml index 675fb13..1ea419a 100644 --- a/OneTrainer/nfs-pv.yaml +++ b/OneTrainer/nfs-pv.yaml @@ -1,28 +1,7 @@ apiVersion: v1 kind: PersistentVolume metadata: - name: supervisor-workspace -spec: - persistentVolumeReclaimPolicy: Delete - capacity: - storage: 1000Gi - accessModes: - - ReadWriteOnce - - ReadOnlyMany - - ReadWriteMany - storageClassName: "" - nfs: - path: /volume1/Kubernetes/onetrainer/workspace # The path to your media - server: 192.168.42.8 # Your NFS server with Media - mountOptions: - - vers=4 - - minorversion=1 - - noac ---- -apiVersion: v1 -kind: PersistentVolume -metadata: - name: supervisor-home + name: onetrainer-home spec: persistentVolumeReclaimPolicy: Delete capacity: @@ -43,7 +22,7 @@ spec: apiVersion: v1 kind: PersistentVolume metadata: - name: supervisor-models + name: onetrainer-models spec: persistentVolumeReclaimPolicy: Delete capacity: @@ -64,7 +43,7 @@ spec: apiVersion: v1 kind: PersistentVolume metadata: - name: supervisor-tmp + name: onetrainer-tmp spec: persistentVolumeReclaimPolicy: Delete capacity: diff --git a/OneTrainer/nfs-pvc.yaml b/OneTrainer/nfs-pvc.yaml new file mode 100644 index 0000000..2e4284b --- /dev/null +++ b/OneTrainer/nfs-pvc.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + io.kompose.service: onetrainer-workspace + name: onetrainer-workspace +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 400Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + io.kompose.service: onetrainer-home + name: onetrainer-home +spec: + volumeName: onetrainer-home + storageClassName: "" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1000Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + io.kompose.service: onetrainer-models + name: onetrainer-models +spec: + volumeName: onetrainer-models + storageClassName: "" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1000Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + io.kompose.service: onetrainer-tmp + name: onetrainer-tmp +spec: + volumeName: onetrainer-tmp + storageClassName: "" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1000Gi diff --git a/OneTrainer/runmefirst.sh b/OneTrainer/runmefirst.sh index 7aadf02..0d5798c 100755 --- a/OneTrainer/runmefirst.sh +++ b/OneTrainer/runmefirst.sh @@ -1,3 +1,4 @@ #!/usr/bin/env bash kubectl create namespace onetrainer-service -kubectl apply -n onetrainer-service -f . +kubectl apply -f . -n onetrainer-service +kubectl apply -f snapshot.yaml diff --git a/OneTrainer/runmelast.sh b/OneTrainer/runmelast.sh new file mode 100755 index 0000000..44acb74 --- /dev/null +++ b/OneTrainer/runmelast.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +helm uninstall ollama --namespace onetrainer-servicee +kubectl delete -f . -n onetrainer-service diff --git a/OneTrainer/service.yaml b/OneTrainer/service.yaml new file mode 100644 index 0000000..7f7e570 --- /dev/null +++ b/OneTrainer/service.yaml @@ -0,0 +1,41 @@ +apiVersion: v1 +kind: Service +metadata: + name: onetrainer +spec: + ports: + - nodePort: 31244 + name: "trainer-ssh" + port: 3701 + protocol: TCP + targetPort: 22 + - nodePort: 31245 + name: "trainer-rdp" + port: 3702 + protocol: TCP + targetPort: 3389 + - name: "trainer-tensorboard" + port: 3703 + protocol: TCP + targetPort: 6006 + selector: + io.kompose.service: onetrainer + sessionAffinity: None + type: LoadBalancer +status: + loadBalancer: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: onetrainer-node-exporter + labels: + prometheus.kubevirt.io: "node-exporter" +spec: + ports: + - name: metrics + port: 9100 + targetPort: 9100 + protocol: TCP + selector: + prometheus.kubevirt.io: "node-exporter" diff --git a/OneTrainer/snapshot.yaml b/OneTrainer/snapshot.yaml new file mode 100644 index 0000000..ea2ebe9 --- /dev/null +++ b/OneTrainer/snapshot.yaml @@ -0,0 +1,15 @@ +apiVersion: longhorn.io/v1beta1 +kind: RecurringJob +metadata: + name: onetrainer-snapshot + namespace: longhorn-system +spec: + cron: "0 */2 * * *" + task: "snapshot" + groups: + - onetrainer-service + retain: 40 + concurrency: 2 + labels: + label/1: a + label/2: b diff --git a/README.md b/README.md deleted file mode 100644 index db500c2..0000000 --- a/README.md +++ /dev/null @@ -1,78 +0,0 @@ -# NixOS Intel Ultra 9 185H K3s with SR-IOV GPU Passthrough -## Featuring the Following Projects -* Cert Manager -* Cluster Plex -* ComfyUI (Nvidia) -* Dashboard -* Flame -* Grafana -* Intel SR-IOV Plugins -* Kubevirt - * Amazon Linux 2023 - * Arch Linux - * NixOS 24.05 (Intel) - * Ubuntu 24.04 Ollama (Intel IPEX) - * Windows 11 (Intel GPU broken) -* Kyverno -* Longhorn -* MariaDB -* Nvidia Container Plugins -* Nvidia Containerd Toolkit -* PHPMyAdmin -* Portainer -* Prometheus -* ReviewBoard -* Traefik -* Unifi Controller -* Wordpress - -## Prerequisites -* 3x Intel 14th Gen Processors with the 185H or similar. I use BeeLink 3x GTi14(s). - * Will also work on ARC dGPUs with a little fiddling. -* Seperate VLAN for your Kube Cluster. - -## Installation -* Contained in the repo are the main configurations required to build out each host. -1. [Build a ThumbDrive or PXE Boot NixOS 24.05](https://wiki.nixos.org/wiki/NixOS_Installation_Guide) - * Complete steps through SWAP setup. -3. `sudo nix-channel --update` -4. `git clone https://github.com/celesrenata/nixos-k3s-configs/` -5. `sudo nixos-generate-config --root /mnt` -6. `cp -r nixos-k3s-configs/nixos-kube-config/gremlin-1/* /mnt/etc/nixos/` -7. sudo nixos-generate-config --root /mnt - * Yes, again -8. You may now edit your hardware-configuration.nix file to your liking -9. `nixos-install --root /mnt` -10. `sudo nixos-enter` -11. `passwd celes` -12. `exit` -13. `reboot` - -### Repeat for Gremlins 2 and 3 -Login to and add your own ssh configs to your account, root, and nixremote accounts: -* Add your own authroized keys, you will need these as the fleet does not work without passwordless SSH! - -## Networking -1. Set your network to expect `10.1.1.12, 10.1.1.13, 10.1.1.14` for your Cluster - -## Configuring NFS -1. These configs are setup for my NFS server, you will have to edit all your PVC files to meet your needs -2. Leaving these details in have been way more useful than not demonstrating how to create truely persistant volumes - -## Ensuring Cluster is Happy -1. I have included automation scripts for resetting the fleet to known good states as well as scripts to deploy all the services I have figured out! -2. `./resetfleet.sh` - -## Edit the Cluster Deployments -Each script is controlled by a `runmefirst.sh` file in the directory of the service, and is stood up by the following automation script: -* `./runmefirst.sh` -* Edit this file to turn off deployments you do not desire for your Cluster - -## Ollama via IPEX -Ollama is controlled via the IPEX fleet within the `kubevirt` directory -If you have more than 32GB of ram per Node you can then use Ipex-LLM Ollama! -`kubevirt/ipex-1x/runmefirst.sh` - -## TODO -* Rebuild Unifi Controller -* Resolve Problem (43) in Win11 when passing SR-IOV Intel graphics to it. diff --git a/comfyui/runmelast.sh b/comfyui/runmelast.sh new file mode 100755 index 0000000..2ce4666 --- /dev/null +++ b/comfyui/runmelast.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +helm uninstall ollama --namespace comfyui-servicee +kubectl delete -f . -n comfyui-service diff --git a/everydream2/ingress.yaml b/everydream2/ingress.yaml new file mode 100644 index 0000000..172f7af --- /dev/null +++ b/everydream2/ingress.yaml @@ -0,0 +1,47 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: supervisor + annotations: + traefik.ingress.kubernetes.io/router.middlewares: default-comfyui-stripprefix@kubernetescrd + traefik.ingress.kubernetes.io/router.entrypoints: websecure + cert-manager.io/cluster-issuer: ca-issuer +spec: + ingressClassName: traefik + rules: + - host: ed2.celestium.life + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: supervisor + port: + number: 28888 + - host: ed2.dev.celestium.life + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: supervisor + port: + number: 28888 + tls: + - hosts: + - ed2.celestium.life + - ed2.dev.celestium.life + secretName: ed2-cert +--- +# Source: comfyui/templates/ingress.yaml +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: comfyui-stripprefix +spec: + stripPrefix: + prefixes: + - /assets + - /lab diff --git a/everydream2/nfs-pv.yaml b/everydream2/nfs-pv.yaml new file mode 100644 index 0000000..09a5d0b --- /dev/null +++ b/everydream2/nfs-pv.yaml @@ -0,0 +1,42 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: ed2-model +spec: + persistentVolumeReclaimPolicy: Delete + capacity: + storage: 1000Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + storageClassName: "" + nfs: + path: /volume1/Kubernetes/comfyui/models # The path to your config + server: 192.168.42.8 # Your NFS server with Media + mountOptions: + - vers=4 + - minorversion=1 + - noac +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: ed2-tmp +spec: + persistentVolumeReclaimPolicy: Delete + capacity: + storage: 1000Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + storageClassName: "" + nfs: + path: /volume1/Kubernetes/comfyui/tmp # The path to your config + server: 192.168.42.8 # Your NFS server with Media + mountOptions: + - vers=4 + - minorversion=1 + - noac diff --git a/everydream2/runmefirst.sh b/everydream2/runmefirst.sh new file mode 100755 index 0000000..de7e6d8 --- /dev/null +++ b/everydream2/runmefirst.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +kubectl create namespace everydream2-service +kubectl apply -f . -n everydream2-service diff --git a/everydream2/supervisor-claim2-persistentvolumeclaim.yaml b/everydream2/supervisor-claim2-persistentvolumeclaim.yaml new file mode 100644 index 0000000..e98be0d --- /dev/null +++ b/everydream2/supervisor-claim2-persistentvolumeclaim.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ed2-model +spec: + volumeName: ed2-model + storageClassName: "" + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1000Gi diff --git a/everydream2/supervisor-claim3-persistentvolumeclaim.yaml b/everydream2/supervisor-claim3-persistentvolumeclaim.yaml new file mode 100644 index 0000000..b8d7668 --- /dev/null +++ b/everydream2/supervisor-claim3-persistentvolumeclaim.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ed2-tmp +spec: + volumeName: ed2-tmp + storageClassName: "" + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1000Gi diff --git a/everydream2/supervisor-deployment.yaml b/everydream2/supervisor-deployment.yaml new file mode 100644 index 0000000..6670060 --- /dev/null +++ b/everydream2/supervisor-deployment.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + annotations: + kompose.cmd: kompose convert + kompose.version: 1.34.0 (HEAD) + labels: + io.kompose.service: supervisor + name: supervisor +spec: + replicas: 1 + selector: + matchLabels: + io.kompose.service: supervisor + strategy: + type: Recreate + template: + metadata: + annotations: + kompose.cmd: kompose convert + kompose.version: 1.34.0 (HEAD) + labels: + io.kompose.service: supervisor + spec: + containers: + - env: + - name: JUPYTER_PASSWORD + value: renata + image: ghcr.io/victorchall/everydream2trainer:main + name: supervisor + ports: + - containerPort: 6006 + protocol: TCP + - containerPort: 8888 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /workspace/EveryDream2trainer/models + name: ed2-model + - mountPath: /workspace/EveryDream2trainer/input + name: ed2-tmp + subPath: input + - mountPath: /workspace/EveryDream2trainer/output + name: ed2-tmp + subPath: output + restartPolicy: Always + volumes: + - name: ed2-model + persistentVolumeClaim: + claimName: ed2-model + - name: ed2-tmp + persistentVolumeClaim: + claimName: ed2-tmp diff --git a/everydream2/supervisor-service.yaml b/everydream2/supervisor-service.yaml new file mode 100644 index 0000000..bab0c41 --- /dev/null +++ b/everydream2/supervisor-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: supervisor +spec: + ports: + - port: 28888 + protocol: TCP + targetPort: 8888 + selector: + io.kompose.service: supervisor + sessionAffinity: None + type: LoadBalancer +status: + loadBalancer: {} diff --git a/mongodb/default-mongodb.yaml b/mongodb/default-mongodb.yaml new file mode 100644 index 0000000..54b31c6 --- /dev/null +++ b/mongodb/default-mongodb.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: default-mongodb +spec: + persistentVolumeReclaimPolicy: Delete + capacity: + storage: 10Gi + accessModes: + - ReadWriteMany + storageClassName: "" + nfs: + path: /volume1/Kubernetes/mongodb/data # The path to your media + server: 192.168.42.8 # Your NFS server with Media + mountOptions: + - vers=4 + - minorversion=1 + - noac +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: default-mongodb-pvc +spec: + volumeName: deafult-mongodb + accessModes: + - ReadWriteMany + storageClassName: "" + resources: + requests: + storage: 10Gi diff --git a/mongodb/runmefirst.sh b/mongodb/runmefirst.sh new file mode 100755 index 0000000..506ba0c --- /dev/null +++ b/mongodb/runmefirst.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +helm repo add mongodb https://mongodb.github.io/helm-charts +helm upgrade -i mongodb-operator mongodb/community-operator \ + --namespace mongodb-operator --create-namespace \ + --set operator.watchNamespace="*" + +#kubectl create namespace rainbow-mongodb +#kubectl apply -f rainbow-mongodb.yaml -n rainbow-mongodb +#helm install mongodb bitnami/mongodb \ +# --namespace rainbow-mongodb \ +# --set persistence.existingClaim=rainbow-mongodb-pvc diff --git a/mongodb/values.yaml b/mongodb/values.yaml new file mode 100644 index 0000000..ab8cc4c --- /dev/null +++ b/mongodb/values.yaml @@ -0,0 +1,126 @@ +## Reference to one or more secrets to be used when pulling images +## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] +# - name: "image-pull-secret" +## Operator +operator: + # Name that will be assigned to most of internal Kubernetes objects like + # Deployment, ServiceAccount, Role etc. + name: mongodb-kubernetes-operator + + # Name of the operator image + operatorImageName: mongodb-kubernetes-operator + + # Name of the deployment of the operator pod + deploymentName: mongodb-kubernetes-operator + + # Version of mongodb-kubernetes-operator + version: 0.11.0 + + # Uncomment this line to watch all namespaces + watchNamespace: "*" + + # Resources allocated to Operator Pod + resources: + limits: + cpu: 1100m + memory: 1Gi + requests: + cpu: 500m + memory: 200Mi + + # PriorityClass configuration for operator + # ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: '' + + # replicas deployed for the operator pod. Running 1 is optimal and suggested. + replicas: 1 + + # Additional environment variables + extraEnvs: [] + # environment: + # - name: CLUSTER_DOMAIN + # value: my-cluster.domain + + podSecurityContext: + runAsNonRoot: true + runAsUser: 2000 + + securityContext: {} + +## Operator's database +database: + name: mongodb-database + # set this to the namespace where you would like + # to deploy the MongoDB database, + # Note if the database namespace is not same + # as the operator namespace, + # make sure to set "watchNamespace" to "*" + # to ensure that the operator has the + # permission to reconcile resources in other namespaces + # namespace: mongodb-database + +agent: + name: mongodb-agent-ubi + version: 107.0.7.8596-1 +versionUpgradeHook: + name: mongodb-kubernetes-operator-version-upgrade-post-start-hook + version: 1.0.9 +readinessProbe: + name: mongodb-kubernetes-readinessprobe + version: 1.0.20 +mongodb: + name: mongodb-community-server + repo: docker.io/mongodb + imageType: ubi8 + +registry: + agent: quay.io/mongodb + versionUpgradeHook: quay.io/mongodb + readinessProbe: quay.io/mongodb + operator: quay.io/mongodb + pullPolicy: Always + +# Set to false if CRDs have been installed already. The CRDs can be installed +# manually from the code repo: github.com/mongodb/mongodb-kubernetes-operator or +# using the `community-operator-crds` Helm chart. +community-operator-crds: + enabled: true + +# Deploys MongoDB with `resource` attributes. +createResource: false +resource: + name: mongodb-replica-set + version: 4.4.0 + members: 3 + tls: + enabled: false + + # Installs Cert-Manager in this cluster. + useX509: false + sampleX509User: false + useCertManager: true + certificateKeySecretRef: tls-certificate + caCertificateSecretRef: tls-ca-key-pair + certManager: + certDuration: 8760h # 365 days + renewCertBefore: 720h # 30 days + + users: [] + # if using the MongoDBCommunity Resource, list any users to be added to the resource + # users: + # - name: my-user + # db: admin + # passwordSecretRef: # a reference to the secret that will be used to generate the user's password + # name: + # roles: + # - name: clusterAdmin + # db: admin + # - name: userAdminAnyDatabase + # db: admin + # - name: readWriteAnyDatabase + # db: admin + # - name: dbAdminAnyDatabase + # db: admin + # scramCredentialsSecretName: my-scram + diff --git a/nixos-kube-configs/gremlin-1/boot.nix b/nixos-kube-configs/gremlin-1/boot.nix index a15eaae..ba790e9 100755 --- a/nixos-kube-configs/gremlin-1/boot.nix +++ b/nixos-kube-configs/gremlin-1/boot.nix @@ -17,7 +17,7 @@ boot.initrd.kernelModules = [ "vmd" "md_mod" "raid0" ]; # See Kernel Overlay - boot.kernelPackages = pkgs.kernel611; + boot.kernelPackages = pkgs.kernelPXP; boot.binfmt.emulatedSystems = [ "aarch64-linux" ]; boot.kernelModules = [ "i915" ]; boot.supportedFilesystems = [ "nfs" ]; diff --git a/nixos-kube-configs/gremlin-1/configuration.nix b/nixos-kube-configs/gremlin-1/configuration.nix index 08fac14..3452190 100644 --- a/nixos-kube-configs/gremlin-1/configuration.nix +++ b/nixos-kube-configs/gremlin-1/configuration.nix @@ -25,7 +25,9 @@ nix.settings.experimental-features = [ "nix-command" "flakes" ]; nix.settings.cores = 12; nixpkgs.config.allowUnfree = true; + nixpkgs.overlays = [ + #(import ./overlays/distcc.nix) (import ./overlays/i915-sriov-dkms.nix) (import ./overlays/intel-firmware.nix) (import ./overlays/intel-gfx-sriov.nix) @@ -61,15 +63,14 @@ services.distccd = { enable = true; allowedClients = [ - "192.168.42.0/24" + "192.168.42.0/25" "10.1.1.0/24" "10.42.0.0/16" ]; - logLevel = "debug"; stats.enable = true; zeroconf = true; - }; - + }; + # Reset Cluster # services.etcd.enable = false; # KUBELET_PATH=$(mount | grep kubelet | cut -d' ' -f3); @@ -101,7 +102,9 @@ screen nfs-utils openiscsi + nvidia-container-toolkit nvtopPackages.intel + nvtopPackages.nvidia intel-gpu-tools nix-index gcc14 @@ -125,6 +128,6 @@ isNormalUser = true; }; - system.stateVersion = "24.05"; + system.stateVersion = "24.11"; } diff --git a/nixos-kube-configs/gremlin-1/configuration.nix.reset b/nixos-kube-configs/gremlin-1/configuration.nix.reset index c59a34f..170da21 100755 --- a/nixos-kube-configs/gremlin-1/configuration.nix.reset +++ b/nixos-kube-configs/gremlin-1/configuration.nix.reset @@ -30,7 +30,7 @@ (import ./overlays/intel-firmware.nix) (import ./overlays/intel-gfx-sriov.nix) (import ./overlays/kernel.nix) - (import ./overlays/nvidia-container-toolkit.nix) + #(import ./overlays/nvidia-container-toolkit.nix) #(import ./overlays/generic-cdi2.nix) ]; @@ -101,7 +101,9 @@ screen nfs-utils openiscsi + nvidia-container-toolkit nvtopPackages.intel + nvtopPackages.nvidia intel-gpu-tools nix-index gcc14 diff --git a/nixos-kube-configs/gremlin-1/graphics.nix b/nixos-kube-configs/gremlin-1/graphics.nix index 6378a3a..db45703 100755 --- a/nixos-kube-configs/gremlin-1/graphics.nix +++ b/nixos-kube-configs/gremlin-1/graphics.nix @@ -23,7 +23,7 @@ in rec { }; hardware.nvidia = { open = true; - package = nvidia-package; + package = config.boot.kernelPackages.nvidiaPackages.production; nvidiaSettings = true; }; hardware.cpu.intel.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware; diff --git a/nixos-kube-configs/gremlin-1/kubernetes.nix b/nixos-kube-configs/gremlin-1/kubernetes.nix index fce92ab..574ed7b 100755 --- a/nixos-kube-configs/gremlin-1/kubernetes.nix +++ b/nixos-kube-configs/gremlin-1/kubernetes.nix @@ -67,7 +67,7 @@ virtualisation = { docker = { enable = true; - package = pkgs.docker_25; + package = pkgs.docker_26; enableNvidia = true; }; }; @@ -111,4 +111,7 @@ }; }; }; + security.pam.loginLimits = [ + {domain = "*"; item = "memlock"; type = "-"; value = "unlimited";} + ]; } diff --git a/nixos-kube-configs/gremlin-1/overlays/i915-sriov-dkms.nix b/nixos-kube-configs/gremlin-1/overlays/i915-sriov-dkms.nix index 917703a..1b331d8 100644 --- a/nixos-kube-configs/gremlin-1/overlays/i915-sriov-dkms.nix +++ b/nixos-kube-configs/gremlin-1/overlays/i915-sriov-dkms.nix @@ -1,34 +1,34 @@ prev: final: rec { intel-gfx-sriov = prev.stdenv.mkDerivation { - name = "intel-gfx-sriov-${prev.linuxPackages_6_6.kernel.modDirVersion}"; + name = "intel-gfx-sriov-${prev.kernelPXP.kernel.modDirVersion}"; passthru.moduleName = "intel-gfx-sriov"; src = prev.fetchFromGitHub { - owner = "strongtz"; + owner = "bbaa-bbaa"; repo = "i915-sriov-dkms"; - rev = "e26ce8952e465762fc0743731aa377ec0b2889ff"; - sha256 = "sha256-O+7ZehoVOYYdCTboF9XGBR9G6I72987AdbbF1JkrsBc="; + rev = "07cc8896d28687cbe6416e64373fc21d8b383423"; + sha256 = "sha256-RVSXLx17ZFjGO1G/g/crAkdBAyGlEqM0iPFDRwnynzc="; }; hardeningDisable = [ "pic" ]; - nativeBuildInputs = final.linuxPackages_6_6.kernel.moduleBuildDependencies; + nativeBuildInputs = prev.kernelPXP.kernel.moduleBuildDependencies; makeFlags = [ - "KVERSION=${final.linuxPackages_6_6.kernel.modDirVersion}" - "KDIR=${final.linuxPackages_6_6.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build" + "KVERSION=${prev.kernelPXP.kernel.modDirVersion}" + "KDIR=${prev.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build" ]; buildFlags = [ - "KERNEL_DIR=${final.linuxPackages_6_6.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build" + "KERNEL_DIR=${prev.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build" ]; buildPhase = '' - make -j8 -C ${final.pkgs.kernel611.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build M=$(pwd) modules + make -j8 -C ${prev.pkgs.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build M=$(pwd) modules ''; installPhase = '' - install -D i915.ko $out/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/kernel/drivers/gpu/drm/i915/i915.ko + install -D i915.ko $out/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/kernel/drivers/gpu/drm/i915/i915.ko ''; }; } diff --git a/nixos-kube-configs/gremlin-1/overlays/kernel.nix b/nixos-kube-configs/gremlin-1/overlays/kernel.nix index b24bc3e..5aafd34 100644 --- a/nixos-kube-configs/gremlin-1/overlays/kernel.nix +++ b/nixos-kube-configs/gremlin-1/overlays/kernel.nix @@ -1,9 +1,16 @@ final: prev: { - kernel611 = prev.pkgs.linuxPackagesFor (prev.pkgs.linux_6_6.override { - structuredExtraConfig = with prev.lib.kernel; { - DRM_I915_PXP = yes; - INTEL_MEI_PXP = module; + kernelPXP = prev.pkgs.linuxPackagesFor (prev.pkgs.linux.override { + argsOverride = rec { + src = prev.pkgs.fetchurl { + url = "mirror://kernel/linux/kernel/v6.x/linux-${version}.tar.xz"; + sha256 = "sha256-UkhYhS9YaanvF96LHm5/rwW8ssRivJazwk2/gu3jc88="; + }; + version = "6.10.12"; + modDirVersion = "6.10.12"; }; - ignoreConfigErrors = true; + extraConfig = '' + DRM_I915_PXP y + INTEL_MEI_PXP m + ''; }); } diff --git a/nixos-kube-configs/gremlin-1/overlays/nvidia-container-toolkit.nix b/nixos-kube-configs/gremlin-1/overlays/nvidia-container-toolkit.nix new file mode 100644 index 0000000..3b2860a --- /dev/null +++ b/nixos-kube-configs/gremlin-1/overlays/nvidia-container-toolkit.nix @@ -0,0 +1,5 @@ +final: prev: +{ + nvidia-container-toolkit = prev.nvidia-container-toolkit.overrideAttrs ({ + }); +} diff --git a/nixos-kube-configs/gremlin-2/boot.nix b/nixos-kube-configs/gremlin-2/boot.nix index a15eaae..ba790e9 100755 --- a/nixos-kube-configs/gremlin-2/boot.nix +++ b/nixos-kube-configs/gremlin-2/boot.nix @@ -17,7 +17,7 @@ boot.initrd.kernelModules = [ "vmd" "md_mod" "raid0" ]; # See Kernel Overlay - boot.kernelPackages = pkgs.kernel611; + boot.kernelPackages = pkgs.kernelPXP; boot.binfmt.emulatedSystems = [ "aarch64-linux" ]; boot.kernelModules = [ "i915" ]; boot.supportedFilesystems = [ "nfs" ]; diff --git a/nixos-kube-configs/gremlin-2/configuration.nix b/nixos-kube-configs/gremlin-2/configuration.nix index b88d973..03560b4 100755 --- a/nixos-kube-configs/gremlin-2/configuration.nix +++ b/nixos-kube-configs/gremlin-2/configuration.nix @@ -28,7 +28,6 @@ (import ./overlays/intel-firmware.nix) (import ./overlays/intel-gfx-sriov.nix) (import ./overlays/kernel.nix) - #(import ./overlays/libuv.nix) ]; # VMD Array @@ -64,7 +63,7 @@ ]; stats.enable = true; zeroconf = true; - }; + }; # Reset Cluster # services.etcd.enable = false; diff --git a/nixos-kube-configs/gremlin-2/hardware-configuration.nix b/nixos-kube-configs/gremlin-2/hardware-configuration.nix index 8cabd52..cc1b416 100755 --- a/nixos-kube-configs/gremlin-2/hardware-configuration.nix +++ b/nixos-kube-configs/gremlin-2/hardware-configuration.nix @@ -38,7 +38,7 @@ }; fileSystems."/boot" = - { device = "/dev/disk/by-uuid/8E7A-1854"; + { device = "/dev/disk/by-uuid/2791-2776"; fsType = "vfat"; options = [ "fmask=0022" "dmask=0022" ]; }; diff --git a/nixos-kube-configs/gremlin-2/kubernetes.nix b/nixos-kube-configs/gremlin-2/kubernetes.nix index b735433..cbb1586 100755 --- a/nixos-kube-configs/gremlin-2/kubernetes.nix +++ b/nixos-kube-configs/gremlin-2/kubernetes.nix @@ -44,4 +44,8 @@ #"--disable servicelb" #"--disable traefik" ]; + + security.pam.loginLimits = [ + {domain = "*"; item = "memlock"; type = "-"; value = "unlimited";} + ]; } diff --git a/nixos-kube-configs/gremlin-2/overlays/i915-sriov-dkms.nix b/nixos-kube-configs/gremlin-2/overlays/i915-sriov-dkms.nix index 917703a..1b331d8 100644 --- a/nixos-kube-configs/gremlin-2/overlays/i915-sriov-dkms.nix +++ b/nixos-kube-configs/gremlin-2/overlays/i915-sriov-dkms.nix @@ -1,34 +1,34 @@ prev: final: rec { intel-gfx-sriov = prev.stdenv.mkDerivation { - name = "intel-gfx-sriov-${prev.linuxPackages_6_6.kernel.modDirVersion}"; + name = "intel-gfx-sriov-${prev.kernelPXP.kernel.modDirVersion}"; passthru.moduleName = "intel-gfx-sriov"; src = prev.fetchFromGitHub { - owner = "strongtz"; + owner = "bbaa-bbaa"; repo = "i915-sriov-dkms"; - rev = "e26ce8952e465762fc0743731aa377ec0b2889ff"; - sha256 = "sha256-O+7ZehoVOYYdCTboF9XGBR9G6I72987AdbbF1JkrsBc="; + rev = "07cc8896d28687cbe6416e64373fc21d8b383423"; + sha256 = "sha256-RVSXLx17ZFjGO1G/g/crAkdBAyGlEqM0iPFDRwnynzc="; }; hardeningDisable = [ "pic" ]; - nativeBuildInputs = final.linuxPackages_6_6.kernel.moduleBuildDependencies; + nativeBuildInputs = prev.kernelPXP.kernel.moduleBuildDependencies; makeFlags = [ - "KVERSION=${final.linuxPackages_6_6.kernel.modDirVersion}" - "KDIR=${final.linuxPackages_6_6.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build" + "KVERSION=${prev.kernelPXP.kernel.modDirVersion}" + "KDIR=${prev.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build" ]; buildFlags = [ - "KERNEL_DIR=${final.linuxPackages_6_6.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build" + "KERNEL_DIR=${prev.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build" ]; buildPhase = '' - make -j8 -C ${final.pkgs.kernel611.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build M=$(pwd) modules + make -j8 -C ${prev.pkgs.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build M=$(pwd) modules ''; installPhase = '' - install -D i915.ko $out/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/kernel/drivers/gpu/drm/i915/i915.ko + install -D i915.ko $out/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/kernel/drivers/gpu/drm/i915/i915.ko ''; }; } diff --git a/nixos-kube-configs/gremlin-2/overlays/kernel.nix b/nixos-kube-configs/gremlin-2/overlays/kernel.nix index b24bc3e..5aafd34 100644 --- a/nixos-kube-configs/gremlin-2/overlays/kernel.nix +++ b/nixos-kube-configs/gremlin-2/overlays/kernel.nix @@ -1,9 +1,16 @@ final: prev: { - kernel611 = prev.pkgs.linuxPackagesFor (prev.pkgs.linux_6_6.override { - structuredExtraConfig = with prev.lib.kernel; { - DRM_I915_PXP = yes; - INTEL_MEI_PXP = module; + kernelPXP = prev.pkgs.linuxPackagesFor (prev.pkgs.linux.override { + argsOverride = rec { + src = prev.pkgs.fetchurl { + url = "mirror://kernel/linux/kernel/v6.x/linux-${version}.tar.xz"; + sha256 = "sha256-UkhYhS9YaanvF96LHm5/rwW8ssRivJazwk2/gu3jc88="; + }; + version = "6.10.12"; + modDirVersion = "6.10.12"; }; - ignoreConfigErrors = true; + extraConfig = '' + DRM_I915_PXP y + INTEL_MEI_PXP m + ''; }); } diff --git a/nixos-kube-configs/gremlin-2/remote-build.nix b/nixos-kube-configs/gremlin-2/remote-build.nix index f5cf658..fbc585e 100755 --- a/nixos-kube-configs/gremlin-2/remote-build.nix +++ b/nixos-kube-configs/gremlin-2/remote-build.nix @@ -4,6 +4,7 @@ nix.settings.system-features = [ "kvm" "big-parallel" + "i686-linux" "nixos-test" "benchmark" ]; diff --git a/nixos-kube-configs/gremlin-3/boot.nix b/nixos-kube-configs/gremlin-3/boot.nix index a15eaae..ba790e9 100755 --- a/nixos-kube-configs/gremlin-3/boot.nix +++ b/nixos-kube-configs/gremlin-3/boot.nix @@ -17,7 +17,7 @@ boot.initrd.kernelModules = [ "vmd" "md_mod" "raid0" ]; # See Kernel Overlay - boot.kernelPackages = pkgs.kernel611; + boot.kernelPackages = pkgs.kernelPXP; boot.binfmt.emulatedSystems = [ "aarch64-linux" ]; boot.kernelModules = [ "i915" ]; boot.supportedFilesystems = [ "nfs" ]; diff --git a/nixos-kube-configs/gremlin-3/configuration.nix b/nixos-kube-configs/gremlin-3/configuration.nix index a448e04..4eecac1 100755 --- a/nixos-kube-configs/gremlin-3/configuration.nix +++ b/nixos-kube-configs/gremlin-3/configuration.nix @@ -127,6 +127,6 @@ rec { isNormalUser = true; }; - system.stateVersion = "24.05"; + system.stateVersion = "24.11"; } diff --git a/nixos-kube-configs/gremlin-3/configuration.nix.reset b/nixos-kube-configs/gremlin-3/configuration.nix.reset index e6bcd9d..f58cb9a 100755 --- a/nixos-kube-configs/gremlin-3/configuration.nix.reset +++ b/nixos-kube-configs/gremlin-3/configuration.nix.reset @@ -2,25 +2,17 @@ # your system. Help is available in the configuration.nix(5) man page, on # https://search.nixos.org/options and in the NixOS manual (`nixos-help`). -{ config, lib, pkgs, ... }: -let - pinnedNixPkgs = import (pkgs.fetchFromGitHub { - owner = "nixos"; - repo = "nixpkgs"; - rev = "459c32e47ea9506113ae61c4a35a45f8a830dba1"; - hash = "sha256-yaU0Jcam1FXjPcGK9hlA/LRoms24JdU1XNPJ1BlM2q0="; - }) { config.allowUnfree = true; }; -in -rec { +{ config, pkgs, ... }: +{ imports = [ # Include the results of the hardware scan. ./boot.nix ./graphics.nix ./hardware-configuration.nix ./iscsi.nix - ./kubernetes.nix + #./kubernetes.nix #./monitoring.nix - #./networking.nix + ./networking.nix ./remote-build.nix ./ups.nix ./virtualisation.nix @@ -32,13 +24,14 @@ rec { nixpkgs.config.allowUnfree = true; nixpkgs.overlays = [ #(import ./overlays/distcc.nix) - (import ./overlays/intel-gfx-sriov.nix) - (import ./overlays/intel-firmware.nix) (import ./overlays/i915-sriov-dkms.nix) + (import ./overlays/intel-firmware.nix) + (import ./overlays/intel-gfx-sriov.nix) (import ./overlays/kernel.nix) + #(import ./overlays/libuv.nix) ]; -# VMD Array + # VMD Array boot.swraid = { enable = true; mdadmConf = " diff --git a/nixos-kube-configs/gremlin-3/kubernetes.nix b/nixos-kube-configs/gremlin-3/kubernetes.nix index b735433..cbb1586 100755 --- a/nixos-kube-configs/gremlin-3/kubernetes.nix +++ b/nixos-kube-configs/gremlin-3/kubernetes.nix @@ -44,4 +44,8 @@ #"--disable servicelb" #"--disable traefik" ]; + + security.pam.loginLimits = [ + {domain = "*"; item = "memlock"; type = "-"; value = "unlimited";} + ]; } diff --git a/nixos-kube-configs/gremlin-3/overlays/i915-sriov-dkms.nix b/nixos-kube-configs/gremlin-3/overlays/i915-sriov-dkms.nix index 917703a..1b331d8 100644 --- a/nixos-kube-configs/gremlin-3/overlays/i915-sriov-dkms.nix +++ b/nixos-kube-configs/gremlin-3/overlays/i915-sriov-dkms.nix @@ -1,34 +1,34 @@ prev: final: rec { intel-gfx-sriov = prev.stdenv.mkDerivation { - name = "intel-gfx-sriov-${prev.linuxPackages_6_6.kernel.modDirVersion}"; + name = "intel-gfx-sriov-${prev.kernelPXP.kernel.modDirVersion}"; passthru.moduleName = "intel-gfx-sriov"; src = prev.fetchFromGitHub { - owner = "strongtz"; + owner = "bbaa-bbaa"; repo = "i915-sriov-dkms"; - rev = "e26ce8952e465762fc0743731aa377ec0b2889ff"; - sha256 = "sha256-O+7ZehoVOYYdCTboF9XGBR9G6I72987AdbbF1JkrsBc="; + rev = "07cc8896d28687cbe6416e64373fc21d8b383423"; + sha256 = "sha256-RVSXLx17ZFjGO1G/g/crAkdBAyGlEqM0iPFDRwnynzc="; }; hardeningDisable = [ "pic" ]; - nativeBuildInputs = final.linuxPackages_6_6.kernel.moduleBuildDependencies; + nativeBuildInputs = prev.kernelPXP.kernel.moduleBuildDependencies; makeFlags = [ - "KVERSION=${final.linuxPackages_6_6.kernel.modDirVersion}" - "KDIR=${final.linuxPackages_6_6.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build" + "KVERSION=${prev.kernelPXP.kernel.modDirVersion}" + "KDIR=${prev.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build" ]; buildFlags = [ - "KERNEL_DIR=${final.linuxPackages_6_6.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build" + "KERNEL_DIR=${prev.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build" ]; buildPhase = '' - make -j8 -C ${final.pkgs.kernel611.kernel.dev}/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/build M=$(pwd) modules + make -j8 -C ${prev.pkgs.kernelPXP.kernel.dev}/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/build M=$(pwd) modules ''; installPhase = '' - install -D i915.ko $out/lib/modules/${final.linuxPackages_6_6.kernel.modDirVersion}/kernel/drivers/gpu/drm/i915/i915.ko + install -D i915.ko $out/lib/modules/${prev.kernelPXP.kernel.modDirVersion}/kernel/drivers/gpu/drm/i915/i915.ko ''; }; } diff --git a/nixos-kube-configs/gremlin-3/overlays/kernel.nix b/nixos-kube-configs/gremlin-3/overlays/kernel.nix index b24bc3e..5aafd34 100644 --- a/nixos-kube-configs/gremlin-3/overlays/kernel.nix +++ b/nixos-kube-configs/gremlin-3/overlays/kernel.nix @@ -1,9 +1,16 @@ final: prev: { - kernel611 = prev.pkgs.linuxPackagesFor (prev.pkgs.linux_6_6.override { - structuredExtraConfig = with prev.lib.kernel; { - DRM_I915_PXP = yes; - INTEL_MEI_PXP = module; + kernelPXP = prev.pkgs.linuxPackagesFor (prev.pkgs.linux.override { + argsOverride = rec { + src = prev.pkgs.fetchurl { + url = "mirror://kernel/linux/kernel/v6.x/linux-${version}.tar.xz"; + sha256 = "sha256-UkhYhS9YaanvF96LHm5/rwW8ssRivJazwk2/gu3jc88="; + }; + version = "6.10.12"; + modDirVersion = "6.10.12"; }; - ignoreConfigErrors = true; + extraConfig = '' + DRM_I915_PXP y + INTEL_MEI_PXP m + ''; }); } diff --git a/nixos-kube-configs/gremlin-3/remote-build.nix b/nixos-kube-configs/gremlin-3/remote-build.nix index f5cf658..fbc585e 100755 --- a/nixos-kube-configs/gremlin-3/remote-build.nix +++ b/nixos-kube-configs/gremlin-3/remote-build.nix @@ -4,6 +4,7 @@ nix.settings.system-features = [ "kvm" "big-parallel" + "i686-linux" "nixos-test" "benchmark" ]; diff --git a/nvidia/runmefirst.sh b/nvidia/runmefirst.sh index fe3f229..346c0d3 100755 --- a/nvidia/runmefirst.sh +++ b/nvidia/runmefirst.sh @@ -11,7 +11,7 @@ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ --create-namespace \ --set gfd.enabled=true \ --set runtimeClassName=nvidia \ - --set-file config.map.config=mps.yaml + --set-file config.map.config=time-slice.yaml helm upgrade -i dcgm-exporter gpu-helm-charts/dcgm-exporter \ --namespace dcgm-exporter \ diff --git a/nvidia/time-slice.yaml b/nvidia/time-slice.yaml new file mode 100644 index 0000000..d131d7e --- /dev/null +++ b/nvidia/time-slice.yaml @@ -0,0 +1,6 @@ +version: v1 +sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 10 diff --git a/ollama/values.yaml b/ollama/values.yaml index 74c4e45..a107eef 100644 --- a/ollama/values.yaml +++ b/ollama/values.yaml @@ -10,14 +10,15 @@ ollama: number: 1 # -- List of models to pull at container startup - models: - - starcoder2:15b - - llama3.2 - - codellama:13b - - x/llama3.2-vision - - phind-codellama:34b-python - - dolphin-llama3:8b - - phi3:14b-instruct + models: + pull: + - starcoder2:15b + - llama3.2 + - codellama:13b + - x/llama3.2-vision + - phind-codellama:34b-python + - dolphin-llama3:8b + - phi3:14b-instruct persistentVolume: enabled: true @@ -28,9 +29,9 @@ ollama: existingClaim: ollama-storage-pvc livenessProbe: - initialDelaySeconds: 1200 + initialDelaySeconds: 3000 image: repository: ollama/ollama pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "0.4.0-rc5" + tag: "latest" diff --git a/prometheus/runmefirst.sh b/prometheus/runmefirst.sh index b569363..46a443b 100755 --- a/prometheus/runmefirst.sh +++ b/prometheus/runmefirst.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update -helm install prometheus prometheus-community/kube-prometheus-stack -n prometheus-service --create-namespace -kubectl apply -f ingress.yaml -n prometheus-service + +helm upgrade -i prometheus prometheus-community/kube-prometheus-stack -n prometheus-service --create-namespace -f values.yaml +#kubectl apply -f ingress.yaml -n prometheus-service + + diff --git a/prometheus/values.yaml b/prometheus/values.yaml new file mode 100644 index 0000000..029ee7c --- /dev/null +++ b/prometheus/values.yaml @@ -0,0 +1,19 @@ +prometheus: + enabled: true + prometheusSpec: + additionalScrapeConfigs: | + - job_name: prometheus + static_configs: + - targets: + - 10.1.1.12:9000 + - 10.1.1.12:9400 + - 10.1.1.12:9199 + - 10.1.1.13:9000 + - 10.1.1.14:9000 + - 192.168.42.1:9100 + - 192.168.42.8:9100 + - 192.168.42.254:9100 + - 192.168.42.201:9100 + - job_name: myapp + static_configs: + - targets: ["myapp-service:3000"]