diff --git a/Dockerfile b/Dockerfile index 26fb6345..7ed8cb61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM golang:1.20 as builder +FROM golang:1.20 AS builder WORKDIR /workspace diff --git a/controllers/flux/job.go b/controllers/flux/job.go index 2ba69bce..e5cda75a 100644 --- a/controllers/flux/job.go +++ b/controllers/flux/job.go @@ -45,8 +45,8 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) { Labels: cluster.Spec.JobLabels, }, + // Completions must be == to Parallelism to allow for scaling Spec: batchv1.JobSpec{ - BackoffLimit: &backoffLimit, Completions: &cluster.Spec.Size, Parallelism: &cluster.Spec.Size, @@ -69,7 +69,6 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) { Volumes: getVolumes(cluster), ImagePullSecrets: getImagePullSecrets(cluster), ServiceAccountName: cluster.Spec.Pod.ServiceAccountName, - RuntimeClassName: &cluster.Spec.Pod.RuntimeClassName, AutomountServiceAccountToken: &cluster.Spec.Pod.AutomountServiceAccountToken, RestartPolicy: corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy), NodeSelector: cluster.Spec.Pod.NodeSelector, @@ -79,6 +78,11 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) { }, } + // Only add runClassName if defined + if cluster.Spec.Pod.RuntimeClassName != "" { + job.Spec.Template.Spec.RuntimeClassName = &cluster.Spec.Pod.RuntimeClassName + } + // Add Affinity to map one pod / node only if the user hasn't disbaled it if !cluster.Spec.Network.DisableAffinity { job.Spec.Template.Spec.Affinity = getAffinity(cluster) diff --git a/controllers/flux/pods.go b/controllers/flux/pods.go index 35127e2a..c29801d7 100644 --- a/controllers/flux/pods.go +++ b/controllers/flux/pods.go @@ -137,12 +137,16 @@ func (r *MiniClusterReconciler) newServicePod( ImagePullSecrets: getImagePullSecrets(cluster), RestartPolicy: corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy), ServiceAccountName: cluster.Spec.Pod.ServiceAccountName, - RuntimeClassName: &cluster.Spec.Pod.RuntimeClassName, AutomountServiceAccountToken: &cluster.Spec.Pod.AutomountServiceAccountToken, NodeSelector: cluster.Spec.Pod.NodeSelector, }, } + // Only add runClassName if defined + if cluster.Spec.Pod.RuntimeClassName != "" { + pod.Spec.RuntimeClassName = &cluster.Spec.Pod.RuntimeClassName + } + // Assemble existing volume mounts - they are added with getContainers mounts := []corev1.VolumeMount{} diff --git a/docs/getting_started/custom-resource-definition.md b/docs/getting_started/custom-resource-definition.md index 6466625f..e50842bf 100644 --- a/docs/getting_started/custom-resource-definition.md +++ b/docs/getting_started/custom-resource-definition.md @@ -188,8 +188,11 @@ When enabled, meaning that we use flux from a view within the container, these c - [ghcr.io/converged-computing/flux-view-rocky:tag-9](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-rocky) - [ghcr.io/converged-computing/flux-view-rocky:tag-8](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-rocky) + - [ghcr.io/converged-computing/flux-view-ubuntu:tag-noble](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu) + - [ghcr.io/converged-computing/flux-view-ubuntu:tag-jammy](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu) - [ghcr.io/converged-computing/flux-view-ubuntu:tag-focal](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu) + Note that we have [arm builds](https://github.com/converged-computing/flux-views/tree/main/arm) available for each of rocky and ubuntu as well. If you don't want to use Flux from a view (and want to use the v1apha1 design of the Flux Operator that had the application alongside Flux) you can do that by way of disabling the flux view: @@ -682,6 +685,34 @@ pod: serviceAccountName: my-service-account ``` +#### restartPolicy + +To customize the restartPolicy for the pod: + +```yaml +pod: + restartPolicy: Never +``` + +#### runtimeClassName + +To add a runtime class name: + +```yaml +pod: + runtimeClassName: nvidia +``` + +#### automountServiceAccountToken + +If you want to automatically mount a service account token: + +```yaml +pod: + automountServiceAccountToken: true +``` + + #### nodeSelector A node selector is a set of key value pairs that helps to schedule pods to the right nodes! You can @@ -720,10 +751,8 @@ name: rabbit #### image -This is the only required attribute! You *must* provide a container base that has Flux. -The requirements of your container are defined in the README of the [flux-hpc](https://github.com/rse-ops/flux-hpc/) -repository. Generally speaking, you need to have Flux executables, Flux Python bindings, -and your own executables on the path, and should be started with root with a flux user. +You do not need to provide a container base that has Flux, but you must make sure your view (with a particular operator system) that will add Flux matches your container. We don't require you to start as root, but if you +have a container with a non-root user, the user needs to have sudo available (to act as root). If you use the [fluxrm/flux-sched](https://hub.docker.com/r/fluxrm/flux-sched) base containers this is usually a good start. diff --git a/pkg/flux/templates/components.sh b/pkg/flux/templates/components.sh index c246d285..edae355c 100644 --- a/pkg/flux/templates/components.sh +++ b/pkg/flux/templates/components.sh @@ -21,7 +21,7 @@ url=$goshareUrl/wait-fs-{{ .Spec.Flux.Arch }} # This waiting script is intended to wait for the flux view, and then start running curl -L -O -s -o ./wait-fs -s ${url} {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} || wget ${url} -q -O ./wait-fs {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} || true chmod +x ./wait-fs || true -mv ./wait-fs /usr/bin/goshare-wait-fs || true +${SUDO} mv ./wait-fs /usr/bin/goshare-wait-fs || true # Ensure spack view is on the path, wherever it is mounted viewbase="{{ .ViewBase }}" @@ -47,9 +47,9 @@ goshare-wait-fs -p ${viewbase}/flux-operator-done.txt {{ if .Spec.Logging.Quiet # Copy mount software to /opt/software # If /opt/software already exists, we need to copy into it if [[ -e "/opt/software" ]]; then - cp -R ${viewbase}/software/* /opt/software/ || true + ${SUDO} cp -R ${viewbase}/software/* /opt/software/ || true else - cp -R ${viewbase}/software /opt/software || true + ${SUDO} cp -R ${viewbase}/software /opt/software || true fi {{end}} @@ -72,10 +72,10 @@ echo "Python root: $foundroot" {{ if .Spec.Logging.Quiet }} > /dev/null 2>&1{{ e # If we found the right python, ensure it's linked (old link does not work) if [[ -f "${pythonversion}" ]]; then - rm -rf $viewroot/bin/python3 - rm -rf $viewroot/bin/python - ln -s ${pythonversion} $viewroot/lib/python || true - ln -s ${pythonversion} $viewroot/lib/python3 || true + ${SUDO} rm -rf $viewroot/bin/python3 + ${SUDO} rm -rf $viewroot/bin/python + ${SUDO} ln -s ${pythonversion} $viewroot/lib/python || true + ${SUDO} ln -s ${pythonversion} $viewroot/lib/python3 || true fi # Ensure we use flux's python (TODO update this to use variable) @@ -87,15 +87,16 @@ find $viewroot . -name libpython*.so* {{ if .Spec.Logging.Quiet }}> /dev/null 2> ls -l /mnt/flux/view/lib/libpython3.11.so.1.0 {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} # Write an easy file we can source for the environment -cat <> ${viewbase}/flux-view.sh +cat <> ./flux-view.sh #!/bin/bash export PATH=$PATH export PYTHONPATH=$PYTHONPATH export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$viewroot/lib export fluxsocket=local://${viewroot}/run/flux/local EOT +${SUDO} mv ./flux-view.sh ${viewbase}/flux-view.sh {{end}} {{define "ensure-pip"}} -${pythonversion} -m pip --version || ${pythonversion} -m ensurepip || (wget https://bootstrap.pypa.io/get-pip.py && ${pythonversion} ./get-pip.py) {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} -${pythonversion} -m pip --upgrade pip {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} -{{end}} +${SUDO} ${pythonversion} -m pip --version || ${SUDO} ${pythonversion} -m ensurepip || (${SUDO} wget https://bootstrap.pypa.io/get-pip.py && ${pythonversion} ./get-pip.py) {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} +${SUDO} ${pythonversion} -m pip --upgrade pip {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} +{{end}} \ No newline at end of file diff --git a/pkg/flux/templates/wait.sh b/pkg/flux/templates/wait.sh index dc489d45..3c84b1a9 100644 --- a/pkg/flux/templates/wait.sh +++ b/pkg/flux/templates/wait.sh @@ -6,6 +6,18 @@ # We use the actual time command and not the wrapper, otherwise we get there is no argument -f {{ if .Spec.Logging.Timed }}which /usr/bin/time > /dev/null 2>&1 || (echo "/usr/bin/time is required to use logging.timed true" && exit 1);{{ end }} +# Set the flux user and id from the getgo +fluxuser=$(whoami) +fluxuid=$(id -u $fluxuser) + +# Add fluxuser to sudoers living... dangerously! +# A non root user container requires sudo to work +SUDO="" +if [[ "${fluxuser}" != "root" ]]; then + echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + SUDO="sudo" +fi + # If any initCommand logic is defined {{ .Container.Commands.Init}} {{ if .Spec.Logging.Quiet }}> /dev/null{{ end }} @@ -14,10 +26,6 @@ {{template "wait-view" .}} {{ if not .Spec.Flux.Container.Disable }}{{template "paths" .}}{{ end }} -# Set the flux user and id from the getgo -fluxuser=$(whoami) -fluxuid=$(id -u $fluxuser) - # Variables we can use again cfg="${viewroot}/etc/flux/config" command="{{ .Container.Command }}" @@ -28,19 +36,21 @@ command="{{ .Container.Command }}" {{ if not .Spec.Logging.Quiet }} echo echo "Hello user ${fluxuser}"{{ end }} - -# Add fluxuser to sudoers living... dangerously! -if [[ "${fluxuser}" != "root" ]]; then - echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers -fi # Ensure the flux user owns the curve.cert # We need to move the curve.cert because config map volume is read only curvesrc=/flux_operator/curve.cert curvepath=$viewroot/curve/curve.cert -mkdir -p $viewroot/curve -cp $curvesrc $curvepath +# run directory must be owned by this user +# and /var/lib/flux +if [[ "${fluxuser}" != "root" ]]; then + ${SUDO} chown -R ${fluxuser} ${viewroot}/run/flux ${viewroot}/var/lib/flux +fi + +# Prepare curve certificate! +${SUDO} mkdir -p $viewroot/curve +${SUDO} cp $curvesrc $curvepath {{ if not .Spec.Logging.Quiet }} echo echo "🌟️ Curve Certificate" @@ -49,9 +59,9 @@ cat ${curvepath} {{ end }} # Remove group and other read -chmod o-r ${curvepath} -chmod g-r ${curvepath} -chown -R ${fluxuid} ${curvepath} +${SUDO} chmod o-r ${curvepath} +${SUDO} chmod g-r ${curvepath} +${SUDO} chown -R ${fluxuid} ${curvepath} # If we have disabled the view, we need to use the flux here to generate resources {{ if .Spec.Flux.Container.Disable }} @@ -61,7 +71,8 @@ echo echo "📦 Resources" echo "flux R encode --hosts=${hosts} --local" {{ end }} -flux R encode --hosts=${hosts} --local > ${viewroot}/etc/flux/system/R +flux R encode --hosts=${hosts} --local > /tmp/R +${SUDO} mv /tmp/R ${viewroot}/etc/flux/system/R {{ if not .Spec.Logging.Quiet }}cat ${viewroot}/etc/flux/system/R{{ end }} {{ end }}