Skip to content

Commit

Permalink
feat: allow non root user
Browse files Browse the repository at this point in the history
Problem: skypilot (and likely others) do not run with a root user
Solution: allow a non-root user that has sudo
Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Jul 22, 2024
1 parent 3c0973c commit c5dc811
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 34 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Build the manager binary
FROM golang:1.20 as builder
FROM golang:1.20 AS builder

WORKDIR /workspace

Expand Down
8 changes: 6 additions & 2 deletions controllers/flux/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) {
Labels: cluster.Spec.JobLabels,
},

// Completions must be == to Parallelism to allow for scaling
Spec: batchv1.JobSpec{

BackoffLimit: &backoffLimit,
Completions: &cluster.Spec.Size,
Parallelism: &cluster.Spec.Size,
Expand All @@ -69,7 +69,6 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) {
Volumes: getVolumes(cluster),
ImagePullSecrets: getImagePullSecrets(cluster),
ServiceAccountName: cluster.Spec.Pod.ServiceAccountName,
RuntimeClassName: &cluster.Spec.Pod.RuntimeClassName,
AutomountServiceAccountToken: &cluster.Spec.Pod.AutomountServiceAccountToken,
RestartPolicy: corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy),
NodeSelector: cluster.Spec.Pod.NodeSelector,
Expand All @@ -79,6 +78,11 @@ func NewMiniClusterJob(cluster *api.MiniCluster) (*batchv1.Job, error) {
},
}

// Only add runClassName if defined
if cluster.Spec.Pod.RuntimeClassName != "" {
job.Spec.Template.Spec.RuntimeClassName = &cluster.Spec.Pod.RuntimeClassName
}

// Add Affinity to map one pod / node only if the user hasn't disbaled it
if !cluster.Spec.Network.DisableAffinity {
job.Spec.Template.Spec.Affinity = getAffinity(cluster)
Expand Down
6 changes: 5 additions & 1 deletion controllers/flux/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,16 @@ func (r *MiniClusterReconciler) newServicePod(
ImagePullSecrets: getImagePullSecrets(cluster),
RestartPolicy: corev1.RestartPolicy(cluster.Spec.Pod.RestartPolicy),
ServiceAccountName: cluster.Spec.Pod.ServiceAccountName,
RuntimeClassName: &cluster.Spec.Pod.RuntimeClassName,
AutomountServiceAccountToken: &cluster.Spec.Pod.AutomountServiceAccountToken,
NodeSelector: cluster.Spec.Pod.NodeSelector,
},
}

// Only add runClassName if defined
if cluster.Spec.Pod.RuntimeClassName != "" {
pod.Spec.RuntimeClassName = &cluster.Spec.Pod.RuntimeClassName
}

// Assemble existing volume mounts - they are added with getContainers
mounts := []corev1.VolumeMount{}

Expand Down
37 changes: 33 additions & 4 deletions docs/getting_started/custom-resource-definition.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,11 @@ When enabled, meaning that we use flux from a view within the container, these c

- [ghcr.io/converged-computing/flux-view-rocky:tag-9](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-rocky)
- [ghcr.io/converged-computing/flux-view-rocky:tag-8](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-rocky)
- [ghcr.io/converged-computing/flux-view-ubuntu:tag-noble](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu)
- [ghcr.io/converged-computing/flux-view-ubuntu:tag-jammy](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu)
- [ghcr.io/converged-computing/flux-view-ubuntu:tag-focal](https://github.com/converged-computing/flux-views/pkgs/container/flux-view-ubuntu)


Note that we have [arm builds](https://github.com/converged-computing/flux-views/tree/main/arm) available for each of rocky and ubuntu as well.
If you don't want to use Flux from a view (and want to use the v1apha1 design of the Flux Operator that had the application alongside Flux) you can do that by way of disabling the flux view:

Expand Down Expand Up @@ -682,6 +685,34 @@ pod:
serviceAccountName: my-service-account
```

#### restartPolicy

To customize the restartPolicy for the pod:

```yaml
pod:
restartPolicy: Never
```

#### runtimeClassName

To add a runtime class name:

```yaml
pod:
runtimeClassName: nvidia
```

#### automountServiceAccountToken

If you want to automatically mount a service account token:

```yaml
pod:
automountServiceAccountToken: true
```


#### nodeSelector

A node selector is a set of key value pairs that helps to schedule pods to the right nodes! You can
Expand Down Expand Up @@ -720,10 +751,8 @@ name: rabbit

#### image

This is the only required attribute! You *must* provide a container base that has Flux.
The requirements of your container are defined in the README of the [flux-hpc](https://github.com/rse-ops/flux-hpc/)
repository. Generally speaking, you need to have Flux executables, Flux Python bindings,
and your own executables on the path, and should be started with root with a flux user.
You do not need to provide a container base that has Flux, but you must make sure your view (with a particular operator system) that will add Flux matches your container. We don't require you to start as root, but if you
have a container with a non-root user, the user needs to have sudo available (to act as root).
If you use the [fluxrm/flux-sched](https://hub.docker.com/r/fluxrm/flux-sched)
base containers this is usually a good start.

Expand Down
23 changes: 12 additions & 11 deletions pkg/flux/templates/components.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ url=$goshareUrl/wait-fs-{{ .Spec.Flux.Arch }}
# This waiting script is intended to wait for the flux view, and then start running
curl -L -O -s -o ./wait-fs -s ${url} {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} || wget ${url} -q -O ./wait-fs {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }} || true
chmod +x ./wait-fs || true
mv ./wait-fs /usr/bin/goshare-wait-fs || true
${SUDO} mv ./wait-fs /usr/bin/goshare-wait-fs || true

# Ensure spack view is on the path, wherever it is mounted
viewbase="{{ .ViewBase }}"
Expand All @@ -47,9 +47,9 @@ goshare-wait-fs -p ${viewbase}/flux-operator-done.txt {{ if .Spec.Logging.Quiet
# Copy mount software to /opt/software
# If /opt/software already exists, we need to copy into it
if [[ -e "/opt/software" ]]; then
cp -R ${viewbase}/software/* /opt/software/ || true
${SUDO} cp -R ${viewbase}/software/* /opt/software/ || true
else
cp -R ${viewbase}/software /opt/software || true
${SUDO} cp -R ${viewbase}/software /opt/software || true
fi
{{end}}

Expand All @@ -72,10 +72,10 @@ echo "Python root: $foundroot" {{ if .Spec.Logging.Quiet }} > /dev/null 2>&1{{ e

# If we found the right python, ensure it's linked (old link does not work)
if [[ -f "${pythonversion}" ]]; then
rm -rf $viewroot/bin/python3
rm -rf $viewroot/bin/python
ln -s ${pythonversion} $viewroot/lib/python || true
ln -s ${pythonversion} $viewroot/lib/python3 || true
${SUDO} rm -rf $viewroot/bin/python3
${SUDO} rm -rf $viewroot/bin/python
${SUDO} ln -s ${pythonversion} $viewroot/lib/python || true
${SUDO} ln -s ${pythonversion} $viewroot/lib/python3 || true
fi

# Ensure we use flux's python (TODO update this to use variable)
Expand All @@ -87,15 +87,16 @@ find $viewroot . -name libpython*.so* {{ if .Spec.Logging.Quiet }}> /dev/null 2>
ls -l /mnt/flux/view/lib/libpython3.11.so.1.0 {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}

# Write an easy file we can source for the environment
cat <<EOT >> ${viewbase}/flux-view.sh
cat <<EOT >> ./flux-view.sh
#!/bin/bash
export PATH=$PATH
export PYTHONPATH=$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$viewroot/lib
export fluxsocket=local://${viewroot}/run/flux/local
EOT
${SUDO} mv ./flux-view.sh ${viewbase}/flux-view.sh
{{end}}
{{define "ensure-pip"}}
${pythonversion} -m pip --version || ${pythonversion} -m ensurepip || (wget https://bootstrap.pypa.io/get-pip.py && ${pythonversion} ./get-pip.py) {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
${pythonversion} -m pip --upgrade pip {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
{{end}}
${SUDO} ${pythonversion} -m pip --version || ${SUDO} ${pythonversion} -m ensurepip || (${SUDO} wget https://bootstrap.pypa.io/get-pip.py && ${pythonversion} ./get-pip.py) {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
${SUDO} ${pythonversion} -m pip --upgrade pip {{ if .Spec.Logging.Quiet }}> /dev/null 2>&1{{ end }}
{{end}}
41 changes: 26 additions & 15 deletions pkg/flux/templates/wait.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
# We use the actual time command and not the wrapper, otherwise we get there is no argument -f
{{ if .Spec.Logging.Timed }}which /usr/bin/time > /dev/null 2>&1 || (echo "/usr/bin/time is required to use logging.timed true" && exit 1);{{ end }}

# Set the flux user and id from the getgo
fluxuser=$(whoami)
fluxuid=$(id -u $fluxuser)

# Add fluxuser to sudoers living... dangerously!
# A non root user container requires sudo to work
SUDO=""
if [[ "${fluxuser}" != "root" ]]; then
echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
SUDO="sudo"
fi

# If any initCommand logic is defined
{{ .Container.Commands.Init}} {{ if .Spec.Logging.Quiet }}> /dev/null{{ end }}

Expand All @@ -14,10 +26,6 @@
{{template "wait-view" .}}
{{ if not .Spec.Flux.Container.Disable }}{{template "paths" .}}{{ end }}

# Set the flux user and id from the getgo
fluxuser=$(whoami)
fluxuid=$(id -u $fluxuser)

# Variables we can use again
cfg="${viewroot}/etc/flux/config"
command="{{ .Container.Command }}"
Expand All @@ -28,19 +36,21 @@ command="{{ .Container.Command }}"
{{ if not .Spec.Logging.Quiet }}
echo
echo "Hello user ${fluxuser}"{{ end }}

# Add fluxuser to sudoers living... dangerously!
if [[ "${fluxuser}" != "root" ]]; then
echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
fi

# Ensure the flux user owns the curve.cert
# We need to move the curve.cert because config map volume is read only
curvesrc=/flux_operator/curve.cert
curvepath=$viewroot/curve/curve.cert

mkdir -p $viewroot/curve
cp $curvesrc $curvepath
# run directory must be owned by this user
# and /var/lib/flux
if [[ "${fluxuser}" != "root" ]]; then
${SUDO} chown -R ${fluxuser} ${viewroot}/run/flux ${viewroot}/var/lib/flux
fi

# Prepare curve certificate!
${SUDO} mkdir -p $viewroot/curve
${SUDO} cp $curvesrc $curvepath
{{ if not .Spec.Logging.Quiet }}
echo
echo "🌟️ Curve Certificate"
Expand All @@ -49,9 +59,9 @@ cat ${curvepath}
{{ end }}

# Remove group and other read
chmod o-r ${curvepath}
chmod g-r ${curvepath}
chown -R ${fluxuid} ${curvepath}
${SUDO} chmod o-r ${curvepath}
${SUDO} chmod g-r ${curvepath}
${SUDO} chown -R ${fluxuid} ${curvepath}

# If we have disabled the view, we need to use the flux here to generate resources
{{ if .Spec.Flux.Container.Disable }}
Expand All @@ -61,7 +71,8 @@ echo
echo "📦 Resources"
echo "flux R encode --hosts=${hosts} --local"
{{ end }}
flux R encode --hosts=${hosts} --local > ${viewroot}/etc/flux/system/R
flux R encode --hosts=${hosts} --local > /tmp/R
${SUDO} mv /tmp/R ${viewroot}/etc/flux/system/R
{{ if not .Spec.Logging.Quiet }}cat ${viewroot}/etc/flux/system/R{{ end }}
{{ end }}

Expand Down

0 comments on commit c5dc811

Please sign in to comment.