From 64a722b293dbf582641a1bdfc63b8b9e2416df04 Mon Sep 17 00:00:00 2001 From: "weizhou.lan@daocloud.io" Date: Thu, 18 Jul 2024 16:06:45 +0800 Subject: [PATCH] optimize Signed-off-by: weizhou.lan@daocloud.io --- ofed-driver/Readme.md | 42 +++++++++++++++++++++- ofed-driver/chart/templates/daemonset.yaml | 12 +++++-- ofed-driver/chart/values.yaml | 16 +++++++++ rdma-tools/Readme.md | 8 ++++- 4 files changed, 74 insertions(+), 4 deletions(-) diff --git a/ofed-driver/Readme.md b/ofed-driver/Readme.md index 908a71e..09f135b 100644 --- a/ofed-driver/Readme.md +++ b/ofed-driver/Readme.md @@ -3,6 +3,9 @@ refer to nvidia network operator [ofed-driver-ds.yaml](https://github.com/Mellanox/network-operator/blob/master/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml) and [values.yaml](https://github.com/Mellanox/network-operator/blob/master/deployment/network-operator/values.yaml#L196) +the pod builds the OFED driver from the source and install some online package. Once the pod is ready, the OFED driver is installed + +## image tag the image tag is with a format `{driverVersion}-${OSName}${OSVer}-${Arch}` refer to [nvidia available image tag](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/mellanox/containers/doca-driver/tags) @@ -19,12 +22,49 @@ helm repo add spiderchart https://spidernet-io.github.io/charts helm repo update helm search repo ofed-driver -helm install spiderchart/ofed-driver ofed-driver -n kube-system \ +# for China user, add `--set image.registry=nvcr.m.daocloud.io` +helm install ofed-driver spiderchart/ofed-driver -n kube-system \ --set image.OSName="ubuntu" \ --set image.OSVer="22.04" \ --set image.Arch="amd64" ``` +note: the pod will run `apt-get` to install something online , you could use proxy as following + +```shell +cat< values.yaml +image: + OSName: "ubuntu" + OSVer: "22.04" + Arch: "amd64" + +extraEnv: + - name: HTTPS_PROXY + value: "http://" + - name: HTTP_PROXY + value: "http://" + - name: https_proxy + value: "http://" + - name: http_proxy + value: "http://" +EOF + +helm install ofed-driver spiderchart/ofed-driver -n kube-system -f values.yaml + +# when the pod is ready, the OFED driver is ready +kubectl get pod -n kube-system + kube-system mofed-ubuntu-24.04-ds-lsprx 0/1 Running 0 3m54s + +``` + +when the driver is ready, mlx5_core module could be found on the node +```shell +~# lsmod | grep -i mlx5_core +mlx5_core 2068480 1 mlx5_ib +``` + +refer [nvidia doc](https://docs.nvidia.com/networking/display/kubernetes2370/network+operator#src-132465565_NetworkOperator-NetworkOperatorDeploymentinAir-gappedEnvironment) and [enviroment config](https://github.com/Mellanox/network-operator/blob/master/docs/mofed-container-env-vars.md) for more details + ## release ```shell diff --git a/ofed-driver/chart/templates/daemonset.yaml b/ofed-driver/chart/templates/daemonset.yaml index 92c531b..fa9d8d9 100644 --- a/ofed-driver/chart/templates/daemonset.yaml +++ b/ofed-driver/chart/templates/daemonset.yaml @@ -59,16 +59,24 @@ spec: - image: {{ include "driver.image" . | quote }} imagePullPolicy: {{ .Values.image.pullPolicy }} name: mofed-container + {{- with .Values.resources }} + resources: + {{- toYaml . | trim | nindent 12 }} + {{- end }} securityContext: privileged: true seLinuxOptions: level: "s0" - {{- if .Values.extraEnv }} env: + - name: UNLOAD_STORAGE_MODULES + value: {{ .Values.config.unload_host_storage_modules }} + - name: ENABLE_NFSRDMA + value: {{ .Values.config.enable_nfsrdma }} + - name: RESTORE_DRIVER_ON_POD_TERMINATION + value: {{ .Values.config.restore_hostdriver_on_termination }} {{- with .Values.extraEnv }} {{- toYaml . | nindent 12 }} {{- end }} - {{- end }} volumeMounts: - name: run-mlnx-ofed mountPath: /run/mellanox/drivers diff --git a/ofed-driver/chart/values.yaml b/ofed-driver/chart/values.yaml index cfeff34..e5d0d64 100644 --- a/ofed-driver/chart/values.yaml +++ b/ofed-driver/chart/values.yaml @@ -14,6 +14,14 @@ image: tolerations: - operator: Exists +config: + # unload host storage modules prior to loading mofed modules + unload_host_storage_modules: false + # enable loading of nfs relates storage modules from mofed container + enable_nfsrdma: false + # restore host drivers when container is gracefully stopped + restore_hostdriver_on_termination: true + extraEnv: [] extraVolumes: [] @@ -24,6 +32,14 @@ extraVolumeMounts: [] # - name: test-val # mountPath: /tmp +resources: + #limits: + # cpu: 1000m + # memory: 900Mi + #requests: + # cpu: 100m + # memory: 500Mi + nodeSelector: kubernetes.io/os: linux diff --git a/rdma-tools/Readme.md b/rdma-tools/Readme.md index 5e2717b..6c7d9c6 100644 --- a/rdma-tools/Readme.md +++ b/rdma-tools/Readme.md @@ -16,6 +16,11 @@ helm search repo rdma-tools # run daemonset on worker1 and worker2 cat < values.yaml +# for china user , it could add these to use a domestic registry +#image: +# registry: ghcr.m.daocloud.io + +# just run daemonset in nodes 'worker1' and 'worker2' affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -45,6 +50,7 @@ resources: # add: [ "IPC_LOCK" ] EOF -helm install spiderchart/rdma-tools rdma-tools -f ./values.yaml +# for China user, add `--set image.registry=ghcr.m.daocloud.io` +helm install rdma-tools spiderchart/rdma-tools -f ./values.yaml ```