From 1a51f367f98d1aa04dddaf3ed743483c8b5d2e3f Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 23 Jul 2024 09:44:34 -0700 Subject: [PATCH 1/2] Bump nvidia-container-toolkit to v1.16.1 This fixes a bug with processing errors during CDI spec generation for MIG devices. Signed-off-by: Christopher Desiniotis --- go.mod | 2 +- go.sum | 4 ++-- .../internal/platform-support/dgpu/nvml.go | 6 +++--- vendor/modules.txt | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/go.mod b/go.mod index baafb94e3..a604c53b0 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/NVIDIA/go-gpuallocator v0.5.0 github.com/NVIDIA/go-nvlib v0.6.0 github.com/NVIDIA/go-nvml v0.12.4-0 - github.com/NVIDIA/nvidia-container-toolkit v1.16.0 + github.com/NVIDIA/nvidia-container-toolkit v1.16.1 github.com/fsnotify/fsnotify v1.7.0 github.com/google/renameio v1.0.1 github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index 270344d50..6247f9af7 100644 --- a/go.sum +++ b/go.sum @@ -27,8 +27,8 @@ github.com/NVIDIA/go-nvlib v0.6.0 h1:zAMBzCYT9xeyRQo0tb7HJbStkzajD6e5joyaQqJ2OGU github.com/NVIDIA/go-nvlib v0.6.0/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY= github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg= github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= -github.com/NVIDIA/nvidia-container-toolkit v1.16.0 h1:NZyKfW0s8nfghoBSJJUth7OZB5ZzRGYbn3RaiTDYdHM= -github.com/NVIDIA/nvidia-container-toolkit v1.16.0/go.mod h1:jJXYvHEdqqpDcRXvolaiFCBsgLxvCwmJWSBZM3zQPY8= +github.com/NVIDIA/nvidia-container-toolkit v1.16.1 h1:PkY6RqYD1wIt1izCvYZ7kr7IitxK8e9+k/prO6b3vD0= +github.com/NVIDIA/nvidia-container-toolkit v1.16.1/go.mod h1:jJXYvHEdqqpDcRXvolaiFCBsgLxvCwmJWSBZM3zQPY8= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go index e4b67641d..be111102b 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go @@ -145,9 +145,9 @@ type toRequiredMigInfo struct { } func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) { - gpu, ret := d.parent.GetMinorNumber() - if ret != nvml.SUCCESS { - return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret) + gpu, err := d.parent.GetMinorNumber() + if err != nil { + return 0, 0, 0, fmt.Errorf("error getting GPU minor: %w", err) } gi, ret := d.GetGpuInstanceId() diff --git a/vendor/modules.txt b/vendor/modules.txt index 6b78873b7..295770019 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -43,7 +43,7 @@ github.com/NVIDIA/go-nvlib/pkg/pciids ## explicit; go 1.20 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml -# github.com/NVIDIA/nvidia-container-toolkit v1.16.0 +# github.com/NVIDIA/nvidia-container-toolkit v1.16.1 ## explicit; go 1.20 github.com/NVIDIA/nvidia-container-toolkit/internal/config/image github.com/NVIDIA/nvidia-container-toolkit/internal/discover From 5505131d13f1aedcf5ff0fd92dab6316e3a5791b Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 23 Jul 2024 11:23:49 -0700 Subject: [PATCH 2/2] Bump project version to v0.16.1 Signed-off-by: Christopher Desiniotis --- CHANGELOG.md | 3 ++ README.md | 50 +++++++++---------- .../helm/nvidia-device-plugin/Chart.yaml | 4 +- ...re-discovery-daemonset-with-mig-mixed.yaml | 4 +- ...e-discovery-daemonset-with-mig-single.yaml | 6 +-- .../gpu-feature-discovery-daemonset.yaml | 6 +-- .../gpu-feature-discovery-job.yaml.template | 6 +-- ...a-device-plugin-compat-with-cpumanager.yml | 2 +- ...plugin-privileged-with-service-account.yml | 2 +- deployments/static/nvidia-device-plugin.yml | 2 +- versions.mk | 2 +- 11 files changed, 45 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29e750504..ca07c63d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ ## Changelog +### v0.16.1 +- Bump nvidia-container-toolkit to v1.16.1 to fix a bug with CDI spec generation for MIG devices + ### v0.16.0 - Fixed logic of atomic writing of the feature file - Replaced `WithDialer` with `WithContextDialer` diff --git a/README.md b/README.md index 7ca9a0542..0697f4e6a 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ The NVIDIA device plugin for Kubernetes is a Daemonset that allows you to automa - Run GPU enabled containers in your Kubernetes cluster. This repository contains NVIDIA's official implementation of the [Kubernetes device plugin](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/). -As of v0.16.0 this repository also holds the implementation for GPU Feature Discovery labels, +As of v0.16.1 this repository also holds the implementation for GPU Feature Discovery labels, for further information on GPU Feature Discovery see [here](docs/gpu-feature-discovery/README.md). Please note that: @@ -123,7 +123,7 @@ Once you have configured the options above on all the GPU nodes in your cluster, you can enable GPU support by deploying the following Daemonset: ```shell -$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.0/deployments/static/nvidia-device-plugin.yml +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.1/deployments/static/nvidia-device-plugin.yml ``` **Note:** This is a simple static daemonset meant to demonstrate the basic @@ -558,11 +558,11 @@ $ helm repo add nvdp https://nvidia.github.io/k8s-device-plugin $ helm repo update ``` -Then verify that the latest release (`v0.16.0`) of the plugin is available: +Then verify that the latest release (`v0.16.1`) of the plugin is available: ``` $ helm search repo nvdp --devel NAME CHART VERSION APP VERSION DESCRIPTION -nvdp/nvidia-device-plugin 0.16.0 0.16.0 A Helm chart for ... +nvdp/nvidia-device-plugin 0.16.1 0.16.1 A Helm chart for ... ``` Once this repo is updated, you can begin installing packages from it to deploy @@ -573,7 +573,7 @@ The most basic installation command without any options is then: helm upgrade -i nvdp nvdp/nvidia-device-plugin \ --namespace nvidia-device-plugin \ --create-namespace \ - --version 0.16.0 + --version 0.16.1 ``` **Note:** You only need the to pass the `--devel` flag to `helm search repo` @@ -582,7 +582,7 @@ version (e.g. `-rc.1`). Full releases will be listed without this. ### Configuring the device plugin's `helm` chart -The `helm` chart for the latest release of the plugin (`v0.16.0`) includes +The `helm` chart for the latest release of the plugin (`v0.16.1`) includes a number of customizable values. Prior to `v0.12.0` the most commonly used values were those that had direct @@ -592,7 +592,7 @@ case of the original values is then to override an option from the `ConfigMap` if desired. Both methods are discussed in more detail below. The full set of values that can be set are found here: -[here](https://github.com/NVIDIA/k8s-device-plugin/blob/v0.16.0/deployments/helm/nvidia-device-plugin/values.yaml). +[here](https://github.com/NVIDIA/k8s-device-plugin/blob/v0.16.1/deployments/helm/nvidia-device-plugin/values.yaml). #### Passing configuration to the plugin via a `ConfigMap`. @@ -631,7 +631,7 @@ EOF And deploy the device plugin via helm (pointing it at this config file and giving it a name): ``` $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --namespace nvidia-device-plugin \ --create-namespace \ --set-file config.map.config=/tmp/dp-example-config0.yaml @@ -653,7 +653,7 @@ $ kubectl create cm -n nvidia-device-plugin nvidia-plugin-configs \ ``` ``` $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --namespace nvidia-device-plugin \ --create-namespace \ --set config.name=nvidia-plugin-configs @@ -681,7 +681,7 @@ EOF And redeploy the device plugin via helm (pointing it at both configs with a specified default). ``` $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --namespace nvidia-device-plugin \ --create-namespace \ --set config.default=config0 \ @@ -700,7 +700,7 @@ $ kubectl create cm -n nvidia-device-plugin nvidia-plugin-configs \ ``` ``` $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --namespace nvidia-device-plugin \ --create-namespace \ --set config.default=config0 \ @@ -783,7 +783,7 @@ chart values that are commonly overridden are: ``` Please take a look in the -[`values.yaml`](https://github.com/NVIDIA/k8s-device-plugin/blob/v0.16.0/deployments/helm/nvidia-device-plugin/values.yaml) +[`values.yaml`](https://github.com/NVIDIA/k8s-device-plugin/blob/v0.16.1/deployments/helm/nvidia-device-plugin/values.yaml) file to see the full set of overridable parameters for the device plugin. Examples of setting these options include: @@ -792,7 +792,7 @@ Enabling compatibility with the `CPUManager` and running with a request for 100ms of CPU time and a limit of 512MB of memory. ```shell $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --namespace nvidia-device-plugin \ --create-namespace \ --set compatWithCPUManager=true \ @@ -803,7 +803,7 @@ $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ Enabling compatibility with the `CPUManager` and the `mixed` `migStrategy` ```shell $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --namespace nvidia-device-plugin \ --create-namespace \ --set compatWithCPUManager=true \ @@ -822,7 +822,7 @@ Discovery to perform this labeling. To enable it, simply set `gfd.enabled=true` during helm install. ``` helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --namespace nvidia-device-plugin \ --create-namespace \ --set gfd.enabled=true @@ -867,7 +867,7 @@ nvidia.com/gpu.product = A100-SXM4-40GB-MIG-1g.5gb-SHARED #### Deploying gpu-feature-discovery in standalone mode -As of v0.16.0, the device plugin's helm chart has integrated support to deploy +As of v0.16.1, the device plugin's helm chart has integrated support to deploy [`gpu-feature-discovery`](https://gitlab.com/nvidia/kubernetes/gpu-feature-discovery/-/tree/main) When gpu-feature-discovery in deploying standalone, begin by setting up the @@ -878,13 +878,13 @@ $ helm repo add nvdp https://nvidia.github.io/k8s-device-plugin $ helm repo update ``` -Then verify that the latest release (`v0.16.0`) of the plugin is available +Then verify that the latest release (`v0.16.1`) of the plugin is available (Note that this includes the GFD chart): ```shell $ helm search repo nvdp --devel NAME CHART VERSION APP VERSION DESCRIPTION -nvdp/nvidia-device-plugin 0.16.0 0.16.0 A Helm chart for ... +nvdp/nvidia-device-plugin 0.16.1 0.16.1 A Helm chart for ... ``` Once this repo is updated, you can begin installing packages from it to deploy @@ -894,7 +894,7 @@ The most basic installation command without any options is then: ``` $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version 0.16.0 \ + --version 0.16.1 \ --namespace gpu-feature-discovery \ --create-namespace \ --set devicePlugin.enabled=false @@ -905,7 +905,7 @@ the default namespace. ```shell $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \ - --version=0.16.0 \ + --version=0.16.1 \ --set allowDefaultNamespace=true \ --set nfd.enabled=false \ --set migStrategy=mixed \ @@ -928,14 +928,14 @@ Using the default values for the flags: $ helm upgrade -i nvdp \ --namespace nvidia-device-plugin \ --create-namespace \ - https://nvidia.github.io/k8s-device-plugin/stable/nvidia-device-plugin-0.16.0.tgz + https://nvidia.github.io/k8s-device-plugin/stable/nvidia-device-plugin-0.16.1.tgz ``` ## Building and Running Locally The next sections are focused on building the device plugin locally and running it. It is intended purely for development and testing, and not required by most users. -It assumes you are pinning to the latest release tag (i.e. `v0.16.0`), but can +It assumes you are pinning to the latest release tag (i.e. `v0.16.1`), but can easily be modified to work with any available tag or branch. ### With Docker @@ -943,8 +943,8 @@ easily be modified to work with any available tag or branch. #### Build Option 1, pull the prebuilt image from [Docker Hub](https://hub.docker.com/r/nvidia/k8s-device-plugin): ```shell -$ docker pull nvcr.io/nvidia/k8s-device-plugin:v0.16.0 -$ docker tag nvcr.io/nvidia/k8s-device-plugin:v0.16.0 nvcr.io/nvidia/k8s-device-plugin:devel +$ docker pull nvcr.io/nvidia/k8s-device-plugin:v0.16.1 +$ docker tag nvcr.io/nvidia/k8s-device-plugin:v0.16.1 nvcr.io/nvidia/k8s-device-plugin:devel ``` Option 2, build without cloning the repository: @@ -952,7 +952,7 @@ Option 2, build without cloning the repository: $ docker build \ -t nvcr.io/nvidia/k8s-device-plugin:devel \ -f deployments/container/Dockerfile.ubuntu \ - https://github.com/NVIDIA/k8s-device-plugin.git#v0.16.0 + https://github.com/NVIDIA/k8s-device-plugin.git#v0.16.1 ``` Option 3, if you want to modify the code: diff --git a/deployments/helm/nvidia-device-plugin/Chart.yaml b/deployments/helm/nvidia-device-plugin/Chart.yaml index a7817bd62..2528153b3 100644 --- a/deployments/helm/nvidia-device-plugin/Chart.yaml +++ b/deployments/helm/nvidia-device-plugin/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: nvidia-device-plugin type: application description: A Helm chart for the nvidia-device-plugin on Kubernetes -version: "0.16.0" -appVersion: "0.16.0" +version: "0.16.1" +appVersion: "0.16.1" kubeVersion: ">= 1.10.0-0" home: https://github.com/NVIDIA/k8s-device-plugin diff --git a/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml b/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml index 9b164f3ca..e9f8bdb76 100644 --- a/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml +++ b/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml @@ -15,11 +15,11 @@ spec: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.16.0 + app.kubernetes.io/version: 0.16.1 app.kubernetes.io/part-of: nvidia-gpu spec: containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] volumeMounts: diff --git a/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml b/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml index d13f846c3..4d30f8bd9 100644 --- a/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml +++ b/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml @@ -4,7 +4,7 @@ metadata: name: gpu-feature-discovery labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.16.0 + app.kubernetes.io/version: 0.16.1 app.kubernetes.io/part-of: nvidia-gpu spec: selector: @@ -15,11 +15,11 @@ spec: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.16.0 + app.kubernetes.io/version: 0.16.1 app.kubernetes.io/part-of: nvidia-gpu spec: containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] volumeMounts: diff --git a/deployments/static/gpu-feature-discovery-daemonset.yaml b/deployments/static/gpu-feature-discovery-daemonset.yaml index 73d70fb32..ef8cc4044 100644 --- a/deployments/static/gpu-feature-discovery-daemonset.yaml +++ b/deployments/static/gpu-feature-discovery-daemonset.yaml @@ -4,7 +4,7 @@ metadata: name: gpu-feature-discovery labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.16.0 + app.kubernetes.io/version: 0.16.1 app.kubernetes.io/part-of: nvidia-gpu spec: selector: @@ -15,11 +15,11 @@ spec: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.16.0 + app.kubernetes.io/version: 0.16.1 app.kubernetes.io/part-of: nvidia-gpu spec: containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] volumeMounts: diff --git a/deployments/static/gpu-feature-discovery-job.yaml.template b/deployments/static/gpu-feature-discovery-job.yaml.template index c5ff741fc..0d4416f16 100644 --- a/deployments/static/gpu-feature-discovery-job.yaml.template +++ b/deployments/static/gpu-feature-discovery-job.yaml.template @@ -4,19 +4,19 @@ metadata: name: gpu-feature-discovery labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.16.0 + app.kubernetes.io/version: 0.16.1 app.kubernetes.io/part-of: nvidia-gpu spec: template: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.16.0 + app.kubernetes.io/version: 0.16.1 app.kubernetes.io/part-of: nvidia-gpu spec: nodeName: NODE_NAME containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] args: diff --git a/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml b/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml index 5f7047515..ac04cd585 100644 --- a/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml +++ b/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml @@ -38,7 +38,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR diff --git a/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml b/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml index 0d8820295..ced5d6c41 100644 --- a/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml +++ b/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml @@ -124,7 +124,7 @@ spec: - env: - name: PASS_DEVICE_SPECS value: "true" - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: nvidia-device-plugin-ctr securityContext: privileged: true diff --git a/deployments/static/nvidia-device-plugin.yml b/deployments/static/nvidia-device-plugin.yml index 681c23de2..3f8b0a368 100644 --- a/deployments/static/nvidia-device-plugin.yml +++ b/deployments/static/nvidia-device-plugin.yml @@ -38,7 +38,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR diff --git a/versions.mk b/versions.mk index 576559375..9f65eb1e5 100644 --- a/versions.mk +++ b/versions.mk @@ -17,7 +17,7 @@ MODULE := github.com/NVIDIA/$(DRIVER_NAME) REGISTRY ?= nvcr.io/nvidia -VERSION ?= v0.16.0 +VERSION ?= v0.16.1 # vVERSION represents the version with a guaranteed v-prefix vVERSION := v$(VERSION:v%=%)