diff --git a/packs/nvidia-dra-driver-25.8.1/README.md b/packs/nvidia-dra-driver-25.8.1/README.md new file mode 100644 index 00000000..9978304d --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/README.md @@ -0,0 +1,87 @@ +# NVIDIA DRA Driver for GPUs + +The [NVIDIA DRA Driver](https://github.com/NVIDIA/k8s-dra-driver-gpu) enables Dynamic Resource Allocation (DRA) for GPUs in Kubernetes 1.32+. This pack works with Palette to provide flexible GPU allocation using DeviceClass and ResourceClaim resources, replacing the traditional device plugin approach with a modern, CEL-based device selection mechanism. + + +## Prerequisites + +- Kubernetes 1.32 or newer (DRA is GA in 1.34+). +- [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) 25.3.0+ for driver management and CDI support. +- CDI enabled in the container runtime (containerd/CRI-O). +- [Node Feature Discovery](https://kubernetes-sigs.github.io/node-feature-discovery/) (NFD) for GPU detection. + + +## Parameters + +To deploy the NVIDIA DRA Driver, you can configure the following parameters in the pack's YAML. + +| **Name** | **Description** | **Type** | **Default Value** | **Required** | +|---|---|---|---|---| +| `nvidiaDriverRoot` | Path to NVIDIA driver installation. Use `/run/nvidia/driver` with GPU Operator, `/` for host-installed drivers. | String | `/run/nvidia/driver` | No | +| `resources.gpus.enabled` | Enable GPU allocation via DRA. | Boolean | `true` | No | +| `resources.computeDomains.enabled` | Enable ComputeDomains for Multi-Node NVLink (MNNVL) on GB200 systems. | Boolean | `false` | No | +| `image.tag` | DRA driver image tag. | String | `v25.8.1` | No | +| `logVerbosity` | Log verbosity level (0-7, higher = more verbose). | String | `4` | No | +| `webhook.enabled` | Enable admission webhook for advanced validation. | Boolean | `false` | No | + +Refer to the [NVIDIA DRA Driver Helm chart](https://github.com/NVIDIA/k8s-dra-driver-gpu) for the complete list of configurable parameters. + + +## Upgrade + +N/A - This is the initial release of the NVIDIA DRA Driver pack. + + +## Usage + +To use the NVIDIA DRA Driver pack, first create a new [add-on cluster profile](https://docs.spectrocloud.com/profiles/cluster-profiles/create-cluster-profiles/create-addon-profile/), search for the **NVIDIA DRA Driver for GPUs** pack, and configure the driver root path based on your environment: + +```yaml +charts: + nvidia-dra-driver-gpu: + nvidiaDriverRoot: /run/nvidia/driver # Use "/" if drivers installed on host +``` + +After installation, the DRA driver creates: +- A default `DeviceClass` named `gpu.nvidia.com` +- `ResourceSlice` objects representing available GPUs on each node + +To request a GPU for your workload, create a ResourceClaimTemplate and reference it in your Pod. Click on the **Add Manifest** button to create a new manifest layer with the following content: + +```yaml +apiVersion: resource.k8s.io/v1 +kind: ResourceClaimTemplate +metadata: + name: gpu-claim +spec: + spec: + devices: + requests: + - name: gpu + deviceClassName: gpu.nvidia.com +--- +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: cuda + image: nvidia/cuda:12.0-base + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimTemplateName: gpu-claim +``` + +Once you have configured the NVIDIA DRA Driver pack, you can add it to an existing cluster profile, as an add-on profile, or as a new add-on layer to a deployed cluster. + + +## References + +- [NVIDIA DRA Driver Documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/dra-intro-install.html) +- [Kubernetes DRA Documentation](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/) +- [NVIDIA DRA Driver on GitHub](https://github.com/NVIDIA/k8s-dra-driver-gpu) +- [NVIDIA GPU Operator Documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu-25.8.1.tgz b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu-25.8.1.tgz new file mode 100644 index 00000000..dd26a0c9 Binary files /dev/null and b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu-25.8.1.tgz differ diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/.helmignore b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/Chart.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/Chart.yaml new file mode 100644 index 00000000..44ca13fe --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/Chart.yaml @@ -0,0 +1,7 @@ +apiVersion: v2 +appVersion: 25.8.1 +description: Official Helm chart for the NVIDIA DRA Driver for GPUs +kubeVersion: '>=1.32.0-0' +name: nvidia-dra-driver-gpu +type: application +version: 25.8.1 diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/LICENSE b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/LICENSE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/NOTICE b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/NOTICE new file mode 100644 index 00000000..f3acef15 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/NOTICE @@ -0,0 +1,4 @@ +Copyright 2025 NVIDIA CORPORATION + +This product includes software developed at +NVIDIA CORPORATION (https://nvidia.com). diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml new file mode 100644 index 00000000..5a28ae17 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml @@ -0,0 +1,162 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.1 + name: computedomains.resource.nvidia.com +spec: + group: resource.nvidia.com + names: + kind: ComputeDomain + listKind: ComputeDomainList + plural: computedomains + singular: computedomain + scope: Namespaced + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: ComputeDomain prepares a set of nodes to run a multi-node workload + in. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ComputeDomainSpec provides the spec for a ComputeDomain. + properties: + channel: + description: ComputeDomainChannelSpec provides the spec for a channel + used to run a workload inside a ComputeDomain. + properties: + allocationMode: + default: Single + description: |- + Allows for requesting all IMEX channels (the maximum per IMEX domain) or + precisely one. + enum: + - All + - Single + type: string + resourceClaimTemplate: + description: ComputeDomainResourceClaimTemplate provides the details + of the ResourceClaimTemplate to generate. + properties: + name: + type: string + required: + - name + type: object + required: + - resourceClaimTemplate + type: object + numNodes: + description: |- + Intended number of IMEX daemons (i.e., individual compute nodes) in the + ComputeDomain. Must be zero or greater. + + With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is + recommended to be set to zero. Workload must implement and consult its + own source of truth for the number of workers online before trying to + share GPU memory (and hence triggering IMEX interaction). When non-zero, + `numNodes` is used only for automatically updating the global + ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX + daemons equals `numNodes`). In this mode, a `numNodes` value greater than + zero in particular does not gate the startup of IMEX daemons: individual + IMEX daemons are started immediately without waiting for its peers, and + any workload pod gets released right after its local IMEX daemon has + started. + + With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set + to the expected number of worker nodes joining the ComputeDomain. In that + mode, all workload pods are held back (with containers in state + `ContainerCreating`) until the underlying IMEX domain has been joined by + `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to + join the ComputeDomain may lead to unexpected behavior. + + The `numNodes` parameter is deprecated and will be removed in the next + API version. + type: integer + required: + - channel + - numNodes + type: object + x-kubernetes-validations: + - message: A computeDomain.spec is immutable + rule: self == oldSelf + status: + description: |- + Global ComputeDomain status. Can be used to guide debugging efforts. + Workload however should not rely on inspecting this field at any point + during its lifecycle. + properties: + nodes: + items: + description: ComputeDomainNode provides information about each node + added to a ComputeDomain. + properties: + cliqueID: + type: string + index: + description: |- + The Index field is used to ensure a consistent IP-to-DNS name + mapping across all machines within an IMEX domain. Each node's index + directly determines its DNS name within a given NVLink partition + (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will + always be unique. This field is marked as optional (but not + omitempty) in order to support downgrades and avoid an API bump. + type: integer + ipAddress: + type: string + name: + type: string + status: + default: NotReady + description: |- + The Status field tracks the readiness of the IMEX daemon running on + this node. It gets switched to Ready whenever the IMEX daemon is + ready to broker GPU memory exchanges and switches to NotReady when + it is not. It is marked as optional in order to support downgrades + and avoid an API bump. + enum: + - Ready + - NotReady + type: string + required: + - cliqueID + - ipAddress + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + status: + default: NotReady + enum: + - Ready + - NotReady + type: string + required: + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/_helpers.tpl b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/_helpers.tpl new file mode 100644 index 00000000..9c361043 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/_helpers.tpl @@ -0,0 +1,186 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "nvidia-dra-driver-gpu.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "nvidia-dra-driver-gpu.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Allow the release namespace to be overridden for multi-namespace deployments in combined charts +*/}} +{{- define "nvidia-dra-driver-gpu.namespace" -}} + {{- if .Values.namespaceOverride -}} + {{- .Values.namespaceOverride -}} + {{- else -}} + {{- .Release.Namespace -}} + {{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "nvidia-dra-driver-gpu.chart" -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- printf "%s-%s" $name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Standard labels: documented at +https://helm.sh/docs/chart_best_practices/labels/ +Apply this to all high-level objects (Deployment, DaemonSet, ...). +Pod template labels are included here to deliver name+instance. +*/}} +{{- define "nvidia-dra-driver-gpu.labels" -}} +helm.sh/chart: {{ include "nvidia-dra-driver-gpu.chart" . }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{ include "nvidia-dra-driver-gpu.templateLabels" . }} +{{- end }} + +{{/* +Apply this to all pod templates (a smaller set of labels compared to +the set of standard labels above, to not clutter individual pods too +much). Note that these labels cannot be used to distinguish +components within this Helm chart. +*/}} +{{- define "nvidia-dra-driver-gpu.templateLabels" -}} +app.kubernetes.io/name: {{ include "nvidia-dra-driver-gpu.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Selector label: precisely filter for just the pods of the corresponding +Deployment, DaemonSet, .... That is, this label key/value pair must be +different per-component (a component name is a required argument). This +could be many labels, but we want to use just one (with a sufficiently +unique key). + +TOOD: remove the override feature, or make the override work per-component. +*/}} +{{- define "nvidia-dra-driver-gpu.selectorLabels" -}} +{{- if and (hasKey . "componentName") (hasKey . "context") -}} +{{- if .context.Values.selectorLabelsOverride -}} +{{ toYaml .context.Values.selectorLabelsOverride }} +{{- else -}} +{{- $name := default .context.Chart.Name .context.Values.nameOverride -}} +{{ $name }}-component: {{ .componentName }} +{{- end }} +{{- else -}} +fail "selectorLabels: both arguments are required: context, componentName" +{{- end }} +{{- end }} + +{{/* +Full image name with tag +*/}} +{{- define "nvidia-dra-driver-gpu.fullimage" -}} +{{- $tag := printf "v%s" .Chart.AppVersion }} +{{- .Values.image.repository -}}:{{- .Values.image.tag | default $tag -}} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "nvidia-dra-driver-gpu.serviceAccountName" -}} +{{- $name := printf "%s-service-account" (include "nvidia-dra-driver-gpu.fullname" .) }} +{{- if .Values.serviceAccount.create }} +{{- default $name .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create the name of the webhook service account to use +*/}} +{{- define "nvidia-dra-driver-gpu.webhookServiceAccountName" -}} +{{- $name := printf "%s-webhook-service-account" (include "nvidia-dra-driver-gpu.fullname" .) }} +{{- if .Values.webhook.serviceAccount.create }} +{{- default $name .Values.webhook.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.webhook.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Check for the existence of an element in a list +*/}} +{{- define "nvidia-dra-driver-gpu.listHas" -}} + {{- $listToCheck := index . 0 }} + {{- $valueToCheck := index . 1 }} + + {{- $found := "" -}} + {{- range $listToCheck}} + {{- if eq . $valueToCheck }} + {{- $found = "true" -}} + {{- end }} + {{- end }} + {{- $found -}} +{{- end }} + +{{/* +Filter a list by a set of valid values +*/}} +{{- define "nvidia-dra-driver-gpu.filterList" -}} + {{- $listToFilter := index . 0 }} + {{- $validValues := index . 1 }} + + {{- $result := list -}} + {{- range $validValues}} + {{- if include "nvidia-dra-driver-gpu.listHas" (list $listToFilter .) }} + {{- $result = append $result . }} + {{- end }} + {{- end }} + {{- $result -}} +{{- end -}} + +{{/* +Get all namespaces (driver namespace + additional namespaces from environment variable) +*/}} +{{- define "nvidia-dra-driver-gpu.namespaces" -}} +{{- $namespaces := list (include "nvidia-dra-driver-gpu.namespace" .) }} +{{- if .Values.controller.containers.computeDomain.env }} +{{- range .Values.controller.containers.computeDomain.env }} +{{- if eq .name "ADDITIONAL_NAMESPACES" }} +{{- if .value }} +{{- $additionalNamespaces := splitList "," .value }} +{{- $namespaces = concat $namespaces $additionalNamespaces }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} +{{- join "," $namespaces -}} +{{- end -}} + +{{/* +Get the latest available resource.k8s.io API version +Returns the highest available version or empty string if none found +*/}} +{{- define "nvidia-dra-driver-gpu.resourceApiVersion" -}} +{{- if .Capabilities.APIVersions.Has "resource.k8s.io/v1" -}} +resource.k8s.io/v1 +{{- else if .Capabilities.APIVersions.Has "resource.k8s.io/v1beta2" -}} +resource.k8s.io/v1beta2 +{{- else if .Capabilities.APIVersions.Has "resource.k8s.io/v1beta1" -}} +resource.k8s.io/v1beta1 +{{- else -}} +{{- end -}} +{{- end -}} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/controller.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/controller.yaml new file mode 100644 index 00000000..906aeedc --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/controller.yaml @@ -0,0 +1,102 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.resources.computeDomains.enabled }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-controller + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "controller") | nindent 6 }} + template: + metadata: + {{- with .Values.controller.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "nvidia-dra-driver-gpu.templateLabels" . | nindent 8 }} + {{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "controller") | nindent 8 }} + spec: + {{- if .Values.controller.priorityClassName }} + priorityClassName: {{ .Values.controller.priorityClassName }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}-controller + securityContext: + {{- toYaml .Values.controller.podSecurityContext | nindent 8 }} + containers: + - name: compute-domain + securityContext: + {{- toYaml .Values.controller.containers.computeDomain.securityContext | nindent 10 }} + image: {{ include "nvidia-dra-driver-gpu.fullimage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["compute-domain-controller", "-v", "$(LOG_VERBOSITY)"] + resources: + {{- toYaml .Values.controller.containers.computeDomain.resources | nindent 10 }} + env: + # LOG_VERBOSITY is the source of truth for this program's klog + # configuration. Currently injected via CLI argument (see above) because + # klog's verbosity for now cannot be sanely set from an env var. + - name: LOG_VERBOSITY + value: "{{ .Values.logVerbosity }}" + # LOG_VERBOSITY_CD_DAEMON controls the verbosity of dynamically launched + # CD daemons (their pod spec is not rendered by Helm, but by this + # controller). + - name: LOG_VERBOSITY_CD_DAEMON + value: "{{ .Values.logVerbosity }}" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: IMAGE_NAME + value: {{ include "nvidia-dra-driver-gpu.fullimage" . }} + # Use runc: explicit "void"; otherwise we inherit "all". + - name: NVIDIA_VISIBLE_DEVICES + value: void + {{- if .Values.featureGates }} + # Feature gates (includes both project-specific and logging features) + - name: FEATURE_GATES + value: "{{ range $key, $value := .Values.featureGates }}{{ $key }}={{ $value }},{{ end }}" + {{- end }} + {{- with .Values.controller.containers.computeDomain.env }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-daemon.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-daemon.yaml new file mode 100644 index 00000000..da3e2611 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-daemon.yaml @@ -0,0 +1,11 @@ +{{- if .Values.resources.computeDomains.enabled }} +--- +apiVersion: {{ include "nvidia-dra-driver-gpu.resourceApiVersion" . }} +kind: DeviceClass +metadata: + name: compute-domain-daemon.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'daemon'" +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-default-channel.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-default-channel.yaml new file mode 100644 index 00000000..1e2221ad --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-default-channel.yaml @@ -0,0 +1,11 @@ +{{- if .Values.resources.computeDomains.enabled }} +--- +apiVersion: {{ include "nvidia-dra-driver-gpu.resourceApiVersion" . }} +kind: DeviceClass +metadata: + name: compute-domain-default-channel.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0" +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-gpu.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-gpu.yaml new file mode 100644 index 00000000..f6eaa2a6 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-gpu.yaml @@ -0,0 +1,11 @@ +{{- if .Values.resources.gpus.enabled }} +--- +apiVersion: {{ include "nvidia-dra-driver-gpu.resourceApiVersion" . }} +kind: DeviceClass +metadata: + name: gpu.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'" +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-mig.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-mig.yaml new file mode 100644 index 00000000..8a0655ac --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/deviceclass-mig.yaml @@ -0,0 +1,11 @@ +{{- if .Values.resources.gpus.enabled }} +--- +apiVersion: {{ include "nvidia-dra-driver-gpu.resourceApiVersion" . }} +kind: DeviceClass +metadata: + name: mig.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'mig'" +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml new file mode 100644 index 00000000..a857f6cd --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml @@ -0,0 +1,322 @@ +# Copyright 2023 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if or .Values.resources.computeDomains.enabled .Values.resources.gpus.enabled }} +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-kubelet-plugin + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "kubelet-plugin") | nindent 6 }} + {{- with .Values.kubeletPlugin.updateStrategy }} + updateStrategy: + {{- toYaml . | nindent 4 }} + {{- end }} + template: + metadata: + {{- with .Values.kubeletPlugin.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "nvidia-dra-driver-gpu.templateLabels" . | nindent 8 }} + {{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "kubelet-plugin") | nindent 8 }} + spec: + {{- if .Values.kubeletPlugin.priorityClassName }} + priorityClassName: {{ .Values.kubeletPlugin.priorityClassName }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}-kubeletplugin + securityContext: + {{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }} + initContainers: + - name: init-container + image: {{ include "nvidia-dra-driver-gpu.fullimage" . }} + securityContext: + privileged: true + command: [bash, /usr/bin/kubelet-plugin-prestart.sh] + env: + - name: NVIDIA_DRIVER_ROOT + value: "{{ .Values.nvidiaDriverRoot }}" + # Use runc: explicit "void"; otherwise we inherit "all". + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: KUBELET_REGISTRAR_DIRECTORY_PATH + value: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }} + - name: KUBELET_PLUGINS_DIRECTORY_PATH + value: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} + volumeMounts: + - name: driver-root-parent + mountPath: /driver-root-parent + {{- if eq "/" .Values.nvidiaDriverRoot }} + readOnly: true + {{- else }} + # In case of the operator-provided driver, another container mounts + # the driver onto the host using `mountPropagation: Bidirectional` + # (out-of-band of the lifecycle of _this_ pod here). For us to see + # that mount, `mountPropagation: HostToContainer` is required (docs: + # "if any Pod with Bidirectional mount propagation to the same volume + # mounts anything there, the container with HostToContainer mount + # propagation will see it."). + mountPropagation: HostToContainer + {{- end }} + containers: + {{- if .Values.resources.computeDomains.enabled }} + - name: compute-domains + securityContext: + {{- toYaml .Values.kubeletPlugin.containers.computeDomains.securityContext | nindent 10 }} + image: {{ include "nvidia-dra-driver-gpu.fullimage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["bash", "-c"] + args: + - |- + # Conditionally mask the params file to prevent this container from + # recreating any missing GPU device nodes. This is necessary, for + # example, when running under nvkind to limit the set GPUs governed + # by the plugin even though it has cgroup access to all of them. + if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then + cp /proc/driver/nvidia/params root/gpu-params + sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params + mount --bind root/gpu-params /proc/driver/nvidia/params + fi + compute-domain-kubelet-plugin -v $(LOG_VERBOSITY) + resources: + {{- toYaml .Values.kubeletPlugin.containers.computeDomains.resources | nindent 10 }} + {{/* + A literal "0" will allocate a random port. Don't configure the probe + with the same literal "0" since that won't match where the service is + actually running. + */}} + {{- if (gt (int .Values.kubeletPlugin.containers.computeDomains.healthcheckPort) 0) }} + startupProbe: + grpc: + port: {{ .Values.kubeletPlugin.containers.computeDomains.healthcheckPort }} + service: liveness + failureThreshold: 60 + periodSeconds: 10 + timeoutSeconds: 10 + livenessProbe: + grpc: + port: {{ .Values.kubeletPlugin.containers.computeDomains.healthcheckPort }} + service: liveness + failureThreshold: 3 + periodSeconds: 10 + timeoutSeconds: 10 + {{- end }} + env: + # LOG_VERBOSITY is the source of truth for this program's klog + # configuration. Currently injected via CLI argument (see above) because + # klog's verbosity for now cannot be sanely set from an environment + # variable. + - name: LOG_VERBOSITY + value: "{{ .Values.logVerbosity }}" + - name: MASK_NVIDIA_DRIVER_PARAMS + value: "{{ .Values.maskNvidiaDriverParams }}" + - name: NVIDIA_DRIVER_ROOT + value: "{{ .Values.nvidiaDriverRoot }}" + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: CDI_ROOT + value: /var/run/cdi + - name: NVIDIA_MIG_CONFIG_DEVICES + value: all + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{- if .Values.nvidiaCDIHookPath }} + - name: NVIDIA_CDI_HOOK_PATH + value: "{{ .Values.nvidiaCDIHookPath }}" + {{- end }} + {{- if .Values.featureGates }} + - name: FEATURE_GATES + value: "{{ range $key, $value := .Values.featureGates }}{{ $key }}={{ $value }},{{ end }}" + {{- end }} + - name: KUBELET_REGISTRAR_DIRECTORY_PATH + value: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }} + - name: KUBELET_PLUGINS_DIRECTORY_PATH + value: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} + {{- if .Values.kubeletPlugin.containers.computeDomains.healthcheckPort }} + - name: HEALTHCHECK_PORT + value: {{ .Values.kubeletPlugin.containers.computeDomains.healthcheckPort | quote }} + {{- end }} + {{- with .Values.kubeletPlugin.containers.computeDomains.env }} + {{- toYaml . | nindent 8 }} + {{- end }} + volumeMounts: + - name: plugins-registry + mountPath: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }} + - name: plugins + mountPath: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} + mountPropagation: Bidirectional + - name: cdi + mountPath: /var/run/cdi + - name: driver-root + mountPath: /driver-root + readOnly: true + mountPropagation: HostToContainer + {{- end }} + {{- if .Values.resources.gpus.enabled }} + - name: gpus + securityContext: + {{- toYaml .Values.kubeletPlugin.containers.gpus.securityContext | nindent 10 }} + image: {{ include "nvidia-dra-driver-gpu.fullimage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["bash", "-c"] + args: + - |- + # Conditionally mask the params file to prevent this container from + # recreating any missing GPU device nodes. This is necessary, for + # example, when running under nvkind to limit the set GPUs governed + # by the plugin even though it has cgroup access to all of them. + if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then + cp /proc/driver/nvidia/params root/gpu-params + sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params + mount --bind root/gpu-params /proc/driver/nvidia/params + fi + gpu-kubelet-plugin -v $(LOG_VERBOSITY) + resources: + {{- toYaml .Values.kubeletPlugin.containers.gpus.resources | nindent 10 }} + {{/* + A literal "0" will allocate a random port. Don't configure the probe + with the same literal "0" since that won't match where the service is + actually running. + */}} + {{- if (gt (int .Values.kubeletPlugin.containers.gpus.healthcheckPort) 0) }} + startupProbe: + grpc: + port: {{ .Values.kubeletPlugin.containers.gpus.healthcheckPort }} + service: liveness + failureThreshold: 60 + periodSeconds: 10 + timeoutSeconds: 10 + livenessProbe: + grpc: + port: {{ .Values.kubeletPlugin.containers.gpus.healthcheckPort }} + service: liveness + failureThreshold: 3 + periodSeconds: 10 + timeoutSeconds: 10 + {{- end }} + env: + # LOG_VERBOSITY is the source of truth for this program's klog + # configuration. Currently injected via CLI argument (see above) because + # klog's verbosity for now cannot be sanely set from an environment + # variable. + - name: LOG_VERBOSITY + value: "{{ .Values.logVerbosity }}" + - name: MASK_NVIDIA_DRIVER_PARAMS + value: "{{ .Values.maskNvidiaDriverParams }}" + - name: NVIDIA_DRIVER_ROOT + value: "{{ .Values.nvidiaDriverRoot }}" + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: CDI_ROOT + value: /var/run/cdi + - name: NVIDIA_MIG_CONFIG_DEVICES + value: all + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: IMAGE_NAME + value: {{ include "nvidia-dra-driver-gpu.fullimage" . }} + {{- if .Values.nvidiaCDIHookPath }} + - name: NVIDIA_CDI_HOOK_PATH + value: "{{ .Values.nvidiaCDIHookPath }}" + {{- end }} + {{- if .Values.featureGates }} + - name: FEATURE_GATES + value: "{{ range $key, $value := .Values.featureGates }}{{ $key }}={{ $value }},{{ end }}" + {{- end }} + - name: KUBELET_REGISTRAR_DIRECTORY_PATH + value: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }} + - name: KUBELET_PLUGINS_DIRECTORY_PATH + value: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} + {{- if .Values.kubeletPlugin.containers.gpus.healthcheckPort }} + - name: HEALTHCHECK_PORT + value: {{ .Values.kubeletPlugin.containers.gpus.healthcheckPort | quote }} + {{- end }} + {{- with .Values.kubeletPlugin.containers.gpus.env }} + {{- toYaml . | nindent 8 }} + {{- end }} + volumeMounts: + - name: plugins-registry + mountPath: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }} + - name: plugins + mountPath: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} + mountPropagation: Bidirectional + - name: cdi + mountPath: /var/run/cdi + - name: driver-root + mountPath: /driver-root + readOnly: true + mountPropagation: HostToContainer + {{- end }} + volumes: + - name: plugins-registry + hostPath: + path: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }} + - name: plugins + hostPath: + path: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} + - name: cdi + hostPath: + path: /var/run/cdi + - name: driver-root-parent + hostPath: + # If nvidiaDriverRoot == "/" then its parent is itself. Otherwise, get + # its parent by removing any trailing slashes as well as the last path + # element with sprig template function `dir`. Examples: /a/b/ -> /a, + # /a/b/c -> /a/b. + {{- if eq "/" .Values.nvidiaDriverRoot }} + path: "/" + {{- else }} + path: {{ dir (trimSuffix "/" .Values.nvidiaDriverRoot) }} + {{- end }} + type: DirectoryOrCreate + - name: driver-root + hostPath: + path: {{ .Values.nvidiaDriverRoot }} + type: DirectoryOrCreate + {{- with .Values.kubeletPlugin.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.kubeletPlugin.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.kubeletPlugin.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-compute-domain-daemon.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-compute-domain-daemon.yaml new file mode 100644 index 00000000..11dff2c6 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-compute-domain-daemon.yaml @@ -0,0 +1,54 @@ +{{- $namespaces := splitList "," (include "nvidia-dra-driver-gpu.namespaces" .) -}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-domain-daemon-role +rules: +- apiGroups: ["resource.nvidia.com"] + resources: ["computedomains", "computedomains/status"] + verbs: ["get", "list", "watch", "update", "patch"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + +{{- range $namespace := $namespaces }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: compute-domain-daemon-service-account + namespace: {{ $namespace }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-domain-daemon-role-binding +subjects: +{{- range $namespace := $namespaces }} +- kind: ServiceAccount + name: compute-domain-daemon-service-account + namespace: {{ $namespace }} +{{- end }} +roleRef: + kind: ClusterRole + name: compute-domain-daemon-role + apiGroup: rbac.authorization.k8s.io +{{- if .Capabilities.APIVersions.Has "security.openshift.io/v1/SecurityContextConstraints" }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-domain-daemon-openshift-anyuid-role-binding +subjects: +{{- range $namespace := $namespaces }} +- kind: ServiceAccount + name: compute-domain-daemon-service-account + namespace: {{ $namespace }} +{{- end }} +roleRef: + kind: ClusterRole + name: system:openshift:scc:anyuid + apiGroup: rbac.authorization.k8s.io +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-controller.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-controller.yaml new file mode 100644 index 00000000..f793d393 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-controller.yaml @@ -0,0 +1,75 @@ +{{- if .Values.serviceAccount.create -}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-clusterrole-controller +rules: +- apiGroups: ["resource.nvidia.com"] + resources: ["computedomains"] + verbs: ["get", "list", "watch", "update"] +- apiGroups: ["resource.nvidia.com"] + resources: ["computedomains/status"] + verbs: ["update"] +- apiGroups: ["resource.k8s.io"] + resources: ["resourceclaimtemplates"] + verbs: ["get", "list", "watch", "create", "update", "delete"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch", "update"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" $ }}-controller + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" $ | nindent 4 }} + {{- with $.Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-clusterrole-binding-controller-{{ include "nvidia-dra-driver-gpu.namespace" . }} +subjects: + - kind: ServiceAccount + name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" $ }}-controller + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} +roleRef: + kind: ClusterRole + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-clusterrole-controller + apiGroup: rbac.authorization.k8s.io + +{{- range $namespace := splitList "," (include "nvidia-dra-driver-gpu.namespaces" .) }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-role-controller + namespace: {{ $namespace }} +rules: +- apiGroups: ["apps"] + resources: ["daemonsets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-role-binding-controller + namespace: {{ $namespace }} +subjects: + - kind: ServiceAccount + name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" $ }}-controller + namespace: {{ include "nvidia-dra-driver-gpu.namespace" $ }} +roleRef: + kind: Role + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-role-controller + apiGroup: rbac.authorization.k8s.io +{{- end }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml new file mode 100644 index 00000000..9c7a78cd --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/rbac-kubeletplugin.yaml @@ -0,0 +1,92 @@ +{{- if .Values.serviceAccount.create -}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-clusterrole-kubeletplugin +rules: +- apiGroups: ["resource.nvidia.com"] + resources: ["computedomains"] + verbs: ["get", "list", "watch"] +- apiGroups: ["resource.k8s.io"] + resources: ["resourceclaims"] + verbs: ["get", "list", "watch"] +- apiGroups: ["resource.k8s.io"] + resources: ["resourceslices"] + verbs: ["get", "list", "watch", "create", "update", "delete"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch", "update"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" $ }}-kubeletplugin + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" $ | nindent 4 }} + {{- with $.Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-clusterrole-binding-kubeletplugin +subjects: + - kind: ServiceAccount + name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" $ }}-kubeletplugin + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} +roleRef: + kind: ClusterRole + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-clusterrole-kubeletplugin + apiGroup: rbac.authorization.k8s.io + +{{- range $namespace := splitList "," (include "nvidia-dra-driver-gpu.namespaces" .) }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-role-kubeletplugin + namespace: {{ $namespace }} +rules: +{{- if (and $.Values.resources.gpus.enabled $.Values.featureGates.MPSSupport) }} +- apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-role-binding-kubeletplugin + namespace: {{ $namespace }} +subjects: + - kind: ServiceAccount + name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" $ }}-kubeletplugin + namespace: {{ include "nvidia-dra-driver-gpu.namespace" $ }} +roleRef: + kind: Role + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-role-kubeletplugin + apiGroup: rbac.authorization.k8s.io +{{- end }} +{{- if .Capabilities.APIVersions.Has "security.openshift.io/v1/SecurityContextConstraints" }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" $ }}-openshift-privileged-role-binding-kubeletplugin +subjects: +- kind: ServiceAccount + name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" $ }}-kubeletplugin + namespace: {{ include "nvidia-dra-driver-gpu.namespace" $ }} +roleRef: + kind: ClusterRole + name: system:openshift:scc:privileged + apiGroup: rbac.authorization.k8s.io +{{- end }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicy.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicy.yaml new file mode 100644 index 00000000..03b81318 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicy.yaml @@ -0,0 +1,36 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: resourceslices-policy-{{ include "nvidia-dra-driver-gpu.name" . }} +spec: + failurePolicy: Fail + matchConstraints: + resourceRules: + - apiGroups: ["resource.k8s.io"] + apiVersions: ["v1", "v1beta1", "v1beta2"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["resourceslices"] + matchConditions: + - name: isRestrictedUser + expression: >- + request.userInfo.username == "system:serviceaccount:{{ include "nvidia-dra-driver-gpu.namespace" . }}:{{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}" + variables: + - name: userNodeName + expression: >- + request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') + - name: objectNodeName + expression: >- + (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") + - name: nodeSelectorValue + expression: >- + (request.operation == "DELETE" ? oldObject : object).spec.?nodeSelector.orValue(null) + - name: allNodesValue + expression: >- + (request.operation == "DELETE" ? oldObject : object).spec.?allNodes.orValue(false) + validations: + - expression: variables.userNodeName != "" + message: >- + no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled + - expression: variables.userNodeName == variables.objectNodeName || variables.allNodesValue == true || variables.nodeSelectorValue != null + messageExpression: >- + "this user running on node '"+variables.userNodeName+"' may not modify cluster or node resourceslices" diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicybinding.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicybinding.yaml new file mode 100644 index 00000000..dcdfdf92 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicybinding.yaml @@ -0,0 +1,8 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: resourceslices-policy-{{ include "nvidia-dra-driver-gpu.name" . }} +spec: + policyName: resourceslices-policy-{{ include "nvidia-dra-driver-gpu.name" . }} + validationActions: [Deny] + # All ResourceSlices are matched. diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingwebhookconfiguration.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingwebhookconfiguration.yaml new file mode 100644 index 00000000..c491450e --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validatingwebhookconfiguration.yaml @@ -0,0 +1,32 @@ +{{- if .Values.webhook.enabled }} +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-webhook-config + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} + {{- if eq .Values.webhook.tls.mode "cert-manager" }} + annotations: + cert-manager.io/inject-ca-from: "{{ include "nvidia-dra-driver-gpu.namespace" . }}/{{ include "nvidia-dra-driver-gpu.name" . }}-webhook-cert" + {{- end }} +webhooks: + - name: "gpu.nvidia.com" + rules: + - apiGroups: ["resource.k8s.io"] + apiVersions: ["v1beta1", "v1beta2", "v1"] + operations: ["CREATE", "UPDATE"] + resources: ["resourceclaims", "resourceclaimtemplates"] + scope: "Namespaced" + clientConfig: + service: + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + name: {{ include "nvidia-dra-driver-gpu.name" . }}-webhook + port: {{ .Values.webhook.servicePort }} + path: /validate-resource-claim-parameters + {{- if and (eq .Values.webhook.tls.mode "secret") .Values.webhook.tls.secret.caBundle }} + caBundle: {{ .Values.webhook.tls.secret.caBundle | quote }} + {{- end }} + failurePolicy: {{ default "Fail" .Values.webhook.failurePolicy | quote }} + admissionReviewVersions: ["v1"] + sideEffects: None +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validation.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validation.yaml new file mode 100644 index 00000000..63050635 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/validation.yaml @@ -0,0 +1,127 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.namespace }} +{{- $error := "" }} +{{- $error = printf "%s\nValue 'namespace' set to %s" $error .Values.namespace }} +{{- $error = printf "%s\nSetting an explicit 'namespace' in values.yaml or via --set on the command line is no longer supported." $error }} +{{- $error = printf "%s\nUse --namespace (with --create-namespace as necessary) instead." $error }} +{{- $error = printf "%s\nSee: https://helm.sh/docs/helm/helm_install/#options" $error }} +{{- fail $error }} +{{- end }} + +{{- if and (eq (include "nvidia-dra-driver-gpu.namespace" .) "default") ( eq .Values.namespaceOverride "") (not .Values.allowDefaultNamespace) }} +{{- $error := "" }} +{{- $error = printf "%s\nRunning in the 'default' namespace is not recommended." $error }} +{{- $error = printf "%s\nSet 'allowDefaultNamespace=true' to bypass this error." $error }} +{{- $error = printf "%s\nOtherwise, use --namespace (with --create-namespace as necessary) to run in a specific namespace." $error }} +{{- $error = printf "%s\nSee: https://helm.sh/docs/helm/helm_install/#options" $error }} +{{- fail $error }} +{{- end }} + +{{- if and .Values.resources.gpus.enabled (not .Values.gpuResourcesEnabledOverride) }} +{{- $error := "" }} +{{- $error = printf "%s\nThe default value of 'resources.gpus.enabled=true' is not yet supported." $error }} +{{- $error = printf "%s\nIt is set to true by default to future proof it as the default once support for it becomes available." $error }} +{{- $error = printf "%s\nUntil then, please explicitly set 'resources.gpus.enabled=false' when installing this chart." $error }} +{{- $error = printf "%s\nIf you truly want to force 'resources.gpus.enabled=true' to apply, you must also set 'gpuResourcesEnabledOverride=true'." $error }} +{{- fail $error }} +{{- end }} + +{{- if .Values.nvidiaCtkPath }} +{{- $error := "" }} +{{- $error = printf "%s\nSetting a user-defined nvidiaCtkPath is no longer supported. It can simply be removed without consequence." $error }} +{{- $error = printf "%s\nIt was previously required to point the DRA driver at the host-path to the nvidia-ctk binary." $error }} +{{- $error = printf "%s\nThis, in turn, was used to execute any CDI hooks injected into containers by the DRA driver." $error }} +{{- $error = printf "%s\nNow a diffent binary is used called nvidia-cdi-hook that is installed by the DRA driver itself." $error }} +{{- $error = printf "%s\nThis renders the need for passing this user-defined flag obsolete." $error }} +{{- fail $error }} +{{- end }} + +{{- if .Values.webhook.enabled }} +{{- if not .Values.webhook.tls }} +{{- $error := "" }} +{{- $error = printf "%s\nWebhook TLS configuration is required when webhook is enabled." $error }} +{{- $error = printf "%s\nPlease configure webhook.tls.mode and related settings." $error }} +{{- fail $error }} +{{- end }} + +{{- if not .Values.webhook.tls.mode }} +{{- $error := "" }} +{{- $error = printf "%s\nWebhook TLS mode is required when webhook is enabled." $error }} +{{- $error = printf "%s\nSet webhook.tls.mode to either 'cert-manager' or 'secret'." $error }} +{{- fail $error }} +{{- end }} + +{{- if not (or (eq .Values.webhook.tls.mode "cert-manager") (eq .Values.webhook.tls.mode "secret")) }} +{{- $error := "" }} +{{- $error = printf "%s\nInvalid webhook TLS mode: %s" $error .Values.webhook.tls.mode }} +{{- $error = printf "%s\nWebhook TLS mode must be either 'cert-manager' or 'secret'." $error }} +{{- fail $error }} +{{- end }} + +{{- if eq .Values.webhook.tls.mode "cert-manager" }} +{{- if not .Values.webhook.tls.certManager }} +{{- $error := "" }} +{{- $error = printf "%s\nCert-manager configuration is required when using cert-manager mode." $error }} +{{- $error = printf "%s\nPlease configure webhook.tls.certManager settings." $error }} +{{- fail $error }} +{{- end }} + +{{- if not .Values.webhook.tls.certManager.issuerType }} +{{- $error := "" }} +{{- $error = printf "%s\nCert-manager issuer type is required when using cert-manager mode." $error }} +{{- $error = printf "%s\nSet webhook.tls.certManager.issuerType to 'selfsigned', 'clusterissuer', or 'issuer'." $error }} +{{- fail $error }} +{{- end }} + +{{- if not (or (eq .Values.webhook.tls.certManager.issuerType "selfsigned") (eq .Values.webhook.tls.certManager.issuerType "clusterissuer") (eq .Values.webhook.tls.certManager.issuerType "issuer")) }} +{{- $error := "" }} +{{- $error = printf "%s\nInvalid cert-manager issuer type: %s" $error .Values.webhook.tls.certManager.issuerType }} +{{- $error = printf "%s\nIssuer type must be 'selfsigned', 'clusterissuer', or 'issuer'." $error }} +{{- fail $error }} +{{- end }} + +{{- if and (ne .Values.webhook.tls.certManager.issuerType "selfsigned") (not .Values.webhook.tls.certManager.issuerName) }} +{{- $error := "" }} +{{- $error = printf "%s\nCert-manager issuer name is required when issuer type is not 'selfsigned'." $error }} +{{- $error = printf "%s\nSet webhook.tls.certManager.issuerName to the name of your issuer." $error }} +{{- fail $error }} +{{- end }} +{{- end }} + +{{- if eq .Values.webhook.tls.mode "secret" }} +{{- if not .Values.webhook.tls.secret }} +{{- $error := "" }} +{{- $error = printf "%s\nSecret configuration is required when using secret mode." $error }} +{{- $error = printf "%s\nPlease configure webhook.tls.secret settings." $error }} +{{- fail $error }} +{{- end }} + +{{- if not .Values.webhook.tls.secret.name }} +{{- $error := "" }} +{{- $error = printf "%s\nSecret name is required when using secret mode." $error }} +{{- $error = printf "%s\nSet webhook.tls.secret.name to the name of your TLS secret." $error }} +{{- fail $error }} +{{- end }} +{{- end }} +{{- end }} + +{{- if not (include "nvidia-dra-driver-gpu.resourceApiVersion" .) -}} +{{- $error := "" }} +{{- $error = printf "%s\nNo supported resource.k8s.io API version found in the cluster." $error }} +{{- $error = printf "%s\nThis chart requires one of: resource.k8s.io/v1, resource.k8s.io/v1beta1, or resource.k8s.io/v1beta2." $error }} +{{- $error = printf "%s\nPlease ensure your cluster supports Dynamic Resource Allocation (DRA) with one of these API versions." $error }} +{{- fail $error }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-cert-issuer.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-cert-issuer.yaml new file mode 100644 index 00000000..2c34933b --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-cert-issuer.yaml @@ -0,0 +1,11 @@ +{{- if and .Values.webhook.enabled (eq .Values.webhook.tls.mode "cert-manager") (eq .Values.webhook.tls.certManager.issuerType "selfsigned") }} +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "nvidia-dra-driver-gpu.fullname" . }}-webhook-issuer + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} +spec: + selfSigned: {} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-cert-secret.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-cert-secret.yaml new file mode 100644 index 00000000..68336647 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-cert-secret.yaml @@ -0,0 +1,29 @@ +{{- if and .Values.webhook.enabled (eq .Values.webhook.tls.mode "cert-manager") }} +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-webhook-cert + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} +spec: + dnsNames: + {{- $svcName := printf "%s-webhook" (include "nvidia-dra-driver-gpu.name" .) }} + {{- $svcNamespace := (include "nvidia-dra-driver-gpu.namespace" .) }} + - {{ $svcName }}.{{ $svcNamespace }}.svc + {{- with .Values.webhook.tls.certManager.dnsNames }} + {{- toYaml . | nindent 4 }} + {{- end }} + issuerRef: + {{- if eq .Values.webhook.tls.certManager.issuerType "clusterissuer" }} + kind: ClusterIssuer + {{- else }} + kind: Issuer + {{- end }} + {{- if eq .Values.webhook.tls.certManager.issuerType "selfsigned" }} + name: {{ include "nvidia-dra-driver-gpu.fullname" . }}-webhook-issuer + {{- else }} + name: {{ .Values.webhook.tls.certManager.issuerName }} + {{- end }} + secretName: {{ include "nvidia-dra-driver-gpu.name" . }}-webhook-cert +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-deployment.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-deployment.yaml new file mode 100644 index 00000000..780c0f2f --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-deployment.yaml @@ -0,0 +1,105 @@ +# Copyright 2025 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.webhook.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-webhook + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.webhook.replicas }} + selector: + matchLabels: + {{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "webhook") | nindent 6 }} + template: + metadata: + {{- with .Values.webhook.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "nvidia-dra-driver-gpu.templateLabels" . | nindent 8 }} + {{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "webhook") | nindent 8 }} + spec: + {{- if .Values.webhook.priorityClassName }} + priorityClassName: {{ .Values.webhook.priorityClassName }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "nvidia-dra-driver-gpu.webhookServiceAccountName" . }} + securityContext: + {{- toYaml .Values.webhook.podSecurityContext | nindent 8 }} + containers: + - name: webhook + securityContext: + {{- toYaml .Values.webhook.containers.webhook.securityContext | nindent 10 }} + image: {{ include "nvidia-dra-driver-gpu.fullimage" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["webhook"] + resources: + {{- toYaml .Values.webhook.containers.webhook.resources | nindent 10 }} + args: + - --tls-cert-file=/cert/tls.crt + - --tls-private-key-file=/cert/tls.key + - --port={{ .Values.webhook.containerPort }} + env: + {{- if .Values.featureGates }} + - name: FEATURE_GATES + value: "{{ range $key, $value := .Values.featureGates }}{{ $key }}={{ $value }},{{ end }}" + {{- end }} + ports: + - name: webhook + containerPort: {{ .Values.webhook.containerPort }} + livenessProbe: + failureThreshold: 5 + httpGet: + path: /readyz + port: webhook + scheme: HTTPS + readinessProbe: + failureThreshold: 3 + httpGet: + path: /readyz + port: webhook + scheme: HTTPS + volumeMounts: + - name: cert + mountPath: /cert + readOnly: true + volumes: + - name: cert + secret: + {{- if eq .Values.webhook.tls.mode "secret" }} + secretName: {{ .Values.webhook.tls.secret.name }} + {{- else }} + secretName: {{ include "nvidia-dra-driver-gpu.name" . }}-webhook-cert + {{- end }} + {{- with .Values.webhook.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.webhook.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.webhook.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-service.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-service.yaml new file mode 100644 index 00000000..bffd9201 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-service.yaml @@ -0,0 +1,17 @@ +{{- if .Values.webhook.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "nvidia-dra-driver-gpu.name" . }}-webhook + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} +spec: + ports: + - port: {{ .Values.webhook.servicePort }} + targetPort: webhook + protocol: TCP + name: https + selector: + {{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "webhook") | nindent 4 }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-serviceaccount.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-serviceaccount.yaml new file mode 100644 index 00000000..e7fc43d1 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/templates/webhook-serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if and .Values.webhook.enabled .Values.webhook.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nvidia-dra-driver-gpu.webhookServiceAccountName" . }} + namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} + labels: + {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} + {{- with .Values.webhook.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/values.yaml b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/values.yaml new file mode 100644 index 00000000..45319c30 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/charts/nvidia-dra-driver-gpu/values.yaml @@ -0,0 +1,247 @@ +# Copyright 2023 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default values for k8s-dra-driver-gpu. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# Specify the driver root on the host. +# If the NVIDIA GPU driver is managed using the NVIDIA GPU Driver Container, +# this is typically /run/nvidia/driver. +# For driver installed directly on a host, a value of `/` is used. +nvidiaDriverRoot: / + +# Optional path to the nvidia-cdi-hook executable. +# If not specified, the default path inferred from the nvidia-container-toolkit library version will be used. +nvidiaCDIHookPath: "" + +nameOverride: "" +fullnameOverride: "" +namespaceOverride: "" +selectorLabelsOverride: {} +gpuResourcesEnabledOverride: false + +allowDefaultNamespace: false + +imagePullSecrets: [] +image: + repository: nvcr.io/nvidia/k8s-dra-driver-gpu + pullPolicy: IfNotPresent + # Note: an empty string is translated to the `appVersion` string from + # the Helm chart YAML (effectively implementing the default value to be + # the current version). Also note that a "v" is prefixed to the + # `appVersion` value. + tag: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +resources: + gpus: + enabled: true + computeDomains: + enabled: true + +# Feature gates configuration following Kubernetes patterns +# Configure feature gates as key-value pairs (feature_name: true/false) +# Examples: +# featureGates: +# ExampleFeature: false # Project-specific alpha feature +# ContextualLogging: true # Kubernetes logging feature (enabled by default) +# LoggingAlphaOptions: false # Kubernetes logging alpha features +# LoggingBetaOptions: true # Kubernetes logging beta features +featureGates: {} + +# Log verbosity for all components. Zero or greater, higher number means higher +# verbosity. Regardless of this setting, messages of type Error, Warning, and +# Info(level 0) are always logged. Can also be set for individual components via +# environment variable (that takes precedence), see +# https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Troubleshooting#controlling-log-verbosity +# +# An (incomplete) representation of which types of messages to expect with +# increasing verbosity level: +# +# Level 0: +# - Configuration detail (during process startup) +# - Kubelet plugins: +# - Permanent errors during device Prepare() and Unprepare() +# +# Level 1: +# - CD controller: +# - Confirm cleanup of stale objects +# - k8s client-go: feature gates +# - Kubelet plugins: +# - Device (un)prepare confirmation, with resource claim UID +# - Workqueue reconciliation failures (noisy: mainly expected, retryable +# errors) +# - CD daemon: +# - explicit 'wait for nodes update' +# +# Level 2: +# - reflector.go informer state: "Caches populated" +# - Kubelet plugins: +# - Acknowledge when Unprepare is a noop +# - CD controller: +# - Added/updated API object callback confirmation +# +# Level 3: +# - reflector.go informer state: "Listing and watching" +# +# Level 6: +# - round_trippers.go output (API server request/response detail) +# - Kubelet plugins: +# - GRPC request/response detail +# - Checkpoint file update confirmation +# - CD daemon: +# - explicit 'IP set did not change' +# +# Level 7: +# - Kubelet plugins: +# - Health check +logVerbosity: "4" + +# Webhook configuration +webhook: + enabled: false + replicas: 1 + servicePort: 443 + containerPort: 443 + priorityClassName: "system-cluster-critical" + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: "100%" + podAnnotations: {} + podSecurityContext: {} + nodeSelector: {} + tolerations: [] + affinity: {} + containers: + webhook: + securityContext: + privileged: false + resources: {} + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + name: "" + # failurePolicy defines how the API server should handle requests if the webhook call fails. + # Options: + # - Fail : reject the request if the webhook call fails either due to cert errors, timeout or if the service is unreachable. + # - Ignore : allow the request to continue if the webhook call fails. + failurePolicy: Fail + # TLS certificate configuration + tls: + # Certificate management mode: "cert-manager" or "secret" + # - "cert-manager": Use cert-manager to automatically generate and manage certificates + # - "secret": Use a user-provided secret containing tls.crt and tls.key + mode: "cert-manager" + certManager: + # Issuer type: "selfsigned", "clusterissuer", or "issuer" + issuerType: "selfsigned" + # Issuer name (required when issuerType is "clusterissuer" or "issuer") + issuerName: "" + # Additional DNS names for the certificate + dnsNames: [] + secret: + # Name of the secret containing tls.crt and tls.key + name: "" + # Base64-encoded CA certificate bundle for validating the webhook's TLS certificate (base64 encoded) + # Required when using secret mode. + # Note: Only include intermediate CA certificates, not root CA certificates + caBundle: "" + +controller: + priorityClassName: "system-node-critical" + podAnnotations: {} + podSecurityContext: {} + nodeSelector: {} + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + containers: + computeDomain: + securityContext: {} + env: [] + resources: {} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + +kubeletPlugin: + priorityClassName: "system-node-critical" + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: "100%" + podAnnotations: {} + podSecurityContext: {} + nodeSelector: {} + tolerations: [] + kubeletRegistrarDirectoryPath: /var/lib/kubelet/plugins_registry + kubeletPluginsDirectoryPath: /var/lib/kubelet/plugins + containers: + init: + securityContext: {} + resources: {} + computeDomains: + env: [] + securityContext: + privileged: true + resources: {} + # Port running a gRPC health service checked by a livenessProbe. + # Set to a negative value to disable the service and the probe. + healthcheckPort: 51515 + gpus: + env: [] + securityContext: + privileged: true + resources: {} + # Port running a gRPC health service checked by a livenessProbe. + # Set to a negative value to disable the service and the probe. + healthcheckPort: 51516 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - "true" + - matchExpressions: + # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA + - key: feature.node.kubernetes.io/cpu-model.vendor_id + operator: In + values: + - "NVIDIA" + - matchExpressions: + # We allow a GPU deployment to be forced by setting the following label to "true" + - key: "nvidia.com/gpu.present" + operator: In + values: + - "true" diff --git a/packs/nvidia-dra-driver-25.8.1/logo.png b/packs/nvidia-dra-driver-25.8.1/logo.png new file mode 100644 index 00000000..e12f5535 Binary files /dev/null and b/packs/nvidia-dra-driver-25.8.1/logo.png differ diff --git a/packs/nvidia-dra-driver-25.8.1/pack.json b/packs/nvidia-dra-driver-25.8.1/pack.json new file mode 100644 index 00000000..86e39e43 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/pack.json @@ -0,0 +1,24 @@ +{ + "addonType":"ai", + "annotations": { + "source": "community", + "contributor": "isc" + }, + "constraints": { + "dependencies": [ + { + "packName": "kubernetes", + "layer": "k8s", + "minVersion": "1.34", + "maxVersion": "", + }, + { + "packName": "nvidia-gpu-operator", + "layer": "addon", + "minVersion": "25.10.1", + "maxVersion": "", + "type": "optional" + } + ] + } +} diff --git a/packs/nvidia-dra-driver-25.8.1/values.yaml b/packs/nvidia-dra-driver-25.8.1/values.yaml new file mode 100644 index 00000000..fa294e97 --- /dev/null +++ b/packs/nvidia-dra-driver-25.8.1/values.yaml @@ -0,0 +1,86 @@ +# NVIDIA DRA Driver for GPUs - Palette Pack Configuration +# Enables Dynamic Resource Allocation for NVIDIA GPUs in Kubernetes 1.32+ + +pack: + namespace: "nvidia-dra-driver-gpu" + namespaceLabels: + "nvidia-dra-driver-gpu": "pod-security.kubernetes.io/enforce=privileged,pod-security.kubernetes.io/enforce-version=latest" + content: + images: + - image: nvcr.io/nvidia/k8s-dra-driver-gpu:v25.8.1 + spectrocloud.com/install-priority: "25" + +charts: + nvidia-dra-driver-gpu: + # Driver root path - use /run/nvidia/driver when GPU Operator manages drivers + # Use "/" when drivers are installed directly on the host + nvidiaDriverRoot: /run/nvidia/driver + gpuResourcesEnabledOverride: true + + # Enable GPU allocation via DRA + resources: + gpus: + enabled: true + # ComputeDomains for Multi-Node NVLink (MNNVL) - disable if not using GB200/similar + computeDomains: + enabled: false + + # Image configuration + image: + repository: nvcr.io/nvidia/k8s-dra-driver-gpu + pullPolicy: IfNotPresent + tag: "v25.8.1" + + # Service account + serviceAccount: + create: true + annotations: {} + name: "" + + # Log verbosity (0-7, higher = more verbose) + logVerbosity: "4" + + # Webhook disabled by default - enable for advanced validation + webhook: + enabled: false + + # Controller configuration (runs on control plane nodes) + controller: + priorityClassName: "system-node-critical" + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + + # Kubelet plugin configuration (runs on GPU nodes) + kubeletPlugin: + priorityClassName: "system-node-critical" + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: "100%" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - "true" + - matchExpressions: + - key: "nvidia.com/gpu.present" + operator: In + values: + - "true"