kubernetes-sigs · shivamerla · Apr 11, 2026
diff --git a/.golangci.yaml b/.golangci.yaml
@@ -17,6 +17,19 @@ linters:
       - common-false-positives
       - legacy
       - std-error-handling
+    rules:
+      # SA1019: conversion code legitimately uses deprecated API types (e.g. v1beta1).
+      # Match by convention so new files do not need config edits:
+      #   - *_conversion.go and *_conversion_test.go under api/, cmd/, or pkg/
+      #   - any *.go under .../conversion/ (e.g. pkg/.../conversion/)
+      - path: (api|cmd|pkg)/.*_conversion(_test)?\.go$
+        linters:
+          - staticcheck
+        text: "SA1019"
+      - path: (api|cmd|pkg)/.*/conversion/.*\.go$
+        linters:
+          - staticcheck
+        text: "SA1019"
     paths:
       - third_party$
       - builtin$

diff --git a/Makefile b/Makefile
@@ -112,18 +112,20 @@ coverage: test
 
 generate: generate-crds generate-informers fmt
 
+# Only copy the CRD for CDClique since we have templatized CRD for CD with webhook configuration
+# and that is under helm templates/ instead of crds/
+# TODO: Need to automate this to templatize the CRD for CD everytime we update the CD API
 generate-crds: generate-deepcopy .remove-crds
 	for dir in $(CLIENT_SOURCES); do \
 		controller-gen crd:crdVersions=v1 \
 			paths=$(CURDIR)/$${dir} \
 			output:crd:dir=$(CURDIR)/deployments/helm/tmp_crds; \
 	done
 	mkdir -p $(CURDIR)/deployments/helm/$(DRIVER_NAME)/crds
-	cp -R $(CURDIR)/deployments/helm/tmp_crds/* \
+	cp -R $(CURDIR)/deployments/helm/tmp_crds/resource.nvidia.com_computedomaincliques.yaml \
 		$(CURDIR)/deployments/helm/$(DRIVER_NAME)/crds
 	rm -rf $(CURDIR)/deployments/helm/tmp_crds
 
-
 # Regenerate everything and fail if the tree is dirty (used by `make check`).
 check-generate: generate
 	git diff --exit-code HEAD

diff --git a/api/nvidia.com/resource/v1beta1/computedomain.go b/api/nvidia.com/resource/v1beta1/computedomain.go
@@ -33,9 +33,13 @@ const (
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
 // +k8s:openapi-gen=true
 // +kubebuilder:resource:scope=Namespaced
+// +kubebuilder:deprecatedversion
 // +kubebuilder:subresource:status
 
 // ComputeDomain prepares a set of nodes to run a multi-node workload in.
+//
+// Deprecated: use resource.nvidia.com/v1beta2 ComputeDomain. This version is
+// retained for compatibility.
 type ComputeDomain struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
@@ -59,6 +63,10 @@ type ComputeDomainList struct {
 
 // +kubebuilder:validation:XValidation:rule="self == oldSelf", message="A computeDomain.spec is immutable"
 
+// AnnotationComputeDomainNumNodes stores the v1beta1-only numNodes field on the hub)
+// object so it survives conversion. It is not part of the v1beta2 API.
+const AnnotationComputeDomainNumNodes = "resource.nvidia.com/computedomain-num-nodes"
+
 // ComputeDomainSpec provides the spec for a ComputeDomain.
 type ComputeDomainSpec struct {
 	// Intended number of IMEX daemons (i.e., individual compute nodes) in the
@@ -83,9 +91,14 @@ type ComputeDomainSpec struct {
 	// `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to
 	// join the ComputeDomain may lead to unexpected behavior.
 	//
-	// The `numNodes` parameter is deprecated and will be removed in the next
-	// API version.
-	NumNodes int                       `json:"numNodes"`
+	// The `numNodes` field exists only on this deprecated API version, it is
+	// not present on resource.nvidia.com/v1beta2 and is round-tripped via
+	// metadata.annotations["resource.nvidia.com/computedomain-num-nodes"] on the hub.
+	//
+	// +kubebuilder:default:=0
+	// +kubebuilder:validation:Minimum=0
+	// +kubebuilder:validation:Optional
+	NumNodes int                       `json:"numNodes,omitempty"`
 	Channel  *ComputeDomainChannelSpec `json:"channel"`
 }
 

diff --git a/api/nvidia.com/resource/v1beta1/computedomain_conversion.go b/api/nvidia.com/resource/v1beta1/computedomain_conversion.go
@@ -0,0 +1,164 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1beta1
+
+import (
+	"fmt"
+	"strconv"
+
+	v1beta2 "sigs.k8s.io/dra-driver-nvidia-gpu/api/nvidia.com/resource/v1beta2"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+// ConvertTo implements the hub (v1beta2) side of multi-version conversion (kubebuilder pattern).
+func (src *ComputeDomain) ConvertTo(dst *v1beta2.ComputeDomain) error {
+	if src == nil || dst == nil {
+		return fmt.Errorf("ConvertTo: nil ComputeDomain")
+	}
+	dst.TypeMeta = metav1.TypeMeta{
+		APIVersion: v1beta2.SchemeGroupVersion.String(),
+		Kind:       "ComputeDomain",
+	}
+	dst.ObjectMeta = *src.ObjectMeta.DeepCopy()
+	syncNumNodesAnnotationOnHub(dst, src.Spec.NumNodes)
+	dst.Spec = v1beta2.ComputeDomainSpec{}
+	if src.Spec.Channel != nil {
+		dst.Spec.Channel = &v1beta2.ComputeDomainChannelSpec{
+			ResourceClaimTemplate: v1beta2.ComputeDomainResourceClaimTemplate{
+				Name: src.Spec.Channel.ResourceClaimTemplate.Name,
+			},
+			AllocationMode: src.Spec.Channel.AllocationMode,
+		}
+	}
+	dst.Status = v1beta1StatusToV1beta2(&src.Status)
+	return nil
+}
+
+// ConvertFrom restores a deprecated v1beta1 view from the hub (v1beta2) representation.
+func (dst *ComputeDomain) ConvertFrom(src *v1beta2.ComputeDomain) error {
+	if src == nil || dst == nil {
+		return fmt.Errorf("ConvertFrom: nil ComputeDomain")
+	}
+	dst.TypeMeta = metav1.TypeMeta{
+		APIVersion: SchemeGroupVersion.String(),
+		Kind:       "ComputeDomain",
+	}
+	dst.ObjectMeta = *src.ObjectMeta.DeepCopy()
+	dst.Spec = ComputeDomainSpec{
+		NumNodes: numNodesFromAnnotationsMap(src.Annotations),
+	}
+	// Hide hub-only storage key from the deprecated API surface.
+	if dst.Annotations != nil {
+		delete(dst.Annotations, AnnotationComputeDomainNumNodes)
+	}
+	if src.Spec.Channel != nil {
+		dst.Spec.Channel = &ComputeDomainChannelSpec{
+			ResourceClaimTemplate: ComputeDomainResourceClaimTemplate{
+				Name: src.Spec.Channel.ResourceClaimTemplate.Name,
+			},
+			AllocationMode: src.Spec.Channel.AllocationMode,
+		}
+	}
+	dst.Status = v1beta2StatusToV1beta1(&src.Status)
+	return nil
+}
+
+// syncNumNodesAnnotationOnHub writes the v1beta1 numNodes carry annotation on the hub (v1beta2) object.
+func syncNumNodesAnnotationOnHub(cd *v1beta2.ComputeDomain, n int) {
+	if cd.Annotations == nil {
+		cd.Annotations = map[string]string{}
+	}
+	if n == 0 {
+		delete(cd.Annotations, AnnotationComputeDomainNumNodes)
+		return
+	}
+	cd.Annotations[AnnotationComputeDomainNumNodes] = strconv.Itoa(n)
+}
+
+func numNodesFromAnnotationsMap(annotations map[string]string) int {
+	if annotations == nil {
+		return 0
+	}
+	s, ok := annotations[AnnotationComputeDomainNumNodes]
+	if !ok || s == "" {
+		return 0
+	}
+	n, err := strconv.Atoi(s)
+	if err != nil {
+		return 0
+	}
+	return n
+}
+
+// NumNodesFromAnnotation returns the v1beta1 numNodes value carried on the hub object.
+func NumNodesFromAnnotation(meta *metav1.ObjectMeta) int {
+	if meta == nil {
+		return 0
+	}
+	return numNodesFromAnnotationsMap(meta.Annotations)
+}
+
+func v1beta1StatusToV1beta2(in *ComputeDomainStatus) v1beta2.ComputeDomainStatus {
+	if in == nil {
+		return v1beta2.ComputeDomainStatus{}
+	}
+	out := v1beta2.ComputeDomainStatus{
+		Status: in.Status,
+	}
+	if len(in.Nodes) > 0 {
+		out.Nodes = make([]*v1beta2.ComputeDomainNode, len(in.Nodes))
+		for i, n := range in.Nodes {
+			if n == nil {
+				continue
+			}
+			out.Nodes[i] = &v1beta2.ComputeDomainNode{
+				Name:      n.Name,
+				IPAddress: n.IPAddress,
+				CliqueID:  n.CliqueID,
+				Index:     n.Index,
+				Status:    n.Status,
+			}
+		}
+	}
+	return out
+}
+
+func v1beta2StatusToV1beta1(in *v1beta2.ComputeDomainStatus) ComputeDomainStatus {
+	if in == nil {
+		return ComputeDomainStatus{}
+	}
+	out := ComputeDomainStatus{
+		Status: in.Status,
+	}
+	if len(in.Nodes) > 0 {
+		out.Nodes = make([]*ComputeDomainNode, len(in.Nodes))
+		for i, n := range in.Nodes {
+			if n == nil {
+				continue
+			}
+			out.Nodes[i] = &ComputeDomainNode{
+				Name:      n.Name,
+				IPAddress: n.IPAddress,
+				CliqueID:  n.CliqueID,
+				Index:     n.Index,
+				Status:    n.Status,
+			}
+		}
+	}
+	return out
+}
diff --git a/api/nvidia.com/resource/v1beta2/computedomain.go b/api/nvidia.com/resource/v1beta2/computedomain.go
@@ -0,0 +1,122 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1beta2
+
+import (
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+const (
+	ComputeDomainStatusNone     = ""
+	ComputeDomainStatusReady    = "Ready"
+	ComputeDomainStatusNotReady = "NotReady"
+
+	ComputeDomainChannelAllocationModeSingle = "Single"
+	ComputeDomainChannelAllocationModeAll    = "All"
+)
+
+// +genclient
+// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
+// +k8s:openapi-gen=true
+// +kubebuilder:resource:scope=Namespaced
+// +kubebuilder:storageversion
+// +kubebuilder:subresource:status
+
+// ComputeDomain prepares a set of nodes to run a multi-node workload in.
+//
+// Hub is the storage / conversion hub for ComputeDomain API versions.
+type ComputeDomain struct {
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	Spec ComputeDomainSpec `json:"spec,omitempty"`
+	// Global ComputeDomain status. Can be used to guide debugging efforts.
+	// Workload however should not rely on inspecting this field at any point
+	// during its lifecycle.
+	Status ComputeDomainStatus `json:"status,omitempty"`
+}
+
+// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
+
+// ComputeDomainList provides a list of ComputeDomains.
+type ComputeDomainList struct {
+	metav1.TypeMeta `json:",inline"`
+	metav1.ListMeta `json:"metadata,omitempty"`
+
+	Items []ComputeDomain `json:"items"`
+}
+
+// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="A computeDomain.spec is immutable"
+
+// ComputeDomainSpec provides the spec for a ComputeDomain.
+//
+// The deprecated resource.nvidia.com/v1beta1 API carries `spec.numNodes`; that
+// value is not part of this version and is preserved on the stored object via
+// metadata.annotations["resource.nvidia.com/computedomain-num-nodes"].
+type ComputeDomainSpec struct {
+	Channel *ComputeDomainChannelSpec `json:"channel"`
+}
+
+// ComputeDomainChannelSpec provides the spec for a channel used to run a workload inside a ComputeDomain.
+type ComputeDomainChannelSpec struct {
+	ResourceClaimTemplate ComputeDomainResourceClaimTemplate `json:"resourceClaimTemplate"`
+	// Allows for requesting all IMEX channels (the maximum per IMEX domain) or
+	// precisely one.
+	// +kubebuilder:validation:Enum=All;Single
+	// +kubebuilder:default:=Single
+	// +kubebuilder:validation:Optional
+	AllocationMode string `json:"allocationMode,omitempty"`
+}
+
+// ComputeDomainResourceClaimTemplate provides the details of the ResourceClaimTemplate to generate.
+type ComputeDomainResourceClaimTemplate struct {
+	Name string `json:"name"`
+}
+
+// ComputeDomainStatus provides the status for a ComputeDomain.
+type ComputeDomainStatus struct {
+	// +kubebuilder:validation:Enum=Ready;NotReady
+	// +kubebuilder:default=NotReady
+	Status string `json:"status"`
+	// +listType=map
+	// +listMapKey=name
+	Nodes []*ComputeDomainNode `json:"nodes,omitempty"`
+}
+
+// ComputeDomainNode provides information about each node added to a ComputeDomain.
+type ComputeDomainNode struct {
+	Name      string `json:"name"`
+	IPAddress string `json:"ipAddress"`
+	CliqueID  string `json:"cliqueID"`
+	// The Index field is used to ensure a consistent IP-to-DNS name
+	// mapping across all machines within an IMEX domain. Each node's index
+	// directly determines its DNS name within a given NVLink partition
+	// (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will
+	// always be unique. This field is marked as optional (but not
+	// omitempty) in order to support downgrades and avoid an API bump.
+	// +kubebuilder:validation:Optional
+	Index int `json:"index"`
+	// The Status field tracks the readiness of the IMEX daemon running on
+	// this node. It gets switched to Ready whenever the IMEX daemon is
+	// ready to broker GPU memory exchanges and switches to NotReady when
+	// it is not. It is marked as optional in order to support downgrades
+	// and avoid an API bump.
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:validation:Enum=Ready;NotReady
+	// +kubebuilder:default:=NotReady
+	Status string `json:"status,omitempty"`
+}
diff --git a/api/nvidia.com/resource/v1beta2/doc.go b/api/nvidia.com/resource/v1beta2/doc.go
@@ -0,0 +1,20 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// +k8s:deepcopy-gen=package
+// +groupName=resource.nvidia.com
+
+package v1beta2