Skip to content

Commit 1b511d3

Browse files
committed
NVIDIA-472: csr approval
1 parent 10bac92 commit 1b511d3

13 files changed

Lines changed: 1783 additions & 0 deletions

File tree

api/v1alpha1/dpfhcpprovisioner_types.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ const (
178178
// HostedClusterCleanup indicates the status of HostedCluster deletion during finalizer cleanup.
179179
HostedClusterCleanup string = "HostedClusterCleanup"
180180

181+
// CSRAutoApprovalActive indicates whether CSR auto-approval is active and watching for CSRs
182+
CSRAutoApprovalActive string = "CSRAutoApprovalActive"
183+
181184
// Validation conditions.
182185

183186
// SecretsValid indicates whether required secrets (pull secret, SSH key) are valid.
@@ -224,6 +227,19 @@ const (
224227
ReasonKubeConfigInjectionFailed string = "InjectionFailed"
225228
)
226229

230+
// Condition reasons for DPFHCPProvisioner CSRAutoApprovalActive status.
231+
// These are used as the Reason field in the CSRAutoApprovalActive condition.
232+
const (
233+
// ReasonCSRApprovalActive indicates CSR auto-approval is actively processing CSRs
234+
ReasonCSRApprovalActive string = "Active"
235+
236+
// ReasonKubeconfigNotAvailable indicates the kubeconfig is not available
237+
ReasonKubeconfigNotAvailable string = "KubeconfigNotAvailable"
238+
239+
// ReasonHostedClusterNotReachable indicates the hosted cluster is not reachable
240+
ReasonHostedClusterNotReachable string = "HostedClusterNotReachable"
241+
)
242+
227243
// DPFHCPProvisionerStatus defines the observed state of DPFHCPProvisioner
228244
type DPFHCPProvisionerStatus struct {
229245
// Phase represents the current lifecycle phase

cmd/main.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
provisioningv1alpha1 "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/api/v1alpha1"
4343
"github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller"
4444
"github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/bluefield"
45+
"github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/csrapproval"
4546
"github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/dpucluster"
4647
"github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/finalizer"
4748
"github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/hostedcluster"
@@ -233,6 +234,9 @@ func main() {
233234
// Initialize Kubeconfig Injector
234235
kubeconfigInjector := kubeconfiginjection.NewKubeconfigInjector(mgr.GetClient(), mgr.GetEventRecorderFor("dpfhcpprovisioner-controller"))
235236

237+
// Initialize CSR Approver
238+
csrApprover := csrapproval.NewCSRApprover(mgr.GetClient(), mgr.GetEventRecorderFor("dpfhcpprovisioner-controller"))
239+
236240
// Initialize Finalizer Manager with pluggable cleanup handlers
237241
// Handlers are executed in registration order
238242
finalizerManager := finalizer.NewManager(mgr.GetClient(), mgr.GetEventRecorderFor("dpfhcpprovisioner-controller"))
@@ -259,6 +263,7 @@ func main() {
259263
FinalizerManager: finalizerManager,
260264
StatusSyncer: statusSyncer,
261265
KubeconfigInjector: kubeconfigInjector,
266+
CSRApprover: csrApprover,
262267
}).SetupWithManager(mgr); err != nil {
263268
setupLog.Error(err, "unable to create controller", "controller", "DPFHCPProvisioner")
264269
os.Exit(1)

config/rbac/role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,11 @@ rules:
109109
- patch
110110
- update
111111
- watch
112+
- apiGroups:
113+
- provisioning.dpu.nvidia.com
114+
resources:
115+
- dpus
116+
verbs:
117+
- get
118+
- list
119+
- watch

helm/dpf-hcp-provisioner-operator/templates/clusterrole.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,16 @@ rules:
113113
- update
114114
- watch
115115

116+
# DPU permissions (for CSR validation against DPU objects)
117+
- apiGroups:
118+
- provisioning.dpu.nvidia.com
119+
resources:
120+
- dpus
121+
verbs:
122+
- get
123+
- list
124+
- watch
125+
116126
# HyperShift HostedCluster and NodePool permissions
117127
- apiGroups:
118128
- hypershift.openshift.io
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/*
2+
Copyright 2025.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package csrapproval
18+
19+
import (
20+
"context"
21+
"fmt"
22+
23+
corev1 "k8s.io/api/core/v1"
24+
"k8s.io/apimachinery/pkg/types"
25+
"k8s.io/client-go/kubernetes"
26+
"k8s.io/client-go/tools/clientcmd"
27+
"sigs.k8s.io/controller-runtime/pkg/client"
28+
)
29+
30+
// ClientManager manages hosted cluster client lifecycle
31+
type ClientManager struct {
32+
mgmtClient client.Client
33+
// hcClients caches Kubernetes clientsets for hosted clusters to avoid recreating them on every reconciliation.
34+
// Each DPFHCPProvisioner creates a hosted cluster with its own API server. This map stores one clientset
35+
// per hosted cluster (keyed by "namespace/name") to reuse connections and avoid expensive client creation
36+
// (parsing kubeconfig, establishing TCP connections) every 30 seconds during CSR polling.
37+
// Without this cache, we would create 120+ clients per hour per hosted cluster.
38+
hcClients map[string]*kubernetes.Clientset
39+
}
40+
41+
// NewClientManager creates a new client manager
42+
func NewClientManager(mgmtClient client.Client) *ClientManager {
43+
return &ClientManager{
44+
mgmtClient: mgmtClient,
45+
hcClients: make(map[string]*kubernetes.Clientset),
46+
}
47+
}
48+
49+
// GetHostedClusterClient retrieves or creates a client for the hosted cluster
50+
func (cm *ClientManager) GetHostedClusterClient(ctx context.Context, namespace, name string) (*kubernetes.Clientset, error) {
51+
key := namespace + "/" + name
52+
53+
// Return cached client if it exists
54+
if clientset, ok := cm.hcClients[key]; ok {
55+
return clientset, nil
56+
}
57+
58+
// Create new client
59+
clientset, err := cm.createHostedClusterClient(ctx, namespace, name)
60+
if err != nil {
61+
return nil, err
62+
}
63+
64+
// Cache the client
65+
cm.hcClients[key] = clientset
66+
67+
return clientset, nil
68+
}
69+
70+
// InvalidateClient removes a cached client (useful when kubeconfig rotates)
71+
func (cm *ClientManager) InvalidateClient(namespace, name string) {
72+
key := namespace + "/" + name
73+
delete(cm.hcClients, key)
74+
}
75+
76+
// createHostedClusterClient creates a Kubernetes client for the hosted cluster
77+
func (cm *ClientManager) createHostedClusterClient(ctx context.Context, namespace, name string) (*kubernetes.Clientset, error) {
78+
// Fetch kubeconfig secret
79+
kubeconfigData, err := cm.getKubeconfigData(ctx, namespace, name)
80+
if err != nil {
81+
return nil, fmt.Errorf("failed to get kubeconfig: %w", err)
82+
}
83+
84+
// Create REST config from kubeconfig
85+
config, err := clientcmd.RESTConfigFromKubeConfig(kubeconfigData)
86+
if err != nil {
87+
return nil, fmt.Errorf("failed to create rest config from kubeconfig: %w", err)
88+
}
89+
90+
// Set reasonable timeouts
91+
config.Timeout = 0 // No timeout for long-lived connections (watches)
92+
config.QPS = 5
93+
config.Burst = 10
94+
95+
// Create clientset
96+
clientset, err := kubernetes.NewForConfig(config)
97+
if err != nil {
98+
return nil, fmt.Errorf("failed to create clientset: %w", err)
99+
}
100+
101+
return clientset, nil
102+
}
103+
104+
// getKubeconfigData retrieves the kubeconfig data from the admin secret
105+
func (cm *ClientManager) getKubeconfigData(ctx context.Context, namespace, name string) ([]byte, error) {
106+
// The kubeconfig secret name follows HyperShift convention: <hostedcluster-name>-admin-kubeconfig
107+
secretName := name + "-admin-kubeconfig"
108+
109+
secret := &corev1.Secret{}
110+
secretKey := types.NamespacedName{
111+
Namespace: namespace,
112+
Name: secretName,
113+
}
114+
115+
if err := cm.mgmtClient.Get(ctx, secretKey, secret); err != nil {
116+
return nil, fmt.Errorf("failed to get kubeconfig secret %s: %w", secretKey, err)
117+
}
118+
119+
kubeconfigData, ok := secret.Data["kubeconfig"]
120+
if !ok {
121+
return nil, fmt.Errorf("kubeconfig key not found in secret %s", secretKey)
122+
}
123+
124+
if len(kubeconfigData) == 0 {
125+
return nil, fmt.Errorf("kubeconfig data is empty in secret %s", secretKey)
126+
}
127+
128+
return kubeconfigData, nil
129+
}
130+
131+
// TestConnection verifies the hosted cluster client can connect to the API server
132+
func TestConnection(ctx context.Context, clientset *kubernetes.Clientset) error {
133+
_, err := clientset.Discovery().ServerVersion()
134+
if err != nil {
135+
return fmt.Errorf("failed to connect to hosted cluster API server: %w", err)
136+
}
137+
return nil
138+
}

0 commit comments

Comments
 (0)