diff --git a/api/v1alpha1/dpfhcpprovisioner_types.go b/api/v1alpha1/dpfhcpprovisioner_types.go index 7508e79f..6f366cc9 100644 --- a/api/v1alpha1/dpfhcpprovisioner_types.go +++ b/api/v1alpha1/dpfhcpprovisioner_types.go @@ -190,6 +190,9 @@ const ( // HostedClusterCleanup indicates the status of HostedCluster deletion during finalizer cleanup. HostedClusterCleanup string = "HostedClusterCleanup" + // CSRAutoApprovalActive indicates whether CSR auto-approval is active and watching for CSRs + CSRAutoApprovalActive string = "CSRAutoApprovalActive" + // Validation conditions. // SecretsValid indicates whether required secrets (pull secret, SSH key) are valid. @@ -240,6 +243,19 @@ const ( ReasonKubeConfigInjectionFailed string = "InjectionFailed" ) +// Condition reasons for DPFHCPProvisioner CSRAutoApprovalActive status. +// These are used as the Reason field in the CSRAutoApprovalActive condition. +const ( + // ReasonCSRApprovalActive indicates CSR auto-approval is actively processing CSRs + ReasonCSRApprovalActive string = "Active" + + // ReasonKubeconfigNotAvailable indicates the kubeconfig is not available + ReasonKubeconfigNotAvailable string = "KubeconfigNotAvailable" + + // ReasonHostedClusterNotReachable indicates the hosted cluster is not reachable + ReasonHostedClusterNotReachable string = "HostedClusterNotReachable" +) + // DPFHCPProvisionerStatus defines the observed state of DPFHCPProvisioner type DPFHCPProvisionerStatus struct { // Phase represents the current lifecycle phase diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index aa4f6383..54022e11 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -98,6 +98,11 @@ func (in *DPFHCPProvisionerSpec) DeepCopyInto(out *DPFHCPProvisionerSpec) { (*out)[key] = val } } + if in.FlannelEnabled != nil { + in, out := &in.FlannelEnabled, &out.FlannelEnabled + *out = new(bool) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DPFHCPProvisionerSpec. diff --git a/cmd/main.go b/cmd/main.go index d181773e..bb7c9b88 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -43,6 +43,7 @@ import ( "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/common" "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller" "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/bluefield" + "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/csrapproval" "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/dpucluster" "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/finalizer" "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/internal/controller/hostedcluster" @@ -217,17 +218,20 @@ func main() { } client := mgr.GetClient() - recorder := mgr.GetEventRecorderFor(common.ControllerName) scheme := mgr.GetScheme() + // Create event recorders for each controller/component + provisionerRecorder := mgr.GetEventRecorderFor(common.ProvisionerControllerName) + csrApprovalRecorder := mgr.GetEventRecorderFor(common.CSRApprovalControllerName) + // Initialize BlueField Image Resolver - imageResolver := bluefield.NewImageResolver(client, recorder) + imageResolver := bluefield.NewImageResolver(client, provisionerRecorder) // Initialize DPUCluster Validator - dpuClusterValidator := dpucluster.NewValidator(client, recorder) + dpuClusterValidator := dpucluster.NewValidator(client, provisionerRecorder) // Initialize Secrets Validator - secretsValidator := secrets.NewValidator(client, recorder) + secretsValidator := secrets.NewValidator(client, provisionerRecorder) // Initialize Secret Manager for HostedCluster lifecycle secretManager := hostedcluster.NewSecretManager(client, scheme) @@ -239,32 +243,36 @@ func main() { nodePoolManager := hostedcluster.NewNodePoolManager(client, scheme) // Initialize Kubeconfig Injector - kubeconfigInjector := kubeconfiginjection.NewKubeconfigInjector(client, recorder) + kubeconfigInjector := kubeconfiginjection.NewKubeconfigInjector(client, provisionerRecorder) // Initialize MetalLB Manager - metalLBManager := metallb.NewMetalLBManager(client, recorder) + metalLBManager := metallb.NewMetalLBManager(client, provisionerRecorder) + + // Initialize CSR Approver + csrApprover := csrapproval.NewCSRApprover(client, csrApprovalRecorder) // Initialize Finalizer Manager with pluggable cleanup handlers // Handlers are executed in registration order - finalizerManager := finalizer.NewManager(client, recorder) + finalizerManager := finalizer.NewManager(client, provisionerRecorder) // Register cleanup handlers in order (dependent resources first, dependencies last) // 1. Kubeconfig injection cleanup (removes kubeconfig from DPUCluster namespace) - finalizerManager.RegisterHandler(kubeconfiginjection.NewCleanupHandler(client, recorder)) + finalizerManager.RegisterHandler(kubeconfiginjection.NewCleanupHandler(client, provisionerRecorder)) // 2. HostedCluster cleanup (removes HostedCluster, NodePool, services, and secrets) // Must run before MetalLB cleanup because LoadBalancer services depend on IPAddressPool - finalizerManager.RegisterHandler(hostedcluster.NewCleanupHandler(client, recorder)) + finalizerManager.RegisterHandler(hostedcluster.NewCleanupHandler(client, provisionerRecorder)) // 3. MetalLB cleanup (removes IPAddressPool and L2Advertisement) // Must run after HostedCluster cleanup to avoid deleting IPs while services still exist - finalizerManager.RegisterHandler(metallb.NewCleanupHandler(client, recorder)) + finalizerManager.RegisterHandler(metallb.NewCleanupHandler(client, provisionerRecorder)) // Initialize Status Syncer for HostedCluster status mirroring statusSyncer := hostedcluster.NewStatusSyncer(client) + // Setup main DPFHCPProvisioner controller if err := (&controller.DPFHCPProvisionerReconciler{ Client: client, Scheme: scheme, - Recorder: recorder, + Recorder: provisionerRecorder, ImageResolver: imageResolver, DPUClusterValidator: dpuClusterValidator, SecretsValidator: secretsValidator, @@ -279,6 +287,16 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "DPFHCPProvisioner") os.Exit(1) } + + // Setup CSR Approval controller (separate from main controller) + if err := (&csrapproval.CSRApprovalReconciler{ + Client: client, + Scheme: scheme, + Approver: csrApprover, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "CSRApproval") + os.Exit(1) + } // +kubebuilder:scaffold:builder if metricsCertWatcher != nil { diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 14e88c89..71361583 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -122,3 +122,11 @@ rules: - patch - update - watch +- apiGroups: + - provisioning.dpu.nvidia.com + resources: + - dpus + verbs: + - get + - list + - watch diff --git a/helm/dpf-hcp-provisioner-operator/templates/clusterrole.yaml b/helm/dpf-hcp-provisioner-operator/templates/clusterrole.yaml index d273af63..513858a9 100644 --- a/helm/dpf-hcp-provisioner-operator/templates/clusterrole.yaml +++ b/helm/dpf-hcp-provisioner-operator/templates/clusterrole.yaml @@ -113,6 +113,16 @@ rules: - update - watch +# DPU permissions (for CSR validation against DPU objects) +- apiGroups: + - provisioning.dpu.nvidia.com + resources: + - dpus + verbs: + - get + - list + - watch + # HyperShift HostedCluster and NodePool permissions - apiGroups: - hypershift.openshift.io diff --git a/internal/common/constants.go b/internal/common/constants.go index 2cab5769..23d185ad 100644 --- a/internal/common/constants.go +++ b/internal/common/constants.go @@ -22,8 +22,11 @@ const ( // If the CR is renamed, update this constant once and it propagates everywhere. DPFHCPProvisionerName = "dpfhcpprovisioner" - // ControllerName is the name used for event recorders - ControllerName = "dpfhcpprovisioner-controller" + // ProvisionerControllerName is the name for the main provisioner controller event recorder + ProvisionerControllerName = "dpfhcpprovisioner-controller" + + // CSRApprovalControllerName is the name for the CSR approval controller event recorder + CSRApprovalControllerName = "csr-approval-controller" ) // Label keys for cross-namespace resource ownership tracking diff --git a/internal/controller/csrapproval/client.go b/internal/controller/csrapproval/client.go new file mode 100644 index 00000000..fbe22b67 --- /dev/null +++ b/internal/controller/csrapproval/client.go @@ -0,0 +1,214 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "context" + "fmt" + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + clientcmdapi "k8s.io/client-go/tools/clientcmd/api" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ClientManager manages hosted cluster client lifecycle +type ClientManager struct { + mgmtClient client.Client + // mu protects concurrent access to hcClients map + // Multiple reconciliations can run concurrently, so we need to protect map access + mu sync.RWMutex + // hcClients caches Kubernetes clientsets for hosted clusters to avoid recreating them on every reconciliation. + // Each DPFHCPProvisioner creates a hosted cluster with its own API server. This map stores one clientset + // per hosted cluster (keyed by "namespace/name") to reuse connections and avoid expensive client creation + // (parsing kubeconfig, establishing TCP connections) every 30 seconds during CSR polling. + // Without this cache, we would create 120+ clients per hour per hosted cluster. + hcClients map[string]*kubernetes.Clientset +} + +// NewClientManager creates a new client manager +func NewClientManager(mgmtClient client.Client) *ClientManager { + return &ClientManager{ + mgmtClient: mgmtClient, + hcClients: make(map[string]*kubernetes.Clientset), + } +} + +// GetHostedClusterClient retrieves or creates a client for the hosted cluster +func (cm *ClientManager) GetHostedClusterClient(ctx context.Context, namespace, name string) (*kubernetes.Clientset, error) { + key := namespace + "/" + name + + // Check cache with read lock + cm.mu.RLock() + if clientset, ok := cm.hcClients[key]; ok { + cm.mu.RUnlock() + return clientset, nil + } + cm.mu.RUnlock() + + // Create new client (outside lock to avoid holding lock during slow operation) + clientset, err := cm.createHostedClusterClient(ctx, namespace, name) + if err != nil { + return nil, err + } + + // Cache the client with write lock + cm.mu.Lock() + cm.hcClients[key] = clientset + cm.mu.Unlock() + + return clientset, nil +} + +// InvalidateClient removes a cached client (useful when kubeconfig rotates) +func (cm *ClientManager) InvalidateClient(namespace, name string) { + key := namespace + "/" + name + cm.mu.Lock() + delete(cm.hcClients, key) + cm.mu.Unlock() +} + +// createHostedClusterClient creates a Kubernetes client for the hosted cluster +func (cm *ClientManager) createHostedClusterClient(ctx context.Context, namespace, name string) (*kubernetes.Clientset, error) { + // Fetch kubeconfig secret + kubeconfigData, err := cm.getKubeconfigData(ctx, namespace, name) + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + + // Parse kubeconfig + kubeconfig, err := clientcmd.Load(kubeconfigData) + if err != nil { + return nil, fmt.Errorf("failed to parse kubeconfig: %w", err) + } + + // Replace external endpoint with internal service DNS name + // The HyperShift admin-kubeconfig uses external endpoints (LoadBalancer IP or NodePort) + // which are not accessible from inside the operator pod's network. + // We need to use the internal service endpoint for in-cluster access. + if err := replaceServerWithInternalEndpoint(kubeconfig, namespace, name); err != nil { + return nil, fmt.Errorf("failed to replace server endpoint: %w", err) + } + + // Create REST config from modified kubeconfig + config, err := clientcmd.NewDefaultClientConfig(*kubeconfig, &clientcmd.ConfigOverrides{}).ClientConfig() + if err != nil { + return nil, fmt.Errorf("failed to create rest config from kubeconfig: %w", err) + } + + // Set reasonable timeouts for CSR API operations + // We use List/Get/UpdateApproval operations (not watches), so a 30s timeout is appropriate + config.Timeout = 30 * time.Second + config.QPS = 5 + config.Burst = 10 + + // Create clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create clientset: %w", err) + } + + return clientset, nil +} + +// getKubeconfigData retrieves the kubeconfig data from the admin secret +func (cm *ClientManager) getKubeconfigData(ctx context.Context, namespace, name string) ([]byte, error) { + // The kubeconfig secret name follows HyperShift convention: -admin-kubeconfig + secretName := name + "-admin-kubeconfig" + + secret := &corev1.Secret{} + secretKey := types.NamespacedName{ + Namespace: namespace, + Name: secretName, + } + + if err := cm.mgmtClient.Get(ctx, secretKey, secret); err != nil { + return nil, fmt.Errorf("failed to get kubeconfig secret %s: %w", secretKey, err) + } + + kubeconfigData, ok := secret.Data["kubeconfig"] + if !ok { + return nil, fmt.Errorf("kubeconfig key not found in secret %s", secretKey) + } + + if len(kubeconfigData) == 0 { + return nil, fmt.Errorf("kubeconfig data is empty in secret %s", secretKey) + } + + return kubeconfigData, nil +} + +// replaceServerWithInternalEndpoint modifies the kubeconfig to use internal service DNS name +// instead of the external LoadBalancer IP or NodePort. This allows the operator pod (running inside the cluster) +// to reach the hosted cluster API server via the internal Kubernetes service. +// +// HyperShift creates admin-kubeconfig with external endpoints: +// - LoadBalancer: https://10.6.135.42:6443 (example external IP, not accessible from operator pod) +// - NodePort: https://:31039 (example NodePort, dynamically allocated per cluster) +// +// This function replaces it with the internal service DNS name: +// https://kube-apiserver.-.svc.cluster.local:6443 +// +// Port 6443 is hardcoded to match HyperShift's implementation. HyperShift itself hardcodes +// the kube-apiserver port as a constant (KASSVCPort = 6443) in their codebase. +func replaceServerWithInternalEndpoint(kubeconfig *clientcmdapi.Config, hostedClusterNamespace, hostedClusterName string) error { + if kubeconfig == nil { + return fmt.Errorf("kubeconfig is nil") + } + + // Find the current context + currentContext := kubeconfig.CurrentContext + if currentContext == "" { + return fmt.Errorf("kubeconfig has no current context") + } + + ctxConfig, ok := kubeconfig.Contexts[currentContext] + if !ok { + return fmt.Errorf("context %s not found in kubeconfig", currentContext) + } + + // Find the cluster referenced by the context + clusterName := ctxConfig.Cluster + cluster, ok := kubeconfig.Clusters[clusterName] + if !ok { + return fmt.Errorf("cluster %s not found in kubeconfig", clusterName) + } + + // Construct the service namespace following HyperShift convention + serviceNamespace := fmt.Sprintf("%s-%s", hostedClusterNamespace, hostedClusterName) + + // Construct internal service DNS name with hardcoded port 6443 (matching HyperShift's approach) + internalServer := fmt.Sprintf("https://kube-apiserver.%s.svc.cluster.local:6443", serviceNamespace) + + // Replace the server URL + cluster.Server = internalServer + + return nil +} + +// TestConnection verifies the hosted cluster client can connect to the API server +func TestConnection(ctx context.Context, clientset *kubernetes.Clientset) error { + _, err := clientset.Discovery().ServerVersion() + if err != nil { + return fmt.Errorf("failed to connect to hosted cluster API server: %w", err) + } + return nil +} diff --git a/internal/controller/csrapproval/controller.go b/internal/controller/csrapproval/controller.go new file mode 100644 index 00000000..fc075108 --- /dev/null +++ b/internal/controller/csrapproval/controller.go @@ -0,0 +1,107 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + provisioningv1alpha1 "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/api/v1alpha1" +) + +// CSRApprovalReconciler is a dedicated controller for CSR auto-approval +// This controller runs independently from the main DPFHCPProvisioner controller +// and continuously polls the hosted cluster for pending CSRs +type CSRApprovalReconciler struct { + client.Client + Scheme *runtime.Scheme + Approver *CSRApprover +} + +// +kubebuilder:rbac:groups=provisioning.dpu.hcp.io,resources=dpfhcpprovisioners,verbs=get;list;watch +// +kubebuilder:rbac:groups=provisioning.dpu.hcp.io,resources=dpfhcpprovisioners/status,verbs=get;update;patch +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch +// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch +// +kubebuilder:rbac:groups=provisioning.dpu.nvidia.com,resources=dpus,verbs=get;list;watch + +// Reconcile handles CSR auto-approval for a DPFHCPProvisioner +// This controller is solely responsible for: +// 1. Connecting to the hosted cluster +// 2. Polling for pending CSRs +// 3. Validating and approving CSRs for DPU worker nodes +// 4. Updating the CSRAutoApprovalActive condition +func (r *CSRApprovalReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logf.FromContext(ctx) + log.Info("CSR Approval Controller reconciling", "namespace", req.Namespace, "name", req.Name) + + // Fetch the DPFHCPProvisioner CR + var cr provisioningv1alpha1.DPFHCPProvisioner + if err := r.Get(ctx, req.NamespacedName, &cr); err != nil { + // CR not found - likely deleted + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Handle deletion - stop CSR watch + if !cr.DeletionTimestamp.IsZero() { + log.Info("DPFHCPProvisioner is being deleted, stopping CSR watch") + if r.Approver != nil { + r.Approver.StopCSRWatch(ctx, &cr) + } + return ctrl.Result{}, nil + } + + // Only process if kubeconfig is available + // CSR approval requires kubeconfig to connect to hosted cluster + if cr.Status.KubeConfigSecretRef == nil { + log.V(1).Info("Kubeconfig not available yet, skipping CSR approval") + return ctrl.Result{}, nil + } + + // Process CSRs - this is the ONLY thing this controller does + // The CSRApprover will: + // 1. Connect to hosted cluster using kubeconfig + // 2. List and filter pending CSRs + // 3. Validate CSRs against DPU objects + // 4. Approve valid CSRs + // 5. Update CSRAutoApprovalActive condition + if r.Approver == nil { + log.V(1).Info("Approver not initialized, skipping CSR approval") + return ctrl.Result{}, nil + } + result, err := r.Approver.ProcessCSRs(ctx, &cr) + if err != nil { + log.Error(err, "Failed to process CSRs") + // Continue polling even on error - CSR approval is best-effort + // The next reconciliation will retry + return ctrl.Result{RequeueAfter: CSRPollingInterval}, nil + } + + log.V(1).Info("CSR approval reconciliation complete", "requeueAfter", result.RequeueAfter) + return result, nil +} + +// SetupWithManager sets up the controller with the Manager +func (r *CSRApprovalReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&provisioningv1alpha1.DPFHCPProvisioner{}). + Named("csrapproval"). + Complete(r) +} diff --git a/internal/controller/csrapproval/csr_content_validation.go b/internal/controller/csrapproval/csr_content_validation.go new file mode 100644 index 00000000..ac3e5039 --- /dev/null +++ b/internal/controller/csrapproval/csr_content_validation.go @@ -0,0 +1,303 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "crypto/x509" + "encoding/pem" + "fmt" + "net" + + certv1 "k8s.io/api/certificates/v1" + "k8s.io/client-go/util/cert" +) + +const ( + // BootstrapperUsername is the expected username for bootstrap CSRs + BootstrapperUsername = "system:serviceaccount:openshift-machine-config-operator:node-bootstrapper" + + // Expected groups for bootstrap CSRs (from service account) + groupServiceAccounts = "system:serviceaccounts" + groupServiceAccountsMachineConfig = "system:serviceaccounts:openshift-machine-config-operator" + + // Expected groups for serving CSRs (from authenticated nodes) + groupNodes = "system:nodes" + + // Common group required for both bootstrap and serving CSRs + groupAuthenticated = "system:authenticated" + + // Expected organization field in certificate request (both bootstrap and serving) + expectedOrganization = "system:nodes" +) + +// ValidateBootstrapCSR performs comprehensive validation of a bootstrap CSR +func ValidateBootstrapCSR(csr *certv1.CertificateSigningRequest, hostname string) error { + // 1. Validate username and groups (must come from node-bootstrapper) + if err := validateBootstrapperIdentity(csr); err != nil { + return fmt.Errorf("bootstrapper identity validation failed: %w", err) + } + + // 2. Validate key usages + if err := validateBootstrapUsages(csr); err != nil { + return fmt.Errorf("key usage validation failed: %w", err) + } + + // 3. Parse and validate certificate request + certReq, err := parseCSRRequest(csr) + if err != nil { + return fmt.Errorf("failed to parse CSR: %w", err) + } + + // 4. Validate organization field + if err := validateOrganization(certReq); err != nil { + return fmt.Errorf("organization validation failed: %w", err) + } + + // 5. Validate CN matches hostname + if err := validateCN(certReq, hostname); err != nil { + return fmt.Errorf("CN validation failed: %w", err) + } + + // 6. Bootstrap CSRs must not contain any SANs (client-auth certificates) + if len(certReq.DNSNames) > 0 || len(certReq.IPAddresses) > 0 || + len(certReq.EmailAddresses) > 0 || len(certReq.URIs) > 0 { + return fmt.Errorf("bootstrap CSR must not contain SANs (DNS=%v, IP=%v, email=%v, URI=%v)", + certReq.DNSNames, certReq.IPAddresses, certReq.EmailAddresses, certReq.URIs) + } + + return nil +} + +// ValidateServingCSR performs comprehensive validation of a serving CSR +func ValidateServingCSR(csr *certv1.CertificateSigningRequest, hostname string) error { + // 1. Validate username and groups (must come from the node itself) + if err := validateNodeIdentity(csr, hostname); err != nil { + return fmt.Errorf("node identity validation failed: %w", err) + } + + // 2. Validate key usages + if err := validateServingUsages(csr); err != nil { + return fmt.Errorf("key usage validation failed: %w", err) + } + + // 3. Parse and validate certificate request + certReq, err := parseCSRRequest(csr) + if err != nil { + return fmt.Errorf("failed to parse CSR: %w", err) + } + + // 4. Validate organization field + if err := validateOrganization(certReq); err != nil { + return fmt.Errorf("organization validation failed: %w", err) + } + + // 5. Validate CN matches hostname + if err := validateCN(certReq, hostname); err != nil { + return fmt.Errorf("CN validation failed: %w", err) + } + + // 6. Validate DNS names and IP addresses + if err := validateServingIdentities(certReq, hostname); err != nil { + return fmt.Errorf("serving identities validation failed: %w", err) + } + + return nil +} + +// validateBootstrapperIdentity validates that bootstrap CSR comes from node-bootstrapper service account +func validateBootstrapperIdentity(csr *certv1.CertificateSigningRequest) error { + // Check username + if csr.Spec.Username != BootstrapperUsername { + return fmt.Errorf("invalid username: expected %s, got %s", BootstrapperUsername, csr.Spec.Username) + } + + // Check groups + requiredGroups := map[string]bool{ + groupServiceAccounts: false, + groupServiceAccountsMachineConfig: false, + groupAuthenticated: false, + } + + for _, group := range csr.Spec.Groups { + if _, ok := requiredGroups[group]; ok { + requiredGroups[group] = true + } + } + + // Verify all required groups are present + for group, found := range requiredGroups { + if !found { + return fmt.Errorf("missing required group: %s", group) + } + } + + return nil +} + +// validateNodeIdentity validates that serving CSR comes from the node itself +func validateNodeIdentity(csr *certv1.CertificateSigningRequest, hostname string) error { + // Check username matches expected format + expectedUsername := SystemNodePrefix + hostname + if csr.Spec.Username != expectedUsername { + return fmt.Errorf("invalid username: expected %s, got %s", expectedUsername, csr.Spec.Username) + } + + // Check groups include system:nodes + hasNodesGroup := false + hasAuthenticatedGroup := false + for _, group := range csr.Spec.Groups { + if group == groupNodes { + hasNodesGroup = true + } + if group == groupAuthenticated { + hasAuthenticatedGroup = true + } + } + + if !hasNodesGroup { + return fmt.Errorf("missing required group: %s", groupNodes) + } + if !hasAuthenticatedGroup { + return fmt.Errorf("missing required group: %s", groupAuthenticated) + } + + return nil +} + +// validateUsages validates key usages against an allowed set +// Requires exact match - no extra usages allowed +// The allowedUsages map should have all required usages with value false +func validateUsages(csr *certv1.CertificateSigningRequest, allowedUsages map[certv1.KeyUsage]bool) error { + // Check each usage in CSR + for _, usage := range csr.Spec.Usages { + if _, ok := allowedUsages[usage]; !ok { + // Disallowed usage present + return fmt.Errorf("disallowed usage present: %s", usage) + } + allowedUsages[usage] = true + } + + // Verify all required usages are present + for usage, found := range allowedUsages { + if !found { + return fmt.Errorf("missing required usage: %s", usage) + } + } + + return nil +} + +// validateBootstrapUsages validates key usages for bootstrap CSRs +// Based on actual environment: digital signature + client auth +// Requires exact match - no extra usages allowed +func validateBootstrapUsages(csr *certv1.CertificateSigningRequest) error { + allowedUsages := map[certv1.KeyUsage]bool{ + certv1.UsageDigitalSignature: false, + certv1.UsageClientAuth: false, + } + return validateUsages(csr, allowedUsages) +} + +// validateServingUsages validates key usages for serving CSRs +// Based on actual environment: digital signature + server auth +// Requires exact match - no extra usages allowed +func validateServingUsages(csr *certv1.CertificateSigningRequest) error { + allowedUsages := map[certv1.KeyUsage]bool{ + certv1.UsageDigitalSignature: false, + certv1.UsageServerAuth: false, + } + return validateUsages(csr, allowedUsages) +} + +// parseCSRRequest parses the X509 certificate request from a CSR +func parseCSRRequest(csr *certv1.CertificateSigningRequest) (*x509.CertificateRequest, error) { + // csr.Spec.Request is already a []byte containing PEM-encoded certificate request + // The Kubernetes Go client handles base64 decoding from JSON/YAML for us + // So we can use it directly without additional base64 decoding + + // Parse PEM block + pemBlock, _ := pem.Decode(csr.Spec.Request) + if pemBlock == nil { + return nil, fmt.Errorf("failed to decode PEM block from CSR request") + } + if pemBlock.Type != cert.CertificateRequestBlockType { + return nil, fmt.Errorf("unexpected PEM block type %q, expected %s", pemBlock.Type, cert.CertificateRequestBlockType) + } + + // Parse X509 certificate request + certReq, err := x509.ParseCertificateRequest(pemBlock.Bytes) + if err != nil { + return nil, fmt.Errorf("failed to parse X509 certificate request: %w", err) + } + + return certReq, nil +} + +// validateOrganization validates that the CSR has the correct organization field +func validateOrganization(certReq *x509.CertificateRequest) error { + if len(certReq.Subject.Organization) != 1 { + return fmt.Errorf("expected exactly one organization, got %d", len(certReq.Subject.Organization)) + } + + if certReq.Subject.Organization[0] != expectedOrganization { + return fmt.Errorf("invalid organization: expected %s, got %s", expectedOrganization, certReq.Subject.Organization[0]) + } + + return nil +} + +// validateCN validates that the CN matches the expected hostname format +func validateCN(certReq *x509.CertificateRequest, hostname string) error { + expectedCN := SystemNodePrefix + hostname + if certReq.Subject.CommonName != expectedCN { + return fmt.Errorf("invalid CN: expected %s, got %s", expectedCN, certReq.Subject.CommonName) + } + + return nil +} + +// validateServingIdentities validates DNS names and IP addresses in serving CSR +// Serving CSRs must carry exactly one DNS SAN equal to the node hostname +func validateServingIdentities(certReq *x509.CertificateRequest, hostname string) error { + // Serving CSRs must contain exactly one DNS SAN + if len(certReq.DNSNames) != 1 { + return fmt.Errorf("CSR must contain exactly one DNS SAN, got %d: %v", len(certReq.DNSNames), certReq.DNSNames) + } + + // The DNS SAN must match the hostname + if certReq.DNSNames[0] != hostname { + return fmt.Errorf("CSR DNS SAN %q does not match hostname %q", certReq.DNSNames[0], hostname) + } + + // Validate that IP addresses are valid (if any) + for _, ip := range certReq.IPAddresses { + if !hasValidIPAddress(ip) { + return fmt.Errorf("CSR contains invalid IP address: %v", ip) + } + } + + return nil +} + +// hasValidIPAddress checks if an IP address is valid for a node +func hasValidIPAddress(ip net.IP) bool { + if ip == nil { + return false + } + // Reject unspecified, loopback, and multicast addresses + return !ip.IsUnspecified() && !ip.IsLoopback() && !ip.IsMulticast() +} diff --git a/internal/controller/csrapproval/csr_content_validation_test.go b/internal/controller/csrapproval/csr_content_validation_test.go new file mode 100644 index 00000000..c329cab5 --- /dev/null +++ b/internal/controller/csrapproval/csr_content_validation_test.go @@ -0,0 +1,328 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + certv1 "k8s.io/api/certificates/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var _ = Describe("CSR Validation", func() { + Describe("ValidateBootstrapCSR", func() { + Context("When CSR has valid bootstrap configuration", func() { + It("should pass validation", func() { + hostname := testHostname + csrBytes := createTestCSRWithOrganization("system:node:"+hostname, "system:nodes") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bootstrap-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Username: BootstrapperUsername, + Groups: []string{ + groupServiceAccounts, + groupServiceAccountsMachineConfig, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageClientAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateBootstrapCSR(csr, hostname) + Expect(err).NotTo(HaveOccurred()) + }) + }) + + Context("When CSR has invalid username", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createTestCSRWithOrganization("system:node:"+hostname, "system:nodes") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-username", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Username: "wrong-username", + Groups: []string{ + groupServiceAccounts, + groupServiceAccountsMachineConfig, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageClientAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateBootstrapCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid username")) + }) + }) + + Context("When CSR is missing required groups", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createTestCSRWithOrganization("system:node:"+hostname, "system:nodes") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-missing-groups", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Username: BootstrapperUsername, + Groups: []string{ + groupAuthenticated, // Missing other groups + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageClientAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateBootstrapCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("missing required group")) + }) + }) + + Context("When CSR is missing required usages", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createTestCSRWithOrganization("system:node:"+hostname, "system:nodes") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-missing-usages", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Username: BootstrapperUsername, + Groups: []string{ + groupServiceAccounts, + groupServiceAccountsMachineConfig, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, // Missing client auth + }, + Request: csrBytes, + }, + } + + err := ValidateBootstrapCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("missing required usage")) + }) + }) + + Context("When CSR has invalid organization", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createTestCSRWithOrganization("system:node:"+hostname, "wrong-org") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-org", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Username: BootstrapperUsername, + Groups: []string{ + groupServiceAccounts, + groupServiceAccountsMachineConfig, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageClientAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateBootstrapCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid organization")) + }) + }) + }) + + Describe("ValidateServingCSR", func() { + Context("When CSR has valid serving configuration", func() { + It("should pass validation", func() { + hostname := testHostname + csrBytes := createServingCSRWithDNS("system:node:"+hostname, hostname) + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-serving-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "system:node:" + hostname, + Groups: []string{ + groupNodes, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageServerAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateServingCSR(csr, hostname) + Expect(err).NotTo(HaveOccurred()) + }) + }) + + Context("When CSR has invalid username", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createServingCSRWithDNS("system:node:"+hostname, hostname) + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-username", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "wrong-username", + Groups: []string{ + groupNodes, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageServerAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateServingCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid username")) + }) + }) + + Context("When CSR is missing system:nodes group", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createServingCSRWithDNS("system:node:"+hostname, hostname) + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-missing-nodes-group", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "system:node:" + hostname, + Groups: []string{ + groupAuthenticated, // Missing system:nodes + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageServerAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateServingCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("missing required group: system:nodes")) + }) + }) + + Context("When CSR is missing required usages", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createServingCSRWithDNS("system:node:"+hostname, hostname) + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-missing-usages", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "system:node:" + hostname, + Groups: []string{ + groupNodes, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, // Missing server auth + }, + Request: csrBytes, + }, + } + + err := ValidateServingCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("missing required usage")) + }) + }) + + Context("When CSR has wrong hostname in DNS SAN", func() { + It("should fail validation", func() { + hostname := testHostname + csrBytes := createServingCSRWithDNS("system:node:"+hostname, "wrong-hostname") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-wrong-dns-name", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "system:node:" + hostname, + Groups: []string{ + groupNodes, + groupAuthenticated, + }, + Usages: []certv1.KeyUsage{ + certv1.UsageDigitalSignature, + certv1.UsageServerAuth, + }, + Request: csrBytes, + }, + } + + err := ValidateServingCSR(csr, hostname) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("does not match hostname")) + }) + }) + }) +}) diff --git a/internal/controller/csrapproval/csr_owner_validation.go b/internal/controller/csrapproval/csr_owner_validation.go new file mode 100644 index 00000000..df1169f3 --- /dev/null +++ b/internal/controller/csrapproval/csr_owner_validation.go @@ -0,0 +1,128 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/client" + + dpuprovisioningv1alpha1 "github.com/nvidia/doca-platform/api/provisioning/v1alpha1" +) + +// ValidationResult contains the result of CSR validation +type ValidationResult struct { + Valid bool + Reason string +} + +// Validator validates CSRs against DPU and Node state +type Validator struct { + mgmtClient client.Client // Client for management cluster (where DPU objects live) + hostedClient kubernetes.Interface // Client for hosted cluster (where CSRs and Nodes live) + dpuNamespace string // Namespace where DPU objects are located +} + +// NewValidator creates a new CSR validator +func NewValidator(mgmtClient client.Client, hostedClient kubernetes.Interface, dpuNamespace string) *Validator { + return &Validator{ + mgmtClient: mgmtClient, + hostedClient: hostedClient, + dpuNamespace: dpuNamespace, + } +} + +// ValidateCSROwner validates CSR ownership by checking: +// 1. DPU object with matching hostname exists in management cluster (ownership check) +// 2. For bootstrap CSRs: Allow regardless of Node existence (supports certificate recovery) +// 3. For serving CSRs: Node MUST exist (already joined via bootstrap) +func (v *Validator) ValidateCSROwner(ctx context.Context, hostname string, isBootstrapCSR bool) (*ValidationResult, error) { + // Check if DPU exists in management cluster (ownership validation) + dpuExists, err := v.dpuExists(ctx, hostname) + if err != nil { + return nil, fmt.Errorf("failed to check DPU existence: %w", err) + } + + if !dpuExists { + return &ValidationResult{ + Valid: false, + Reason: fmt.Sprintf("no matching DPU found for hostname %s in namespace %s", hostname, v.dpuNamespace), + }, nil + } + + if isBootstrapCSR { + // Bootstrap CSR: Allow if DPU exists (regardless of Node state) + // This supports both initial join and certificate recovery scenarios + return &ValidationResult{ + Valid: true, + Reason: "DPU exists, bootstrap CSR approved", + }, nil + } + + // Serving CSR: Verify Node exists (should have joined via bootstrap CSR) + nodeExists, err := v.nodeExists(ctx, hostname) + if err != nil { + return nil, fmt.Errorf("failed to check Node existence: %w", err) + } + + if !nodeExists { + return &ValidationResult{ + Valid: false, + Reason: fmt.Sprintf("node %s does not exist yet in hosted cluster", hostname), + }, nil + } + + return &ValidationResult{ + Valid: true, + Reason: "DPU exists and node already joined", + }, nil +} + +// dpuExists checks if a DPU with the given hostname exists in the management cluster +func (v *Validator) dpuExists(ctx context.Context, hostname string) (bool, error) { + dpuList := &dpuprovisioningv1alpha1.DPUList{} + if err := v.mgmtClient.List(ctx, dpuList, client.InNamespace(v.dpuNamespace)); err != nil { + return false, err + } + + for _, dpu := range dpuList.Items { + if dpu.Name == hostname { + return true, nil + } + } + + return false, nil +} + +// nodeExists checks if a Node with the given hostname exists in the hosted cluster +func (v *Validator) nodeExists(ctx context.Context, hostname string) (bool, error) { + nodeList, err := v.hostedClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, err + } + + for _, node := range nodeList.Items { + if node.Name == hostname { + return true, nil + } + } + + return false, nil +} diff --git a/internal/controller/csrapproval/csr_owner_validation_test.go b/internal/controller/csrapproval/csr_owner_validation_test.go new file mode 100644 index 00000000..98e61864 --- /dev/null +++ b/internal/controller/csrapproval/csr_owner_validation_test.go @@ -0,0 +1,192 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "context" + + dpuprovisioningv1alpha1 "github.com/nvidia/doca-platform/api/provisioning/v1alpha1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var _ = Describe("CSR Owner Validation - DPU Existence Check", func() { + var ( + ctx context.Context + mgmtClient client.Client + dpuNamespace string + testHostname string + ) + + BeforeEach(func() { + ctx = context.Background() + mgmtClient = k8sClient + dpuNamespace = "test-dpu-namespace" + testHostname = "test-dpu-node" + }) + + Describe("DPU Existence Check", func() { + Context("when DPU exists with matching name", func() { + It("should find DPU with exact name match", func() { + // Create DPU with name matching hostname + // Note: Implementation matches by DPU object name, not Spec fields + dpu := &dpuprovisioningv1alpha1.DPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: testHostname, + Namespace: dpuNamespace, + }, + Spec: dpuprovisioningv1alpha1.DPUSpec{ + DPUNodeName: "node-1", + DPUDeviceName: "device-1", + BFB: "bfb-1", + SerialNumber: "SN123456", + }, + } + Expect(mgmtClient.Create(ctx, dpu)).To(Succeed()) + DeferCleanup(mgmtClient.Delete, ctx, dpu) + + // Create validator (using nil for hcClient since we're only testing DPU lookup) + validator := NewValidator(mgmtClient, nil, dpuNamespace) + + // Check if DPU exists + exists, err := validator.dpuExists(ctx, testHostname) + Expect(err).NotTo(HaveOccurred()) + Expect(exists).To(BeTrue()) + }) + }) + + Context("when DPU does not exist", func() { + It("should not find DPU when none exists", func() { + // Don't create any DPU + + validator := NewValidator(mgmtClient, nil, dpuNamespace) + + // Check if DPU exists + exists, err := validator.dpuExists(ctx, testHostname) + Expect(err).NotTo(HaveOccurred()) + Expect(exists).To(BeFalse()) + }) + + It("should not find DPU in different namespace", func() { + // Create DPU in different namespace + dpu := &dpuprovisioningv1alpha1.DPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: testHostname, + Namespace: "wrong-namespace", + }, + Spec: dpuprovisioningv1alpha1.DPUSpec{ + DPUNodeName: "node-1", + DPUDeviceName: "device-1", + BFB: "bfb-1", + SerialNumber: "SN123456", + }, + } + Expect(mgmtClient.Create(ctx, dpu)).To(Succeed()) + DeferCleanup(mgmtClient.Delete, ctx, dpu) + + validator := NewValidator(mgmtClient, nil, dpuNamespace) + + // Check if DPU exists in test namespace (should not find it) + exists, err := validator.dpuExists(ctx, testHostname) + Expect(err).NotTo(HaveOccurred()) + Expect(exists).To(BeFalse()) + }) + + It("should not find DPU with different name", func() { + // Create DPU with different object name + dpu := &dpuprovisioningv1alpha1.DPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: "dpu-123", + Namespace: dpuNamespace, + }, + Spec: dpuprovisioningv1alpha1.DPUSpec{ + DPUNodeName: "node-1", + DPUDeviceName: "device-1", + BFB: "bfb-1", + SerialNumber: "SN123456", + }, + } + Expect(mgmtClient.Create(ctx, dpu)).To(Succeed()) + DeferCleanup(mgmtClient.Delete, ctx, dpu) + + validator := NewValidator(mgmtClient, nil, dpuNamespace) + + // Check if DPU exists with test hostname (should not match) + exists, err := validator.dpuExists(ctx, testHostname) + Expect(err).NotTo(HaveOccurred()) + Expect(exists).To(BeFalse()) + }) + }) + + Context("when multiple DPUs exist", func() { + It("should find the correct DPU among multiple", func() { + // Create multiple DPUs + dpu1 := &dpuprovisioningv1alpha1.DPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: "dpu-1", + Namespace: dpuNamespace, + }, + Spec: dpuprovisioningv1alpha1.DPUSpec{ + DPUNodeName: "node-1", + DPUDeviceName: "device-1", + BFB: "bfb-1", + SerialNumber: "SN111111", + }, + } + dpu2 := &dpuprovisioningv1alpha1.DPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: testHostname, // This one matches our search + Namespace: dpuNamespace, + }, + Spec: dpuprovisioningv1alpha1.DPUSpec{ + DPUNodeName: "node-2", + DPUDeviceName: "device-2", + BFB: "bfb-2", + SerialNumber: "SN222222", + }, + } + dpu3 := &dpuprovisioningv1alpha1.DPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: "dpu-3", + Namespace: dpuNamespace, + }, + Spec: dpuprovisioningv1alpha1.DPUSpec{ + DPUNodeName: "node-3", + DPUDeviceName: "device-3", + BFB: "bfb-3", + SerialNumber: "SN333333", + }, + } + Expect(mgmtClient.Create(ctx, dpu1)).To(Succeed()) + DeferCleanup(mgmtClient.Delete, ctx, dpu1) + Expect(mgmtClient.Create(ctx, dpu2)).To(Succeed()) + DeferCleanup(mgmtClient.Delete, ctx, dpu2) + Expect(mgmtClient.Create(ctx, dpu3)).To(Succeed()) + DeferCleanup(mgmtClient.Delete, ctx, dpu3) + + validator := NewValidator(mgmtClient, nil, dpuNamespace) + + // Check if DPU exists (should find dpu-2 by matching object name) + exists, err := validator.dpuExists(ctx, testHostname) + Expect(err).NotTo(HaveOccurred()) + Expect(exists).To(BeTrue()) + }) + }) + }) +}) diff --git a/internal/controller/csrapproval/csrapproval.go b/internal/controller/csrapproval/csrapproval.go new file mode 100644 index 00000000..dd1a0ecb --- /dev/null +++ b/internal/controller/csrapproval/csrapproval.go @@ -0,0 +1,320 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "context" + "fmt" + "time" + + certv1 "k8s.io/api/certificates/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + provisioningv1alpha1 "github.com/rh-ecosystem-edge/dpf-hcp-provisioner-operator/api/v1alpha1" +) + +const ( + // CSRPollingInterval is how often to check for pending CSRs in the hosted cluster + // Polls continuously to handle bootstrap CSRs, serving CSRs, and edge cases like node recreation + CSRPollingInterval = 30 * time.Second +) + +// CSRApprover manages CSR auto-approval for a DPFHCPProvisioner +type CSRApprover struct { + mgmtClient client.Client // Client for management cluster (where operator runs) + recorder record.EventRecorder + clientManager *ClientManager // Manages cached clients to hosted clusters +} + +// NewCSRApprover creates a new CSR approver +func NewCSRApprover(mgmtClient client.Client, recorder record.EventRecorder) *CSRApprover { + return &CSRApprover{ + mgmtClient: mgmtClient, + recorder: recorder, + clientManager: NewClientManager(mgmtClient), + } +} + +// ProcessCSRs processes and approves pending CSRs for a DPFHCPProvisioner +// Called during each reconciliation loop to handle CSRs from worker nodes +func (a *CSRApprover) ProcessCSRs(ctx context.Context, dpfhcp *provisioningv1alpha1.DPFHCPProvisioner) (ctrl.Result, error) { + log := logf.FromContext(ctx) + + // Step 1: Verify prerequisites (kubeconfig secret exists and is populated) + if !a.isKubeconfigAvailable(ctx, dpfhcp) { + log.V(1).Info("Kubeconfig secret not available, CSR auto-approval cannot start") + if err := a.setCondition(ctx, dpfhcp, metav1.ConditionFalse, provisioningv1alpha1.ReasonKubeconfigNotAvailable, "Kubeconfig secret not available for hosted cluster"); err != nil { + return ctrl.Result{}, err + } + // Don't requeue - reconciliation will be triggered when main controller updates DPFHCPProvisioner status + // (main controller watches kubeconfig secrets and updates DPFHCPProvisioner.Status.KubeConfigSecretRef) + return ctrl.Result{}, nil + } + + // Step 2: Get or create hosted cluster client + hcClient, err := a.clientManager.GetHostedClusterClient(ctx, dpfhcp.Namespace, dpfhcp.Name) + if err != nil { + log.Error(err, "Failed to create hosted cluster client") + if err := a.setCondition(ctx, dpfhcp, metav1.ConditionFalse, provisioningv1alpha1.ReasonHostedClusterNotReachable, fmt.Sprintf("Cannot connect to hosted cluster: %v", err)); err != nil { + return ctrl.Result{}, err + } + a.recorder.Event(dpfhcp, corev1.EventTypeWarning, "HostedClusterUnreachable", fmt.Sprintf("Cannot connect to hosted cluster: %v", err)) + // Don't requeue - reconciliation will be triggered when main controller updates DPFHCPProvisioner status + // (main controller watches HostedCluster and updates DPFHCPProvisioner.Status.HostedClusterAvailable) + return ctrl.Result{}, nil + } + + // Step 3: Test connection + if err := TestConnection(ctx, hcClient); err != nil { + log.Error(err, "Hosted cluster not reachable") + // Invalidate cached client so next reconciliation creates a fresh one + // This handles cases like expired credentials, kubeconfig rotation, etc. + a.clientManager.InvalidateClient(dpfhcp.Namespace, dpfhcp.Name) + if err := a.setCondition(ctx, dpfhcp, metav1.ConditionFalse, provisioningv1alpha1.ReasonHostedClusterNotReachable, fmt.Sprintf("Cannot connect to hosted cluster: %v", err)); err != nil { + return ctrl.Result{}, err + } + a.recorder.Event(dpfhcp, corev1.EventTypeWarning, "HostedClusterUnreachable", fmt.Sprintf("Cannot connect to hosted cluster: %v", err)) + // Don't requeue - reconciliation will be triggered when main controller updates DPFHCPProvisioner status + // (main controller watches HostedCluster and updates DPFHCPProvisioner.Status.HostedClusterAvailable) + return ctrl.Result{}, nil + } + + // Step 4: Process any pending CSRs + if err := a.processPendingCSRs(ctx, dpfhcp, hcClient); err != nil { + log.Error(err, "Failed to process pending CSRs") + // Transient error during normal operation - don't fail the reconciliation + // Continue polling to retry CSR processing + if condErr := a.setCondition(ctx, dpfhcp, metav1.ConditionTrue, provisioningv1alpha1.ReasonCSRApprovalActive, + fmt.Sprintf("CSR processing encountered transient error: %v", err)); condErr != nil { + return ctrl.Result{}, condErr + } + // Retry after polling interval (assume CSRs might be pending) + return ctrl.Result{RequeueAfter: CSRPollingInterval}, nil + } + + // Step 5: Update condition and schedule next poll + if err := a.setCondition(ctx, dpfhcp, metav1.ConditionTrue, provisioningv1alpha1.ReasonCSRApprovalActive, + "CSR auto-approval active for worker nodes"); err != nil { + return ctrl.Result{}, err + } + + // Always poll every 30 seconds to check for new CSRs + // This handles all cases: initial bootstrap CSRs, serving CSRs after bootstrap, node recreation, etc. + return ctrl.Result{RequeueAfter: CSRPollingInterval}, nil +} + +// StopCSRWatch cleans up resources for a DPFHCPProvisioner +// Called during finalizer cleanup +func (a *CSRApprover) StopCSRWatch(ctx context.Context, dpfhcp *provisioningv1alpha1.DPFHCPProvisioner) { + if a == nil || a.clientManager == nil || dpfhcp == nil { + // Approver not initialized or CR is nil, nothing to stop + return + } + + log := logf.FromContext(ctx) + log.Info("Cleaning up CSR approver resources") + + // Invalidate cached client + a.clientManager.InvalidateClient(dpfhcp.Namespace, dpfhcp.Name) +} + +// isKubeconfigAvailable checks if the kubeconfig secret exists and is populated +func (a *CSRApprover) isKubeconfigAvailable(ctx context.Context, dpfhcp *provisioningv1alpha1.DPFHCPProvisioner) bool { + _, err := a.clientManager.getKubeconfigData(ctx, dpfhcp.Namespace, dpfhcp.Name) + return err == nil +} + +// processPendingCSRs processes all pending CSRs in the hosted cluster +func (a *CSRApprover) processPendingCSRs(ctx context.Context, dpfhcp *provisioningv1alpha1.DPFHCPProvisioner, hcClient *kubernetes.Clientset) error { + log := logf.FromContext(ctx) + + // List all CSRs + csrList, err := hcClient.CertificatesV1().CertificateSigningRequests().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list CSRs: %w", err) + } + + // Filter to pending CSRs + pendingCSRs := make([]certv1.CertificateSigningRequest, 0) + for _, csr := range csrList.Items { + if IsPending(&csr) { + pendingCSRs = append(pendingCSRs, csr) + } + } + + if len(pendingCSRs) == 0 { + log.V(1).Info("No pending CSRs found") + return nil + } + + log.Info("Found pending CSRs", "count", len(pendingCSRs)) + + // Filter to bootstrap and serving CSRs only + bootstrapCSRs := make([]certv1.CertificateSigningRequest, 0) + servingCSRs := make([]certv1.CertificateSigningRequest, 0) + otherCSRs := make([]certv1.CertificateSigningRequest, 0) + for _, csr := range pendingCSRs { + switch csr.Spec.SignerName { + case SignerBootstrap: + bootstrapCSRs = append(bootstrapCSRs, csr) + case SignerServing: + servingCSRs = append(servingCSRs, csr) + default: + otherCSRs = append(otherCSRs, csr) + } + } + + log.Info("Filtered CSRs by signer", + "bootstrapCount", len(bootstrapCSRs), + "servingCount", len(servingCSRs), + "otherCount", len(otherCSRs)) + + // Log other signers for debugging + for _, csr := range otherCSRs { + log.Info("Skipping CSR with unsupported signer", + "csrName", csr.Name, + "signerName", csr.Spec.SignerName) + } + + // Process bootstrap CSRs + for _, csr := range bootstrapCSRs { + err := a.processCSR(ctx, dpfhcp, hcClient, &csr, "bootstrap") + if err != nil { + // Actual error - log and continue + log.Error(err, "Failed to process bootstrap CSR", "csrName", csr.Name) + continue + } + } + + // Process serving CSRs + for _, csr := range servingCSRs { + err := a.processCSR(ctx, dpfhcp, hcClient, &csr, "serving") + if err != nil { + // Actual error - log and continue + log.Error(err, "Failed to process serving CSR", "csrName", csr.Name) + continue + } + } + + return nil +} + +// processCSR processes a single CSR (validate and approve if valid) +// Returns: +// - nil: CSR was successfully approved or skipped (validation failed but not an error condition) +// - error: Actual error occurred during processing +func (a *CSRApprover) processCSR(ctx context.Context, dpfhcp *provisioningv1alpha1.DPFHCPProvisioner, hcClient *kubernetes.Clientset, csr *certv1.CertificateSigningRequest, csrType string) error { + log := logf.FromContext(ctx) + + // Extract hostname from CSR + hostname, err := ExtractHostname(csr) + if err != nil { + log.Info("Skipping CSR: cannot extract hostname", "csrName", csr.Name, "error", err.Error()) + return nil + } + + log.Info("Processing CSR", "csrName", csr.Name, "hostname", hostname, "type", csrType) + + // Perform comprehensive CSR validation based on type + switch csr.Spec.SignerName { + case SignerBootstrap: + if err := ValidateBootstrapCSR(csr, hostname); err != nil { + log.Info("Skipping CSR: bootstrap validation failed", "csrName", csr.Name, "hostname", hostname, "error", err.Error()) + return nil + } + case SignerServing: + if err := ValidateServingCSR(csr, hostname); err != nil { + log.Info("Skipping CSR: serving validation failed", "csrName", csr.Name, "hostname", hostname, "error", err.Error()) + return nil + } + } + + // Validate CSR ownership (check if DPU exists and node state matches CSR type) + validator := NewValidator(a.mgmtClient, hcClient, dpfhcp.Spec.DPUClusterRef.Namespace) + isBootstrapCSR := csr.Spec.SignerName == SignerBootstrap + result, err := validator.ValidateCSROwner(ctx, hostname, isBootstrapCSR) + if err != nil { + return fmt.Errorf("failed to validate CSR ownership: %w", err) + } + if !result.Valid { + log.Info("Skipping CSR: owner validation failed", "csrName", csr.Name, "hostname", hostname, "reason", result.Reason) + return nil + } + + // Approve the CSR + if err := a.approveCSR(ctx, hcClient, csr, hostname, csrType); err != nil { + return fmt.Errorf("failed to approve CSR: %w", err) + } + + // Emit event for successful approval + eventMsg := fmt.Sprintf("Approved %s CSR for DPU worker node %s", csrType, hostname) + a.recorder.Event(dpfhcp, corev1.EventTypeNormal, "CSRApproved", eventMsg) + + log.Info("CSR approved", "csrName", csr.Name, "hostname", hostname, "type", csrType) + return nil +} + +// approveCSR approves a CSR by updating its status +func (a *CSRApprover) approveCSR(ctx context.Context, hcClient *kubernetes.Clientset, csr *certv1.CertificateSigningRequest, hostname, csrType string) error { + // Create approval condition + csr.Status.Conditions = append(csr.Status.Conditions, certv1.CertificateSigningRequestCondition{ + Type: certv1.CertificateApproved, + Status: corev1.ConditionTrue, + Reason: "AutoApproved", + Message: fmt.Sprintf("Auto-approved %s CSR for DPU worker node %s", csrType, hostname), + LastUpdateTime: metav1.Now(), + }) + + // Update CSR status + _, err := hcClient.CertificatesV1().CertificateSigningRequests().UpdateApproval(ctx, csr.Name, csr, metav1.UpdateOptions{}) + return err +} + +// setCondition updates the CSRAutoApprovalActive condition +func (a *CSRApprover) setCondition(ctx context.Context, dpfhcp *provisioningv1alpha1.DPFHCPProvisioner, status metav1.ConditionStatus, reason, message string) error { + condition := metav1.Condition{ + Type: provisioningv1alpha1.CSRAutoApprovalActive, + Status: status, + Reason: reason, + Message: message, + ObservedGeneration: dpfhcp.Generation, + } + + // This handles finding/updating/appending the condition correctly + if changed := meta.SetStatusCondition(&dpfhcp.Status.Conditions, condition); changed { + // Persist status update first; only emit event on success + if err := a.mgmtClient.Status().Update(ctx, dpfhcp); err != nil { + return err + } + + // Emit event only when condition changed and persisted successfully (avoid spam and ghost events) + eventType := corev1.EventTypeNormal + if status == metav1.ConditionFalse { + eventType = corev1.EventTypeWarning + } + a.recorder.Event(dpfhcp, eventType, reason, message) + } + + return nil +} diff --git a/internal/controller/csrapproval/hostname.go b/internal/controller/csrapproval/hostname.go new file mode 100644 index 00000000..227c2d77 --- /dev/null +++ b/internal/controller/csrapproval/hostname.go @@ -0,0 +1,122 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "crypto/x509" + "encoding/pem" + "fmt" + "strings" + + certv1 "k8s.io/api/certificates/v1" +) + +const ( + // SignerBootstrap is the signer name for bootstrap CSRs + SignerBootstrap = "kubernetes.io/kube-apiserver-client-kubelet" + + // SignerServing is the signer name for serving CSRs + SignerServing = "kubernetes.io/kubelet-serving" + + // SystemNodePrefix is the prefix used in CSR CN and username + SystemNodePrefix = "system:node:" +) + +// ExtractHostname extracts the hostname from a CSR based on its signer type. +// For bootstrap CSRs: extracts from CN in the certificate request +// For serving CSRs: extracts from username field +func ExtractHostname(csr *certv1.CertificateSigningRequest) (string, error) { + if csr == nil { + return "", fmt.Errorf("CSR is nil") + } + + switch csr.Spec.SignerName { + case SignerBootstrap: + return extractHostnameFromBootstrapCSR(csr) + case SignerServing: + return extractHostnameFromServingCSR(csr) + default: + return "", fmt.Errorf("unsupported signer name: %s", csr.Spec.SignerName) + } +} + +// extractHostnameFromBootstrapCSR extracts hostname from bootstrap CSR's certificate request CN +func extractHostnameFromBootstrapCSR(csr *certv1.CertificateSigningRequest) (string, error) { + // csr.Spec.Request is already a []byte containing PEM-encoded certificate request + + // Parse PEM block + pemBlock, _ := pem.Decode(csr.Spec.Request) + if pemBlock == nil { + return "", fmt.Errorf("failed to decode PEM block from CSR request") + } + + // Parse X509 certificate request + certReq, err := x509.ParseCertificateRequest(pemBlock.Bytes) + if err != nil { + return "", fmt.Errorf("failed to parse X509 certificate request: %w", err) + } + + // Extract hostname from CN (format: "system:node:") + cn := certReq.Subject.CommonName + hostname := strings.TrimPrefix(cn, SystemNodePrefix) + + if hostname == "" || hostname == cn { + return "", fmt.Errorf("invalid CN format, expected 'system:node:', got: %s", cn) + } + + return hostname, nil +} + +// extractHostnameFromServingCSR extracts hostname from serving CSR's username field +func extractHostnameFromServingCSR(csr *certv1.CertificateSigningRequest) (string, error) { + // Username format: "system:node:" + username := csr.Spec.Username + hostname := strings.TrimPrefix(username, SystemNodePrefix) + + if hostname == "" || hostname == username { + return "", fmt.Errorf("invalid username format, expected 'system:node:', got: %s", username) + } + + return hostname, nil +} + +// IsPending checks if a CSR is in pending state (no Approved or Denied condition) +func IsPending(csr *certv1.CertificateSigningRequest) bool { + if csr == nil { + return false + } + + for _, condition := range csr.Status.Conditions { + if condition.Type == certv1.CertificateApproved || condition.Type == certv1.CertificateDenied { + return false + } + } + return true +} + +// FilterBySignerName filters CSRs to only include bootstrap and serving CSR types +func FilterBySignerName(csrs []certv1.CertificateSigningRequest) []certv1.CertificateSigningRequest { + filtered := make([]certv1.CertificateSigningRequest, 0, len(csrs)) + + for _, csr := range csrs { + if csr.Spec.SignerName == SignerBootstrap || csr.Spec.SignerName == SignerServing { + filtered = append(filtered, csr) + } + } + + return filtered +} diff --git a/internal/controller/csrapproval/hostname_test.go b/internal/controller/csrapproval/hostname_test.go new file mode 100644 index 00000000..2de7870a --- /dev/null +++ b/internal/controller/csrapproval/hostname_test.go @@ -0,0 +1,394 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + certv1 "k8s.io/api/certificates/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + testHostname = "dpu-worker-01" +) + +var _ = Describe("Hostname Extraction", func() { + Describe("ExtractHostname from Bootstrap CSR", func() { + Context("When CSR has valid CN with system:node: prefix", func() { + It("should extract hostname correctly", func() { + hostname := testHostname + csrBytes := createTestCSR("system:node:" + hostname) + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bootstrap-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Request: csrBytes, + }, + } + + extractedHostname, err := ExtractHostname(csr) + Expect(err).NotTo(HaveOccurred()) + Expect(extractedHostname).To(Equal(hostname)) + }) + }) + + Context("When CSR has invalid CN format", func() { + It("should return error", func() { + csrBytes := createTestCSR("invalid-format") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Request: csrBytes, + }, + } + + _, err := ExtractHostname(csr) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid CN format")) + }) + }) + + Context("When CSR has empty CN after prefix removal", func() { + It("should return error", func() { + csrBytes := createTestCSR("system:node:") + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-empty-hostname-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerBootstrap, + Request: csrBytes, + }, + } + + _, err := ExtractHostname(csr) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid CN format")) + }) + }) + }) + + Describe("ExtractHostname from Serving CSR", func() { + Context("When CSR has valid username with system:node: prefix", func() { + It("should extract hostname correctly", func() { + hostname := testHostname + + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-serving-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "system:node:" + hostname, + }, + } + + extractedHostname, err := ExtractHostname(csr) + Expect(err).NotTo(HaveOccurred()) + Expect(extractedHostname).To(Equal(hostname)) + }) + }) + + Context("When CSR has invalid username format", func() { + It("should return error", func() { + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-invalid-serving-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "invalid-format", + }, + } + + _, err := ExtractHostname(csr) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid username format")) + }) + }) + + Context("When CSR has empty username after prefix removal", func() { + It("should return error", func() { + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-empty-username-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: SignerServing, + Username: "system:node:", + }, + } + + _, err := ExtractHostname(csr) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid username format")) + }) + }) + }) + + Describe("ExtractHostname with unsupported signer", func() { + Context("When CSR has unsupported signer name", func() { + It("should return error", func() { + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-unsupported-csr", + }, + Spec: certv1.CertificateSigningRequestSpec{ + SignerName: "unsupported.signer.io/test", + Username: "system:node:test-host", + }, + } + + _, err := ExtractHostname(csr) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("unsupported signer name")) + }) + }) + + Context("When CSR is nil", func() { + It("should return error", func() { + _, err := ExtractHostname(nil) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("CSR is nil")) + }) + }) + }) + + Describe("IsPending", func() { + Context("When CSR has no conditions", func() { + It("should return true", func() { + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pending-csr", + }, + Status: certv1.CertificateSigningRequestStatus{ + Conditions: []certv1.CertificateSigningRequestCondition{}, + }, + } + + Expect(IsPending(csr)).To(BeTrue()) + }) + }) + + Context("When CSR has Approved condition", func() { + It("should return false", func() { + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-approved-csr", + }, + Status: certv1.CertificateSigningRequestStatus{ + Conditions: []certv1.CertificateSigningRequestCondition{ + { + Type: certv1.CertificateApproved, + Status: corev1.ConditionTrue, + }, + }, + }, + } + + Expect(IsPending(csr)).To(BeFalse()) + }) + }) + + Context("When CSR has Denied condition", func() { + It("should return false", func() { + csr := &certv1.CertificateSigningRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-denied-csr", + }, + Status: certv1.CertificateSigningRequestStatus{ + Conditions: []certv1.CertificateSigningRequestCondition{ + { + Type: certv1.CertificateDenied, + Status: corev1.ConditionTrue, + }, + }, + }, + } + + Expect(IsPending(csr)).To(BeFalse()) + }) + }) + + Context("When CSR is nil", func() { + It("should return false", func() { + Expect(IsPending(nil)).To(BeFalse()) + }) + }) + }) + + Describe("FilterBySignerName", func() { + Context("When list contains mixed signer types", func() { + It("should filter to only bootstrap and serving CSRs", func() { + csrs := []certv1.CertificateSigningRequest{ + { + ObjectMeta: metav1.ObjectMeta{Name: "bootstrap-csr"}, + Spec: certv1.CertificateSigningRequestSpec{SignerName: SignerBootstrap}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "serving-csr"}, + Spec: certv1.CertificateSigningRequestSpec{SignerName: SignerServing}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "other-csr"}, + Spec: certv1.CertificateSigningRequestSpec{SignerName: "other.signer.io/test"}, + }, + } + + filtered := FilterBySignerName(csrs) + Expect(filtered).To(HaveLen(2)) + Expect(filtered[0].Name).To(Equal("bootstrap-csr")) + Expect(filtered[1].Name).To(Equal("serving-csr")) + }) + }) + + Context("When list contains no matching signers", func() { + It("should return empty list", func() { + csrs := []certv1.CertificateSigningRequest{ + { + ObjectMeta: metav1.ObjectMeta{Name: "other-csr"}, + Spec: certv1.CertificateSigningRequestSpec{SignerName: "other.signer.io/test"}, + }, + } + + filtered := FilterBySignerName(csrs) + Expect(filtered).To(BeEmpty()) + }) + }) + + Context("When list is empty", func() { + It("should return empty list", func() { + csrs := []certv1.CertificateSigningRequest{} + + filtered := FilterBySignerName(csrs) + Expect(filtered).To(BeEmpty()) + }) + }) + }) +}) + +// createTestCSR creates a test CSR with the given CN +func createTestCSR(cn string) []byte { + // Generate a private key + privateKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + panic(err) + } + + // Create CSR template + template := x509.CertificateRequest{ + Subject: pkix.Name{ + CommonName: cn, + }, + } + + // Create CSR + csrBytes, err := x509.CreateCertificateRequest(rand.Reader, &template, privateKey) + if err != nil { + panic(err) + } + + // Encode to PEM + pemBlock := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE REQUEST", + Bytes: csrBytes, + }) + + // Return PEM-encoded bytes directly (Kubernetes Go client handles base64 decoding from JSON/YAML) + return pemBlock +} + +// createTestCSRWithOrganization creates a test CSR with the given CN and organization +func createTestCSRWithOrganization(cn, org string) []byte { + // Generate a private key + privateKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + panic(err) + } + + // Create CSR template with organization + template := x509.CertificateRequest{ + Subject: pkix.Name{ + CommonName: cn, + Organization: []string{org}, + }, + } + + // Create CSR + csrBytes, err := x509.CreateCertificateRequest(rand.Reader, &template, privateKey) + if err != nil { + panic(err) + } + + // Encode to PEM + pemBlock := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE REQUEST", + Bytes: csrBytes, + }) + + // Return PEM-encoded bytes directly (Kubernetes Go client handles base64 decoding from JSON/YAML) + return pemBlock +} + +// createServingCSRWithDNS creates a test CSR with the given CN and DNS names +func createServingCSRWithDNS(cn, dnsName string) []byte { + // Generate a private key + privateKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + panic(err) + } + + // Create CSR template with organization and DNS names + template := x509.CertificateRequest{ + Subject: pkix.Name{ + CommonName: cn, + Organization: []string{"system:nodes"}, + }, + DNSNames: []string{dnsName}, + } + + // Create CSR + csrBytes, err := x509.CreateCertificateRequest(rand.Reader, &template, privateKey) + if err != nil { + panic(err) + } + + // Encode to PEM + pemBlock := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE REQUEST", + Bytes: csrBytes, + }) + + // Return PEM-encoded bytes directly (Kubernetes Go client handles base64 decoding from JSON/YAML) + return pemBlock +} diff --git a/internal/controller/csrapproval/suite_test.go b/internal/controller/csrapproval/suite_test.go new file mode 100644 index 00000000..aebac58c --- /dev/null +++ b/internal/controller/csrapproval/suite_test.go @@ -0,0 +1,48 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csrapproval + +import ( + "testing" + + dpuprovisioningv1alpha1 "github.com/nvidia/doca-platform/api/provisioning/v1alpha1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +var ( + k8sClient client.Client + scheme *runtime.Scheme +) + +func TestCSRApproval(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "CSR Approval Suite") +} + +var _ = BeforeSuite(func() { + // Create scheme and register DPU types + scheme = runtime.NewScheme() + err := dpuprovisioningv1alpha1.AddToScheme(scheme) + Expect(err).NotTo(HaveOccurred()) + + // Create fake client for tests + k8sClient = fake.NewClientBuilder().WithScheme(scheme).Build() +}) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 14719a76..4b37c5a7 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -122,18 +122,18 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) By("setting up DPFHCPProvisioner controller") - kubeconfigInjector := kubeconfiginjection.NewKubeconfigInjector(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ControllerName)) + kubeconfigInjector := kubeconfiginjection.NewKubeconfigInjector(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ProvisionerControllerName)) // Initialize Finalizer Manager with pluggable cleanup handlers - finalizerManager := finalizer.NewManager(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ControllerName)) + finalizerManager := finalizer.NewManager(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ProvisionerControllerName)) // Register cleanup handlers in order (dependent resources first) - finalizerManager.RegisterHandler(kubeconfiginjection.NewCleanupHandler(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ControllerName))) - finalizerManager.RegisterHandler(hostedcluster.NewCleanupHandler(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ControllerName))) + finalizerManager.RegisterHandler(kubeconfiginjection.NewCleanupHandler(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ProvisionerControllerName))) + finalizerManager.RegisterHandler(hostedcluster.NewCleanupHandler(k8sManager.GetClient(), k8sManager.GetEventRecorderFor(common.ProvisionerControllerName))) reconciler := &DPFHCPProvisionerReconciler{ Client: k8sManager.GetClient(), Scheme: k8sManager.GetScheme(), - Recorder: k8sManager.GetEventRecorderFor(common.ControllerName), + Recorder: k8sManager.GetEventRecorderFor(common.ProvisionerControllerName), ImageResolver: bluefield.NewImageResolver(k8sManager.GetClient(), k8sManager.GetEventRecorderFor("bluefield-image-resolver")), DPUClusterValidator: dpucluster.NewValidator(k8sManager.GetClient(), k8sManager.GetEventRecorderFor("dpucluster-validator")), SecretsValidator: secrets.NewValidator(k8sManager.GetClient(), k8sManager.GetEventRecorderFor("secrets-validator")),