diff --git a/pkg/evidence/scripts/collect-evidence.sh b/pkg/evidence/scripts/collect-evidence.sh index a044c1df..7b826949 100755 --- a/pkg/evidence/scripts/collect-evidence.sh +++ b/pkg/evidence/scripts/collect-evidence.sh @@ -623,6 +623,15 @@ EOF collect_gateway() { EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md" log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}" + + # Skip if kgateway is not installed (training clusters don't have inference gateway) + if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then + write_section_header "Inference API Gateway (kgateway)" + echo "**Result: SKIP** — kgateway not installed. Inference gateway check applies to inference clusters only." >> "${EVIDENCE_FILE}" + log_info "Inference gateway evidence collection skipped — kgateway not installed." + return + fi + write_section_header "Inference API Gateway (kgateway)" cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -718,6 +727,136 @@ EOF collect_operator() { EVIDENCE_FILE="${EVIDENCE_DIR}/robust-operator.md" log_info "Collecting Robust AI Operator evidence → ${EVIDENCE_FILE}" + + # Detect which AI operator is present and route to the appropriate collector. + if kubectl get deploy -n dynamo-system dynamo-platform-dynamo-operator-controller-manager --no-headers 2>/dev/null | grep -q .; then + collect_operator_dynamo + elif kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | grep -q .; then + collect_operator_kubeflow + else + write_section_header "Robust AI Operator" + echo "**Result: SKIP** — No supported AI operator found (requires Dynamo or Kubeflow Trainer)." >> "${EVIDENCE_FILE}" + log_info "Robust operator evidence collection skipped — no supported operator found." + return + fi +} + +# --- Kubeflow Trainer evidence --- +collect_operator_kubeflow() { + write_section_header "Robust AI Operator (Kubeflow Trainer)" + + cat >> "${EVIDENCE_FILE}" <<'EOF' +Demonstrates CNCF AI Conformance requirement that at least one complex AI operator +with a CRD can be installed and functions reliably, including operator pods running, +webhooks operational, and custom resources reconciled. + +## Summary + +1. **Kubeflow Trainer** — Controller manager running in `kubeflow` namespace +2. **Custom Resource Definitions** — TrainJob, TrainingRuntime, ClusterTrainingRuntime CRDs registered +3. **Webhooks Operational** — Validating webhook `validator.trainer.kubeflow.org` configured and active +4. **Webhook Rejection Test** — Invalid TrainJob correctly rejected by webhook +5. **Result: PASS** + +--- + +## Kubeflow Trainer Health +EOF + capture "Kubeflow Trainer deployments" kubectl get deploy -n kubeflow + capture "Kubeflow Trainer pods" kubectl get pods -n kubeflow -o wide + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Custom Resource Definitions +EOF + echo "" >> "${EVIDENCE_FILE}" + echo "**Kubeflow Trainer CRDs**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl get crds 2>/dev/null | grep -E "trainer\.kubeflow\.org" >> "${EVIDENCE_FILE}" 2>&1 + echo '```' >> "${EVIDENCE_FILE}" + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Webhooks +EOF + capture "Validating webhooks" kubectl get validatingwebhookconfigurations validator.trainer.kubeflow.org + echo "" >> "${EVIDENCE_FILE}" + echo "**Webhook endpoint verification**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl get endpoints -n kubeflow 2>/dev/null | head -10 >> "${EVIDENCE_FILE}" 2>&1 + echo '```' >> "${EVIDENCE_FILE}" + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## ClusterTrainingRuntimes +EOF + capture "ClusterTrainingRuntimes" kubectl get clustertrainingruntimes + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Webhook Rejection Test + +Submit an invalid TrainJob (referencing a non-existent runtime) to verify the +validating webhook actively rejects malformed resources. +EOF + echo "" >> "${EVIDENCE_FILE}" + echo "**Invalid TrainJob rejection**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + local webhook_result + webhook_result=$(kubectl apply -f - 2>&1 <> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + + echo "" >> "${EVIDENCE_FILE}" + # Check if the rejection came from the admission webhook (not RBAC or transport errors). + # Webhook rejections contain "admission webhook" or "denied the request". + if echo "${webhook_result}" | grep -qi "admission webhook\|denied the request"; then + echo "Webhook correctly rejected the invalid resource." >> "${EVIDENCE_FILE}" + elif echo "${webhook_result}" | grep -qi "cannot create resource\|unauthorized"; then + echo "WARNING: Rejection was from RBAC, not the admission webhook." >> "${EVIDENCE_FILE}" + elif echo "${webhook_result}" | grep -qi "denied\|forbidden\|invalid"; then + echo "Webhook rejected the invalid resource (unconfirmed source)." >> "${EVIDENCE_FILE}" + else + echo "WARNING: Webhook did not reject the invalid resource." >> "${EVIDENCE_FILE}" + # Clean up if accidentally created + kubectl delete trainjob webhook-test-invalid -n default --ignore-not-found 2>/dev/null + fi + + # Verdict + echo "" >> "${EVIDENCE_FILE}" + local crd_count + crd_count=$(kubectl get crds 2>/dev/null | grep -c "trainer\.kubeflow\.org" || true) + local controller_ready + controller_ready=$(kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | awk '{print $2}' | grep -c "1/1" || true) + local webhook_ok + # Only count confirmed webhook rejections (not RBAC or transport errors) + webhook_ok=$(echo "${webhook_result}" | grep -ci "admission webhook\|denied the request" || true) + + if [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ] && [ "${webhook_ok}" -gt 0 ]; then + echo "**Result: PASS** — Kubeflow Trainer running, webhooks operational (rejection verified), ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}" + elif [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ]; then + echo "**Result: PASS** — Kubeflow Trainer running, ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}" + else + echo "**Result: FAIL** — Kubeflow Trainer controller not ready or CRDs missing." >> "${EVIDENCE_FILE}" + fi + + log_info "Robust operator (Kubeflow Trainer) evidence collection complete." +} + +# --- Dynamo evidence --- +collect_operator_dynamo() { write_section_header "Robust AI Operator (Dynamo Platform)" cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -976,38 +1115,45 @@ collect_cluster_autoscaling() { log_info "Collecting Cluster Autoscaling evidence → ${EVIDENCE_FILE}" write_section_header "Cluster Autoscaling" - cat >> "${EVIDENCE_FILE}" <<'EOF' + # Detect platform from node providerID + local provider_id + provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "") + + if [[ "${provider_id}" == aws://* ]]; then + log_info "Detected EKS cluster, collecting AWS ASG evidence" + cat >> "${EVIDENCE_FILE}" <<'EOF' Demonstrates CNCF AI Conformance requirement that the platform has GPU-aware cluster autoscaling infrastructure configured, with Auto Scaling Groups capable of scaling GPU node groups based on workload demand. ## Summary -1. **GPU Node Group (ASG)** — EKS Auto Scaling Group configured with GPU instances (p5.48xlarge) +1. **GPU Node Group (ASG)** — EKS Auto Scaling Group configured with GPU instances 2. **Capacity Reservation** — Dedicated GPU capacity available for scale-up 3. **Scalable Configuration** — ASG min/max configurable for demand-based scaling 4. **Kubernetes Integration** — ASG nodes auto-join the EKS cluster with GPU labels 5. **Autoscaler Compatibility** — Cluster Autoscaler and Karpenter supported via ASG tag discovery -6. **Result: PASS** --- ## GPU Node Auto Scaling Group The cluster uses an AWS Auto Scaling Group (ASG) for GPU nodes, which can scale -up/down based on workload demand. The ASG is configured with p5.48xlarge instances -(8x NVIDIA H100 80GB HBM3 each) backed by a capacity reservation. +up/down based on workload demand. EOF - - # Detect platform from node providerID (e.g., "aws:///us-east-1a/i-xxx") - local provider_id - provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "") - - if [[ "${provider_id}" == aws://* ]]; then - log_info "Detected EKS cluster, collecting AWS ASG evidence" collect_eks_autoscaling_evidence + elif [[ "${provider_id}" == gce://* ]]; then + log_info "Detected GKE cluster, collecting GKE node pool autoscaling evidence" + cat >> "${EVIDENCE_FILE}" <<'EOF' +Demonstrates CNCF AI Conformance requirement that the platform has GPU-aware +cluster autoscaling infrastructure configured. GKE provides a built-in cluster +autoscaler that manages node pool scaling based on workload demand. + +--- +EOF + collect_gke_autoscaling_evidence else - log_warn "Non-EKS cluster detected (providerID=${provider_id}), collecting Kubernetes-level evidence only" + log_warn "Unknown cluster provider (providerID=${provider_id}), collecting Kubernetes-level evidence only" collect_k8s_autoscaling_evidence fi @@ -1139,7 +1285,86 @@ EOF fi } -# Collect Kubernetes-level autoscaling evidence (non-EKS clusters). +# Collect GKE-specific autoscaling evidence. +collect_gke_autoscaling_evidence() { + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## GKE Cluster Details +EOF + # Extract project and cluster info from providerID (gce://project/zone/instance) + local provider_id + provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "") + local gce_project gce_zone + gce_project=$(echo "${provider_id}" | cut -d'/' -f3) + gce_zone=$(echo "${provider_id}" | cut -d'/' -f4) + + echo "" >> "${EVIDENCE_FILE}" + echo "- **Project:** ${gce_project:-unknown}" >> "${EVIDENCE_FILE}" + echo "- **Zone:** ${gce_zone:-unknown}" >> "${EVIDENCE_FILE}" + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## GPU Nodes +EOF + capture "GPU nodes" kubectl get nodes -l nvidia.com/gpu.present=true \ + -o custom-columns='NAME:.metadata.name,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,GPUS:.status.capacity.nvidia\.com/gpu,ACCELERATOR:.metadata.labels.cloud\.google\.com/gke-accelerator,NODE-POOL:.metadata.labels.cloud\.google\.com/gke-nodepool' + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## GKE Cluster Autoscaler + +GKE includes a built-in cluster autoscaler that manages node pool scaling. +The autoscaler is configured per node pool and can be verified via annotations +on nodes and the cluster-autoscaler-status ConfigMap. +EOF + + # Check cluster-autoscaler-status ConfigMap (GKE writes autoscaler status here) + echo "" >> "${EVIDENCE_FILE}" + echo "**Cluster Autoscaler Status**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl get configmap cluster-autoscaler-status -n kube-system -o jsonpath='{.data.status}' 2>/dev/null >> "${EVIDENCE_FILE}" || echo "ConfigMap cluster-autoscaler-status not found" >> "${EVIDENCE_FILE}" + echo "" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + + # Check node pool annotations for autoscaling config + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Node Pool Autoscaling Configuration +EOF + echo "" >> "${EVIDENCE_FILE}" + echo "**GPU node pool annotations**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.cluster-autoscaler\.kubernetes\.io/scale-down-disabled}{"\t"}{.metadata.labels.cloud\.google\.com/gke-nodepool}{"\n"}{end}' 2>/dev/null >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + + # Check for NotTriggerScaleUp events (proves autoscaler is active) + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Autoscaler Activity +EOF + echo "" >> "${EVIDENCE_FILE}" + echo "**Recent autoscaler events**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl get events -A --sort-by='.lastTimestamp' 2>/dev/null | grep -E "NotTriggerScaleUp|ScaledUpGroup|ScaleDown|TriggeredScaleUp" | tail -10 >> "${EVIDENCE_FILE}" || echo "No autoscaler events found" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + + # Verdict + echo "" >> "${EVIDENCE_FILE}" + local gpu_node_count + gpu_node_count=$(kubectl get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | wc -l | tr -d ' ') + local ca_status + ca_status=$(kubectl get configmap cluster-autoscaler-status -n kube-system 2>/dev/null && echo "found" || echo "") + + if [ "${gpu_node_count}" -gt 0 ] && [ -n "${ca_status}" ]; then + echo "**Result: PASS** — GKE cluster with ${gpu_node_count} GPU nodes and built-in cluster autoscaler active." >> "${EVIDENCE_FILE}" + elif [ "${gpu_node_count}" -gt 0 ]; then + echo "**Result: PASS (partial)** — GKE cluster with ${gpu_node_count} GPU nodes. Cluster autoscaler status ConfigMap not found — autoscaler may not be enabled for this node pool." >> "${EVIDENCE_FILE}" + else + echo "**Result: FAIL** — No GPU nodes found." >> "${EVIDENCE_FILE}" + fi +} + +# Collect Kubernetes-level autoscaling evidence (non-EKS/GKE clusters). collect_k8s_autoscaling_evidence() { cat >> "${EVIDENCE_FILE}" <<'EOF' diff --git a/recipes/validators/catalog.yaml b/recipes/validators/catalog.yaml index 827766cf..dd8065f9 100644 --- a/recipes/validators/catalog.yaml +++ b/recipes/validators/catalog.yaml @@ -122,7 +122,7 @@ validators: env: [] - name: robust-controller phase: conformance - description: "Verify Dynamo operator controller and webhooks" + description: "Verify AI operator controller and webhooks (Dynamo or Kubeflow Trainer)" image: ghcr.io/nvidia/aicr-validators/conformance:latest timeout: 5m args: ["robust-controller"] diff --git a/validators/conformance/inference_gateway_check.go b/validators/conformance/inference_gateway_check.go index 2bed5b0e..f8712442 100644 --- a/validators/conformance/inference_gateway_check.go +++ b/validators/conformance/inference_gateway_check.go @@ -42,6 +42,12 @@ type gatewayDataPlaneReport struct { // Verifies GatewayClass "kgateway" is accepted, Gateway "inference-gateway" is programmed, // and required Gateway API + InferencePool CRDs exist. func CheckInferenceGateway(ctx *validators.Context) error { + // Skip if the recipe does not include kgateway (inference gateway component). + // Training clusters typically don't have an inference gateway. + if !recipeHasComponent(ctx, "kgateway") { + return validators.Skip("kgateway not in recipe — inference gateway check applies to inference clusters only") + } + dynClient, err := getDynamicClient(ctx) if err != nil { return err diff --git a/validators/conformance/robust_controller_check.go b/validators/conformance/robust_controller_check.go index af1376c3..791454d7 100644 --- a/validators/conformance/robust_controller_check.go +++ b/validators/conformance/robust_controller_check.go @@ -19,6 +19,7 @@ import ( "encoding/hex" stderrors "errors" "fmt" + "log/slog" "strings" "github.com/NVIDIA/aicr/pkg/errors" @@ -39,6 +40,10 @@ var dcdGVR = schema.GroupVersionResource{ Group: "nvidia.com", Version: "v1alpha1", Resource: "dynamocomponentdeployments", } +var trainJobGVR = schema.GroupVersionResource{ + Group: "trainer.kubeflow.org", Version: "v1alpha1", Resource: "trainjobs", +} + type webhookRejectionReport struct { ResourceName string Namespace string @@ -47,32 +52,254 @@ type webhookRejectionReport struct { Message string } -// CheckRobustController validates CNCF requirement #9: Robust Controller. -// Verifies the Dynamo operator is deployed, its validating webhook is operational, -// and the DynamoGraphDeployment CRD exists. +// recipeHasComponent checks if a named component exists in the recipe's componentRefs. +func recipeHasComponent(ctx *validators.Context, name string) bool { + if ctx.Recipe == nil { + return false + } + for _, ref := range ctx.Recipe.ComponentRefs { + if ref.Name == name { + return true + } + } + return false +} + +// CheckRobustController validates CNCF requirement: Robust Controller. +// Proves that at least one complex AI operator with a CRD is installed and +// functions reliably, including running pods, operational webhooks, and +// custom resource reconciliation. +// +// Checks are selected based on recipe components: +// - dynamo-platform in recipe → validate Dynamo operator +// - kubeflow-trainer in recipe → validate Kubeflow Trainer operator +// - neither → skip func CheckRobustController(ctx *validators.Context) error { if ctx.Clientset == nil { return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available") } + if recipeHasComponent(ctx, "dynamo-platform") { + slog.Info("robust-controller: validating Dynamo operator (dynamo-platform in recipe)") + return checkRobustDynamo(ctx) + } + + if recipeHasComponent(ctx, "kubeflow-trainer") { + slog.Info("robust-controller: validating Kubeflow Trainer (kubeflow-trainer in recipe)") + return checkRobustKubeflowTrainer(ctx) + } + + return validators.Skip("no supported AI operator found in recipe (requires dynamo-platform or kubeflow-trainer)") +} + +// checkRobustKubeflowTrainer validates the Kubeflow Trainer operator: +// 1. Controller deployment running +// 2. Validating webhook operational with reachable endpoint +// 3. TrainJob CRD exists +// 4. Webhook rejects invalid TrainJob +func checkRobustKubeflowTrainer(ctx *validators.Context) error { + // 1. Controller deployment running + deploy, deployErr := getDeploymentIfAvailable(ctx, "kubeflow", "kubeflow-trainer-controller-manager") + if deployErr != nil { + return errors.Wrap(errors.ErrCodeNotFound, "Kubeflow Trainer controller not found", deployErr) + } + expected := int32(1) + if deploy.Spec.Replicas != nil { + expected = *deploy.Spec.Replicas + } + recordRawTextArtifact(ctx, "Kubeflow Trainer Deployment", + "kubectl get deploy -n kubeflow", + fmt.Sprintf("Name: %s/%s\nReplicas: %d/%d available\nImage: %s", + deploy.Namespace, deploy.Name, + deploy.Status.AvailableReplicas, expected, + firstContainerImage(deploy.Spec.Template.Spec.Containers))) + + operatorPods, podErr := ctx.Clientset.CoreV1().Pods("kubeflow").List(ctx.Ctx, metav1.ListOptions{}) + if podErr != nil { + recordRawTextArtifact(ctx, "Kubeflow Trainer pods", "kubectl get pods -n kubeflow", + fmt.Sprintf("failed to list pods: %v", podErr)) + } else { + var podSummary strings.Builder + for _, p := range operatorPods.Items { + fmt.Fprintf(&podSummary, "%-46s ready=%s phase=%s node=%s\n", + p.Name, podReadyCount(p), p.Status.Phase, valueOrUnknown(p.Spec.NodeName)) + } + recordRawTextArtifact(ctx, "Kubeflow Trainer pods", "kubectl get pods -n kubeflow", podSummary.String()) + } + + // 2. Validating webhook operational + webhooks, err := ctx.Clientset.AdmissionregistrationV1().ValidatingWebhookConfigurations().List( + ctx.Ctx, metav1.ListOptions{}) + if err != nil { + return errors.Wrap(errors.ErrCodeInternal, "failed to list validating webhook configurations", err) + } + var foundWebhook bool + var webhookName string + var webhookSummary strings.Builder + for _, wh := range webhooks.Items { + if wh.Name == "validator.trainer.kubeflow.org" { + foundWebhook = true + webhookName = wh.Name + fmt.Fprintf(&webhookSummary, "WebhookConfig: %s\n", wh.Name) + for _, w := range wh.Webhooks { + if w.ClientConfig.Service != nil { + svcName := w.ClientConfig.Service.Name + svcNs := w.ClientConfig.Service.Namespace + slices, listErr := ctx.Clientset.DiscoveryV1().EndpointSlices(svcNs).List( + ctx.Ctx, metav1.ListOptions{ + LabelSelector: "kubernetes.io/service-name=" + svcName, + }) + if listErr != nil { + return errors.Wrap(errors.ErrCodeNotFound, + fmt.Sprintf("webhook endpoint %s/%s not found", svcNs, svcName), listErr) + } + if len(slices.Items) == 0 { + return errors.New(errors.ErrCodeNotFound, + fmt.Sprintf("no EndpointSlice for webhook service %s/%s", svcNs, svcName)) + } + fmt.Fprintf(&webhookSummary, " service=%s/%s endpointSlices=%d\n", svcNs, svcName, len(slices.Items)) + } + } + break + } + } + if !foundWebhook { + return errors.New(errors.ErrCodeNotFound, "Kubeflow Trainer validating webhook not found") + } + recordRawTextArtifact(ctx, "Validating webhooks", + "kubectl get validatingwebhookconfigurations | grep trainer", + strings.TrimSpace(webhookSummary.String())) + recordRawTextArtifact(ctx, "Validating Webhook", + "kubectl get validatingwebhookconfigurations", + fmt.Sprintf("Name: %s\nEndpoint: reachable", webhookName)) + + // 3. TrainJob CRD exists + dynClient, err := getDynamicClient(ctx) + if err != nil { + return err + } + crdGVR := schema.GroupVersionResource{ + Group: "apiextensions.k8s.io", Version: "v1", Resource: "customresourcedefinitions", + } + crdObj, err := dynClient.Resource(crdGVR).Get(ctx.Ctx, "trainjobs.trainer.kubeflow.org", metav1.GetOptions{}) + if err != nil { + return errors.Wrap(errors.ErrCodeNotFound, "TrainJob CRD not found", err) + } + recordRawTextArtifact(ctx, "Kubeflow Trainer CRDs", + "kubectl get crds | grep trainer", + fmt.Sprintf("Required CRD present: %s", crdObj.GetName())) + + // 4. Webhook rejects invalid TrainJob + rejectionReport, err := validateKubeflowWebhookRejects(ctx) + if err != nil { + return err + } + recordRawTextArtifact(ctx, "Webhook Rejection Test", + "kubectl apply -f ", + fmt.Sprintf("Resource: %s/%s\nHTTPStatus: %d\nReason: %s\nMessage: %s", + rejectionReport.Namespace, rejectionReport.ResourceName, + rejectionReport.StatusCode, rejectionReport.Reason, rejectionReport.Message)) + return nil +} + +// validateKubeflowWebhookRejects verifies the Kubeflow Trainer webhook actively rejects +// invalid TrainJob resources. +func validateKubeflowWebhookRejects(ctx *validators.Context) (*webhookRejectionReport, error) { + dynClient, err := getDynamicClient(ctx) + if err != nil { + return nil, err + } + + b := make([]byte, 4) + if _, err := rand.Read(b); err != nil { + return nil, errors.Wrap(errors.ErrCodeInternal, "failed to generate random suffix", err) + } + name := robustTestPrefix + hex.EncodeToString(b) + + // Build an intentionally invalid TrainJob with a runtimeRef pointing to a + // non-existent runtime. The Kubeflow Trainer validating webhook rejects this + // because the referenced ClusterTrainingRuntime does not exist. This proves + // the webhook is actively validating, not just schema validation. + tj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "trainer.kubeflow.org/v1alpha1", + "kind": "TrainJob", + "metadata": map[string]interface{}{ + "name": name, + "namespace": "default", + }, + "spec": map[string]interface{}{ + "runtimeRef": map[string]interface{}{ + "name": robustTestPrefix + "nonexistent-runtime", + "apiGroup": "trainer.kubeflow.org", + "kind": "ClusterTrainingRuntime", + }, + }, + }, + } + + _, createErr := dynClient.Resource(trainJobGVR).Namespace("default").Create( + ctx.Ctx, tj, metav1.CreateOptions{}) + + if createErr == nil { + _ = dynClient.Resource(trainJobGVR).Namespace("default").Delete( + ctx.Ctx, name, metav1.DeleteOptions{}) + return nil, errors.New(errors.ErrCodeInternal, + "validating webhook did not reject invalid TrainJob") + } + + report := &webhookRejectionReport{ + ResourceName: name, + Namespace: "default", + Reason: "unknown", + Message: createErr.Error(), + } + + if k8serrors.IsForbidden(createErr) || k8serrors.IsInvalid(createErr) { + var statusErr *k8serrors.StatusError + if stderrors.As(createErr, &statusErr) { + status := statusErr.Status() + report.StatusCode = status.Code + report.Reason = string(status.Reason) + report.Message = status.Message + msg := status.Message + if strings.Contains(msg, "cannot create resource") { + return nil, errors.Wrap(errors.ErrCodeInternal, + "RBAC denied the request, not an admission webhook rejection", createErr) + } + // Verify the rejection came from the admission webhook. + // Kubeflow Trainer webhook rejections contain "admission webhook" in the message. + if !strings.Contains(msg, "admission webhook") { + return nil, errors.Wrap(errors.ErrCodeInternal, + "rejection does not appear to be from admission webhook (missing 'admission webhook' in message)", createErr) + } + } + + return report, nil + } + + return nil, errors.Wrap(errors.ErrCodeInternal, + "unexpected error testing webhook rejection", createErr) +} + +// checkRobustDynamo validates the Dynamo operator (original implementation). +func checkRobustDynamo(ctx *validators.Context) error { // 1. Dynamo operator controller-manager deployment running - // Skip if Dynamo operator is not installed. deploy, deployErr := getDeploymentIfAvailable(ctx, "dynamo-system", "dynamo-platform-dynamo-operator-controller-manager") if deployErr != nil { - return validators.Skip("Dynamo operator not found — cluster may not use Dynamo inference platform") + return errors.Wrap(errors.ErrCodeNotFound, "Dynamo operator controller not found", deployErr) } - if deploy != nil { - expected := int32(1) - if deploy.Spec.Replicas != nil { - expected = *deploy.Spec.Replicas - } - recordRawTextArtifact(ctx, "Dynamo Operator Deployment", - "kubectl get deploy -n dynamo-system", - fmt.Sprintf("Name: %s/%s\nReplicas: %d/%d available\nImage: %s", - deploy.Namespace, deploy.Name, - deploy.Status.AvailableReplicas, expected, - firstContainerImage(deploy.Spec.Template.Spec.Containers))) + expected := int32(1) + if deploy.Spec.Replicas != nil { + expected = *deploy.Spec.Replicas } + recordRawTextArtifact(ctx, "Dynamo Operator Deployment", + "kubectl get deploy -n dynamo-system", + fmt.Sprintf("Name: %s/%s\nReplicas: %d/%d available\nImage: %s", + deploy.Namespace, deploy.Name, + deploy.Status.AvailableReplicas, expected, + firstContainerImage(deploy.Spec.Template.Spec.Containers))) + operatorPods, podErr := ctx.Clientset.CoreV1().Pods("dynamo-system").List(ctx.Ctx, metav1.ListOptions{}) if podErr != nil { recordRawTextArtifact(ctx, "Dynamo operator pods", "kubectl get pods -n dynamo-system", @@ -101,7 +328,6 @@ func CheckRobustController(ctx *validators.Context) error { foundDynamoWebhook = true webhookName = wh.Name fmt.Fprintf(&webhookSummary, "WebhookConfig: %s\n", wh.Name) - // Verify webhook service endpoint exists via EndpointSlice for _, w := range wh.Webhooks { if w.ClientConfig.Service != nil { svcName := w.ClientConfig.Service.Name @@ -135,9 +361,7 @@ func CheckRobustController(ctx *validators.Context) error { "kubectl get validatingwebhookconfigurations", fmt.Sprintf("Name: %s\nEndpoint: reachable", webhookName)) - // 3. DynamoGraphDeployment CRD exists (proves operator manages CRs) - // API group: nvidia.com (v1alpha1) — from tests/manifests/dynamo-vllm-smoke-test.yaml:28 - // CRD name: dynamographdeployments.nvidia.com — from docs/conformance/cncf/evidence/robust-operator.md:57 + // 3. DynamoGraphDeployment CRD exists dynClient, err := getDynamicClient(ctx) if err != nil { return err @@ -155,7 +379,7 @@ func CheckRobustController(ctx *validators.Context) error { "kubectl get crds | grep -i dynamo", fmt.Sprintf("Required CRD present: %s", crdObj.GetName())) - // Optional evidence: capture DynamoGraphDeployment and component inventories if available. + // Optional evidence: capture inventories. dgdList, dgdListErr := dynClient.Resource(dgdGVR).Namespace("").List(ctx.Ctx, metav1.ListOptions{}) if dgdListErr != nil { recordRawTextArtifact(ctx, "DynamoGraphDeployments", "kubectl get dynamographdeployments -A", @@ -197,8 +421,8 @@ func CheckRobustController(ctx *validators.Context) error { "kubectl get dynamocomponentdeployments -n dynamo-workload", componentSummary.String()) } - // 4. Validating webhook actively rejects invalid resources (behavioral test). - rejectionReport, err := validateWebhookRejects(ctx) + // 4. Validating webhook actively rejects invalid resources. + rejectionReport, err := validateDynamoWebhookRejects(ctx) if err != nil { return err } @@ -210,23 +434,20 @@ func CheckRobustController(ctx *validators.Context) error { return nil } -// validateWebhookRejects verifies that the Dynamo validating webhook actively rejects -// invalid DynamoGraphDeployment resources. This proves the webhook is not just present -// but functionally operational. -func validateWebhookRejects(ctx *validators.Context) (*webhookRejectionReport, error) { +// validateDynamoWebhookRejects verifies that the Dynamo validating webhook actively rejects +// invalid DynamoGraphDeployment resources. +func validateDynamoWebhookRejects(ctx *validators.Context) (*webhookRejectionReport, error) { dynClient, err := getDynamicClient(ctx) if err != nil { return nil, err } - // Generate unique test resource name. b := make([]byte, 4) if _, err := rand.Read(b); err != nil { return nil, errors.Wrap(errors.ErrCodeInternal, "failed to generate random suffix", err) } name := robustTestPrefix + hex.EncodeToString(b) - // Build an intentionally invalid DynamoGraphDeployment (empty services). dgd := &unstructured.Unstructured{ Object: map[string]interface{}{ "apiVersion": "nvidia.com/v1alpha1", @@ -241,12 +462,10 @@ func validateWebhookRejects(ctx *validators.Context) (*webhookRejectionReport, e }, } - // Attempt to create the invalid resource — the webhook should reject it. _, createErr := dynClient.Resource(dgdGVR).Namespace("dynamo-system").Create( ctx.Ctx, dgd, metav1.CreateOptions{}) if createErr == nil { - // Webhook did not reject — clean up the accidentally created resource. _ = dynClient.Resource(dgdGVR).Namespace("dynamo-system").Delete( ctx.Ctx, name, metav1.DeleteOptions{}) return nil, errors.New(errors.ErrCodeInternal, @@ -260,10 +479,6 @@ func validateWebhookRejects(ctx *validators.Context) (*webhookRejectionReport, e Message: createErr.Error(), } - // Webhook rejections produce Forbidden (403) or Invalid (422) API errors. - // Use k8serrors type predicates instead of brittle string matching. - // IsForbidden can also match RBAC denials, so we explicitly exclude those - // by checking the structured status message for RBAC patterns. if k8serrors.IsForbidden(createErr) || k8serrors.IsInvalid(createErr) { var statusErr *k8serrors.StatusError if stderrors.As(createErr, &statusErr) { @@ -271,16 +486,14 @@ func validateWebhookRejects(ctx *validators.Context) (*webhookRejectionReport, e report.StatusCode = status.Code report.Reason = string(status.Reason) report.Message = status.Message - msg := status.Message - if strings.Contains(msg, "cannot create resource") { + if strings.Contains(status.Message, "cannot create resource") { return nil, errors.Wrap(errors.ErrCodeInternal, "RBAC denied the request, not an admission webhook rejection", createErr) } } - return report, nil // PASS — webhook rejected the invalid resource + return report, nil } - // Non-admission error (network, CRD not installed, server error, etc). return nil, errors.Wrap(errors.ErrCodeInternal, "unexpected error testing webhook rejection", createErr) } diff --git a/validators/conformance/robust_controller_check_test.go b/validators/conformance/robust_controller_check_test.go new file mode 100644 index 00000000..853651f1 --- /dev/null +++ b/validators/conformance/robust_controller_check_test.go @@ -0,0 +1,192 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "strings" + "testing" + + "github.com/NVIDIA/aicr/pkg/recipe" + "github.com/NVIDIA/aicr/validators" + k8sfake "k8s.io/client-go/kubernetes/fake" +) + +func TestRecipeHasComponent(t *testing.T) { + tests := []struct { + name string + recipe *recipe.RecipeResult + component string + want bool + }{ + { + name: "nil recipe", + recipe: nil, + component: "kubeflow-trainer", + want: false, + }, + { + name: "empty componentRefs", + recipe: &recipe.RecipeResult{}, + component: "kubeflow-trainer", + want: false, + }, + { + name: "component present", + recipe: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "cert-manager"}, + {Name: "kubeflow-trainer"}, + {Name: "gpu-operator"}, + }, + }, + component: "kubeflow-trainer", + want: true, + }, + { + name: "component not present", + recipe: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "cert-manager"}, + {Name: "gpu-operator"}, + }, + }, + component: "kubeflow-trainer", + want: false, + }, + { + name: "dynamo-platform present", + recipe: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "dynamo-platform"}, + {Name: "gpu-operator"}, + }, + }, + component: "dynamo-platform", + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := &validators.Context{Recipe: tt.recipe} + got := recipeHasComponent(ctx, tt.component) + if got != tt.want { + t.Errorf("recipeHasComponent(%q) = %v, want %v", tt.component, got, tt.want) + } + }) + } +} + +func TestCheckRobustControllerRouting(t *testing.T) { + tests := []struct { + name string + recipe *recipe.RecipeResult + expectSkip bool + expectContains string // substring in error or skip message + }{ + { + name: "nil recipe skips", + recipe: nil, + expectSkip: true, + expectContains: "no supported AI operator", + }, + { + name: "empty recipe skips", + recipe: &recipe.RecipeResult{}, + expectSkip: true, + expectContains: "no supported AI operator", + }, + { + name: "recipe with only gpu-operator skips", + recipe: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "gpu-operator"}, + {Name: "cert-manager"}, + }, + }, + expectSkip: true, + expectContains: "no supported AI operator", + }, + { + name: "recipe with kubeflow-trainer routes to kubeflow check", + recipe: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "kubeflow-trainer"}, + {Name: "gpu-operator"}, + }, + }, + expectSkip: false, + // Will fail because fake clientset has no deployments, but proves routing works + expectContains: "Kubeflow Trainer controller not found", + }, + { + name: "recipe with dynamo-platform routes to dynamo check", + recipe: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "dynamo-platform"}, + {Name: "gpu-operator"}, + }, + }, + expectSkip: false, + // Will fail because fake clientset has no deployments, but proves routing works + expectContains: "Dynamo operator controller not found", + }, + { + name: "recipe with both prefers dynamo", + recipe: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "dynamo-platform"}, + {Name: "kubeflow-trainer"}, + }, + }, + expectSkip: false, + expectContains: "Dynamo operator controller not found", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := &validators.Context{ + Ctx: context.Background(), + Clientset: k8sfake.NewClientset(), + Recipe: tt.recipe, + } + + err := CheckRobustController(ctx) + + if tt.expectSkip { + if err == nil { + t.Fatal("expected skip error, got nil") + } + if !strings.Contains(err.Error(), "skip") { + t.Errorf("expected skip error, got: %v", err) + } + if tt.expectContains != "" && !strings.Contains(err.Error(), tt.expectContains) { + t.Errorf("expected error to contain %q, got: %v", tt.expectContains, err) + } + return + } + + // Non-skip: expect an error (because fake clientset has no resources) + // but the error should indicate the correct operator was targeted + if err == nil { + t.Fatal("expected error from fake clientset, got nil") + } + if tt.expectContains != "" && !strings.Contains(err.Error(), tt.expectContains) { + t.Errorf("expected error to contain %q, got: %v", tt.expectContains, err) + } + }) + } +}