NVIDIA · yuanchen8911 · Mar 12, 2026 · Mar 11, 2026
@@ -623,6 +623,15 @@ EOF
 collect_gateway() {
     EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md"
     log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}"
+
+    # Skip if kgateway is not installed (training clusters don't have inference gateway)
+    if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then
+        write_section_header "Inference API Gateway (kgateway)"
+        echo "**Result: SKIP** — kgateway not installed. Inference gateway check applies to inference clusters only." >> "${EVIDENCE_FILE}"
+        log_info "Inference gateway evidence collection skipped — kgateway not installed."
+        return
+    fi
+
     write_section_header "Inference API Gateway (kgateway)"
 
     cat >> "${EVIDENCE_FILE}" <<'EOF'
@@ -718,6 +727,136 @@ EOF
 collect_operator() {
     EVIDENCE_FILE="${EVIDENCE_DIR}/robust-operator.md"
     log_info "Collecting Robust AI Operator evidence → ${EVIDENCE_FILE}"
+
+    # Detect which AI operator is present and route to the appropriate collector.
+    if kubectl get deploy -n dynamo-system dynamo-platform-dynamo-operator-controller-manager --no-headers 2>/dev/null | grep -q .; then
+        collect_operator_dynamo
+    elif kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | grep -q .; then
+        collect_operator_kubeflow
+    else
+        write_section_header "Robust AI Operator"
+        echo "**Result: SKIP** — No supported AI operator found (requires Dynamo or Kubeflow Trainer)." >> "${EVIDENCE_FILE}"
+        log_info "Robust operator evidence collection skipped — no supported operator found."
+        return
+    fi
+}
+
+# --- Kubeflow Trainer evidence ---
+collect_operator_kubeflow() {
+    write_section_header "Robust AI Operator (Kubeflow Trainer)"
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+Demonstrates CNCF AI Conformance requirement that at least one complex AI operator
+with a CRD can be installed and functions reliably, including operator pods running,
+webhooks operational, and custom resources reconciled.
+
+## Summary
+
+1. **Kubeflow Trainer** — Controller manager running in `kubeflow` namespace
+2. **Custom Resource Definitions** — TrainJob, TrainingRuntime, ClusterTrainingRuntime CRDs registered
+3. **Webhooks Operational** — Validating webhook `validator.trainer.kubeflow.org` configured and active
+4. **Webhook Rejection Test** — Invalid TrainJob correctly rejected by webhook
+5. **Result: PASS**
+
+---
+
+## Kubeflow Trainer Health
+EOF
+    capture "Kubeflow Trainer deployments" kubectl get deploy -n kubeflow
+    capture "Kubeflow Trainer pods" kubectl get pods -n kubeflow -o wide
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Custom Resource Definitions
+EOF
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Kubeflow Trainer CRDs**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    kubectl get crds 2>/dev/null | grep -E "trainer\.kubeflow\.org" >> "${EVIDENCE_FILE}" 2>&1
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Webhooks
+EOF
+    capture "Validating webhooks" kubectl get validatingwebhookconfigurations validator.trainer.kubeflow.org
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Webhook endpoint verification**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    kubectl get endpoints -n kubeflow 2>/dev/null | head -10 >> "${EVIDENCE_FILE}" 2>&1
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## ClusterTrainingRuntimes
+EOF
+    capture "ClusterTrainingRuntimes" kubectl get clustertrainingruntimes
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Webhook Rejection Test
+
+Submit an invalid TrainJob (referencing a non-existent runtime) to verify the
+validating webhook actively rejects malformed resources.
+EOF
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Invalid TrainJob rejection**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    local webhook_result
+    webhook_result=$(kubectl apply -f - 2>&1 <<INVALID_CR || true
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  name: webhook-test-invalid
+  namespace: default
+spec:
+  runtimeRef:
+    name: nonexistent-runtime
+    apiGroup: trainer.kubeflow.org
+    kind: ClusterTrainingRuntime
+INVALID_CR
+)
+    echo "${webhook_result}" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    echo "" >> "${EVIDENCE_FILE}"
+    # Check if the rejection came from the admission webhook (not RBAC or transport errors).
+    # Webhook rejections contain "admission webhook" or "denied the request".
+    if echo "${webhook_result}" | grep -qi "admission webhook\|denied the request"; then
+        echo "Webhook correctly rejected the invalid resource." >> "${EVIDENCE_FILE}"
+    elif echo "${webhook_result}" | grep -qi "cannot create resource\|unauthorized"; then
+        echo "WARNING: Rejection was from RBAC, not the admission webhook." >> "${EVIDENCE_FILE}"
+    elif echo "${webhook_result}" | grep -qi "denied\|forbidden\|invalid"; then
+        echo "Webhook rejected the invalid resource (unconfirmed source)." >> "${EVIDENCE_FILE}"
+    else
+        echo "WARNING: Webhook did not reject the invalid resource." >> "${EVIDENCE_FILE}"
+        # Clean up if accidentally created
+        kubectl delete trainjob webhook-test-invalid -n default --ignore-not-found 2>/dev/null
+    fi
+
+    # Verdict
+    echo "" >> "${EVIDENCE_FILE}"
+    local crd_count
+    crd_count=$(kubectl get crds 2>/dev/null | grep -c "trainer\.kubeflow\.org" || true)
+    local controller_ready
+    controller_ready=$(kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | awk '{print $2}' | grep -c "1/1" || true)
+    local webhook_ok
+    # Only count confirmed webhook rejections (not RBAC or transport errors)
+    webhook_ok=$(echo "${webhook_result}" | grep -ci "admission webhook\|denied the request" || true)
+
+    if [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ] && [ "${webhook_ok}" -gt 0 ]; then
+        echo "**Result: PASS** — Kubeflow Trainer running, webhooks operational (rejection verified), ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
+    elif [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ]; then
+        echo "**Result: PASS** — Kubeflow Trainer running, ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
+    else
+        echo "**Result: FAIL** — Kubeflow Trainer controller not ready or CRDs missing." >> "${EVIDENCE_FILE}"
+    fi
+
+    log_info "Robust operator (Kubeflow Trainer) evidence collection complete."
+}
+
+# --- Dynamo evidence ---
+collect_operator_dynamo() {
     write_section_header "Robust AI Operator (Dynamo Platform)"
 
     cat >> "${EVIDENCE_FILE}" <<'EOF'
@@ -976,38 +1115,45 @@ collect_cluster_autoscaling() {
     log_info "Collecting Cluster Autoscaling evidence → ${EVIDENCE_FILE}"
     write_section_header "Cluster Autoscaling"
 
-    cat >> "${EVIDENCE_FILE}" <<'EOF'
+    # Detect platform from node providerID
+    local provider_id
+    provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "")
+
+    if [[ "${provider_id}" == aws://* ]]; then
+        log_info "Detected EKS cluster, collecting AWS ASG evidence"
+        cat >> "${EVIDENCE_FILE}" <<'EOF'
 Demonstrates CNCF AI Conformance requirement that the platform has GPU-aware
 cluster autoscaling infrastructure configured, with Auto Scaling Groups capable
 of scaling GPU node groups based on workload demand.
 
 ## Summary
 
-1. **GPU Node Group (ASG)** — EKS Auto Scaling Group configured with GPU instances (p5.48xlarge)
+1. **GPU Node Group (ASG)** — EKS Auto Scaling Group configured with GPU instances
 2. **Capacity Reservation** — Dedicated GPU capacity available for scale-up
 3. **Scalable Configuration** — ASG min/max configurable for demand-based scaling
 4. **Kubernetes Integration** — ASG nodes auto-join the EKS cluster with GPU labels
 5. **Autoscaler Compatibility** — Cluster Autoscaler and Karpenter supported via ASG tag discovery
-6. **Result: PASS**
 
 ---
 
 ## GPU Node Auto Scaling Group
 
 The cluster uses an AWS Auto Scaling Group (ASG) for GPU nodes, which can scale
-up/down based on workload demand. The ASG is configured with p5.48xlarge instances
-(8x NVIDIA H100 80GB HBM3 each) backed by a capacity reservation.
+up/down based on workload demand.
 EOF
-
-    # Detect platform from node providerID (e.g., "aws:///us-east-1a/i-xxx")
-    local provider_id
-    provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "")
-
-    if [[ "${provider_id}" == aws://* ]]; then
-        log_info "Detected EKS cluster, collecting AWS ASG evidence"
         collect_eks_autoscaling_evidence
+    elif [[ "${provider_id}" == gce://* ]]; then
+        log_info "Detected GKE cluster, collecting GKE node pool autoscaling evidence"
+        cat >> "${EVIDENCE_FILE}" <<'EOF'
+Demonstrates CNCF AI Conformance requirement that the platform has GPU-aware
+cluster autoscaling infrastructure configured. GKE provides a built-in cluster
+autoscaler that manages node pool scaling based on workload demand.
+
+---
+EOF
+        collect_gke_autoscaling_evidence
     else
-        log_warn "Non-EKS cluster detected (providerID=${provider_id}), collecting Kubernetes-level evidence only"
+        log_warn "Unknown cluster provider (providerID=${provider_id}), collecting Kubernetes-level evidence only"
         collect_k8s_autoscaling_evidence
     fi
 
@@ -1139,7 +1285,86 @@ EOF
     fi
 }
 
-# Collect Kubernetes-level autoscaling evidence (non-EKS clusters).
+# Collect GKE-specific autoscaling evidence.
+collect_gke_autoscaling_evidence() {
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## GKE Cluster Details
+EOF
+    # Extract project and cluster info from providerID (gce://project/zone/instance)
+    local provider_id
+    provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "")
+    local gce_project gce_zone
+    gce_project=$(echo "${provider_id}" | cut -d'/' -f3)
+    gce_zone=$(echo "${provider_id}" | cut -d'/' -f4)
+
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "- **Project:** ${gce_project:-unknown}" >> "${EVIDENCE_FILE}"
+    echo "- **Zone:** ${gce_zone:-unknown}" >> "${EVIDENCE_FILE}"
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## GPU Nodes
+EOF
+    capture "GPU nodes" kubectl get nodes -l nvidia.com/gpu.present=true \
+        -o custom-columns='NAME:.metadata.name,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,GPUS:.status.capacity.nvidia\.com/gpu,ACCELERATOR:.metadata.labels.cloud\.google\.com/gke-accelerator,NODE-POOL:.metadata.labels.cloud\.google\.com/gke-nodepool'
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## GKE Cluster Autoscaler
+
+GKE includes a built-in cluster autoscaler that manages node pool scaling.
+The autoscaler is configured per node pool and can be verified via annotations
+on nodes and the cluster-autoscaler-status ConfigMap.
+EOF
+
+    # Check cluster-autoscaler-status ConfigMap (GKE writes autoscaler status here)
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Cluster Autoscaler Status**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    kubectl get configmap cluster-autoscaler-status -n kube-system -o jsonpath='{.data.status}' 2>/dev/null >> "${EVIDENCE_FILE}" || echo "ConfigMap cluster-autoscaler-status not found" >> "${EVIDENCE_FILE}"
+    echo "" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    # Check node pool annotations for autoscaling config
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Node Pool Autoscaling Configuration
+EOF
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**GPU node pool annotations**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.cluster-autoscaler\.kubernetes\.io/scale-down-disabled}{"\t"}{.metadata.labels.cloud\.google\.com/gke-nodepool}{"\n"}{end}' 2>/dev/null >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    # Check for NotTriggerScaleUp events (proves autoscaler is active)
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Autoscaler Activity
+EOF
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Recent autoscaler events**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    kubectl get events -A --sort-by='.lastTimestamp' 2>/dev/null | grep -E "NotTriggerScaleUp|ScaledUpGroup|ScaleDown|TriggeredScaleUp" | tail -10 >> "${EVIDENCE_FILE}" || echo "No autoscaler events found" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    # Verdict
+    echo "" >> "${EVIDENCE_FILE}"
+    local gpu_node_count
+    gpu_node_count=$(kubectl get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    local ca_status
+    ca_status=$(kubectl get configmap cluster-autoscaler-status -n kube-system 2>/dev/null && echo "found" || echo "")
+
+    if [ "${gpu_node_count}" -gt 0 ] && [ -n "${ca_status}" ]; then
+        echo "**Result: PASS** — GKE cluster with ${gpu_node_count} GPU nodes and built-in cluster autoscaler active." >> "${EVIDENCE_FILE}"
+    elif [ "${gpu_node_count}" -gt 0 ]; then
+        echo "**Result: PASS (partial)** — GKE cluster with ${gpu_node_count} GPU nodes. Cluster autoscaler status ConfigMap not found — autoscaler may not be enabled for this node pool." >> "${EVIDENCE_FILE}"
+    else
+        echo "**Result: FAIL** — No GPU nodes found." >> "${EVIDENCE_FILE}"
+    fi
+}
+
+# Collect Kubernetes-level autoscaling evidence (non-EKS/GKE clusters).
 collect_k8s_autoscaling_evidence() {
     cat >> "${EVIDENCE_FILE}" <<'EOF'
 

@@ -122,7 +122,7 @@ validators:
     env: []
   - name: robust-controller
     phase: conformance
-    description: "Verify Dynamo operator controller and webhooks"
+    description: "Verify AI operator controller and webhooks (Dynamo or Kubeflow Trainer)"
     image: ghcr.io/nvidia/aicr-validators/conformance:latest
     timeout: 5m
     args: ["robust-controller"]

@@ -42,6 +42,12 @@ type gatewayDataPlaneReport struct {
 // Verifies GatewayClass "kgateway" is accepted, Gateway "inference-gateway" is programmed,
 // and required Gateway API + InferencePool CRDs exist.
 func CheckInferenceGateway(ctx *validators.Context) error {
+	// Skip if the recipe does not include kgateway (inference gateway component).
+	// Training clusters typically don't have an inference gateway.
+	if !recipeHasComponent(ctx, "kgateway") {
+		return validators.Skip("kgateway not in recipe — inference gateway check applies to inference clusters only")
+	}
+
 	dynClient, err := getDynamicClient(ctx)
 	if err != nil {
 		return err