NVIDIA
diff --git a/‎pkg/evidence/scripts/collect-evidence.sh‎
Lines changed: 132 additions & 0 deletions b/‎pkg/evidence/scripts/collect-evidence.sh‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎recipes/validators/catalog.yaml‎
Lines changed: 1 addition & 1 deletion b/‎recipes/validators/catalog.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎validators/conformance/inference_gateway_check.go‎
Lines changed: 6 additions & 0 deletions b/‎validators/conformance/inference_gateway_check.go‎
Lines changed: 6 additions & 0 deletions
@@ -623,6 +623,15 @@ EOF
 collect_gateway() {
     EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md"
     log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}"
+
+    # Skip if kgateway is not installed (training clusters don't have inference gateway)
+    if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then
+        write_section_header "Inference API Gateway (kgateway)"
+        echo "**Result: SKIP** — kgateway not installed. Inference gateway check applies to inference clusters only." >> "${EVIDENCE_FILE}"
+        log_info "Inference gateway evidence collection skipped — kgateway not installed."
+        return
+    fi
+
     write_section_header "Inference API Gateway (kgateway)"
 
     cat >> "${EVIDENCE_FILE}" <<'EOF'
@@ -718,6 +727,129 @@ EOF
 collect_operator() {
     EVIDENCE_FILE="${EVIDENCE_DIR}/robust-operator.md"
     log_info "Collecting Robust AI Operator evidence → ${EVIDENCE_FILE}"
+
+    # Detect which AI operator is present and route to the appropriate collector.
+    if kubectl get deploy -n dynamo-system dynamo-platform-dynamo-operator-controller-manager --no-headers 2>/dev/null | grep -q .; then
+        collect_operator_dynamo
+    elif kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | grep -q .; then
+        collect_operator_kubeflow
+    else
+        write_section_header "Robust AI Operator"
+        echo "**Result: SKIP** — No supported AI operator found (requires Dynamo or Kubeflow Trainer)." >> "${EVIDENCE_FILE}"
+        log_info "Robust operator evidence collection skipped — no supported operator found."
+        return
+    fi
+}
+
+# --- Kubeflow Trainer evidence ---
+collect_operator_kubeflow() {
+    write_section_header "Robust AI Operator (Kubeflow Trainer)"
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+Demonstrates CNCF AI Conformance requirement that at least one complex AI operator
+with a CRD can be installed and functions reliably, including operator pods running,
+webhooks operational, and custom resources reconciled.
+
+## Summary
+
+1. **Kubeflow Trainer** — Controller manager running in `kubeflow` namespace
+2. **Custom Resource Definitions** — TrainJob, TrainingRuntime, ClusterTrainingRuntime CRDs registered
+3. **Webhooks Operational** — Validating webhook `validator.trainer.kubeflow.org` configured and active
+4. **Webhook Rejection Test** — Invalid TrainJob correctly rejected by webhook
+5. **Result: PASS**
+
+---
+
+## Kubeflow Trainer Health
+EOF
+    capture "Kubeflow Trainer deployments" kubectl get deploy -n kubeflow
+    capture "Kubeflow Trainer pods" kubectl get pods -n kubeflow -o wide
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Custom Resource Definitions
+EOF
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Kubeflow Trainer CRDs**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    kubectl get crds 2>/dev/null | grep -E "trainer\.kubeflow\.org" >> "${EVIDENCE_FILE}" 2>&1
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Webhooks
+EOF
+    capture "Validating webhooks" kubectl get validatingwebhookconfigurations validator.trainer.kubeflow.org
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Webhook endpoint verification**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    kubectl get endpoints -n kubeflow 2>/dev/null | head -10 >> "${EVIDENCE_FILE}" 2>&1
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## ClusterTrainingRuntimes
+EOF
+    capture "ClusterTrainingRuntimes" kubectl get clustertrainingruntimes
+
+    cat >> "${EVIDENCE_FILE}" <<'EOF'
+
+## Webhook Rejection Test
+
+Submit an invalid TrainJob (referencing a non-existent runtime) to verify the
+validating webhook actively rejects malformed resources.
+EOF
+    echo "" >> "${EVIDENCE_FILE}"
+    echo "**Invalid TrainJob rejection**" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+    local webhook_result
+    webhook_result=$(kubectl apply -f - 2>&1 <<INVALID_CR || true
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  name: webhook-test-invalid
+  namespace: default
+spec:
+  runtimeRef:
+    name: nonexistent-runtime
+    apiGroup: trainer.kubeflow.org
+    kind: ClusterTrainingRuntime
+INVALID_CR
+)
+    echo "${webhook_result}" >> "${EVIDENCE_FILE}"
+    echo '```' >> "${EVIDENCE_FILE}"
+
+    echo "" >> "${EVIDENCE_FILE}"
+    if echo "${webhook_result}" | grep -qi "denied\|forbidden\|invalid\|error\|not found"; then
+        echo "Webhook correctly rejected the invalid resource." >> "${EVIDENCE_FILE}"
+    else
+        echo "WARNING: Webhook did not reject the invalid resource." >> "${EVIDENCE_FILE}"
+        # Clean up if accidentally created
+        kubectl delete trainjob webhook-test-invalid -n default --ignore-not-found 2>/dev/null
+    fi
+
+    # Verdict
+    echo "" >> "${EVIDENCE_FILE}"
+    local crd_count
+    crd_count=$(kubectl get crds 2>/dev/null | grep -c "trainer\.kubeflow\.org" || true)
+    local controller_ready
+    controller_ready=$(kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | awk '{print $2}' | grep -c "1/1" || true)
+    local webhook_ok
+    webhook_ok=$(echo "${webhook_result}" | grep -ci "denied\|forbidden\|invalid\|error\|not found" || true)
+
+    if [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ] && [ "${webhook_ok}" -gt 0 ]; then
+        echo "**Result: PASS** — Kubeflow Trainer running, webhooks operational (rejection verified), ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
+    elif [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ]; then
+        echo "**Result: PASS** — Kubeflow Trainer running, ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
+    else
+        echo "**Result: FAIL** — Kubeflow Trainer controller not ready or CRDs missing." >> "${EVIDENCE_FILE}"
+    fi
+
+    log_info "Robust operator (Kubeflow Trainer) evidence collection complete."
+}
+
+# --- Dynamo evidence ---
+collect_operator_dynamo() {
     write_section_header "Robust AI Operator (Dynamo Platform)"
 
     cat >> "${EVIDENCE_FILE}" <<'EOF'
 
@@ -122,7 +122,7 @@ validators:
     env: []
   - name: robust-controller
     phase: conformance
-    description: "Verify Dynamo operator controller and webhooks"
+    description: "Verify AI operator controller and webhooks (Dynamo or Kubeflow Trainer)"
     image: ghcr.io/nvidia/aicr-validators/conformance:latest
     timeout: 5m
     args: ["robust-controller"]
 
@@ -42,6 +42,12 @@ type gatewayDataPlaneReport struct {
 // Verifies GatewayClass "kgateway" is accepted, Gateway "inference-gateway" is programmed,
 // and required Gateway API + InferencePool CRDs exist.
 func CheckInferenceGateway(ctx *validators.Context) error {
+	// Skip if the recipe does not include kgateway (inference gateway component).
+	// Training clusters typically don't have an inference gateway.
+	if !recipeHasComponent(ctx, "kgateway") {
+		return validators.Skip("kgateway not in recipe — inference gateway check applies to inference clusters only")
+	}
+
 	dynClient, err := getDynamicClient(ctx)
 	if err != nil {
 		return err