Skip to content

Commit d147adf

Browse files
committed
feat(validator): add Kubeflow Trainer support to robust-controller check
The robust-controller conformance check previously only validated the Dynamo operator, causing it to skip on all training clusters. This adds Kubeflow Trainer as an alternative target, selected based on recipe component presence: - dynamo-platform in recipe → validate Dynamo operator - kubeflow-trainer in recipe → validate Kubeflow Trainer - neither → skip Kubeflow Trainer validation checks: 1. Controller deployment running (kubeflow-trainer-controller-manager) 2. Validating webhook operational with reachable endpoint 3. TrainJob CRD exists (trainjobs.trainer.kubeflow.org) 4. Webhook rejects invalid TrainJob (behavioral test) Refactored the original Dynamo validation into checkRobustDynamo() and renamed validateWebhookRejects to validateDynamoWebhookRejects for clarity.
1 parent eab65d9 commit d147adf

File tree

5 files changed

+585
-40
lines changed

5 files changed

+585
-40
lines changed

pkg/evidence/scripts/collect-evidence.sh

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,15 @@ EOF
623623
collect_gateway() {
624624
EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md"
625625
log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}"
626+
627+
# Skip if kgateway is not installed (training clusters don't have inference gateway)
628+
if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then
629+
write_section_header "Inference API Gateway (kgateway)"
630+
echo "**Result: SKIP** — kgateway not installed. Inference gateway check applies to inference clusters only." >> "${EVIDENCE_FILE}"
631+
log_info "Inference gateway evidence collection skipped — kgateway not installed."
632+
return
633+
fi
634+
626635
write_section_header "Inference API Gateway (kgateway)"
627636

628637
cat >> "${EVIDENCE_FILE}" <<'EOF'
@@ -718,6 +727,129 @@ EOF
718727
collect_operator() {
719728
EVIDENCE_FILE="${EVIDENCE_DIR}/robust-operator.md"
720729
log_info "Collecting Robust AI Operator evidence → ${EVIDENCE_FILE}"
730+
731+
# Detect which AI operator is present and route to the appropriate collector.
732+
if kubectl get deploy -n dynamo-system dynamo-platform-dynamo-operator-controller-manager --no-headers 2>/dev/null | grep -q .; then
733+
collect_operator_dynamo
734+
elif kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | grep -q .; then
735+
collect_operator_kubeflow
736+
else
737+
write_section_header "Robust AI Operator"
738+
echo "**Result: SKIP** — No supported AI operator found (requires Dynamo or Kubeflow Trainer)." >> "${EVIDENCE_FILE}"
739+
log_info "Robust operator evidence collection skipped — no supported operator found."
740+
return
741+
fi
742+
}
743+
744+
# --- Kubeflow Trainer evidence ---
745+
collect_operator_kubeflow() {
746+
write_section_header "Robust AI Operator (Kubeflow Trainer)"
747+
748+
cat >> "${EVIDENCE_FILE}" <<'EOF'
749+
Demonstrates CNCF AI Conformance requirement that at least one complex AI operator
750+
with a CRD can be installed and functions reliably, including operator pods running,
751+
webhooks operational, and custom resources reconciled.
752+
753+
## Summary
754+
755+
1. **Kubeflow Trainer** — Controller manager running in `kubeflow` namespace
756+
2. **Custom Resource Definitions** — TrainJob, TrainingRuntime, ClusterTrainingRuntime CRDs registered
757+
3. **Webhooks Operational** — Validating webhook `validator.trainer.kubeflow.org` configured and active
758+
4. **Webhook Rejection Test** — Invalid TrainJob correctly rejected by webhook
759+
5. **Result: PASS**
760+
761+
---
762+
763+
## Kubeflow Trainer Health
764+
EOF
765+
capture "Kubeflow Trainer deployments" kubectl get deploy -n kubeflow
766+
capture "Kubeflow Trainer pods" kubectl get pods -n kubeflow -o wide
767+
768+
cat >> "${EVIDENCE_FILE}" <<'EOF'
769+
770+
## Custom Resource Definitions
771+
EOF
772+
echo "" >> "${EVIDENCE_FILE}"
773+
echo "**Kubeflow Trainer CRDs**" >> "${EVIDENCE_FILE}"
774+
echo '```' >> "${EVIDENCE_FILE}"
775+
kubectl get crds 2>/dev/null | grep -E "trainer\.kubeflow\.org" >> "${EVIDENCE_FILE}" 2>&1
776+
echo '```' >> "${EVIDENCE_FILE}"
777+
778+
cat >> "${EVIDENCE_FILE}" <<'EOF'
779+
780+
## Webhooks
781+
EOF
782+
capture "Validating webhooks" kubectl get validatingwebhookconfigurations validator.trainer.kubeflow.org
783+
echo "" >> "${EVIDENCE_FILE}"
784+
echo "**Webhook endpoint verification**" >> "${EVIDENCE_FILE}"
785+
echo '```' >> "${EVIDENCE_FILE}"
786+
kubectl get endpoints -n kubeflow 2>/dev/null | head -10 >> "${EVIDENCE_FILE}" 2>&1
787+
echo '```' >> "${EVIDENCE_FILE}"
788+
789+
cat >> "${EVIDENCE_FILE}" <<'EOF'
790+
791+
## ClusterTrainingRuntimes
792+
EOF
793+
capture "ClusterTrainingRuntimes" kubectl get clustertrainingruntimes
794+
795+
cat >> "${EVIDENCE_FILE}" <<'EOF'
796+
797+
## Webhook Rejection Test
798+
799+
Submit an invalid TrainJob (referencing a non-existent runtime) to verify the
800+
validating webhook actively rejects malformed resources.
801+
EOF
802+
echo "" >> "${EVIDENCE_FILE}"
803+
echo "**Invalid TrainJob rejection**" >> "${EVIDENCE_FILE}"
804+
echo '```' >> "${EVIDENCE_FILE}"
805+
local webhook_result
806+
webhook_result=$(kubectl apply -f - 2>&1 <<INVALID_CR || true
807+
apiVersion: trainer.kubeflow.org/v1alpha1
808+
kind: TrainJob
809+
metadata:
810+
name: webhook-test-invalid
811+
namespace: default
812+
spec:
813+
runtimeRef:
814+
name: nonexistent-runtime
815+
apiGroup: trainer.kubeflow.org
816+
kind: ClusterTrainingRuntime
817+
INVALID_CR
818+
)
819+
echo "${webhook_result}" >> "${EVIDENCE_FILE}"
820+
echo '```' >> "${EVIDENCE_FILE}"
821+
822+
echo "" >> "${EVIDENCE_FILE}"
823+
if echo "${webhook_result}" | grep -qi "denied\|forbidden\|invalid\|error\|not found"; then
824+
echo "Webhook correctly rejected the invalid resource." >> "${EVIDENCE_FILE}"
825+
else
826+
echo "WARNING: Webhook did not reject the invalid resource." >> "${EVIDENCE_FILE}"
827+
# Clean up if accidentally created
828+
kubectl delete trainjob webhook-test-invalid -n default --ignore-not-found 2>/dev/null
829+
fi
830+
831+
# Verdict
832+
echo "" >> "${EVIDENCE_FILE}"
833+
local crd_count
834+
crd_count=$(kubectl get crds 2>/dev/null | grep -c "trainer\.kubeflow\.org" || true)
835+
local controller_ready
836+
controller_ready=$(kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | awk '{print $2}' | grep -c "1/1" || true)
837+
local webhook_ok
838+
webhook_ok=$(echo "${webhook_result}" | grep -ci "denied\|forbidden\|invalid\|error\|not found" || true)
839+
840+
if [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ] && [ "${webhook_ok}" -gt 0 ]; then
841+
echo "**Result: PASS** — Kubeflow Trainer running, webhooks operational (rejection verified), ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
842+
elif [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ]; then
843+
echo "**Result: PASS** — Kubeflow Trainer running, ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
844+
else
845+
echo "**Result: FAIL** — Kubeflow Trainer controller not ready or CRDs missing." >> "${EVIDENCE_FILE}"
846+
fi
847+
848+
log_info "Robust operator (Kubeflow Trainer) evidence collection complete."
849+
}
850+
851+
# --- Dynamo evidence ---
852+
collect_operator_dynamo() {
721853
write_section_header "Robust AI Operator (Dynamo Platform)"
722854

723855
cat >> "${EVIDENCE_FILE}" <<'EOF'

recipes/validators/catalog.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ validators:
122122
env: []
123123
- name: robust-controller
124124
phase: conformance
125-
description: "Verify Dynamo operator controller and webhooks"
125+
description: "Verify AI operator controller and webhooks (Dynamo or Kubeflow Trainer)"
126126
image: ghcr.io/nvidia/aicr-validators/conformance:latest
127127
timeout: 5m
128128
args: ["robust-controller"]

validators/conformance/inference_gateway_check.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ type gatewayDataPlaneReport struct {
4242
// Verifies GatewayClass "kgateway" is accepted, Gateway "inference-gateway" is programmed,
4343
// and required Gateway API + InferencePool CRDs exist.
4444
func CheckInferenceGateway(ctx *validators.Context) error {
45+
// Skip if the recipe does not include kgateway (inference gateway component).
46+
// Training clusters typically don't have an inference gateway.
47+
if !recipeHasComponent(ctx, "kgateway") {
48+
return validators.Skip("kgateway not in recipe — inference gateway check applies to inference clusters only")
49+
}
50+
4551
dynClient, err := getDynamicClient(ctx)
4652
if err != nil {
4753
return err

0 commit comments

Comments
 (0)