|
623 | 623 | collect_gateway() { |
624 | 624 | EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md" |
625 | 625 | log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}" |
| 626 | + |
| 627 | + # Skip if kgateway is not installed (training clusters don't have inference gateway) |
| 628 | + if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then |
| 629 | + write_section_header "Inference API Gateway (kgateway)" |
| 630 | + echo "**Result: SKIP** — kgateway not installed. Inference gateway check applies to inference clusters only." >> "${EVIDENCE_FILE}" |
| 631 | + log_info "Inference gateway evidence collection skipped — kgateway not installed." |
| 632 | + return |
| 633 | + fi |
| 634 | + |
626 | 635 | write_section_header "Inference API Gateway (kgateway)" |
627 | 636 |
|
628 | 637 | cat >> "${EVIDENCE_FILE}" <<'EOF' |
@@ -718,6 +727,129 @@ EOF |
718 | 727 | collect_operator() { |
719 | 728 | EVIDENCE_FILE="${EVIDENCE_DIR}/robust-operator.md" |
720 | 729 | log_info "Collecting Robust AI Operator evidence → ${EVIDENCE_FILE}" |
| 730 | + |
| 731 | + # Detect which AI operator is present and route to the appropriate collector. |
| 732 | + if kubectl get deploy -n dynamo-system dynamo-platform-dynamo-operator-controller-manager --no-headers 2>/dev/null | grep -q .; then |
| 733 | + collect_operator_dynamo |
| 734 | + elif kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | grep -q .; then |
| 735 | + collect_operator_kubeflow |
| 736 | + else |
| 737 | + write_section_header "Robust AI Operator" |
| 738 | + echo "**Result: SKIP** — No supported AI operator found (requires Dynamo or Kubeflow Trainer)." >> "${EVIDENCE_FILE}" |
| 739 | + log_info "Robust operator evidence collection skipped — no supported operator found." |
| 740 | + return |
| 741 | + fi |
| 742 | +} |
| 743 | + |
| 744 | +# --- Kubeflow Trainer evidence --- |
| 745 | +collect_operator_kubeflow() { |
| 746 | + write_section_header "Robust AI Operator (Kubeflow Trainer)" |
| 747 | + |
| 748 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 749 | +Demonstrates CNCF AI Conformance requirement that at least one complex AI operator |
| 750 | +with a CRD can be installed and functions reliably, including operator pods running, |
| 751 | +webhooks operational, and custom resources reconciled. |
| 752 | +
|
| 753 | +## Summary |
| 754 | +
|
| 755 | +1. **Kubeflow Trainer** — Controller manager running in `kubeflow` namespace |
| 756 | +2. **Custom Resource Definitions** — TrainJob, TrainingRuntime, ClusterTrainingRuntime CRDs registered |
| 757 | +3. **Webhooks Operational** — Validating webhook `validator.trainer.kubeflow.org` configured and active |
| 758 | +4. **Webhook Rejection Test** — Invalid TrainJob correctly rejected by webhook |
| 759 | +5. **Result: PASS** |
| 760 | +
|
| 761 | +--- |
| 762 | +
|
| 763 | +## Kubeflow Trainer Health |
| 764 | +EOF |
| 765 | + capture "Kubeflow Trainer deployments" kubectl get deploy -n kubeflow |
| 766 | + capture "Kubeflow Trainer pods" kubectl get pods -n kubeflow -o wide |
| 767 | + |
| 768 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 769 | +
|
| 770 | +## Custom Resource Definitions |
| 771 | +EOF |
| 772 | + echo "" >> "${EVIDENCE_FILE}" |
| 773 | + echo "**Kubeflow Trainer CRDs**" >> "${EVIDENCE_FILE}" |
| 774 | + echo '```' >> "${EVIDENCE_FILE}" |
| 775 | + kubectl get crds 2>/dev/null | grep -E "trainer\.kubeflow\.org" >> "${EVIDENCE_FILE}" 2>&1 |
| 776 | + echo '```' >> "${EVIDENCE_FILE}" |
| 777 | + |
| 778 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 779 | +
|
| 780 | +## Webhooks |
| 781 | +EOF |
| 782 | + capture "Validating webhooks" kubectl get validatingwebhookconfigurations validator.trainer.kubeflow.org |
| 783 | + echo "" >> "${EVIDENCE_FILE}" |
| 784 | + echo "**Webhook endpoint verification**" >> "${EVIDENCE_FILE}" |
| 785 | + echo '```' >> "${EVIDENCE_FILE}" |
| 786 | + kubectl get endpoints -n kubeflow 2>/dev/null | head -10 >> "${EVIDENCE_FILE}" 2>&1 |
| 787 | + echo '```' >> "${EVIDENCE_FILE}" |
| 788 | + |
| 789 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 790 | +
|
| 791 | +## ClusterTrainingRuntimes |
| 792 | +EOF |
| 793 | + capture "ClusterTrainingRuntimes" kubectl get clustertrainingruntimes |
| 794 | + |
| 795 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 796 | +
|
| 797 | +## Webhook Rejection Test |
| 798 | +
|
| 799 | +Submit an invalid TrainJob (referencing a non-existent runtime) to verify the |
| 800 | +validating webhook actively rejects malformed resources. |
| 801 | +EOF |
| 802 | + echo "" >> "${EVIDENCE_FILE}" |
| 803 | + echo "**Invalid TrainJob rejection**" >> "${EVIDENCE_FILE}" |
| 804 | + echo '```' >> "${EVIDENCE_FILE}" |
| 805 | + local webhook_result |
| 806 | + webhook_result=$(kubectl apply -f - 2>&1 <<INVALID_CR || true |
| 807 | +apiVersion: trainer.kubeflow.org/v1alpha1 |
| 808 | +kind: TrainJob |
| 809 | +metadata: |
| 810 | + name: webhook-test-invalid |
| 811 | + namespace: default |
| 812 | +spec: |
| 813 | + runtimeRef: |
| 814 | + name: nonexistent-runtime |
| 815 | + apiGroup: trainer.kubeflow.org |
| 816 | + kind: ClusterTrainingRuntime |
| 817 | +INVALID_CR |
| 818 | +) |
| 819 | + echo "${webhook_result}" >> "${EVIDENCE_FILE}" |
| 820 | + echo '```' >> "${EVIDENCE_FILE}" |
| 821 | + |
| 822 | + echo "" >> "${EVIDENCE_FILE}" |
| 823 | + if echo "${webhook_result}" | grep -qi "denied\|forbidden\|invalid\|error\|not found"; then |
| 824 | + echo "Webhook correctly rejected the invalid resource." >> "${EVIDENCE_FILE}" |
| 825 | + else |
| 826 | + echo "WARNING: Webhook did not reject the invalid resource." >> "${EVIDENCE_FILE}" |
| 827 | + # Clean up if accidentally created |
| 828 | + kubectl delete trainjob webhook-test-invalid -n default --ignore-not-found 2>/dev/null |
| 829 | + fi |
| 830 | + |
| 831 | + # Verdict |
| 832 | + echo "" >> "${EVIDENCE_FILE}" |
| 833 | + local crd_count |
| 834 | + crd_count=$(kubectl get crds 2>/dev/null | grep -c "trainer\.kubeflow\.org" || true) |
| 835 | + local controller_ready |
| 836 | + controller_ready=$(kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | awk '{print $2}' | grep -c "1/1" || true) |
| 837 | + local webhook_ok |
| 838 | + webhook_ok=$(echo "${webhook_result}" | grep -ci "denied\|forbidden\|invalid\|error\|not found" || true) |
| 839 | + |
| 840 | + if [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ] && [ "${webhook_ok}" -gt 0 ]; then |
| 841 | + echo "**Result: PASS** — Kubeflow Trainer running, webhooks operational (rejection verified), ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}" |
| 842 | + elif [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ]; then |
| 843 | + echo "**Result: PASS** — Kubeflow Trainer running, ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}" |
| 844 | + else |
| 845 | + echo "**Result: FAIL** — Kubeflow Trainer controller not ready or CRDs missing." >> "${EVIDENCE_FILE}" |
| 846 | + fi |
| 847 | + |
| 848 | + log_info "Robust operator (Kubeflow Trainer) evidence collection complete." |
| 849 | +} |
| 850 | + |
| 851 | +# --- Dynamo evidence --- |
| 852 | +collect_operator_dynamo() { |
721 | 853 | write_section_header "Robust AI Operator (Dynamo Platform)" |
722 | 854 |
|
723 | 855 | cat >> "${EVIDENCE_FILE}" <<'EOF' |
|
0 commit comments