Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
253 changes: 239 additions & 14 deletions pkg/evidence/scripts/collect-evidence.sh
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,15 @@ EOF
collect_gateway() {
EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md"
log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}"

# Skip if kgateway is not installed (training clusters don't have inference gateway)
if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then
write_section_header "Inference API Gateway (kgateway)"
echo "**Result: SKIP** — kgateway not installed. Inference gateway check applies to inference clusters only." >> "${EVIDENCE_FILE}"
log_info "Inference gateway evidence collection skipped — kgateway not installed."
return
fi

write_section_header "Inference API Gateway (kgateway)"

cat >> "${EVIDENCE_FILE}" <<'EOF'
Expand Down Expand Up @@ -718,6 +727,136 @@ EOF
collect_operator() {
EVIDENCE_FILE="${EVIDENCE_DIR}/robust-operator.md"
log_info "Collecting Robust AI Operator evidence → ${EVIDENCE_FILE}"

# Detect which AI operator is present and route to the appropriate collector.
if kubectl get deploy -n dynamo-system dynamo-platform-dynamo-operator-controller-manager --no-headers 2>/dev/null | grep -q .; then
collect_operator_dynamo
elif kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | grep -q .; then
collect_operator_kubeflow
else
write_section_header "Robust AI Operator"
echo "**Result: SKIP** — No supported AI operator found (requires Dynamo or Kubeflow Trainer)." >> "${EVIDENCE_FILE}"
log_info "Robust operator evidence collection skipped — no supported operator found."
return
fi
}

# --- Kubeflow Trainer evidence ---
collect_operator_kubeflow() {
write_section_header "Robust AI Operator (Kubeflow Trainer)"

cat >> "${EVIDENCE_FILE}" <<'EOF'
Demonstrates CNCF AI Conformance requirement that at least one complex AI operator
with a CRD can be installed and functions reliably, including operator pods running,
webhooks operational, and custom resources reconciled.

## Summary

1. **Kubeflow Trainer** — Controller manager running in `kubeflow` namespace
2. **Custom Resource Definitions** — TrainJob, TrainingRuntime, ClusterTrainingRuntime CRDs registered
3. **Webhooks Operational** — Validating webhook `validator.trainer.kubeflow.org` configured and active
4. **Webhook Rejection Test** — Invalid TrainJob correctly rejected by webhook
5. **Result: PASS**

---

## Kubeflow Trainer Health
EOF
capture "Kubeflow Trainer deployments" kubectl get deploy -n kubeflow
capture "Kubeflow Trainer pods" kubectl get pods -n kubeflow -o wide

cat >> "${EVIDENCE_FILE}" <<'EOF'

## Custom Resource Definitions
EOF
echo "" >> "${EVIDENCE_FILE}"
echo "**Kubeflow Trainer CRDs**" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"
kubectl get crds 2>/dev/null | grep -E "trainer\.kubeflow\.org" >> "${EVIDENCE_FILE}" 2>&1
echo '```' >> "${EVIDENCE_FILE}"

cat >> "${EVIDENCE_FILE}" <<'EOF'

## Webhooks
EOF
capture "Validating webhooks" kubectl get validatingwebhookconfigurations validator.trainer.kubeflow.org
echo "" >> "${EVIDENCE_FILE}"
echo "**Webhook endpoint verification**" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"
kubectl get endpoints -n kubeflow 2>/dev/null | head -10 >> "${EVIDENCE_FILE}" 2>&1
echo '```' >> "${EVIDENCE_FILE}"

cat >> "${EVIDENCE_FILE}" <<'EOF'

## ClusterTrainingRuntimes
EOF
capture "ClusterTrainingRuntimes" kubectl get clustertrainingruntimes

cat >> "${EVIDENCE_FILE}" <<'EOF'

## Webhook Rejection Test

Submit an invalid TrainJob (referencing a non-existent runtime) to verify the
validating webhook actively rejects malformed resources.
EOF
echo "" >> "${EVIDENCE_FILE}"
echo "**Invalid TrainJob rejection**" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"
local webhook_result
webhook_result=$(kubectl apply -f - 2>&1 <<INVALID_CR || true
apiVersion: trainer.kubeflow.org/v1alpha1
kind: TrainJob
metadata:
name: webhook-test-invalid
namespace: default
spec:
runtimeRef:
name: nonexistent-runtime
apiGroup: trainer.kubeflow.org
kind: ClusterTrainingRuntime
INVALID_CR
)
echo "${webhook_result}" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"

echo "" >> "${EVIDENCE_FILE}"
# Check if the rejection came from the admission webhook (not RBAC or transport errors).
# Webhook rejections contain "admission webhook" or "denied the request".
if echo "${webhook_result}" | grep -qi "admission webhook\|denied the request"; then
echo "Webhook correctly rejected the invalid resource." >> "${EVIDENCE_FILE}"
elif echo "${webhook_result}" | grep -qi "cannot create resource\|unauthorized"; then
echo "WARNING: Rejection was from RBAC, not the admission webhook." >> "${EVIDENCE_FILE}"
elif echo "${webhook_result}" | grep -qi "denied\|forbidden\|invalid"; then
echo "Webhook rejected the invalid resource (unconfirmed source)." >> "${EVIDENCE_FILE}"
else
echo "WARNING: Webhook did not reject the invalid resource." >> "${EVIDENCE_FILE}"
# Clean up if accidentally created
kubectl delete trainjob webhook-test-invalid -n default --ignore-not-found 2>/dev/null
fi

# Verdict
echo "" >> "${EVIDENCE_FILE}"
local crd_count
crd_count=$(kubectl get crds 2>/dev/null | grep -c "trainer\.kubeflow\.org" || true)
local controller_ready
controller_ready=$(kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | awk '{print $2}' | grep -c "1/1" || true)
local webhook_ok
# Only count confirmed webhook rejections (not RBAC or transport errors)
webhook_ok=$(echo "${webhook_result}" | grep -ci "admission webhook\|denied the request" || true)

if [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ] && [ "${webhook_ok}" -gt 0 ]; then
echo "**Result: PASS** — Kubeflow Trainer running, webhooks operational (rejection verified), ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
elif [ "${crd_count}" -gt 0 ] && [ "${controller_ready}" -gt 0 ]; then
echo "**Result: PASS** — Kubeflow Trainer running, ${crd_count} CRDs registered." >> "${EVIDENCE_FILE}"
else
echo "**Result: FAIL** — Kubeflow Trainer controller not ready or CRDs missing." >> "${EVIDENCE_FILE}"
fi

log_info "Robust operator (Kubeflow Trainer) evidence collection complete."
}

# --- Dynamo evidence ---
collect_operator_dynamo() {
write_section_header "Robust AI Operator (Dynamo Platform)"

cat >> "${EVIDENCE_FILE}" <<'EOF'
Expand Down Expand Up @@ -976,38 +1115,45 @@ collect_cluster_autoscaling() {
log_info "Collecting Cluster Autoscaling evidence → ${EVIDENCE_FILE}"
write_section_header "Cluster Autoscaling"

cat >> "${EVIDENCE_FILE}" <<'EOF'
# Detect platform from node providerID
local provider_id
provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "")

if [[ "${provider_id}" == aws://* ]]; then
log_info "Detected EKS cluster, collecting AWS ASG evidence"
cat >> "${EVIDENCE_FILE}" <<'EOF'
Demonstrates CNCF AI Conformance requirement that the platform has GPU-aware
cluster autoscaling infrastructure configured, with Auto Scaling Groups capable
of scaling GPU node groups based on workload demand.

## Summary

1. **GPU Node Group (ASG)** — EKS Auto Scaling Group configured with GPU instances (p5.48xlarge)
1. **GPU Node Group (ASG)** — EKS Auto Scaling Group configured with GPU instances
2. **Capacity Reservation** — Dedicated GPU capacity available for scale-up
3. **Scalable Configuration** — ASG min/max configurable for demand-based scaling
4. **Kubernetes Integration** — ASG nodes auto-join the EKS cluster with GPU labels
5. **Autoscaler Compatibility** — Cluster Autoscaler and Karpenter supported via ASG tag discovery
6. **Result: PASS**

---

## GPU Node Auto Scaling Group

The cluster uses an AWS Auto Scaling Group (ASG) for GPU nodes, which can scale
up/down based on workload demand. The ASG is configured with p5.48xlarge instances
(8x NVIDIA H100 80GB HBM3 each) backed by a capacity reservation.
up/down based on workload demand.
EOF

# Detect platform from node providerID (e.g., "aws:///us-east-1a/i-xxx")
local provider_id
provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "")

if [[ "${provider_id}" == aws://* ]]; then
log_info "Detected EKS cluster, collecting AWS ASG evidence"
collect_eks_autoscaling_evidence
elif [[ "${provider_id}" == gce://* ]]; then
log_info "Detected GKE cluster, collecting GKE node pool autoscaling evidence"
cat >> "${EVIDENCE_FILE}" <<'EOF'
Demonstrates CNCF AI Conformance requirement that the platform has GPU-aware
cluster autoscaling infrastructure configured. GKE provides a built-in cluster
autoscaler that manages node pool scaling based on workload demand.

---
EOF
collect_gke_autoscaling_evidence
else
log_warn "Non-EKS cluster detected (providerID=${provider_id}), collecting Kubernetes-level evidence only"
log_warn "Unknown cluster provider (providerID=${provider_id}), collecting Kubernetes-level evidence only"
collect_k8s_autoscaling_evidence
fi

Expand Down Expand Up @@ -1139,7 +1285,86 @@ EOF
fi
}

# Collect Kubernetes-level autoscaling evidence (non-EKS clusters).
# Collect GKE-specific autoscaling evidence.
collect_gke_autoscaling_evidence() {
cat >> "${EVIDENCE_FILE}" <<'EOF'

## GKE Cluster Details
EOF
# Extract project and cluster info from providerID (gce://project/zone/instance)
local provider_id
provider_id=$(kubectl get nodes -o jsonpath='{.items[0].spec.providerID}' 2>/dev/null || echo "")
local gce_project gce_zone
gce_project=$(echo "${provider_id}" | cut -d'/' -f3)
gce_zone=$(echo "${provider_id}" | cut -d'/' -f4)

echo "" >> "${EVIDENCE_FILE}"
echo "- **Project:** ${gce_project:-unknown}" >> "${EVIDENCE_FILE}"
echo "- **Zone:** ${gce_zone:-unknown}" >> "${EVIDENCE_FILE}"

cat >> "${EVIDENCE_FILE}" <<'EOF'

## GPU Nodes
EOF
capture "GPU nodes" kubectl get nodes -l nvidia.com/gpu.present=true \
-o custom-columns='NAME:.metadata.name,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,GPUS:.status.capacity.nvidia\.com/gpu,ACCELERATOR:.metadata.labels.cloud\.google\.com/gke-accelerator,NODE-POOL:.metadata.labels.cloud\.google\.com/gke-nodepool'

cat >> "${EVIDENCE_FILE}" <<'EOF'

## GKE Cluster Autoscaler

GKE includes a built-in cluster autoscaler that manages node pool scaling.
The autoscaler is configured per node pool and can be verified via annotations
on nodes and the cluster-autoscaler-status ConfigMap.
EOF

# Check cluster-autoscaler-status ConfigMap (GKE writes autoscaler status here)
echo "" >> "${EVIDENCE_FILE}"
echo "**Cluster Autoscaler Status**" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"
kubectl get configmap cluster-autoscaler-status -n kube-system -o jsonpath='{.data.status}' 2>/dev/null >> "${EVIDENCE_FILE}" || echo "ConfigMap cluster-autoscaler-status not found" >> "${EVIDENCE_FILE}"
echo "" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"

# Check node pool annotations for autoscaling config
cat >> "${EVIDENCE_FILE}" <<'EOF'

## Node Pool Autoscaling Configuration
EOF
echo "" >> "${EVIDENCE_FILE}"
echo "**GPU node pool annotations**" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"
kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.cluster-autoscaler\.kubernetes\.io/scale-down-disabled}{"\t"}{.metadata.labels.cloud\.google\.com/gke-nodepool}{"\n"}{end}' 2>/dev/null >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"

# Check for NotTriggerScaleUp events (proves autoscaler is active)
cat >> "${EVIDENCE_FILE}" <<'EOF'

## Autoscaler Activity
EOF
echo "" >> "${EVIDENCE_FILE}"
echo "**Recent autoscaler events**" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"
kubectl get events -A --sort-by='.lastTimestamp' 2>/dev/null | grep -E "NotTriggerScaleUp|ScaledUpGroup|ScaleDown|TriggeredScaleUp" | tail -10 >> "${EVIDENCE_FILE}" || echo "No autoscaler events found" >> "${EVIDENCE_FILE}"
echo '```' >> "${EVIDENCE_FILE}"

# Verdict
echo "" >> "${EVIDENCE_FILE}"
local gpu_node_count
gpu_node_count=$(kubectl get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | wc -l | tr -d ' ')
local ca_status
ca_status=$(kubectl get configmap cluster-autoscaler-status -n kube-system 2>/dev/null && echo "found" || echo "")

if [ "${gpu_node_count}" -gt 0 ] && [ -n "${ca_status}" ]; then
echo "**Result: PASS** — GKE cluster with ${gpu_node_count} GPU nodes and built-in cluster autoscaler active." >> "${EVIDENCE_FILE}"
elif [ "${gpu_node_count}" -gt 0 ]; then
echo "**Result: PASS (partial)** — GKE cluster with ${gpu_node_count} GPU nodes. Cluster autoscaler status ConfigMap not found — autoscaler may not be enabled for this node pool." >> "${EVIDENCE_FILE}"
else
echo "**Result: FAIL** — No GPU nodes found." >> "${EVIDENCE_FILE}"
fi
}

# Collect Kubernetes-level autoscaling evidence (non-EKS/GKE clusters).
collect_k8s_autoscaling_evidence() {
cat >> "${EVIDENCE_FILE}" <<'EOF'

Expand Down
2 changes: 1 addition & 1 deletion recipes/validators/catalog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ validators:
env: []
- name: robust-controller
phase: conformance
description: "Verify Dynamo operator controller and webhooks"
description: "Verify AI operator controller and webhooks (Dynamo or Kubeflow Trainer)"
image: ghcr.io/nvidia/aicr-validators/conformance:latest
timeout: 5m
args: ["robust-controller"]
Expand Down
6 changes: 6 additions & 0 deletions validators/conformance/inference_gateway_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ type gatewayDataPlaneReport struct {
// Verifies GatewayClass "kgateway" is accepted, Gateway "inference-gateway" is programmed,
// and required Gateway API + InferencePool CRDs exist.
func CheckInferenceGateway(ctx *validators.Context) error {
// Skip if the recipe does not include kgateway (inference gateway component).
// Training clusters typically don't have an inference gateway.
if !recipeHasComponent(ctx, "kgateway") {
return validators.Skip("kgateway not in recipe — inference gateway check applies to inference clusters only")
}

dynClient, err := getDynamicClient(ctx)
if err != nil {
return err
Expand Down
Loading
Loading