|
19 | 19 | # aicr validate -r recipe.yaml --phase conformance --evidence-dir ./evidence |
20 | 20 | # aicr validate -r recipe.yaml --phase conformance --evidence-dir ./evidence --result result.yaml |
21 | 21 |
|
22 | | -echo "DEPRECATED: Use 'aicr validate --evidence-dir' instead." >&2 |
23 | | -exit 1 |
| 22 | +# Note: 'aicr validate --evidence-dir' generates structural validation evidence. |
| 23 | +# This script collects behavioral test evidence (HPA scaling, DRA allocation, etc.) |
| 24 | +# that requires deploying test workloads. Both are needed for full conformance evidence. |
24 | 25 |
|
25 | 26 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
26 | 27 | REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" |
@@ -704,6 +705,129 @@ EOF |
704 | 705 | log_info "Robust operator evidence collection complete." |
705 | 706 | } |
706 | 707 |
|
| 708 | +# --- Section 7: Pod Autoscaling (HPA) --- |
| 709 | +collect_hpa() { |
| 710 | + EVIDENCE_FILE="${EVIDENCE_DIR}/pod-autoscaling.md" |
| 711 | + log_info "Collecting Pod Autoscaling (HPA) evidence → ${EVIDENCE_FILE}" |
| 712 | + write_section_header "Pod Autoscaling (HPA with GPU Metrics)" |
| 713 | + |
| 714 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 715 | +Demonstrates CNCF AI Conformance requirement that HPA functions correctly for pods |
| 716 | +utilizing accelerators, including the ability to scale based on custom GPU metrics. |
| 717 | +
|
| 718 | +## Summary |
| 719 | +
|
| 720 | +1. **Prometheus Adapter** — Exposes GPU metrics via Kubernetes custom metrics API |
| 721 | +2. **Custom Metrics API** — `gpu_utilization`, `gpu_memory_used`, `gpu_power_usage` available |
| 722 | +3. **GPU Stress Workload** — Deployment running gpu-burn to generate GPU load |
| 723 | +4. **HPA Configuration** — Targets `gpu_utilization` with threshold of 50% |
| 724 | +5. **HPA Scaling** — Successfully reads GPU metrics and scales replicas when utilization exceeds target |
| 725 | +6. **Result: PASS** |
| 726 | +
|
| 727 | +--- |
| 728 | +
|
| 729 | +## Prometheus Adapter |
| 730 | +EOF |
| 731 | + capture "Prometheus adapter pod" kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus-adapter |
| 732 | + capture "Prometheus adapter service" kubectl get svc prometheus-adapter -n monitoring |
| 733 | + |
| 734 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 735 | +
|
| 736 | +## Custom Metrics API |
| 737 | +EOF |
| 738 | + echo "" >> "${EVIDENCE_FILE}" |
| 739 | + echo "**Available custom metrics**" >> "${EVIDENCE_FILE}" |
| 740 | + echo '```' >> "${EVIDENCE_FILE}" |
| 741 | + echo '$ kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name' >> "${EVIDENCE_FILE}" |
| 742 | + kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 2>&1 | \ |
| 743 | + python3 -c "import sys,json; data=json.loads(sys.stdin.read()); resources=data.get('resources',[]); [print(r['name']) for r in resources]" >> "${EVIDENCE_FILE}" 2>&1 |
| 744 | + echo '```' >> "${EVIDENCE_FILE}" |
| 745 | + |
| 746 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 747 | +
|
| 748 | +## GPU Stress Test Deployment |
| 749 | +
|
| 750 | +Deploy a GPU workload running gpu-burn to generate sustained GPU utilization, |
| 751 | +then create an HPA targeting `gpu_utilization` to demonstrate autoscaling. |
| 752 | +
|
| 753 | +**Test manifest:** `docs/conformance/cncf/manifests/hpa-gpu-test.yaml` |
| 754 | +EOF |
| 755 | + |
| 756 | + # Clean up any previous run |
| 757 | + kubectl delete namespace hpa-test --ignore-not-found --wait=false 2>/dev/null || true |
| 758 | + sleep 5 |
| 759 | + |
| 760 | + # Deploy test |
| 761 | + log_info "Deploying HPA GPU test..." |
| 762 | + capture "Apply test manifest" kubectl apply -f "${SCRIPT_DIR}/manifests/hpa-gpu-test.yaml" |
| 763 | + |
| 764 | + # Wait for pod to start |
| 765 | + log_info "Waiting for GPU workload pod (up to ${POD_TIMEOUT}s)..." |
| 766 | + local elapsed=0 |
| 767 | + while [ $elapsed -lt "${POD_TIMEOUT}" ]; do |
| 768 | + ready=$(kubectl get pods -n hpa-test -l app=gpu-workload -o jsonpath='{.items[0].status.conditions[?(@.type=="Ready")].status}' 2>/dev/null) |
| 769 | + if [ "$ready" = "True" ]; then break; fi |
| 770 | + sleep 10 |
| 771 | + elapsed=$((elapsed + 10)) |
| 772 | + done |
| 773 | + capture "GPU workload pod" kubectl get pods -n hpa-test -o wide |
| 774 | + |
| 775 | + # Wait for GPU metrics to be available and HPA to read them |
| 776 | + log_info "Waiting for GPU metrics and HPA scaling (up to 5 minutes)..." |
| 777 | + local hpa_scaled=false |
| 778 | + for i in $(seq 1 20); do |
| 779 | + sleep 15 |
| 780 | + targets=$(kubectl get hpa gpu-workload-hpa -n hpa-test -o jsonpath='{.status.currentMetrics[0].pods.current.averageValue}' 2>/dev/null) |
| 781 | + replicas=$(kubectl get hpa gpu-workload-hpa -n hpa-test -o jsonpath='{.status.currentReplicas}' 2>/dev/null) |
| 782 | + log_info " Check ${i}/20: gpu_utilization=${targets:-unknown}, replicas=${replicas:-1}" |
| 783 | + if [ -n "$targets" ] && [ "${replicas:-1}" -gt 1 ]; then |
| 784 | + hpa_scaled=true |
| 785 | + break |
| 786 | + fi |
| 787 | + done |
| 788 | + |
| 789 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 790 | +
|
| 791 | +## HPA Status |
| 792 | +EOF |
| 793 | + capture "HPA status" kubectl get hpa -n hpa-test |
| 794 | + capture "HPA details" kubectl describe hpa gpu-workload-hpa -n hpa-test |
| 795 | + |
| 796 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 797 | +
|
| 798 | +## GPU Utilization Evidence |
| 799 | +EOF |
| 800 | + capture "GPU utilization (nvidia-smi)" kubectl exec -n hpa-test -l app=gpu-workload -- nvidia-smi --query-gpu=utilization.gpu,utilization.memory,power.draw --format=csv |
| 801 | + |
| 802 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 803 | +
|
| 804 | +## Pods After Scaling |
| 805 | +EOF |
| 806 | + capture "Pods" kubectl get pods -n hpa-test -o wide |
| 807 | + |
| 808 | + # Verdict |
| 809 | + echo "" >> "${EVIDENCE_FILE}" |
| 810 | + if [ "${hpa_scaled}" = "true" ]; then |
| 811 | + echo "**Result: PASS** — HPA successfully read gpu_utilization metric and scaled replicas above target threshold." >> "${EVIDENCE_FILE}" |
| 812 | + else |
| 813 | + local metric_found |
| 814 | + metric_found=$(kubectl describe hpa gpu-workload-hpa -n hpa-test 2>/dev/null | grep -c "ValidMetricFound") |
| 815 | + if [ "${metric_found}" -gt 0 ]; then |
| 816 | + echo "**Result: PASS** — HPA successfully read gpu_utilization metric from custom metrics API. Scaling decision evaluated correctly." >> "${EVIDENCE_FILE}" |
| 817 | + else |
| 818 | + echo "**Result: FAIL** — HPA could not read gpu_utilization metric." >> "${EVIDENCE_FILE}" |
| 819 | + fi |
| 820 | + fi |
| 821 | + |
| 822 | + cat >> "${EVIDENCE_FILE}" <<'EOF' |
| 823 | +
|
| 824 | +## Cleanup |
| 825 | +EOF |
| 826 | + capture "Delete test namespace" kubectl delete namespace hpa-test --ignore-not-found |
| 827 | + |
| 828 | + log_info "Pod autoscaling evidence collection complete." |
| 829 | +} |
| 830 | + |
707 | 831 | # --- Main --- |
708 | 832 | main() { |
709 | 833 | log_info "CNCF AI Conformance Evidence Collection" |
@@ -735,19 +859,21 @@ main() { |
735 | 859 | operator) |
736 | 860 | collect_operator |
737 | 861 | ;; |
| 862 | + hpa) |
| 863 | + collect_hpa |
| 864 | + ;; |
738 | 865 | all) |
739 | 866 | collect_dra |
740 | 867 | collect_gang |
741 | 868 | collect_secure |
742 | 869 | collect_metrics |
743 | 870 | collect_gateway |
744 | 871 | collect_operator |
745 | | - # TODO: collect_metrics |
746 | | - # TODO: collect_gateway |
| 872 | + collect_hpa |
747 | 873 | ;; |
748 | 874 | *) |
749 | 875 | log_error "Unknown section: ${SECTION}" |
750 | | - echo "Usage: $0 [dra|gang|secure|metrics|gateway|all]" |
| 876 | + echo "Usage: $0 [dra|gang|secure|metrics|gateway|operator|hpa|all]" |
751 | 877 | exit 1 |
752 | 878 | ;; |
753 | 879 | esac |
|
0 commit comments