diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index 36bf00ee4..d539a9548 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -22,7 +22,7 @@ inputs: artifact_name_prefix: description: 'Prefix for the uploaded artifact name' required: false - default: 'gpu-smoke-test-debug' + default: 'gpu-test-debug' runs: using: 'composite' @@ -39,7 +39,7 @@ runs: kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" describe pod/gpu-smoke-test > /tmp/debug-artifacts/gpu-pod-describe.txt || true + kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true - name: Export kind logs if: failure() diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index ac47fbf2c..2677eb303 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -16,7 +16,7 @@ name: GPU Inference Test (nvkind + H100) on: schedule: - - cron: '0 6,18 * * *' # Every 12 hours (2x daily) + - cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test push: branches: - "pull-request/[0-9]+" @@ -284,3 +284,4 @@ jobs: uses: ./.github/actions/gpu-test-cleanup with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} + artifact_name_prefix: gpu-inference-test-debug diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index 74b5af20c..ed132196b 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -16,7 +16,7 @@ name: GPU Training Test (nvkind + H100 x2) on: schedule: - - cron: '0 6,18 * * *' # Every 12 hours (2x daily) + - cron: '30 6,18 * * *' # Every 12 hours (2x daily), offset from inference test push: branches: - "pull-request/[0-9]+" @@ -71,6 +71,47 @@ jobs: accelerator: h100 intent: training + # --- Snapshot and validation --- + + - name: Run eidos snapshot + run: | + ./eidos snapshot --deploy-agent \ + --kubeconfig="${HOME}/.kube/config" \ + --namespace=default \ + --image=ko.local:smoke-test \ + --require-gpu \ + --output=snapshot.yaml + echo "--- Snapshot output ---" + cat snapshot.yaml + + - name: Validate snapshot detected GPU + run: | + GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu.model"]' snapshot.yaml) + GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu-count"]' snapshot.yaml) + echo "GPU model: ${GPU_MODEL}" + echo "GPU count: ${GPU_COUNT}" + if [[ "${GPU_MODEL}" != *"H100"* ]]; then + echo "::error::Expected H100 GPU in snapshot, got: ${GPU_MODEL}" + exit 1 + fi + if [[ "${GPU_COUNT}" -lt 2 ]]; then + echo "::error::Expected gpu-count >= 2 for training, got: ${GPU_COUNT}" + exit 1 + fi + echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" + + - name: Validate cluster + run: | + ./eidos validate \ + --recipe recipe.yaml \ + --phase readiness \ + --phase deployment \ + --phase conformance \ + --namespace gpu-operator \ + --kubeconfig="${HOME}/.kube/config" \ + --require-gpu \ + --image=ko.local:smoke-test + # --- Health checks --- - name: Install chainsaw @@ -131,6 +172,22 @@ jobs: kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \ logs gang-worker-1 2>/dev/null || true + # --- Evidence collection --- + + - name: Collect AI conformance evidence + if: always() + run: | + go run ./tests/chainsaw/ai-conformance/ \ + --dir tests/chainsaw/ai-conformance/kind-training \ + --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ + --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ + --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ + --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ + --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ + --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \ + --kubeconfig="${HOME}/.kube/config" \ + --debug + # --- Debug diagnostics (before cleanup so resources still exist) --- - name: Debug diagnostics @@ -180,3 +237,4 @@ jobs: uses: ./.github/actions/gpu-test-cleanup with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} + artifact_name_prefix: gpu-training-test-debug diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index cf173c657..8d2aef570 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -19,7 +19,7 @@ on: - cron: '0 6,18 * * *' # Every 12 hours (2x daily) push: branches: - - "pull-request/*" + - "pull-request/[0-9]+" paths: - '.github/workflows/gpu-smoke-test.yaml' - '.github/actions/gpu-cluster-setup/**'