Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/actions/gpu-test-cleanup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ inputs:
artifact_name_prefix:
description: 'Prefix for the uploaded artifact name'
required: false
default: 'gpu-smoke-test-debug'
default: 'gpu-test-debug'

runs:
using: 'composite'
Expand All @@ -39,7 +39,7 @@ runs:
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
kubectl --context="kind-${KIND_CLUSTER_NAME}" describe pod/gpu-smoke-test > /tmp/debug-artifacts/gpu-pod-describe.txt || true
kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true

- name: Export kind logs
if: failure()
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/gpu-h100-inference-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ name: GPU Inference Test (nvkind + H100)

on:
schedule:
- cron: '0 6,18 * * *' # Every 12 hours (2x daily)
- cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test
push:
branches:
- "pull-request/[0-9]+"
Expand Down Expand Up @@ -284,3 +284,4 @@ jobs:
uses: ./.github/actions/gpu-test-cleanup
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
artifact_name_prefix: gpu-inference-test-debug
60 changes: 59 additions & 1 deletion .github/workflows/gpu-h100-training-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ name: GPU Training Test (nvkind + H100 x2)

on:
schedule:
- cron: '0 6,18 * * *' # Every 12 hours (2x daily)
- cron: '30 6,18 * * *' # Every 12 hours (2x daily), offset from inference test
push:
branches:
- "pull-request/[0-9]+"
Expand Down Expand Up @@ -71,6 +71,47 @@ jobs:
accelerator: h100
intent: training

# --- Snapshot and validation ---

- name: Run eidos snapshot
run: |
./eidos snapshot --deploy-agent \
--kubeconfig="${HOME}/.kube/config" \
--namespace=default \
--image=ko.local:smoke-test \
--require-gpu \
--output=snapshot.yaml
echo "--- Snapshot output ---"
cat snapshot.yaml

- name: Validate snapshot detected GPU
run: |
GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu.model"]' snapshot.yaml)
GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu-count"]' snapshot.yaml)
echo "GPU model: ${GPU_MODEL}"
echo "GPU count: ${GPU_COUNT}"
if [[ "${GPU_MODEL}" != *"H100"* ]]; then
echo "::error::Expected H100 GPU in snapshot, got: ${GPU_MODEL}"
exit 1
fi
if [[ "${GPU_COUNT}" -lt 2 ]]; then
echo "::error::Expected gpu-count >= 2 for training, got: ${GPU_COUNT}"
exit 1
fi
echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"

- name: Validate cluster
run: |
./eidos validate \
--recipe recipe.yaml \
--phase readiness \
--phase deployment \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test

# --- Health checks ---

- name: Install chainsaw
Expand Down Expand Up @@ -131,6 +172,22 @@ jobs:
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
logs gang-worker-1 2>/dev/null || true

# --- Evidence collection ---

- name: Collect AI conformance evidence
if: always()
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind-training \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
--kubeconfig="${HOME}/.kube/config" \
--debug

# --- Debug diagnostics (before cleanup so resources still exist) ---

- name: Debug diagnostics
Expand Down Expand Up @@ -180,3 +237,4 @@ jobs:
uses: ./.github/actions/gpu-test-cleanup
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
artifact_name_prefix: gpu-training-test-debug
2 changes: 1 addition & 1 deletion .github/workflows/gpu-smoke-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ on:
- cron: '0 6,18 * * *' # Every 12 hours (2x daily)
push:
branches:
- "pull-request/*"
- "pull-request/[0-9]+"
paths:
- '.github/workflows/gpu-smoke-test.yaml'
- '.github/actions/gpu-cluster-setup/**'
Expand Down
Loading