Skip to content

Commit 23d3fa0

Browse files
committed
feat(validator): self-contained gang scheduling conformance check
Make the gang-scheduling conformance check fully self-contained by programmatically creating test resources instead of relying on pre-deployed manifests. The check now: 1. Verifies KAI scheduler deployments and CRDs (unchanged) 2. Pre-flight: counts free GPUs via ResourceSlices/ResourceClaims and fails fast if fewer than 2 are available 3. Creates a PodGroup with 2 GPU test pods using DRA ResourceClaims 4. Waits for all pods to reach terminal state 5. Validates gang scheduling patterns (kai-scheduler, PodGroup labels, DRA resource claims, pod success) 6. Cleans up all test resources Adds countAvailableGPUs() helper to conformance/helpers.go for reuse by other GPU-dependent checks. Remove redundant "Deploy gang scheduling test" and cleanup steps from the GPU training CI workflow since the conformance check now handles this end-to-end.
1 parent 3c1bd69 commit 23d3fa0

File tree

6 files changed

+522
-86
lines changed

6 files changed

+522
-86
lines changed

.github/workflows/gpu-h100-training-test.yaml

Lines changed: 2 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ on:
2929
- 'pkg/validator/checks/conformance/**'
3030
- '.github/actions/gpu-test-cleanup/**'
3131
- '.github/actions/load-versions/**'
32-
- 'docs/conformance/cncf/manifests/gang-scheduling-test.yaml'
3332
- 'tests/chainsaw/ai-conformance/kind-training/**'
3433
- 'recipes/components/dynamo-platform/**'
3534
- 'recipes/overlays/kind.yaml'
@@ -128,6 +127,8 @@ jobs:
128127
# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
129128
# Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
130129
# has had time to bootstrap (pod-autoscaling check needs live metric data).
130+
# Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained
131+
# gang-scheduling conformance check — no separate deploy step needed.
131132

132133
- name: Validate cluster
133134
run: |
@@ -141,52 +142,6 @@ jobs:
141142
--require-gpu \
142143
--image=ko.local:smoke-test
143144
144-
# --- Gang scheduling test with PodGroup + KAI scheduler ---
145-
146-
- name: Deploy gang scheduling test
147-
run: |
148-
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
149-
-f docs/conformance/cncf/manifests/gang-scheduling-test.yaml
150-
151-
echo "Waiting for gang scheduling pods to complete..."
152-
for i in $(seq 1 60); do
153-
PHASE_0=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
154-
-n gang-scheduling-test get pod gang-worker-0 \
155-
-o jsonpath='{.status.phase}' 2>/dev/null)
156-
PHASE_1=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
157-
-n gang-scheduling-test get pod gang-worker-1 \
158-
-o jsonpath='{.status.phase}' 2>/dev/null)
159-
160-
if [[ "${PHASE_0}" == "Succeeded" && "${PHASE_1}" == "Succeeded" ]]; then
161-
echo "Both gang scheduling pods succeeded!"
162-
break
163-
fi
164-
165-
if [[ "${PHASE_0}" == "Failed" || "${PHASE_1}" == "Failed" ]]; then
166-
echo "::error::Gang scheduling pod failed (worker-0=${PHASE_0}, worker-1=${PHASE_1})"
167-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
168-
describe pods 2>/dev/null || true
169-
exit 1
170-
fi
171-
172-
echo "Waiting for pod completion... worker-0=${PHASE_0}, worker-1=${PHASE_1} (${i}/60)"
173-
sleep 10
174-
done
175-
176-
if [[ "${PHASE_0}" != "Succeeded" || "${PHASE_1}" != "Succeeded" ]]; then
177-
echo "::error::Gang scheduling pods did not complete within 10 minutes"
178-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
179-
describe pods 2>/dev/null || true
180-
exit 1
181-
fi
182-
183-
echo "=== gang-worker-0 logs ==="
184-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
185-
logs gang-worker-0 2>/dev/null || true
186-
echo "=== gang-worker-1 logs ==="
187-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
188-
logs gang-worker-1 2>/dev/null || true
189-
190145
# --- Cluster Autoscaling validation ---
191146

192147
- name: Cluster Autoscaling (Karpenter + KWOK)
@@ -222,36 +177,15 @@ jobs:
222177
kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
223178
echo "=== KAI scheduler podgroups ==="
224179
kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
225-
echo "=== Gang scheduling test pods ==="
226-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
227-
describe pods 2>/dev/null || true
228-
echo "=== gang-worker-0 logs ==="
229-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
230-
logs gang-worker-0 --tail=50 2>/dev/null || true
231-
echo "=== gang-worker-1 logs ==="
232-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
233-
logs gang-worker-1 --tail=50 2>/dev/null || true
234-
echo "=== ResourceClaims ==="
235-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
236-
get resourceclaims -o yaml 2>/dev/null || true
237180
echo "=== Non-running pods (all namespaces) ==="
238181
kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
239182
--field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
240-
echo "=== Recent events (gang-scheduling-test) ==="
241-
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
242-
get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
243183
echo "=== GPU Operator pods ==="
244184
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
245185
echo "=== Node resources ==="
246186
kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
247187
grep -A 20 "Allocated resources" || true
248188
249-
- name: Gang scheduling test cleanup
250-
if: always()
251-
run: |
252-
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
253-
-f docs/conformance/cncf/manifests/gang-scheduling-test.yaml --ignore-not-found 2>/dev/null || true
254-
255189
- name: GPU Test Cleanup
256190
if: always()
257191
uses: ./.github/actions/gpu-test-cleanup

pkg/defaults/timeouts.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,15 @@ const (
137137
ComponentRenderTimeout = 60 * time.Second
138138
)
139139

140-
// DRA test timeouts for conformance validation.
140+
// Conformance test timeouts for DRA and gang scheduling validation.
141141
const (
142142
// DRATestPodTimeout is the timeout for the DRA test pod to complete.
143143
// The pod runs a simple CUDA device check but may need time for image pull.
144144
DRATestPodTimeout = 5 * time.Minute
145+
146+
// GangTestPodTimeout is the timeout for gang scheduling test pods to complete.
147+
// Two pods must be co-scheduled, each pulling a CUDA image and running nvidia-smi.
148+
GangTestPodTimeout = 5 * time.Minute
145149
)
146150

147151
// Pod operation timeouts for validation and agent operations.

pkg/validator/agent/rbac.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,12 @@ func (d *Deployer) ensureClusterRole(ctx context.Context) error {
165165
Resources: []string{"queues", "podgroups"},
166166
Verbs: []string{"get", "list"},
167167
},
168+
// Conformance: gang-scheduling — PodGroup lifecycle for functional test
169+
{
170+
APIGroups: []string{"scheduling.run.ai"},
171+
Resources: []string{"podgroups"},
172+
Verbs: []string{"create", "delete"},
173+
},
168174
// Conformance: Cluster autoscaling (Karpenter)
169175
{
170176
APIGroups: []string{"karpenter.sh"},

0 commit comments

Comments
 (0)