2929 - ' pkg/validator/checks/conformance/**'
3030 - ' .github/actions/gpu-test-cleanup/**'
3131 - ' .github/actions/load-versions/**'
32- - ' docs/conformance/cncf/manifests/gang-scheduling-test.yaml'
3332 - ' tests/chainsaw/ai-conformance/kind-training/**'
3433 - ' recipes/components/dynamo-platform/**'
3534 - ' recipes/overlays/kind.yaml'
@@ -128,6 +127,8 @@ jobs:
128127 # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
129128 # Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
130129 # has had time to bootstrap (pod-autoscaling check needs live metric data).
130+ # Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained
131+ # gang-scheduling conformance check — no separate deploy step needed.
131132
132133 - name : Validate cluster
133134 run : |
@@ -141,52 +142,6 @@ jobs:
141142 --require-gpu \
142143 --image=ko.local:smoke-test
143144
144- # --- Gang scheduling test with PodGroup + KAI scheduler ---
145-
146- - name : Deploy gang scheduling test
147- run : |
148- kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
149- -f docs/conformance/cncf/manifests/gang-scheduling-test.yaml
150-
151- echo "Waiting for gang scheduling pods to complete..."
152- for i in $(seq 1 60); do
153- PHASE_0=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
154- -n gang-scheduling-test get pod gang-worker-0 \
155- -o jsonpath='{.status.phase}' 2>/dev/null)
156- PHASE_1=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
157- -n gang-scheduling-test get pod gang-worker-1 \
158- -o jsonpath='{.status.phase}' 2>/dev/null)
159-
160- if [[ "${PHASE_0}" == "Succeeded" && "${PHASE_1}" == "Succeeded" ]]; then
161- echo "Both gang scheduling pods succeeded!"
162- break
163- fi
164-
165- if [[ "${PHASE_0}" == "Failed" || "${PHASE_1}" == "Failed" ]]; then
166- echo "::error::Gang scheduling pod failed (worker-0=${PHASE_0}, worker-1=${PHASE_1})"
167- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
168- describe pods 2>/dev/null || true
169- exit 1
170- fi
171-
172- echo "Waiting for pod completion... worker-0=${PHASE_0}, worker-1=${PHASE_1} (${i}/60)"
173- sleep 10
174- done
175-
176- if [[ "${PHASE_0}" != "Succeeded" || "${PHASE_1}" != "Succeeded" ]]; then
177- echo "::error::Gang scheduling pods did not complete within 10 minutes"
178- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
179- describe pods 2>/dev/null || true
180- exit 1
181- fi
182-
183- echo "=== gang-worker-0 logs ==="
184- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
185- logs gang-worker-0 2>/dev/null || true
186- echo "=== gang-worker-1 logs ==="
187- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
188- logs gang-worker-1 2>/dev/null || true
189-
190145 # --- Cluster Autoscaling validation ---
191146
192147 - name : Cluster Autoscaling (Karpenter + KWOK)
@@ -222,36 +177,15 @@ jobs:
222177 kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
223178 echo "=== KAI scheduler podgroups ==="
224179 kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
225- echo "=== Gang scheduling test pods ==="
226- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
227- describe pods 2>/dev/null || true
228- echo "=== gang-worker-0 logs ==="
229- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
230- logs gang-worker-0 --tail=50 2>/dev/null || true
231- echo "=== gang-worker-1 logs ==="
232- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
233- logs gang-worker-1 --tail=50 2>/dev/null || true
234- echo "=== ResourceClaims ==="
235- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
236- get resourceclaims -o yaml 2>/dev/null || true
237180 echo "=== Non-running pods (all namespaces) ==="
238181 kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
239182 --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
240- echo "=== Recent events (gang-scheduling-test) ==="
241- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
242- get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
243183 echo "=== GPU Operator pods ==="
244184 kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
245185 echo "=== Node resources ==="
246186 kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
247187 grep -A 20 "Allocated resources" || true
248188
249- - name : Gang scheduling test cleanup
250- if : always()
251- run : |
252- kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
253- -f docs/conformance/cncf/manifests/gang-scheduling-test.yaml --ignore-not-found 2>/dev/null || true
254-
255189 - name : GPU Test Cleanup
256190 if : always()
257191 uses : ./.github/actions/gpu-test-cleanup
0 commit comments