NVIDIA
diff --git a/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 2 additions & 68 deletions b/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 2 additions & 68 deletions
diff --git a/‎pkg/defaults/timeouts.go‎
Lines changed: 5 additions & 1 deletion b/‎pkg/defaults/timeouts.go‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎pkg/validator/agent/rbac.go‎
Lines changed: 6 additions & 0 deletions b/‎pkg/validator/agent/rbac.go‎
Lines changed: 6 additions & 0 deletions
@@ -29,7 +29,6 @@ on:
       - 'pkg/validator/checks/conformance/**'
       - '.github/actions/gpu-test-cleanup/**'
       - '.github/actions/load-versions/**'
-      - 'docs/conformance/cncf/manifests/gang-scheduling-test.yaml'
       - 'tests/chainsaw/ai-conformance/kind-training/**'
       - 'recipes/components/dynamo-platform/**'
       - 'recipes/overlays/kind.yaml'
@@ -128,6 +127,8 @@ jobs:
       # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
       # Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
       # has had time to bootstrap (pod-autoscaling check needs live metric data).
+      # Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained
+      # gang-scheduling conformance check — no separate deploy step needed.
 
       - name: Validate cluster
         run: |
@@ -141,52 +142,6 @@ jobs:
             --require-gpu \
             --image=ko.local:smoke-test
 
-      # --- Gang scheduling test with PodGroup + KAI scheduler ---
-
-      - name: Deploy gang scheduling test
-        run: |
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
-            -f docs/conformance/cncf/manifests/gang-scheduling-test.yaml
-
-          echo "Waiting for gang scheduling pods to complete..."
-          for i in $(seq 1 60); do
-            PHASE_0=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-              -n gang-scheduling-test get pod gang-worker-0 \
-              -o jsonpath='{.status.phase}' 2>/dev/null)
-            PHASE_1=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-              -n gang-scheduling-test get pod gang-worker-1 \
-              -o jsonpath='{.status.phase}' 2>/dev/null)
-
-            if [[ "${PHASE_0}" == "Succeeded" && "${PHASE_1}" == "Succeeded" ]]; then
-              echo "Both gang scheduling pods succeeded!"
-              break
-            fi
-
-            if [[ "${PHASE_0}" == "Failed" || "${PHASE_1}" == "Failed" ]]; then
-              echo "::error::Gang scheduling pod failed (worker-0=${PHASE_0}, worker-1=${PHASE_1})"
-              kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-                describe pods 2>/dev/null || true
-              exit 1
-            fi
-
-            echo "Waiting for pod completion... worker-0=${PHASE_0}, worker-1=${PHASE_1} (${i}/60)"
-            sleep 10
-          done
-
-          if [[ "${PHASE_0}" != "Succeeded" || "${PHASE_1}" != "Succeeded" ]]; then
-            echo "::error::Gang scheduling pods did not complete within 10 minutes"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-              describe pods 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "=== gang-worker-0 logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-            logs gang-worker-0 2>/dev/null || true
-          echo "=== gang-worker-1 logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-            logs gang-worker-1 2>/dev/null || true
-
       # --- Cluster Autoscaling validation ---
 
       - name: Cluster Autoscaling (Karpenter + KWOK)
@@ -222,36 +177,15 @@ jobs:
           kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
           echo "=== KAI scheduler podgroups ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
-          echo "=== Gang scheduling test pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-            describe pods 2>/dev/null || true
-          echo "=== gang-worker-0 logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-            logs gang-worker-0 --tail=50 2>/dev/null || true
-          echo "=== gang-worker-1 logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-            logs gang-worker-1 --tail=50 2>/dev/null || true
-          echo "=== ResourceClaims ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-            get resourceclaims -o yaml 2>/dev/null || true
           echo "=== Non-running pods (all namespaces) ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
             --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== Recent events (gang-scheduling-test) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
-            get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
           echo "=== GPU Operator pods ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
           echo "=== Node resources ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
             grep -A 20 "Allocated resources" || true
 
-      - name: Gang scheduling test cleanup
-        if: always()
-        run: |
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
-            -f docs/conformance/cncf/manifests/gang-scheduling-test.yaml --ignore-not-found 2>/dev/null || true
-
       - name: GPU Test Cleanup
         if: always()
         uses: ./.github/actions/gpu-test-cleanup
 
@@ -137,11 +137,15 @@ const (
 	ComponentRenderTimeout = 60 * time.Second
 )
 
-// DRA test timeouts for conformance validation.
+// Conformance test timeouts for DRA and gang scheduling validation.
 const (
 	// DRATestPodTimeout is the timeout for the DRA test pod to complete.
 	// The pod runs a simple CUDA device check but may need time for image pull.
 	DRATestPodTimeout = 5 * time.Minute
+
+	// GangTestPodTimeout is the timeout for gang scheduling test pods to complete.
+	// Two pods must be co-scheduled, each pulling a CUDA image and running nvidia-smi.
+	GangTestPodTimeout = 5 * time.Minute
 )
 
 // Pod operation timeouts for validation and agent operations.
 
@@ -165,6 +165,12 @@ func (d *Deployer) ensureClusterRole(ctx context.Context) error {
 				Resources: []string{"queues", "podgroups"},
 				Verbs:     []string{"get", "list"},
 			},
+			// Conformance: gang-scheduling — PodGroup lifecycle for functional test
+			{
+				APIGroups: []string{"scheduling.run.ai"},
+				Resources: []string{"podgroups"},
+				Verbs:     []string{"create", "delete"},
+			},
 			// Conformance: Cluster autoscaling (Karpenter)
 			{
 				APIGroups: []string{"karpenter.sh"},