Simplify E2E workflow following KAI Scheduler pattern

danbar2 · danbar2 · commit dd3d3238f5a1 · 2026-01-27T11:12:50.000+02:00
- Single job with sequential cluster setup and test execution
- Use helm/kind-action for cluster creation (30 workers)
- Deploy local registry in cluster with NodePort
- Build and push Grove operator to local registry
- Install Kai Scheduler from helm chart
- Run all E2E tests sequentially (no parallelism)
- Cleanup cluster in same job with 'if: always()'

This follows the proven pattern from KAI Scheduler's E2E workflow
and runs everything in one job on one runner.
diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml
@@ -14,13 +14,6 @@
 # limitations under the License.
 # */
 
-# NOTE: This workflow splits cluster creation from E2E test execution.
-# REQUIREMENTS for self-hosted runners:
-# - All runners must be on the same network and able to reach each other via IP
-# - The k3d cluster API server (port 6550) must be accessible across runners
-# - If runners are on completely isolated machines, consider using runner labels
-#   to ensure all jobs run on the same runner instance
-
 name: E2E Tests
 
 on:
@@ -35,14 +28,13 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  setup-cluster:
+  e2e:
     # Run on non-draft PRs or draft PRs with 'run-e2e' label
     if: github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e')
     runs-on: cpu-amd-m5-2xlarge
-    timeout-minutes: 30
-    name: Setup E2E Cluster
+    timeout-minutes: 90
+    name: E2E Tests
     steps:
-      # print runner specs so we have a record incase of failures
       - name: Print runner specs
         run: |
           echo "CPUs: $(nproc)"
@@ -58,47 +50,15 @@ jobs:
           sudo apt install build-essential -y
 
       - name: Set up Go
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
         with:
           go-version: "1.24.5"
 
-      - name: Install kind
-        run: |
-          # Install kind
-          curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
-          chmod +x ./kind
-          sudo mv ./kind /usr/local/bin/kind
-          kind version
-
-      - name: Install kubectl
-        run: |
-          curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
-          chmod +x kubectl
-          sudo mv kubectl /usr/local/bin/
-          kubectl version --client
-
-      - name: Install Helm
-        run: |
-          curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm version
-
-      - name: Get runner IP address
-        id: get-ip
-        run: |
-          # Get the primary IP address of this runner
-          RUNNER_IP=$(hostname -I | awk '{print $1}')
-          echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_ENV
-          echo "runner_ip=$RUNNER_IP" >> $GITHUB_OUTPUT
-          echo "Runner IP: $RUNNER_IP"
-
       - name: Create kind cluster configuration
         run: |
           cat <<EOF > /tmp/kind-config.yaml
           kind: Cluster
           apiVersion: kind.x-k8s.io/v1alpha4
-          networking:
-            apiServerAddress: "$RUNNER_IP"
-            apiServerPort: 6443
           nodes:
           - role: control-plane
           - role: worker
@@ -131,52 +91,88 @@ jobs:
           - role: worker
           - role: worker
           - role: worker
+          containerdConfigPatches:
+          - |-
+            [plugins."io.containerd.grpc.v1.cri".registry.mirrors."localhost:30100"]
+              endpoint = ["http://registry:5000"]
           EOF
-          cat /tmp/kind-config.yaml
 
-      - name: Create kind cluster
+      - name: Create k8s Kind Cluster
+        uses: helm/kind-action@v1.13.0
+        with:
+          cluster_name: grove-e2e
+          version: v0.30.0
+          config: /tmp/kind-config.yaml
+
+      - name: Deploy image registry
         run: |
-          kind create cluster --name e2e-test-cluster --config /tmp/kind-config.yaml --wait 5m
+          kubectl create namespace kube-registry || true
+          kubectl apply -f - <<EOF
+          apiVersion: v1
+          kind: Service
+          metadata:
+            name: registry
+            namespace: kube-registry
+          spec:
+            type: NodePort
+            ports:
+            - port: 5000
+              nodePort: 30100
+              protocol: TCP
+            selector:
+              app: registry
+          ---
+          apiVersion: apps/v1
+          kind: Deployment
+          metadata:
+            name: registry
+            namespace: kube-registry
+          spec:
+            replicas: 1
+            selector:
+              matchLabels:
+                app: registry
+            template:
+              metadata:
+                labels:
+                  app: registry
+              spec:
+                containers:
+                - name: registry
+                  image: registry:2
+                  ports:
+                  - containerPort: 5000
+          EOF
+          kubectl wait --for=condition=available --timeout=120s deployment/registry -n kube-registry
 
-      - name: Install Kai Scheduler
+      - name: Prepare charts
         run: |
           cd operator
-          echo "> Preparing charts (copying CRDs)..."
           ./hack/prepare-charts.sh
-          echo "> Installing Kai Scheduler and Grove..."
-          # Install Kai scheduler via helm
-          helm repo add nvidia https://nvidia.github.io/k8s-device-plugin
+
+      - name: Build and load Grove operator image
+        run: |
+          cd operator
+          make docker-build IMG=localhost:30100/grove-operator:e2e
+          docker push localhost:30100/grove-operator:e2e
+          
+      - name: Install Kai Scheduler
+        run: |
           helm repo add kai https://nvidia.github.io/kai-scheduler
           helm repo update
-          # Install with tolerations for control-plane
           helm install kai-scheduler kai/kai-scheduler \
             --namespace kai-system --create-namespace \
-            --set scheduler.tolerations[0].key=node-role.kubernetes.io/control-plane \
-            --set scheduler.tolerations[0].operator=Exists \
-            --set scheduler.tolerations[0].effect=NoSchedule \
+            --set global.registry=localhost:30100 \
             --wait --timeout 10m
-          
+
       - name: Deploy Grove operator
         run: |
           cd operator
-          # Build and load operator image into kind
-          make docker-build IMG=grove-operator:e2e
-          kind load docker-image grove-operator:e2e --name e2e-test-cluster
-          # Deploy operator
-          make deploy IMG=grove-operator:e2e
-          # Wait for operator to be ready
+          make deploy IMG=localhost:30100/grove-operator:e2e
           kubectl wait --for=condition=available --timeout=5m deployment/grove-controller-manager -n grove-system
-          kubectl wait --for=condition=ready --timeout=5m pod -l control-plane=controller-manager -n grove-system
-
-      - name: Wait for Kai Scheduler to be ready
-        run: |
-          kubectl wait --for=condition=ready --timeout=5m pod -l app=kai-scheduler -n kai-system
 
       - name: Create default Kai queues
         run: |
-          cd operator/e2e
-          go run -tags=e2e ./cmd/create-kai-queues/main.go || echo "Using inline queue creation..."
-          # Fallback: create queues directly
           kubectl apply -f - <<EOF
           apiVersion: scheduling.x-k8s.io/v1alpha1
           kind: Queue
@@ -189,112 +185,30 @@ jobs:
 
       - name: Verify Grove webhook is ready
         run: |
-          # Test webhook by doing a dry-run create
-          kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server || true
-          sleep 5
-          # Try again to ensure webhook is responding
-          kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server
+          for i in {1..30}; do
+            if kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server 2>&1; then
+              echo "Grove webhook is ready"
+              break
+            fi
+            echo "Waiting for webhook... (attempt $i/30)"
+            sleep 5
+          done
 
-      - name: Export kubeconfig
+      - name: Run E2E tests
         run: |
-          # Export kubeconfig from kind
-          kind get kubeconfig --name e2e-test-cluster > /tmp/kubeconfig
-          
-          # Verify cluster is accessible
-          kubectl --kubeconfig=/tmp/kubeconfig cluster-info
-          kubectl --kubeconfig=/tmp/kubeconfig get nodes
-          
-          echo "Kubeconfig server:"
-          grep "server:" /tmp/kubeconfig
-
-      - name: Save cluster configuration
-        run: |
-          # Save any additional environment configuration needed for tests
-          echo "CLUSTER_NAME=e2e-test-cluster" > /tmp/cluster-config.env
-          echo "CLUSTER_TYPE=kind" >> /tmp/cluster-config.env
-          echo "RUNNER_IP=$RUNNER_IP" >> /tmp/cluster-config.env
-
-      - name: Upload cluster artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: cluster-artifacts
-          path: |
-            /tmp/kubeconfig
-            /tmp/cluster-config.env
-          retention-days: 1
-
-  e2e:
-    needs: setup-cluster
-    runs-on: cpu-amd-m5-2xlarge
-    timeout-minutes: 60
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - test_name: gang_scheduling
-            test_pattern: "^Test_GS"
-          - test_name: rolling_updates
-            test_pattern: "^Test_RU"
-          - test_name: startup_ordering
-            test_pattern: "^Test_SO"
-          - test_name: Topology_Aware_Scheduling
-            test_pattern: "^Test_TAS"
-    name: E2E - ${{ matrix.test_name }}
-    steps:
-      - name: Print runner specs
-        run: |
-          echo "CPUs: $(nproc)"
-          echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
-
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Go
-        uses: actions/setup-go@v4
-        with:
-          go-version: "1.24.5"
-
-      - name: Download cluster artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: cluster-artifacts
-          path: /tmp/
-
-      - name: Load cluster configuration
-        run: |
-          # Load environment variables from cluster setup
-          cat /tmp/cluster-config.env >> $GITHUB_ENV
-          export KUBECONFIG=/tmp/kubeconfig
-
-      - name: Run e2e tests - ${{ matrix.test_name }}
-        env:
-          KUBECONFIG: /tmp/kubeconfig
-          E2E_USE_EXISTING_CLUSTER: "true"
-        run: |
-          cd operator/e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}'
+          cd operator/e2e
+          go test -tags=e2e ./tests/... -v -timeout 60m
 
       - name: Upload test logs on failure
         if: failure()
         uses: actions/upload-artifact@v4
         with:
-          name: e2e-test-logs-${{ matrix.test_name }}
+          name: e2e-test-logs
           path: /tmp/e2e-*.log
           if-no-files-found: ignore
           retention-days: 7
 
-  cleanup-cluster:
-    needs: e2e
-    if: always()
-    runs-on: cpu-amd-m5-2xlarge
-    timeout-minutes: 10
-    name: Cleanup E2E Cluster
-    steps:
-      - name: Install kind (for cleanup)
-        run: |
-          curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
-          chmod +x ./kind
-          sudo mv ./kind /usr/local/bin/kind || true
-
       - name: Cleanup kind cluster
+        if: always()
         run: |
-          kind delete cluster --name e2e-test-cluster || true
+          kind delete cluster --name grove-e2e || true