Split e2e cluster creation #357
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /* | |
| # Copyright 2025 The Grove Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # */ | |
| # NOTE: This workflow splits cluster creation from E2E test execution. | |
| # REQUIREMENTS for self-hosted runners: | |
| # - All runners must be on the same network and able to reach each other via IP | |
| # - The k3d cluster API server (port 6550) must be accessible across runners | |
| # - If runners are on completely isolated machines, consider using runner labels | |
| # to ensure all jobs run on the same runner instance | |
| name: E2E Tests | |
| on: | |
| pull_request: | |
| types: [opened, synchronize, reopened, labeled, ready_for_review] | |
| branches: ["main"] | |
| # Cancel in-progress runs when a new run is triggered for the same PR | |
| # This prevents stale E2E test runs from consuming resources | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number }} | |
| cancel-in-progress: true | |
| jobs: | |
| setup-cluster: | |
| # Run on non-draft PRs or draft PRs with 'run-e2e' label | |
| if: github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e') | |
| runs-on: cpu-amd-m5-2xlarge | |
| timeout-minutes: 30 | |
| name: Setup E2E Cluster | |
| steps: | |
| # print runner specs so we have a record incase of failures | |
| - name: Print runner specs | |
| run: | | |
| echo "CPUs: $(nproc)" | |
| echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')" | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| # NVIDIA self-hosted runners don't have make installed by default | |
| - name: Install build-essential for make | |
| run: | | |
| sudo apt-get update | |
| sudo apt install build-essential -y | |
| - name: Set up Go | |
| uses: actions/setup-go@v4 | |
| with: | |
| go-version: "1.24.5" | |
| - name: Install kind | |
| run: | | |
| # Install kind | |
| curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64 | |
| chmod +x ./kind | |
| sudo mv ./kind /usr/local/bin/kind | |
| kind version | |
| - name: Install kubectl | |
| run: | | |
| curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" | |
| chmod +x kubectl | |
| sudo mv kubectl /usr/local/bin/ | |
| kubectl version --client | |
| - name: Install Helm | |
| run: | | |
| curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash | |
| helm version | |
| - name: Get runner IP address | |
| id: get-ip | |
| run: | | |
| # Get the primary IP address of this runner | |
| RUNNER_IP=$(hostname -I | awk '{print $1}') | |
| echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_ENV | |
| echo "runner_ip=$RUNNER_IP" >> $GITHUB_OUTPUT | |
| echo "Runner IP: $RUNNER_IP" | |
| - name: Create kind cluster configuration | |
| run: | | |
| cat <<EOF > /tmp/kind-config.yaml | |
| kind: Cluster | |
| apiVersion: kind.x-k8s.io/v1alpha4 | |
| networking: | |
| apiServerAddress: "$RUNNER_IP" | |
| apiServerPort: 6443 | |
| nodes: | |
| - role: control-plane | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| - role: worker | |
| EOF | |
| cat /tmp/kind-config.yaml | |
| - name: Create kind cluster | |
| run: | | |
| kind create cluster --name e2e-test-cluster --config /tmp/kind-config.yaml --wait 5m | |
| - name: Install Kai Scheduler | |
| run: | | |
| cd operator | |
| echo "> Preparing charts (copying CRDs)..." | |
| ./hack/prepare-charts.sh | |
| echo "> Installing Kai Scheduler and Grove..." | |
| # Install Kai scheduler via helm | |
| helm repo add nvidia https://nvidia.github.io/k8s-device-plugin | |
| helm repo add kai https://nvidia.github.io/kai-scheduler | |
| helm repo update | |
| # Install with tolerations for control-plane | |
| helm install kai-scheduler kai/kai-scheduler \ | |
| --namespace kai-system --create-namespace \ | |
| --set scheduler.tolerations[0].key=node-role.kubernetes.io/control-plane \ | |
| --set scheduler.tolerations[0].operator=Exists \ | |
| --set scheduler.tolerations[0].effect=NoSchedule \ | |
| --wait --timeout 10m | |
| - name: Deploy Grove operator | |
| run: | | |
| cd operator | |
| # Build and load operator image into kind | |
| make docker-build IMG=grove-operator:e2e | |
| kind load docker-image grove-operator:e2e --name e2e-test-cluster | |
| # Deploy operator | |
| make deploy IMG=grove-operator:e2e | |
| # Wait for operator to be ready | |
| kubectl wait --for=condition=available --timeout=5m deployment/grove-controller-manager -n grove-system | |
| kubectl wait --for=condition=ready --timeout=5m pod -l control-plane=controller-manager -n grove-system | |
| - name: Wait for Kai Scheduler to be ready | |
| run: | | |
| kubectl wait --for=condition=ready --timeout=5m pod -l app=kai-scheduler -n kai-system | |
| - name: Create default Kai queues | |
| run: | | |
| cd operator/e2e | |
| go run -tags=e2e ./cmd/create-kai-queues/main.go || echo "Using inline queue creation..." | |
| # Fallback: create queues directly | |
| kubectl apply -f - <<EOF | |
| apiVersion: scheduling.x-k8s.io/v1alpha1 | |
| kind: Queue | |
| metadata: | |
| name: default | |
| namespace: default | |
| spec: | |
| weight: 1 | |
| EOF | |
| - name: Verify Grove webhook is ready | |
| run: | | |
| # Test webhook by doing a dry-run create | |
| kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server || true | |
| sleep 5 | |
| # Try again to ensure webhook is responding | |
| kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server | |
| - name: Export kubeconfig | |
| run: | | |
| # Export kubeconfig from kind | |
| kind get kubeconfig --name e2e-test-cluster > /tmp/kubeconfig | |
| # Verify cluster is accessible | |
| kubectl --kubeconfig=/tmp/kubeconfig cluster-info | |
| kubectl --kubeconfig=/tmp/kubeconfig get nodes | |
| echo "Kubeconfig server:" | |
| grep "server:" /tmp/kubeconfig | |
| - name: Save cluster configuration | |
| run: | | |
| # Save any additional environment configuration needed for tests | |
| echo "CLUSTER_NAME=e2e-test-cluster" > /tmp/cluster-config.env | |
| echo "CLUSTER_TYPE=kind" >> /tmp/cluster-config.env | |
| echo "RUNNER_IP=$RUNNER_IP" >> /tmp/cluster-config.env | |
| - name: Upload cluster artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: cluster-artifacts | |
| path: | | |
| /tmp/kubeconfig | |
| /tmp/cluster-config.env | |
| retention-days: 1 | |
| e2e: | |
| needs: setup-cluster | |
| runs-on: cpu-amd-m5-2xlarge | |
| timeout-minutes: 60 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - test_name: gang_scheduling | |
| test_pattern: "^Test_GS" | |
| - test_name: rolling_updates | |
| test_pattern: "^Test_RU" | |
| - test_name: startup_ordering | |
| test_pattern: "^Test_SO" | |
| - test_name: Topology_Aware_Scheduling | |
| test_pattern: "^Test_TAS" | |
| name: E2E - ${{ matrix.test_name }} | |
| steps: | |
| - name: Print runner specs | |
| run: | | |
| echo "CPUs: $(nproc)" | |
| echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')" | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v4 | |
| with: | |
| go-version: "1.24.5" | |
| - name: Download cluster artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: cluster-artifacts | |
| path: /tmp/ | |
| - name: Load cluster configuration | |
| run: | | |
| # Load environment variables from cluster setup | |
| cat /tmp/cluster-config.env >> $GITHUB_ENV | |
| export KUBECONFIG=/tmp/kubeconfig | |
| - name: Run e2e tests - ${{ matrix.test_name }} | |
| env: | |
| KUBECONFIG: /tmp/kubeconfig | |
| E2E_USE_EXISTING_CLUSTER: "true" | |
| run: | | |
| cd operator/e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}' | |
| - name: Upload test logs on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-logs-${{ matrix.test_name }} | |
| path: /tmp/e2e-*.log | |
| if-no-files-found: ignore | |
| retention-days: 7 | |
| cleanup-cluster: | |
| needs: e2e | |
| if: always() | |
| runs-on: cpu-amd-m5-2xlarge | |
| timeout-minutes: 10 | |
| name: Cleanup E2E Cluster | |
| steps: | |
| - name: Install kind (for cleanup) | |
| run: | | |
| curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64 | |
| chmod +x ./kind | |
| sudo mv ./kind /usr/local/bin/kind || true | |
| - name: Cleanup kind cluster | |
| run: | | |
| kind delete cluster --name e2e-test-cluster || true |