Skip to content

Split e2e cluster creation #357

Split e2e cluster creation

Split e2e cluster creation #357

Workflow file for this run

# /*
# Copyright 2025 The Grove Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# */
# NOTE: This workflow splits cluster creation from E2E test execution.
# REQUIREMENTS for self-hosted runners:
# - All runners must be on the same network and able to reach each other via IP
# - The k3d cluster API server (port 6550) must be accessible across runners
# - If runners are on completely isolated machines, consider using runner labels
# to ensure all jobs run on the same runner instance
name: E2E Tests
on:
pull_request:
types: [opened, synchronize, reopened, labeled, ready_for_review]
branches: ["main"]
# Cancel in-progress runs when a new run is triggered for the same PR
# This prevents stale E2E test runs from consuming resources
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true
jobs:
setup-cluster:
# Run on non-draft PRs or draft PRs with 'run-e2e' label
if: github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e')
runs-on: cpu-amd-m5-2xlarge
timeout-minutes: 30
name: Setup E2E Cluster
steps:
# print runner specs so we have a record incase of failures
- name: Print runner specs
run: |
echo "CPUs: $(nproc)"
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
- name: Checkout code
uses: actions/checkout@v4
# NVIDIA self-hosted runners don't have make installed by default
- name: Install build-essential for make
run: |
sudo apt-get update
sudo apt install build-essential -y
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: "1.24.5"
- name: Install kind
run: |
# Install kind
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
kind version
- name: Install kubectl
run: |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
kubectl version --client
- name: Install Helm
run: |
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm version
- name: Get runner IP address
id: get-ip
run: |
# Get the primary IP address of this runner
RUNNER_IP=$(hostname -I | awk '{print $1}')
echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_ENV
echo "runner_ip=$RUNNER_IP" >> $GITHUB_OUTPUT
echo "Runner IP: $RUNNER_IP"
- name: Create kind cluster configuration
run: |
cat <<EOF > /tmp/kind-config.yaml
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
networking:
apiServerAddress: "$RUNNER_IP"
apiServerPort: 6443
nodes:
- role: control-plane
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
EOF
cat /tmp/kind-config.yaml
- name: Create kind cluster
run: |
kind create cluster --name e2e-test-cluster --config /tmp/kind-config.yaml --wait 5m
- name: Install Kai Scheduler
run: |
cd operator
echo "> Preparing charts (copying CRDs)..."
./hack/prepare-charts.sh
echo "> Installing Kai Scheduler and Grove..."
# Install Kai scheduler via helm
helm repo add nvidia https://nvidia.github.io/k8s-device-plugin
helm repo add kai https://nvidia.github.io/kai-scheduler
helm repo update
# Install with tolerations for control-plane
helm install kai-scheduler kai/kai-scheduler \
--namespace kai-system --create-namespace \
--set scheduler.tolerations[0].key=node-role.kubernetes.io/control-plane \
--set scheduler.tolerations[0].operator=Exists \
--set scheduler.tolerations[0].effect=NoSchedule \
--wait --timeout 10m
- name: Deploy Grove operator
run: |
cd operator
# Build and load operator image into kind
make docker-build IMG=grove-operator:e2e
kind load docker-image grove-operator:e2e --name e2e-test-cluster
# Deploy operator
make deploy IMG=grove-operator:e2e
# Wait for operator to be ready
kubectl wait --for=condition=available --timeout=5m deployment/grove-controller-manager -n grove-system
kubectl wait --for=condition=ready --timeout=5m pod -l control-plane=controller-manager -n grove-system
- name: Wait for Kai Scheduler to be ready
run: |
kubectl wait --for=condition=ready --timeout=5m pod -l app=kai-scheduler -n kai-system
- name: Create default Kai queues
run: |
cd operator/e2e
go run -tags=e2e ./cmd/create-kai-queues/main.go || echo "Using inline queue creation..."
# Fallback: create queues directly
kubectl apply -f - <<EOF
apiVersion: scheduling.x-k8s.io/v1alpha1
kind: Queue
metadata:
name: default
namespace: default
spec:
weight: 1
EOF
- name: Verify Grove webhook is ready
run: |
# Test webhook by doing a dry-run create
kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server || true
sleep 5
# Try again to ensure webhook is responding
kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server
- name: Export kubeconfig
run: |
# Export kubeconfig from kind
kind get kubeconfig --name e2e-test-cluster > /tmp/kubeconfig
# Verify cluster is accessible
kubectl --kubeconfig=/tmp/kubeconfig cluster-info
kubectl --kubeconfig=/tmp/kubeconfig get nodes
echo "Kubeconfig server:"
grep "server:" /tmp/kubeconfig
- name: Save cluster configuration
run: |
# Save any additional environment configuration needed for tests
echo "CLUSTER_NAME=e2e-test-cluster" > /tmp/cluster-config.env
echo "CLUSTER_TYPE=kind" >> /tmp/cluster-config.env
echo "RUNNER_IP=$RUNNER_IP" >> /tmp/cluster-config.env
- name: Upload cluster artifacts
uses: actions/upload-artifact@v4
with:
name: cluster-artifacts
path: |
/tmp/kubeconfig
/tmp/cluster-config.env
retention-days: 1
e2e:
needs: setup-cluster
runs-on: cpu-amd-m5-2xlarge
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
- test_name: gang_scheduling
test_pattern: "^Test_GS"
- test_name: rolling_updates
test_pattern: "^Test_RU"
- test_name: startup_ordering
test_pattern: "^Test_SO"
- test_name: Topology_Aware_Scheduling
test_pattern: "^Test_TAS"
name: E2E - ${{ matrix.test_name }}
steps:
- name: Print runner specs
run: |
echo "CPUs: $(nproc)"
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: "1.24.5"
- name: Download cluster artifacts
uses: actions/download-artifact@v4
with:
name: cluster-artifacts
path: /tmp/
- name: Load cluster configuration
run: |
# Load environment variables from cluster setup
cat /tmp/cluster-config.env >> $GITHUB_ENV
export KUBECONFIG=/tmp/kubeconfig
- name: Run e2e tests - ${{ matrix.test_name }}
env:
KUBECONFIG: /tmp/kubeconfig
E2E_USE_EXISTING_CLUSTER: "true"
run: |
cd operator/e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}'
- name: Upload test logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: e2e-test-logs-${{ matrix.test_name }}
path: /tmp/e2e-*.log
if-no-files-found: ignore
retention-days: 7
cleanup-cluster:
needs: e2e
if: always()
runs-on: cpu-amd-m5-2xlarge
timeout-minutes: 10
name: Cleanup E2E Cluster
steps:
- name: Install kind (for cleanup)
run: |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind || true
- name: Cleanup kind cluster
run: |
kind delete cluster --name e2e-test-cluster || true