Split e2e cluster creation #357

Workflow file for this run

.github/workflows/e2e-test.yaml at 456c33b

	# /*
	# Copyright 2025 The Grove Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# */

	# NOTE: This workflow splits cluster creation from E2E test execution.
	# REQUIREMENTS for self-hosted runners:
	# - All runners must be on the same network and able to reach each other via IP
	# - The k3d cluster API server (port 6550) must be accessible across runners
	# - If runners are on completely isolated machines, consider using runner labels
	# to ensure all jobs run on the same runner instance

	name: E2E Tests

	on:
	pull_request:
	types: [opened, synchronize, reopened, labeled, ready_for_review]
	branches: ["main"]

	# Cancel in-progress runs when a new run is triggered for the same PR
	# This prevents stale E2E test runs from consuming resources
	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
	cancel-in-progress: true

	jobs:
	setup-cluster:
	# Run on non-draft PRs or draft PRs with 'run-e2e' label
	if: github.event.pull_request.draft == false \|\| contains(github.event.pull_request.labels.*.name, 'run-e2e')
	runs-on: cpu-amd-m5-2xlarge
	timeout-minutes: 30
	name: Setup E2E Cluster
	steps:
	# print runner specs so we have a record incase of failures
	- name: Print runner specs
	run: \|
	echo "CPUs: $(nproc)"
	echo "RAM: $(free -h \| awk '/^Mem:/ {print $2}')"

	- name: Checkout code
	uses: actions/checkout@v4

	# NVIDIA self-hosted runners don't have make installed by default
	- name: Install build-essential for make
	run: \|
	sudo apt-get update
	sudo apt install build-essential -y

	- name: Set up Go
	uses: actions/setup-go@v4
	with:
	go-version: "1.24.5"

	- name: Install kind
	run: \|
	# Install kind
	curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
	chmod +x ./kind
	sudo mv ./kind /usr/local/bin/kind
	kind version

	- name: Install kubectl
	run: \|
	curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
	chmod +x kubectl
	sudo mv kubectl /usr/local/bin/
	kubectl version --client

	- name: Install Helm
	run: \|
	curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash
	helm version

	- name: Get runner IP address
	id: get-ip
	run: \|
	# Get the primary IP address of this runner
	RUNNER_IP=$(hostname -I \| awk '{print $1}')
	echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_ENV
	echo "runner_ip=$RUNNER_IP" >> $GITHUB_OUTPUT
	echo "Runner IP: $RUNNER_IP"

	- name: Create kind cluster configuration
	run: \|
	cat <<EOF > /tmp/kind-config.yaml
	kind: Cluster
	apiVersion: kind.x-k8s.io/v1alpha4
	networking:
	apiServerAddress: "$RUNNER_IP"
	apiServerPort: 6443
	nodes:
	- role: control-plane
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	- role: worker
	EOF
	cat /tmp/kind-config.yaml

	- name: Create kind cluster
	run: \|
	kind create cluster --name e2e-test-cluster --config /tmp/kind-config.yaml --wait 5m

	- name: Install Kai Scheduler
	run: \|
	cd operator
	echo "> Preparing charts (copying CRDs)..."
	./hack/prepare-charts.sh
	echo "> Installing Kai Scheduler and Grove..."
	# Install Kai scheduler via helm
	helm repo add nvidia https://nvidia.github.io/k8s-device-plugin
	helm repo add kai https://nvidia.github.io/kai-scheduler
	helm repo update
	# Install with tolerations for control-plane
	helm install kai-scheduler kai/kai-scheduler \
	--namespace kai-system --create-namespace \
	--set scheduler.tolerations[0].key=node-role.kubernetes.io/control-plane \
	--set scheduler.tolerations[0].operator=Exists \
	--set scheduler.tolerations[0].effect=NoSchedule \
	--wait --timeout 10m

	- name: Deploy Grove operator
	run: \|
	cd operator
	# Build and load operator image into kind
	make docker-build IMG=grove-operator:e2e
	kind load docker-image grove-operator:e2e --name e2e-test-cluster
	# Deploy operator
	make deploy IMG=grove-operator:e2e
	# Wait for operator to be ready
	kubectl wait --for=condition=available --timeout=5m deployment/grove-controller-manager -n grove-system
	kubectl wait --for=condition=ready --timeout=5m pod -l control-plane=controller-manager -n grove-system

	- name: Wait for Kai Scheduler to be ready
	run: \|
	kubectl wait --for=condition=ready --timeout=5m pod -l app=kai-scheduler -n kai-system

	- name: Create default Kai queues
	run: \|
	cd operator/e2e
	go run -tags=e2e ./cmd/create-kai-queues/main.go \|\| echo "Using inline queue creation..."
	# Fallback: create queues directly
	kubectl apply -f - <<EOF
	apiVersion: scheduling.x-k8s.io/v1alpha1
	kind: Queue
	metadata:
	name: default
	namespace: default
	spec:
	weight: 1
	EOF

	- name: Verify Grove webhook is ready
	run: \|
	# Test webhook by doing a dry-run create
	kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server \|\| true
	sleep 5
	# Try again to ensure webhook is responding
	kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server

	- name: Export kubeconfig
	run: \|
	# Export kubeconfig from kind
	kind get kubeconfig --name e2e-test-cluster > /tmp/kubeconfig

	# Verify cluster is accessible
	kubectl --kubeconfig=/tmp/kubeconfig cluster-info
	kubectl --kubeconfig=/tmp/kubeconfig get nodes

	echo "Kubeconfig server:"
	grep "server:" /tmp/kubeconfig

	- name: Save cluster configuration
	run: \|
	# Save any additional environment configuration needed for tests
	echo "CLUSTER_NAME=e2e-test-cluster" > /tmp/cluster-config.env
	echo "CLUSTER_TYPE=kind" >> /tmp/cluster-config.env
	echo "RUNNER_IP=$RUNNER_IP" >> /tmp/cluster-config.env

	- name: Upload cluster artifacts
	uses: actions/upload-artifact@v4
	with:
	name: cluster-artifacts
	path: \|
	/tmp/kubeconfig
	/tmp/cluster-config.env
	retention-days: 1

	e2e:
	needs: setup-cluster
	runs-on: cpu-amd-m5-2xlarge
	timeout-minutes: 60
	strategy:
	fail-fast: false
	matrix:
	include:
	- test_name: gang_scheduling
	test_pattern: "^Test_GS"
	- test_name: rolling_updates
	test_pattern: "^Test_RU"
	- test_name: startup_ordering
	test_pattern: "^Test_SO"
	- test_name: Topology_Aware_Scheduling
	test_pattern: "^Test_TAS"
	name: E2E - ${{ matrix.test_name }}
	steps:
	- name: Print runner specs
	run: \|
	echo "CPUs: $(nproc)"
	echo "RAM: $(free -h \| awk '/^Mem:/ {print $2}')"

	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Go
	uses: actions/setup-go@v4
	with:
	go-version: "1.24.5"

	- name: Download cluster artifacts
	uses: actions/download-artifact@v4
	with:
	name: cluster-artifacts
	path: /tmp/

	- name: Load cluster configuration
	run: \|
	# Load environment variables from cluster setup
	cat /tmp/cluster-config.env >> $GITHUB_ENV
	export KUBECONFIG=/tmp/kubeconfig

	- name: Run e2e tests - ${{ matrix.test_name }}
	env:
	KUBECONFIG: /tmp/kubeconfig
	E2E_USE_EXISTING_CLUSTER: "true"
	run: \|
	cd operator/e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}'

	- name: Upload test logs on failure
	if: failure()
	uses: actions/upload-artifact@v4
	with:
	name: e2e-test-logs-${{ matrix.test_name }}
	path: /tmp/e2e-*.log
	if-no-files-found: ignore
	retention-days: 7

	cleanup-cluster:
	needs: e2e
	if: always()
	runs-on: cpu-amd-m5-2xlarge
	timeout-minutes: 10
	name: Cleanup E2E Cluster
	steps:
	- name: Install kind (for cleanup)
	run: \|
	curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
	chmod +x ./kind
	sudo mv ./kind /usr/local/bin/kind \|\| true

	- name: Cleanup kind cluster
	run: \|
	kind delete cluster --name e2e-test-cluster \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Split e2e cluster creation #357

Workflow file

Split e2e cluster creation #357

Uh oh!

Workflow file for this run