Skip to content

Commit dd3d323

Browse files
committed
Simplify E2E workflow following KAI Scheduler pattern
- Single job with sequential cluster setup and test execution - Use helm/kind-action for cluster creation (30 workers) - Deploy local registry in cluster with NodePort - Build and push Grove operator to local registry - Install Kai Scheduler from helm chart - Run all E2E tests sequentially (no parallelism) - Cleanup cluster in same job with 'if: always()' This follows the proven pattern from KAI Scheduler's E2E workflow and runs everything in one job on one runner.
1 parent 456c33b commit dd3d323

File tree

1 file changed

+81
-167
lines changed

1 file changed

+81
-167
lines changed

.github/workflows/e2e-test.yaml

Lines changed: 81 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,6 @@
1414
# limitations under the License.
1515
# */
1616

17-
# NOTE: This workflow splits cluster creation from E2E test execution.
18-
# REQUIREMENTS for self-hosted runners:
19-
# - All runners must be on the same network and able to reach each other via IP
20-
# - The k3d cluster API server (port 6550) must be accessible across runners
21-
# - If runners are on completely isolated machines, consider using runner labels
22-
# to ensure all jobs run on the same runner instance
23-
2417
name: E2E Tests
2518

2619
on:
@@ -35,14 +28,13 @@ concurrency:
3528
cancel-in-progress: true
3629

3730
jobs:
38-
setup-cluster:
31+
e2e:
3932
# Run on non-draft PRs or draft PRs with 'run-e2e' label
4033
if: github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e')
4134
runs-on: cpu-amd-m5-2xlarge
42-
timeout-minutes: 30
43-
name: Setup E2E Cluster
35+
timeout-minutes: 90
36+
name: E2E Tests
4437
steps:
45-
# print runner specs so we have a record incase of failures
4638
- name: Print runner specs
4739
run: |
4840
echo "CPUs: $(nproc)"
@@ -58,47 +50,15 @@ jobs:
5850
sudo apt install build-essential -y
5951
6052
- name: Set up Go
61-
uses: actions/setup-go@v4
53+
uses: actions/setup-go@v5
6254
with:
6355
go-version: "1.24.5"
6456

65-
- name: Install kind
66-
run: |
67-
# Install kind
68-
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
69-
chmod +x ./kind
70-
sudo mv ./kind /usr/local/bin/kind
71-
kind version
72-
73-
- name: Install kubectl
74-
run: |
75-
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
76-
chmod +x kubectl
77-
sudo mv kubectl /usr/local/bin/
78-
kubectl version --client
79-
80-
- name: Install Helm
81-
run: |
82-
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
83-
helm version
84-
85-
- name: Get runner IP address
86-
id: get-ip
87-
run: |
88-
# Get the primary IP address of this runner
89-
RUNNER_IP=$(hostname -I | awk '{print $1}')
90-
echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_ENV
91-
echo "runner_ip=$RUNNER_IP" >> $GITHUB_OUTPUT
92-
echo "Runner IP: $RUNNER_IP"
93-
9457
- name: Create kind cluster configuration
9558
run: |
9659
cat <<EOF > /tmp/kind-config.yaml
9760
kind: Cluster
9861
apiVersion: kind.x-k8s.io/v1alpha4
99-
networking:
100-
apiServerAddress: "$RUNNER_IP"
101-
apiServerPort: 6443
10262
nodes:
10363
- role: control-plane
10464
- role: worker
@@ -131,52 +91,88 @@ jobs:
13191
- role: worker
13292
- role: worker
13393
- role: worker
94+
containerdConfigPatches:
95+
- |-
96+
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."localhost:30100"]
97+
endpoint = ["http://registry:5000"]
13498
EOF
135-
cat /tmp/kind-config.yaml
13699
137-
- name: Create kind cluster
100+
- name: Create k8s Kind Cluster
101+
uses: helm/kind-action@v1.13.0
102+
with:
103+
cluster_name: grove-e2e
104+
version: v0.30.0
105+
config: /tmp/kind-config.yaml
106+
107+
- name: Deploy image registry
138108
run: |
139-
kind create cluster --name e2e-test-cluster --config /tmp/kind-config.yaml --wait 5m
109+
kubectl create namespace kube-registry || true
110+
kubectl apply -f - <<EOF
111+
apiVersion: v1
112+
kind: Service
113+
metadata:
114+
name: registry
115+
namespace: kube-registry
116+
spec:
117+
type: NodePort
118+
ports:
119+
- port: 5000
120+
nodePort: 30100
121+
protocol: TCP
122+
selector:
123+
app: registry
124+
---
125+
apiVersion: apps/v1
126+
kind: Deployment
127+
metadata:
128+
name: registry
129+
namespace: kube-registry
130+
spec:
131+
replicas: 1
132+
selector:
133+
matchLabels:
134+
app: registry
135+
template:
136+
metadata:
137+
labels:
138+
app: registry
139+
spec:
140+
containers:
141+
- name: registry
142+
image: registry:2
143+
ports:
144+
- containerPort: 5000
145+
EOF
146+
kubectl wait --for=condition=available --timeout=120s deployment/registry -n kube-registry
140147
141-
- name: Install Kai Scheduler
148+
- name: Prepare charts
142149
run: |
143150
cd operator
144-
echo "> Preparing charts (copying CRDs)..."
145151
./hack/prepare-charts.sh
146-
echo "> Installing Kai Scheduler and Grove..."
147-
# Install Kai scheduler via helm
148-
helm repo add nvidia https://nvidia.github.io/k8s-device-plugin
152+
153+
- name: Build and load Grove operator image
154+
run: |
155+
cd operator
156+
make docker-build IMG=localhost:30100/grove-operator:e2e
157+
docker push localhost:30100/grove-operator:e2e
158+
159+
- name: Install Kai Scheduler
160+
run: |
149161
helm repo add kai https://nvidia.github.io/kai-scheduler
150162
helm repo update
151-
# Install with tolerations for control-plane
152163
helm install kai-scheduler kai/kai-scheduler \
153164
--namespace kai-system --create-namespace \
154-
--set scheduler.tolerations[0].key=node-role.kubernetes.io/control-plane \
155-
--set scheduler.tolerations[0].operator=Exists \
156-
--set scheduler.tolerations[0].effect=NoSchedule \
165+
--set global.registry=localhost:30100 \
157166
--wait --timeout 10m
158-
167+
159168
- name: Deploy Grove operator
160169
run: |
161170
cd operator
162-
# Build and load operator image into kind
163-
make docker-build IMG=grove-operator:e2e
164-
kind load docker-image grove-operator:e2e --name e2e-test-cluster
165-
# Deploy operator
166-
make deploy IMG=grove-operator:e2e
167-
# Wait for operator to be ready
171+
make deploy IMG=localhost:30100/grove-operator:e2e
168172
kubectl wait --for=condition=available --timeout=5m deployment/grove-controller-manager -n grove-system
169-
kubectl wait --for=condition=ready --timeout=5m pod -l control-plane=controller-manager -n grove-system
170-
171-
- name: Wait for Kai Scheduler to be ready
172-
run: |
173-
kubectl wait --for=condition=ready --timeout=5m pod -l app=kai-scheduler -n kai-system
174173
175174
- name: Create default Kai queues
176175
run: |
177-
cd operator/e2e
178-
go run -tags=e2e ./cmd/create-kai-queues/main.go || echo "Using inline queue creation..."
179-
# Fallback: create queues directly
180176
kubectl apply -f - <<EOF
181177
apiVersion: scheduling.x-k8s.io/v1alpha1
182178
kind: Queue
@@ -189,112 +185,30 @@ jobs:
189185
190186
- name: Verify Grove webhook is ready
191187
run: |
192-
# Test webhook by doing a dry-run create
193-
kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server || true
194-
sleep 5
195-
# Try again to ensure webhook is responding
196-
kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server
188+
for i in {1..30}; do
189+
if kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server 2>&1; then
190+
echo "Grove webhook is ready"
191+
break
192+
fi
193+
echo "Waiting for webhook... (attempt $i/30)"
194+
sleep 5
195+
done
197196
198-
- name: Export kubeconfig
197+
- name: Run E2E tests
199198
run: |
200-
# Export kubeconfig from kind
201-
kind get kubeconfig --name e2e-test-cluster > /tmp/kubeconfig
202-
203-
# Verify cluster is accessible
204-
kubectl --kubeconfig=/tmp/kubeconfig cluster-info
205-
kubectl --kubeconfig=/tmp/kubeconfig get nodes
206-
207-
echo "Kubeconfig server:"
208-
grep "server:" /tmp/kubeconfig
209-
210-
- name: Save cluster configuration
211-
run: |
212-
# Save any additional environment configuration needed for tests
213-
echo "CLUSTER_NAME=e2e-test-cluster" > /tmp/cluster-config.env
214-
echo "CLUSTER_TYPE=kind" >> /tmp/cluster-config.env
215-
echo "RUNNER_IP=$RUNNER_IP" >> /tmp/cluster-config.env
216-
217-
- name: Upload cluster artifacts
218-
uses: actions/upload-artifact@v4
219-
with:
220-
name: cluster-artifacts
221-
path: |
222-
/tmp/kubeconfig
223-
/tmp/cluster-config.env
224-
retention-days: 1
225-
226-
e2e:
227-
needs: setup-cluster
228-
runs-on: cpu-amd-m5-2xlarge
229-
timeout-minutes: 60
230-
strategy:
231-
fail-fast: false
232-
matrix:
233-
include:
234-
- test_name: gang_scheduling
235-
test_pattern: "^Test_GS"
236-
- test_name: rolling_updates
237-
test_pattern: "^Test_RU"
238-
- test_name: startup_ordering
239-
test_pattern: "^Test_SO"
240-
- test_name: Topology_Aware_Scheduling
241-
test_pattern: "^Test_TAS"
242-
name: E2E - ${{ matrix.test_name }}
243-
steps:
244-
- name: Print runner specs
245-
run: |
246-
echo "CPUs: $(nproc)"
247-
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
248-
249-
- name: Checkout code
250-
uses: actions/checkout@v4
251-
252-
- name: Set up Go
253-
uses: actions/setup-go@v4
254-
with:
255-
go-version: "1.24.5"
256-
257-
- name: Download cluster artifacts
258-
uses: actions/download-artifact@v4
259-
with:
260-
name: cluster-artifacts
261-
path: /tmp/
262-
263-
- name: Load cluster configuration
264-
run: |
265-
# Load environment variables from cluster setup
266-
cat /tmp/cluster-config.env >> $GITHUB_ENV
267-
export KUBECONFIG=/tmp/kubeconfig
268-
269-
- name: Run e2e tests - ${{ matrix.test_name }}
270-
env:
271-
KUBECONFIG: /tmp/kubeconfig
272-
E2E_USE_EXISTING_CLUSTER: "true"
273-
run: |
274-
cd operator/e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}'
199+
cd operator/e2e
200+
go test -tags=e2e ./tests/... -v -timeout 60m
275201
276202
- name: Upload test logs on failure
277203
if: failure()
278204
uses: actions/upload-artifact@v4
279205
with:
280-
name: e2e-test-logs-${{ matrix.test_name }}
206+
name: e2e-test-logs
281207
path: /tmp/e2e-*.log
282208
if-no-files-found: ignore
283209
retention-days: 7
284210

285-
cleanup-cluster:
286-
needs: e2e
287-
if: always()
288-
runs-on: cpu-amd-m5-2xlarge
289-
timeout-minutes: 10
290-
name: Cleanup E2E Cluster
291-
steps:
292-
- name: Install kind (for cleanup)
293-
run: |
294-
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
295-
chmod +x ./kind
296-
sudo mv ./kind /usr/local/bin/kind || true
297-
298211
- name: Cleanup kind cluster
212+
if: always()
299213
run: |
300-
kind delete cluster --name e2e-test-cluster || true
214+
kind delete cluster --name grove-e2e || true

0 commit comments

Comments
 (0)