1414# limitations under the License.
1515# */
1616
17- # NOTE: This workflow splits cluster creation from E2E test execution.
18- # REQUIREMENTS for self-hosted runners:
19- # - All runners must be on the same network and able to reach each other via IP
20- # - The k3d cluster API server (port 6550) must be accessible across runners
21- # - If runners are on completely isolated machines, consider using runner labels
22- # to ensure all jobs run on the same runner instance
23-
2417name : E2E Tests
2518
2619on :
@@ -35,14 +28,13 @@ concurrency:
3528 cancel-in-progress : true
3629
3730jobs :
38- setup-cluster :
31+ e2e :
3932 # Run on non-draft PRs or draft PRs with 'run-e2e' label
4033 if : github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e')
4134 runs-on : cpu-amd-m5-2xlarge
42- timeout-minutes : 30
43- name : Setup E2E Cluster
35+ timeout-minutes : 90
36+ name : E2E Tests
4437 steps :
45- # print runner specs so we have a record incase of failures
4638 - name : Print runner specs
4739 run : |
4840 echo "CPUs: $(nproc)"
@@ -58,47 +50,15 @@ jobs:
5850 sudo apt install build-essential -y
5951
6052 - name : Set up Go
61- uses : actions/setup-go@v4
53+ uses : actions/setup-go@v5
6254 with :
6355 go-version : " 1.24.5"
6456
65- - name : Install kind
66- run : |
67- # Install kind
68- curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
69- chmod +x ./kind
70- sudo mv ./kind /usr/local/bin/kind
71- kind version
72-
73- - name : Install kubectl
74- run : |
75- curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
76- chmod +x kubectl
77- sudo mv kubectl /usr/local/bin/
78- kubectl version --client
79-
80- - name : Install Helm
81- run : |
82- curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
83- helm version
84-
85- - name : Get runner IP address
86- id : get-ip
87- run : |
88- # Get the primary IP address of this runner
89- RUNNER_IP=$(hostname -I | awk '{print $1}')
90- echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_ENV
91- echo "runner_ip=$RUNNER_IP" >> $GITHUB_OUTPUT
92- echo "Runner IP: $RUNNER_IP"
93-
9457 - name : Create kind cluster configuration
9558 run : |
9659 cat <<EOF > /tmp/kind-config.yaml
9760 kind: Cluster
9861 apiVersion: kind.x-k8s.io/v1alpha4
99- networking:
100- apiServerAddress: "$RUNNER_IP"
101- apiServerPort: 6443
10262 nodes:
10363 - role: control-plane
10464 - role: worker
@@ -131,52 +91,88 @@ jobs:
13191 - role: worker
13292 - role: worker
13393 - role: worker
94+ containerdConfigPatches:
95+ - |-
96+ [plugins."io.containerd.grpc.v1.cri".registry.mirrors."localhost:30100"]
97+ endpoint = ["http://registry:5000"]
13498 EOF
135- cat /tmp/kind-config.yaml
13699
137- - name : Create kind cluster
100+ - name : Create k8s Kind Cluster
101+ uses : helm/kind-action@v1.13.0
102+ with :
103+ cluster_name : grove-e2e
104+ version : v0.30.0
105+ config : /tmp/kind-config.yaml
106+
107+ - name : Deploy image registry
138108 run : |
139- kind create cluster --name e2e-test-cluster --config /tmp/kind-config.yaml --wait 5m
109+ kubectl create namespace kube-registry || true
110+ kubectl apply -f - <<EOF
111+ apiVersion: v1
112+ kind: Service
113+ metadata:
114+ name: registry
115+ namespace: kube-registry
116+ spec:
117+ type: NodePort
118+ ports:
119+ - port: 5000
120+ nodePort: 30100
121+ protocol: TCP
122+ selector:
123+ app: registry
124+ ---
125+ apiVersion: apps/v1
126+ kind: Deployment
127+ metadata:
128+ name: registry
129+ namespace: kube-registry
130+ spec:
131+ replicas: 1
132+ selector:
133+ matchLabels:
134+ app: registry
135+ template:
136+ metadata:
137+ labels:
138+ app: registry
139+ spec:
140+ containers:
141+ - name: registry
142+ image: registry:2
143+ ports:
144+ - containerPort: 5000
145+ EOF
146+ kubectl wait --for=condition=available --timeout=120s deployment/registry -n kube-registry
140147
141- - name : Install Kai Scheduler
148+ - name : Prepare charts
142149 run : |
143150 cd operator
144- echo "> Preparing charts (copying CRDs)..."
145151 ./hack/prepare-charts.sh
146- echo "> Installing Kai Scheduler and Grove..."
147- # Install Kai scheduler via helm
148- helm repo add nvidia https://nvidia.github.io/k8s-device-plugin
152+
153+ - name : Build and load Grove operator image
154+ run : |
155+ cd operator
156+ make docker-build IMG=localhost:30100/grove-operator:e2e
157+ docker push localhost:30100/grove-operator:e2e
158+
159+ - name : Install Kai Scheduler
160+ run : |
149161 helm repo add kai https://nvidia.github.io/kai-scheduler
150162 helm repo update
151- # Install with tolerations for control-plane
152163 helm install kai-scheduler kai/kai-scheduler \
153164 --namespace kai-system --create-namespace \
154- --set scheduler.tolerations[0].key=node-role.kubernetes.io/control-plane \
155- --set scheduler.tolerations[0].operator=Exists \
156- --set scheduler.tolerations[0].effect=NoSchedule \
165+ --set global.registry=localhost:30100 \
157166 --wait --timeout 10m
158-
167+
159168 - name : Deploy Grove operator
160169 run : |
161170 cd operator
162- # Build and load operator image into kind
163- make docker-build IMG=grove-operator:e2e
164- kind load docker-image grove-operator:e2e --name e2e-test-cluster
165- # Deploy operator
166- make deploy IMG=grove-operator:e2e
167- # Wait for operator to be ready
171+ make deploy IMG=localhost:30100/grove-operator:e2e
168172 kubectl wait --for=condition=available --timeout=5m deployment/grove-controller-manager -n grove-system
169- kubectl wait --for=condition=ready --timeout=5m pod -l control-plane=controller-manager -n grove-system
170-
171- - name : Wait for Kai Scheduler to be ready
172- run : |
173- kubectl wait --for=condition=ready --timeout=5m pod -l app=kai-scheduler -n kai-system
174173
175174 - name : Create default Kai queues
176175 run : |
177- cd operator/e2e
178- go run -tags=e2e ./cmd/create-kai-queues/main.go || echo "Using inline queue creation..."
179- # Fallback: create queues directly
180176 kubectl apply -f - <<EOF
181177 apiVersion: scheduling.x-k8s.io/v1alpha1
182178 kind: Queue
@@ -189,112 +185,30 @@ jobs:
189185
190186 - name : Verify Grove webhook is ready
191187 run : |
192- # Test webhook by doing a dry-run create
193- kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server || true
194- sleep 5
195- # Try again to ensure webhook is responding
196- kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server
188+ for i in {1..30}; do
189+ if kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server 2>&1; then
190+ echo "Grove webhook is ready"
191+ break
192+ fi
193+ echo "Waiting for webhook... (attempt $i/30)"
194+ sleep 5
195+ done
197196
198- - name : Export kubeconfig
197+ - name : Run E2E tests
199198 run : |
200- # Export kubeconfig from kind
201- kind get kubeconfig --name e2e-test-cluster > /tmp/kubeconfig
202-
203- # Verify cluster is accessible
204- kubectl --kubeconfig=/tmp/kubeconfig cluster-info
205- kubectl --kubeconfig=/tmp/kubeconfig get nodes
206-
207- echo "Kubeconfig server:"
208- grep "server:" /tmp/kubeconfig
209-
210- - name : Save cluster configuration
211- run : |
212- # Save any additional environment configuration needed for tests
213- echo "CLUSTER_NAME=e2e-test-cluster" > /tmp/cluster-config.env
214- echo "CLUSTER_TYPE=kind" >> /tmp/cluster-config.env
215- echo "RUNNER_IP=$RUNNER_IP" >> /tmp/cluster-config.env
216-
217- - name : Upload cluster artifacts
218- uses : actions/upload-artifact@v4
219- with :
220- name : cluster-artifacts
221- path : |
222- /tmp/kubeconfig
223- /tmp/cluster-config.env
224- retention-days : 1
225-
226- e2e :
227- needs : setup-cluster
228- runs-on : cpu-amd-m5-2xlarge
229- timeout-minutes : 60
230- strategy :
231- fail-fast : false
232- matrix :
233- include :
234- - test_name : gang_scheduling
235- test_pattern : " ^Test_GS"
236- - test_name : rolling_updates
237- test_pattern : " ^Test_RU"
238- - test_name : startup_ordering
239- test_pattern : " ^Test_SO"
240- - test_name : Topology_Aware_Scheduling
241- test_pattern : " ^Test_TAS"
242- name : E2E - ${{ matrix.test_name }}
243- steps :
244- - name : Print runner specs
245- run : |
246- echo "CPUs: $(nproc)"
247- echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
248-
249- - name : Checkout code
250- uses : actions/checkout@v4
251-
252- - name : Set up Go
253- uses : actions/setup-go@v4
254- with :
255- go-version : " 1.24.5"
256-
257- - name : Download cluster artifacts
258- uses : actions/download-artifact@v4
259- with :
260- name : cluster-artifacts
261- path : /tmp/
262-
263- - name : Load cluster configuration
264- run : |
265- # Load environment variables from cluster setup
266- cat /tmp/cluster-config.env >> $GITHUB_ENV
267- export KUBECONFIG=/tmp/kubeconfig
268-
269- - name : Run e2e tests - ${{ matrix.test_name }}
270- env :
271- KUBECONFIG : /tmp/kubeconfig
272- E2E_USE_EXISTING_CLUSTER : " true"
273- run : |
274- cd operator/e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}'
199+ cd operator/e2e
200+ go test -tags=e2e ./tests/... -v -timeout 60m
275201
276202 - name : Upload test logs on failure
277203 if : failure()
278204 uses : actions/upload-artifact@v4
279205 with :
280- name : e2e-test-logs-${{ matrix.test_name }}
206+ name : e2e-test-logs
281207 path : /tmp/e2e-*.log
282208 if-no-files-found : ignore
283209 retention-days : 7
284210
285- cleanup-cluster :
286- needs : e2e
287- if : always()
288- runs-on : cpu-amd-m5-2xlarge
289- timeout-minutes : 10
290- name : Cleanup E2E Cluster
291- steps :
292- - name : Install kind (for cleanup)
293- run : |
294- curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
295- chmod +x ./kind
296- sudo mv ./kind /usr/local/bin/kind || true
297-
298211 - name : Cleanup kind cluster
212+ if : always()
299213 run : |
300- kind delete cluster --name e2e-test-cluster || true
214+ kind delete cluster --name grove-e2e || true
0 commit comments