Skip to content

Split e2e cluster creation #361

Split e2e cluster creation

Split e2e cluster creation #361

Workflow file for this run

# /*
# Copyright 2025 The Grove Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# */
name: E2E Tests
on:
pull_request:
types: [opened, synchronize, reopened, labeled, ready_for_review]
branches: ["main"]
# Cancel in-progress runs when a new run is triggered for the same PR
# This prevents stale E2E test runs from consuming resources
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true
jobs:
e2e:
# Run on non-draft PRs or draft PRs with 'run-e2e' label
if: github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e')
runs-on: cpu-amd-m5-2xlarge
timeout-minutes: 90
name: E2E Tests
steps:
- name: Print runner specs
run: |
echo "CPUs: $(nproc)"
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
- name: Checkout code
uses: actions/checkout@v4
# NVIDIA self-hosted runners don't have make installed by default
- name: Install build-essential for make
run: |
sudo apt-get update
sudo apt install build-essential -y
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "1.24.5"
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Create kind cluster configuration
run: |
cat <<EOF > /tmp/kind-config.yaml
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
- role: worker
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."localhost:30100"]
endpoint = ["http://registry:5000"]
EOF
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.13.0
with:
cluster_name: grove-e2e
version: v0.30.0
config: /tmp/kind-config.yaml
- name: Deploy image registry
run: |
kubectl create namespace kube-registry || true
kubectl apply -f - <<EOF
apiVersion: v1
kind: Service
metadata:
name: registry
namespace: kube-registry
spec:
type: NodePort
ports:
- port: 5000
nodePort: 30100
protocol: TCP
selector:
app: registry
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: registry
namespace: kube-registry
spec:
replicas: 1
selector:
matchLabels:
app: registry
template:
metadata:
labels:
app: registry
spec:
containers:
- name: registry
image: registry:2
ports:
- containerPort: 5000
EOF
kubectl wait --for=condition=available --timeout=120s deployment/registry -n kube-registry
- name: Prepare charts
run: |
cd operator
./hack/prepare-charts.sh
- name: Build and load Grove operator image
run: |
cd operator
# Build with buildx and load into docker (--load flag)
make docker-build IMG=localhost:30100/grove-operator:e2e DOCKER_BUILDX_ADDITIONAL_ARGS="--load"
# Tag and push to local registry
docker push localhost:30100/grove-operator:e2e
- name: Install Kai Scheduler
run: |
helm repo add kai https://nvidia.github.io/kai-scheduler
helm repo update
helm install kai-scheduler kai/kai-scheduler \
--namespace kai-system --create-namespace \
--set global.registry=localhost:30100 \
--wait --timeout 10m
- name: Deploy Grove operator
run: |
cd operator
make deploy IMG=localhost:30100/grove-operator:e2e
kubectl wait --for=condition=available --timeout=5m deployment/grove-controller-manager -n grove-system
- name: Create default Kai queues
run: |
kubectl apply -f - <<EOF
apiVersion: scheduling.x-k8s.io/v1alpha1
kind: Queue
metadata:
name: default
namespace: default
spec:
weight: 1
EOF
- name: Verify Grove webhook is ready
run: |
for i in {1..30}; do
if kubectl create -f operator/config/samples/grove_v1alpha1_podcliqueset.yaml --dry-run=server 2>&1; then
echo "Grove webhook is ready"
break
fi
echo "Waiting for webhook... (attempt $i/30)"
sleep 5
done
- name: Run E2E tests
run: |
cd operator/e2e
go test -tags=e2e ./tests/... -v -timeout 60m
- name: Upload test logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: e2e-test-logs
path: /tmp/e2e-*.log
if-no-files-found: ignore
retention-days: 7
- name: Cleanup kind cluster
if: always()
run: |
kind delete cluster --name grove-e2e || true