Skip to content

Auto-MNNVL: add ComputeDomain component for PCS controller #365

Auto-MNNVL: add ComputeDomain component for PCS controller

Auto-MNNVL: add ComputeDomain component for PCS controller #365

Workflow file for this run

# /*
# Copyright 2025 The Grove Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# */
name: E2E Tests
on:
pull_request:
types: [opened, synchronize, reopened, labeled, ready_for_review]
branches: ["main"]
# Cancel in-progress runs when a new run is triggered for the same PR
# This prevents stale E2E test runs from consuming resources
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true
jobs:
e2e:
# Run on non-draft PRs or draft PRs with 'run-e2e' label
if: github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e')
# use NVIDIA self-hosted runner with 8 vCPUs and 30GB of ram (on AWS)
# the tests are unstable using the default runner with 4 vCPUs and 16GB of ram
runs-on: cpu-amd-m5-2xlarge
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
- test_name: gang_scheduling
test_pattern: "^Test_GS"
- test_name: rolling_updates
test_pattern: "^Test_RU"
- test_name: startup_ordering
test_pattern: "^Test_SO"
- test_name: Topology_Aware_Scheduling
test_pattern: "^Test_TAS"
name: E2E - ${{ matrix.test_name }}
steps:
# print runner specs so we have a record incase of failures
- name: Print runner specs
run: |
echo "CPUs: $(nproc)"
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
- name: Checkout code
uses: actions/checkout@v4
# NVIDIA self-hosted runners don't have make installed by default
- name: Install build-essential for make
run: |
sudo apt-get update
sudo apt install build-essential -y
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: "1.24.5"
- name: Install k3d
run: |
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
k3d version
- name: Install skaffold
run: |
curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64
sudo install skaffold /usr/local/bin/
skaffold version
- name: Run e2e tests - ${{ matrix.test_name }}
run: |
cd operator
echo "> Preparing charts (copying CRDs)..."
./hack/prepare-charts.sh
echo "> Running e2e tests for ${{ matrix.test_name }}..."
cd e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}'
# The test code handles cleanup via Teardown(), but this step provides
# extra safety in case of timeout or panic. Also good practice to ensure
# clean state even though GitHub Actions runners are ephemeral.
- name: Cleanup k3d cluster
if: always()
run: |
k3d cluster delete shared-e2e-test-cluster || true
- name: Upload test logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: e2e-test-logs-${{ matrix.test_name }}
path: /tmp/e2e-*.log
if-no-files-found: ignore
retention-days: 7