Auto-MNNVL: add ComputeDomain component for PCS controller #365
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /* | |
| # Copyright 2025 The Grove Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # */ | |
| name: E2E Tests | |
| on: | |
| pull_request: | |
| types: [opened, synchronize, reopened, labeled, ready_for_review] | |
| branches: ["main"] | |
| # Cancel in-progress runs when a new run is triggered for the same PR | |
| # This prevents stale E2E test runs from consuming resources | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number }} | |
| cancel-in-progress: true | |
| jobs: | |
| e2e: | |
| # Run on non-draft PRs or draft PRs with 'run-e2e' label | |
| if: github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e') | |
| # use NVIDIA self-hosted runner with 8 vCPUs and 30GB of ram (on AWS) | |
| # the tests are unstable using the default runner with 4 vCPUs and 16GB of ram | |
| runs-on: cpu-amd-m5-2xlarge | |
| timeout-minutes: 60 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - test_name: gang_scheduling | |
| test_pattern: "^Test_GS" | |
| - test_name: rolling_updates | |
| test_pattern: "^Test_RU" | |
| - test_name: startup_ordering | |
| test_pattern: "^Test_SO" | |
| - test_name: Topology_Aware_Scheduling | |
| test_pattern: "^Test_TAS" | |
| name: E2E - ${{ matrix.test_name }} | |
| steps: | |
| # print runner specs so we have a record incase of failures | |
| - name: Print runner specs | |
| run: | | |
| echo "CPUs: $(nproc)" | |
| echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')" | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| # NVIDIA self-hosted runners don't have make installed by default | |
| - name: Install build-essential for make | |
| run: | | |
| sudo apt-get update | |
| sudo apt install build-essential -y | |
| - name: Set up Go | |
| uses: actions/setup-go@v4 | |
| with: | |
| go-version: "1.24.5" | |
| - name: Install k3d | |
| run: | | |
| curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash | |
| k3d version | |
| - name: Install skaffold | |
| run: | | |
| curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64 | |
| sudo install skaffold /usr/local/bin/ | |
| skaffold version | |
| - name: Run e2e tests - ${{ matrix.test_name }} | |
| run: | | |
| cd operator | |
| echo "> Preparing charts (copying CRDs)..." | |
| ./hack/prepare-charts.sh | |
| echo "> Running e2e tests for ${{ matrix.test_name }}..." | |
| cd e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}' | |
| # The test code handles cleanup via Teardown(), but this step provides | |
| # extra safety in case of timeout or panic. Also good practice to ensure | |
| # clean state even though GitHub Actions runners are ephemeral. | |
| - name: Cleanup k3d cluster | |
| if: always() | |
| run: | | |
| k3d cluster delete shared-e2e-test-cluster || true | |
| - name: Upload test logs on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-logs-${{ matrix.test_name }} | |
| path: /tmp/e2e-*.log | |
| if-no-files-found: ignore | |
| retention-days: 7 |