E2E test soperator #3369
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E test soperator | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| terraform_repo: | |
| description: "terraform repository with slurm cluster deployment" | |
| required: true | |
| default: "nebius/nebius-solution-library" | |
| type: string | |
| terraform_repo_ref: | |
| description: "ref of terraform repo to apply" | |
| required: true | |
| default: "main" | |
| type: string | |
| path_to_installation: | |
| description: "path inside terraform repo to terraform module to apply" | |
| required: true | |
| default: "/soperator/installations/example" | |
| type: string | |
| permissions: | |
| contents: read | |
| concurrency: | |
| # Prevent cancelling already-running jobs (avoids resource waste). | |
| cancel-in-progress: false | |
| # GitHub still cancels older WAITING runs when newer runs arrive. | |
| # Per-branch concurrency groups prevent these cancellations between different branches, | |
| # e.g. a run on 'feature-branch' will not be cancelled when a new scheduled run at `main` branch arrived. | |
| group: e2e-${{ github.ref_name }} | |
| jobs: | |
| e2e-test: | |
| runs-on: | |
| - self-hosted | |
| - e2e-tests # We have a single runner with this label => only 1 job at a time. | |
| environment: e2e | |
| env: | |
| PATH_TO_INSTALLATION: "${{ github.workspace }}/terraform-repo/${{ github.event.inputs.path_to_installation || '/soperator/installations/example' }}" | |
| TERRAFORM_REPO: "${{ github.event.inputs.terraform_repo || 'nebius/nebius-solution-library' }}" | |
| TERRAFORM_REPO_REF: "${{ github.event.inputs.terraform_repo_ref || 'main' }}" | |
| NEBIUS_TENANT_ID: ${{ vars.E2E_TEST_NEBIUS_TENANT_ID }} | |
| NEBIUS_PROJECT_ID: ${{ vars.E2E_TEST_NEBIUS_PROJECT_ID }} | |
| NEBIUS_REGION: ${{ vars.E2E_TEST_NEBIUS_REGION }} | |
| WORKER_PLATFORM: ${{ vars.E2E_TEST_WORKER_PLATFORM }} | |
| WORKER_PRESET: ${{ vars.E2E_TEST_WORKER_PRESET }} | |
| INFINIBAND_FABRIC: ${{ vars.E2E_TEST_INFINIBAND_FABRIC }} | |
| SSH_KEYS: ${{ vars.E2E_TEST_SSH_KEYS }} | |
| O11Y_ACCESS_TOKEN: ${{ secrets.E2E_O11Y_ACCESS_TOKEN }} | |
| OUTPUT_LOG_FILE: "${{ github.workspace }}/output/output.log" | |
| OUTPUT_ERR_FILE: "${{ github.workspace }}/output/output.err" | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install GO | |
| uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0 | |
| with: | |
| go-version-file: 'go.mod' | |
| cache: false | |
| - name: Install E2E tools | |
| run: | | |
| make install-e2e-tools | |
| echo "${{ github.workspace }}/bin" >> $GITHUB_PATH | |
| - name: Install AWS CLI v2 | |
| run: | | |
| if command -v aws >/dev/null 2>&1; then | |
| echo "AWS CLI already installed: $(aws --version)" | |
| exit 0 | |
| fi | |
| echo "AWS CLI not found, installing..." | |
| curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" | |
| unzip -q awscliv2.zip | |
| sudo ./aws/install --update | |
| - name: Check tools versions | |
| shell: bash | |
| run: | | |
| echo "yq:" | |
| which yq | |
| yq --version | |
| echo "" | |
| echo "jq:" | |
| which jq | |
| jq --version | |
| echo "" | |
| echo "nebius:" | |
| which nebius | |
| nebius version | |
| echo "" | |
| echo "terraform:" | |
| which terraform | |
| terraform --version | |
| echo "" | |
| echo "kubectl:" | |
| which kubectl | |
| kubectl version --client | |
| echo "" | |
| echo "aws:" | |
| which aws | |
| aws --version | |
| echo "" | |
| echo "go:" | |
| which go | |
| go version | |
| - name: Find latest successful build run on current branch | |
| id: find_build | |
| shell: bash | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| echo "Looking for successful build run on branch: ${{ github.ref_name }}" | |
| run_info=$(gh api -X GET \ | |
| "/repos/${{ github.repository }}/actions/workflows/one_job.yml/runs" \ | |
| -F branch="${{ github.ref_name }}" \ | |
| -F status=success \ | |
| -F per_page=1 \ | |
| --jq '.workflow_runs[0] | {id: .id, head_sha: .head_sha, created_at: .created_at, html_url: .html_url}') | |
| run_id=$(jq -r '.id' <<<"$run_info") | |
| if [[ "$run_id" == "null" || -z "$run_id" ]]; then | |
| echo "::error::No successful build found on branch ${{ github.ref_name }}" | |
| exit 1 | |
| fi | |
| head_sha=$(jq -r '.head_sha' <<<"$run_info") | |
| created_at=$(jq -r '.created_at' <<<"$run_info") | |
| html_url=$(jq -r '.html_url' <<<"$run_info") | |
| echo "Found build: $run_id" | |
| echo "run_id=$run_id" >> $GITHUB_OUTPUT | |
| echo "head_sha=$head_sha" >> $GITHUB_OUTPUT | |
| echo "created_at=$created_at" >> $GITHUB_OUTPUT | |
| echo "html_url=$html_url" >> $GITHUB_OUTPUT | |
| - name: Download artifact with version | |
| run: | | |
| gh run download ${{ steps.find_build.outputs.run_id }} -n version | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Read version | |
| run: | | |
| SOPERATOR_VERSION=$(sed -n '1p' version.txt) | |
| SOPERATOR_UNSTABLE=$(sed -n '2p' version.txt) | |
| echo "SOPERATOR_VERSION=$SOPERATOR_VERSION" >> $GITHUB_ENV | |
| echo "SOPERATOR_UNSTABLE=$SOPERATOR_UNSTABLE" >> $GITHUB_ENV | |
| - name: Checkout Terraform repository | |
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| with: | |
| repository: ${{ env.TERRAFORM_REPO }} | |
| ref: ${{ env.TERRAFORM_REPO_REF }} | |
| path: "${{ github.workspace }}/terraform-repo" | |
| fetch-depth: 0 | |
| - name: Terraform Apply | |
| run: | | |
| cd ${{ env.PATH_TO_INSTALLATION }} | |
| nebius iam session-management revoke --all-my-active | |
| # revoke is async operation and in different region token from source .envrc could be revoked right after it's created | |
| sleep 5 | |
| source .envrc | |
| cd - | |
| # Configure AWS CLI | |
| aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID | |
| aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY | |
| aws configure set region $NEBIUS_REGION | |
| aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443 | |
| go test -v -timeout 2h --tags=e2e -run TestTerraformApply ./test/e2e/... | |
| - name: K8s Cluster Info and NodeGroups | |
| if: always() | |
| shell: bash | |
| run: | | |
| echo "=== Listing K8s clusters ===" | |
| clusters_json=$(nebius mk8s cluster list --parent-id "$NEBIUS_PROJECT_ID" --format json) | |
| echo "$clusters_json" | |
| echo "" | |
| echo "=== Listing node groups for each cluster ===" | |
| cluster_ids=$(echo "$clusters_json" | jq -r '.items[].metadata.id // empty') | |
| for cluster_id in $cluster_ids; do | |
| echo "" | |
| echo "--- Node groups for cluster: $cluster_id ---" | |
| nebius mk8s node-group list --parent-id "$cluster_id" --page-size 1000 | |
| done | |
| - name: "K8s Cluster: Pods" | |
| if: always() | |
| shell: bash | |
| run: kubectl get pods -A -o wide | |
| - name: "K8s Cluster: Events" | |
| if: always() | |
| shell: bash | |
| run: kubectl get events -A --sort-by='.lastTimestamp' | |
| - name: "K8s Cluster: Nodes" | |
| if: always() | |
| shell: bash | |
| run: | | |
| kubectl get nodes | |
| echo "" | |
| kubectl get nodes -o yaml | |
| - name: "K8s Cluster: Jobs" | |
| if: always() | |
| shell: bash | |
| run: | | |
| kubectl -n soperator get job | |
| echo "" | |
| kubectl -n soperator get job -o yaml | |
| - name: "K8s Cluster: Helm Releases" | |
| if: always() | |
| shell: bash | |
| run: | | |
| kubectl get helmreleases -n flux-system | |
| echo "" | |
| kubectl get helmreleases -n flux-system -o yaml | |
| - name: "K8s Cluster: Slurm Cluster CRs" | |
| if: always() | |
| shell: bash | |
| run: | | |
| kubectl get slurmclusters -A | |
| echo "" | |
| kubectl get slurmclusters -A -o yaml | |
| - name: "K8s Cluster: Slurm Active Checks CRs" | |
| if: always() | |
| shell: bash | |
| run: | | |
| kubectl get activechecks -A | |
| echo "" | |
| kubectl get activechecks -A -o yaml | |
| - name: Slurm Cluster State | |
| if: always() | |
| shell: bash | |
| run: | | |
| kubectl exec -n soperator controller-0 -- sinfo -N | |
| kubectl exec -n soperator controller-0 -- squeue | |
| kubectl exec -n soperator controller-0 -- sacct --parsable2 --allusers --starttime=now-6hours | column -t -s'|' | |
| - name: Collect Full Kubernetes Cluster Info | |
| if: always() | |
| shell: bash | |
| run: | | |
| mkdir -p ./cluster-info | |
| kubectl cluster-info dump --namespaces=kruise-system,soperator-system,soperator,flux-system --output-directory=./cluster-info | |
| - name: Upload Full Kubernetes Cluster Info | |
| if: always() | |
| uses: actions/upload-artifact@v5 | |
| with: | |
| name: cluster-info | |
| path: ./cluster-info | |
| retention-days: 7 | |
| - name: Collect Soperator Outputs | |
| if: always() | |
| shell: bash | |
| run: | | |
| mkdir -p ./soperator-outputs | |
| kubectl cp soperator/controller-0:/mnt/jail/opt/soperator-outputs ./soperator-outputs | |
| - name: Upload Soperator Outputs | |
| if: always() | |
| uses: actions/upload-artifact@v5 | |
| with: | |
| name: soperator-outputs | |
| path: ./soperator-outputs | |
| retention-days: 7 | |
| - name: Terraform Destroy | |
| if: always() | |
| run: | | |
| cd ${{ env.PATH_TO_INSTALLATION }} | |
| source .envrc | |
| cd - | |
| # Configure AWS CLI (in case env was lost) | |
| aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID | |
| aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY | |
| aws configure set region $NEBIUS_REGION | |
| aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443 | |
| go test -v -timeout 30m --tags=e2e -run TestTerraformDestroy ./test/e2e/... | |
| - name: Force cleanup compute instances on failure | |
| if: failure() | |
| shell: bash | |
| run: | | |
| echo "=== Node groups state before cleanup ===" | |
| clusters_json=$(nebius mk8s cluster list --parent-id "$NEBIUS_PROJECT_ID" --format json 2>/dev/null || true) | |
| cluster_ids=$(echo "$clusters_json" | jq -r '.items[].metadata.id // empty' 2>/dev/null || true) | |
| for cluster_id in $cluster_ids; do | |
| echo "" | |
| echo "--- Node groups for cluster: $cluster_id ---" | |
| nebius mk8s node-group list --parent-id "$cluster_id" --page-size 1000 --format yaml || true | |
| done | |
| echo "" | |
| echo "=== Forcing cleanup of remaining compute instances ===" | |
| # Get nodes from the cluster (ignore errors if cluster is already gone) | |
| nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true) | |
| if [ -z "$nodes" ]; then | |
| echo "No nodes found or cluster not accessible" | |
| exit 0 | |
| fi | |
| echo "Found nodes: $nodes" | |
| operations=() | |
| for node in $nodes; do | |
| echo "Deleting compute instance: $node" | |
| op=$(nebius compute instance delete --id="$node" --async 2>/dev/null) && operations+=("$op") || echo "Failed to delete instance $node (may already be deleted)" | |
| done | |
| echo "" | |
| echo "=== Operation statuses ===" | |
| for op in "${operations[@]}"; do | |
| echo "--- $op ---" | |
| nebius compute instance operation get --id="$op" || true | |
| done | |
| echo "=== Cleanup complete ===" | |
| - name: Add build info to job summary | |
| if: always() | |
| shell: bash | |
| run: | | |
| head_sha="${{ steps.find_build.outputs.head_sha }}" | |
| created_at="${{ steps.find_build.outputs.created_at }}" | |
| html_url="${{ steps.find_build.outputs.html_url }}" | |
| run_id="${{ steps.find_build.outputs.run_id }}" | |
| repo_url="https://github.com/${{ github.repository }}" | |
| terraform_repo_url="https://github.com/${{ env.TERRAFORM_REPO }}" | |
| { | |
| echo "### Image Build Information" | |
| if [[ -n "$run_id" && "$run_id" != "null" ]]; then | |
| echo "[Run $run_id]($html_url) at $created_at" | |
| echo "### Latest Commits on ${{ github.ref_name }}" | |
| git log --format='%h|%s|%ar|%an' -3 "$head_sha" 2>/dev/null | while IFS='|' read -r hash msg date author; do | |
| echo "- [$hash]($repo_url/commit/$hash): $msg ($date) <$author>" | |
| done | |
| # Get commits not covered by the build (first-parent only) | |
| if git merge-base --is-ancestor "$head_sha" HEAD 2>/dev/null; then | |
| not_covered=$(git log --format='%h|%s|%ar|%an' --first-parent "$head_sha"..HEAD 2>/dev/null || true) | |
| if [[ -n "$not_covered" ]]; then | |
| echo "" | |
| echo "### Not Covered Commits" | |
| while IFS='|' read -r hash msg date author; do | |
| echo "- [$hash]($repo_url/commit/$hash): $msg ($date) <$author>" | |
| done <<< "$not_covered" | |
| fi | |
| else | |
| echo "" | |
| echo "### Warning" | |
| echo "Build commit $head_sha is not reachable from current HEAD. Cannot determine uncovered commits." | |
| fi | |
| else | |
| echo "No successful build was found." | |
| fi | |
| # Terraform repo latest commits | |
| echo "" | |
| echo "### Terraform Repo Latest Commits on ${{ env.TERRAFORM_REPO_REF }}" | |
| git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do | |
| echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>" | |
| done | |
| } >> $GITHUB_STEP_SUMMARY | |
| - name: Add errors output to job summary | |
| if: ${{ always() }} | |
| run: cat $OUTPUT_ERR_FILE >> $GITHUB_STEP_SUMMARY | |
| - name: Upload terraform output | |
| if: ${{ always() }} | |
| uses: actions/upload-artifact@v5 | |
| with: | |
| name: terraform-output | |
| path: "${{ github.workspace }}/output" |