Skip to content

E2E test soperator #3369

E2E test soperator

E2E test soperator #3369

Workflow file for this run

name: E2E test soperator
on:
workflow_dispatch:
inputs:
terraform_repo:
description: "terraform repository with slurm cluster deployment"
required: true
default: "nebius/nebius-solution-library"
type: string
terraform_repo_ref:
description: "ref of terraform repo to apply"
required: true
default: "main"
type: string
path_to_installation:
description: "path inside terraform repo to terraform module to apply"
required: true
default: "/soperator/installations/example"
type: string
permissions:
contents: read
concurrency:
# Prevent cancelling already-running jobs (avoids resource waste).
cancel-in-progress: false
# GitHub still cancels older WAITING runs when newer runs arrive.
# Per-branch concurrency groups prevent these cancellations between different branches,
# e.g. a run on 'feature-branch' will not be cancelled when a new scheduled run at `main` branch arrived.
group: e2e-${{ github.ref_name }}
jobs:
e2e-test:
runs-on:
- self-hosted
- e2e-tests # We have a single runner with this label => only 1 job at a time.
environment: e2e
env:
PATH_TO_INSTALLATION: "${{ github.workspace }}/terraform-repo/${{ github.event.inputs.path_to_installation || '/soperator/installations/example' }}"
TERRAFORM_REPO: "${{ github.event.inputs.terraform_repo || 'nebius/nebius-solution-library' }}"
TERRAFORM_REPO_REF: "${{ github.event.inputs.terraform_repo_ref || 'main' }}"
NEBIUS_TENANT_ID: ${{ vars.E2E_TEST_NEBIUS_TENANT_ID }}
NEBIUS_PROJECT_ID: ${{ vars.E2E_TEST_NEBIUS_PROJECT_ID }}
NEBIUS_REGION: ${{ vars.E2E_TEST_NEBIUS_REGION }}
WORKER_PLATFORM: ${{ vars.E2E_TEST_WORKER_PLATFORM }}
WORKER_PRESET: ${{ vars.E2E_TEST_WORKER_PRESET }}
INFINIBAND_FABRIC: ${{ vars.E2E_TEST_INFINIBAND_FABRIC }}
SSH_KEYS: ${{ vars.E2E_TEST_SSH_KEYS }}
O11Y_ACCESS_TOKEN: ${{ secrets.E2E_O11Y_ACCESS_TOKEN }}
OUTPUT_LOG_FILE: "${{ github.workspace }}/output/output.log"
OUTPUT_ERR_FILE: "${{ github.workspace }}/output/output.err"
steps:
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
fetch-depth: 0
- name: Install GO
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
with:
go-version-file: 'go.mod'
cache: false
- name: Install E2E tools
run: |
make install-e2e-tools
echo "${{ github.workspace }}/bin" >> $GITHUB_PATH
- name: Install AWS CLI v2
run: |
if command -v aws >/dev/null 2>&1; then
echo "AWS CLI already installed: $(aws --version)"
exit 0
fi
echo "AWS CLI not found, installing..."
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip -q awscliv2.zip
sudo ./aws/install --update
- name: Check tools versions
shell: bash
run: |
echo "yq:"
which yq
yq --version
echo ""
echo "jq:"
which jq
jq --version
echo ""
echo "nebius:"
which nebius
nebius version
echo ""
echo "terraform:"
which terraform
terraform --version
echo ""
echo "kubectl:"
which kubectl
kubectl version --client
echo ""
echo "aws:"
which aws
aws --version
echo ""
echo "go:"
which go
go version
- name: Find latest successful build run on current branch
id: find_build
shell: bash
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "Looking for successful build run on branch: ${{ github.ref_name }}"
run_info=$(gh api -X GET \
"/repos/${{ github.repository }}/actions/workflows/one_job.yml/runs" \
-F branch="${{ github.ref_name }}" \
-F status=success \
-F per_page=1 \
--jq '.workflow_runs[0] | {id: .id, head_sha: .head_sha, created_at: .created_at, html_url: .html_url}')
run_id=$(jq -r '.id' <<<"$run_info")
if [[ "$run_id" == "null" || -z "$run_id" ]]; then
echo "::error::No successful build found on branch ${{ github.ref_name }}"
exit 1
fi
head_sha=$(jq -r '.head_sha' <<<"$run_info")
created_at=$(jq -r '.created_at' <<<"$run_info")
html_url=$(jq -r '.html_url' <<<"$run_info")
echo "Found build: $run_id"
echo "run_id=$run_id" >> $GITHUB_OUTPUT
echo "head_sha=$head_sha" >> $GITHUB_OUTPUT
echo "created_at=$created_at" >> $GITHUB_OUTPUT
echo "html_url=$html_url" >> $GITHUB_OUTPUT
- name: Download artifact with version
run: |
gh run download ${{ steps.find_build.outputs.run_id }} -n version
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Read version
run: |
SOPERATOR_VERSION=$(sed -n '1p' version.txt)
SOPERATOR_UNSTABLE=$(sed -n '2p' version.txt)
echo "SOPERATOR_VERSION=$SOPERATOR_VERSION" >> $GITHUB_ENV
echo "SOPERATOR_UNSTABLE=$SOPERATOR_UNSTABLE" >> $GITHUB_ENV
- name: Checkout Terraform repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: ${{ env.TERRAFORM_REPO }}
ref: ${{ env.TERRAFORM_REPO_REF }}
path: "${{ github.workspace }}/terraform-repo"
fetch-depth: 0
- name: Terraform Apply
run: |
cd ${{ env.PATH_TO_INSTALLATION }}
nebius iam session-management revoke --all-my-active
# revoke is async operation and in different region token from source .envrc could be revoked right after it's created
sleep 5
source .envrc
cd -
# Configure AWS CLI
aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID
aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY
aws configure set region $NEBIUS_REGION
aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443
go test -v -timeout 2h --tags=e2e -run TestTerraformApply ./test/e2e/...
- name: K8s Cluster Info and NodeGroups
if: always()
shell: bash
run: |
echo "=== Listing K8s clusters ==="
clusters_json=$(nebius mk8s cluster list --parent-id "$NEBIUS_PROJECT_ID" --format json)
echo "$clusters_json"
echo ""
echo "=== Listing node groups for each cluster ==="
cluster_ids=$(echo "$clusters_json" | jq -r '.items[].metadata.id // empty')
for cluster_id in $cluster_ids; do
echo ""
echo "--- Node groups for cluster: $cluster_id ---"
nebius mk8s node-group list --parent-id "$cluster_id" --page-size 1000
done
- name: "K8s Cluster: Pods"
if: always()
shell: bash
run: kubectl get pods -A -o wide
- name: "K8s Cluster: Events"
if: always()
shell: bash
run: kubectl get events -A --sort-by='.lastTimestamp'
- name: "K8s Cluster: Nodes"
if: always()
shell: bash
run: |
kubectl get nodes
echo ""
kubectl get nodes -o yaml
- name: "K8s Cluster: Jobs"
if: always()
shell: bash
run: |
kubectl -n soperator get job
echo ""
kubectl -n soperator get job -o yaml
- name: "K8s Cluster: Helm Releases"
if: always()
shell: bash
run: |
kubectl get helmreleases -n flux-system
echo ""
kubectl get helmreleases -n flux-system -o yaml
- name: "K8s Cluster: Slurm Cluster CRs"
if: always()
shell: bash
run: |
kubectl get slurmclusters -A
echo ""
kubectl get slurmclusters -A -o yaml
- name: "K8s Cluster: Slurm Active Checks CRs"
if: always()
shell: bash
run: |
kubectl get activechecks -A
echo ""
kubectl get activechecks -A -o yaml
- name: Slurm Cluster State
if: always()
shell: bash
run: |
kubectl exec -n soperator controller-0 -- sinfo -N
kubectl exec -n soperator controller-0 -- squeue
kubectl exec -n soperator controller-0 -- sacct --parsable2 --allusers --starttime=now-6hours | column -t -s'|'
- name: Collect Full Kubernetes Cluster Info
if: always()
shell: bash
run: |
mkdir -p ./cluster-info
kubectl cluster-info dump --namespaces=kruise-system,soperator-system,soperator,flux-system --output-directory=./cluster-info
- name: Upload Full Kubernetes Cluster Info
if: always()
uses: actions/upload-artifact@v5
with:
name: cluster-info
path: ./cluster-info
retention-days: 7
- name: Collect Soperator Outputs
if: always()
shell: bash
run: |
mkdir -p ./soperator-outputs
kubectl cp soperator/controller-0:/mnt/jail/opt/soperator-outputs ./soperator-outputs
- name: Upload Soperator Outputs
if: always()
uses: actions/upload-artifact@v5
with:
name: soperator-outputs
path: ./soperator-outputs
retention-days: 7
- name: Terraform Destroy
if: always()
run: |
cd ${{ env.PATH_TO_INSTALLATION }}
source .envrc
cd -
# Configure AWS CLI (in case env was lost)
aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID
aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY
aws configure set region $NEBIUS_REGION
aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443
go test -v -timeout 30m --tags=e2e -run TestTerraformDestroy ./test/e2e/...
- name: Force cleanup compute instances on failure
if: failure()
shell: bash
run: |
echo "=== Node groups state before cleanup ==="
clusters_json=$(nebius mk8s cluster list --parent-id "$NEBIUS_PROJECT_ID" --format json 2>/dev/null || true)
cluster_ids=$(echo "$clusters_json" | jq -r '.items[].metadata.id // empty' 2>/dev/null || true)
for cluster_id in $cluster_ids; do
echo ""
echo "--- Node groups for cluster: $cluster_id ---"
nebius mk8s node-group list --parent-id "$cluster_id" --page-size 1000 --format yaml || true
done
echo ""
echo "=== Forcing cleanup of remaining compute instances ==="
# Get nodes from the cluster (ignore errors if cluster is already gone)
nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)
if [ -z "$nodes" ]; then
echo "No nodes found or cluster not accessible"
exit 0
fi
echo "Found nodes: $nodes"
operations=()
for node in $nodes; do
echo "Deleting compute instance: $node"
op=$(nebius compute instance delete --id="$node" --async 2>/dev/null) && operations+=("$op") || echo "Failed to delete instance $node (may already be deleted)"
done
echo ""
echo "=== Operation statuses ==="
for op in "${operations[@]}"; do
echo "--- $op ---"
nebius compute instance operation get --id="$op" || true
done
echo "=== Cleanup complete ==="
- name: Add build info to job summary
if: always()
shell: bash
run: |
head_sha="${{ steps.find_build.outputs.head_sha }}"
created_at="${{ steps.find_build.outputs.created_at }}"
html_url="${{ steps.find_build.outputs.html_url }}"
run_id="${{ steps.find_build.outputs.run_id }}"
repo_url="https://github.com/${{ github.repository }}"
terraform_repo_url="https://github.com/${{ env.TERRAFORM_REPO }}"
{
echo "### Image Build Information"
if [[ -n "$run_id" && "$run_id" != "null" ]]; then
echo "[Run $run_id]($html_url) at $created_at"
echo "### Latest Commits on ${{ github.ref_name }}"
git log --format='%h|%s|%ar|%an' -3 "$head_sha" 2>/dev/null | while IFS='|' read -r hash msg date author; do
echo "- [$hash]($repo_url/commit/$hash): $msg ($date) <$author>"
done
# Get commits not covered by the build (first-parent only)
if git merge-base --is-ancestor "$head_sha" HEAD 2>/dev/null; then
not_covered=$(git log --format='%h|%s|%ar|%an' --first-parent "$head_sha"..HEAD 2>/dev/null || true)
if [[ -n "$not_covered" ]]; then
echo ""
echo "### Not Covered Commits"
while IFS='|' read -r hash msg date author; do
echo "- [$hash]($repo_url/commit/$hash): $msg ($date) <$author>"
done <<< "$not_covered"
fi
else
echo ""
echo "### Warning"
echo "Build commit $head_sha is not reachable from current HEAD. Cannot determine uncovered commits."
fi
else
echo "No successful build was found."
fi
# Terraform repo latest commits
echo ""
echo "### Terraform Repo Latest Commits on ${{ env.TERRAFORM_REPO_REF }}"
git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do
echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>"
done
} >> $GITHUB_STEP_SUMMARY
- name: Add errors output to job summary
if: ${{ always() }}
run: cat $OUTPUT_ERR_FILE >> $GITHUB_STEP_SUMMARY
- name: Upload terraform output
if: ${{ always() }}
uses: actions/upload-artifact@v5
with:
name: terraform-output
path: "${{ github.workspace }}/output"