Skip to content

ci: OPS-3142: Add multi-gpu test job (#6189) #2

ci: OPS-3142: Add multi-gpu test job (#6189)

ci: OPS-3142: Add multi-gpu test job (#6189) #2

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: PR
on:
push:
branches:
- main
- "pull-request/[0-9]+"
# Note: release/* branches are handled by release.yml workflow
workflow_dispatch:
inputs:
run_deploy_operator:
description: 'Run deploy operator and deployment tests'
required: false
type: boolean
default: false
concurrency:
# The group name is a ternary operation. If the ref_name is 'main',
# then the group name uses the run_id to ensure a unique group for
# 'main' pushes. Otherwise, the group name is the ref_name, so that
# workflows on the same PR/branch have the same group name for cancelling.
group: docker-build-test-${{ github.ref_name == 'main' && github.run_id || github.ref_name }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
# ============================================================================
# SETUP & DETECTION JOBS
# ============================================================================
changed-files:
runs-on: ubuntu-latest
environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' || '' }}
outputs:
core: ${{ steps.changes.outputs.core }}
operator: ${{ steps.changes.outputs.operator }}
deploy: ${{ steps.changes.outputs.deploy }}
vllm: ${{ steps.changes.outputs.vllm }}
sglang: ${{ steps.changes.outputs.sglang }}
trtllm: ${{ steps.changes.outputs.trtllm }}
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
fetch-depth: 0
- name: Check for changes
id: changes
uses: ./.github/actions/changed-files
with:
gh_token: ${{ github.token }}
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
backend-status-check:
runs-on: ubuntu-latest
needs: [changed-files, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator] # THIS list determines blocking jobs
if: always()
steps:
- name: "Check all dependent jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
# ============================================================================
# Operator
# ============================================================================
operator:
needs: changed-files
if: needs.changed-files.outputs.operator == 'true'
name: Operator
runs-on: prod-default-v2
env:
IMAGE_REGISTRY: ai-dynamo
IMAGE_REPOSITORY: dynamo
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
outputs:
operator_default_tag: ${{ steps.build-and-push-image.outputs.operator_default_tag }}
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
flavor: general
all_arch: 'true'
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Linter
shell: bash
working-directory: ./deploy/operator
run: |
docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Tester
shell: bash
working-directory: ./deploy/operator
run: |
docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Set up Go
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
with:
go-version: '1.25'
- name: Check for uncommitted changes
shell: bash
working-directory: ./deploy/operator
run: |
make check
- name: Build and push Container
id: build-and-push-image
shell: bash
working-directory: ./deploy/operator
env:
NO_CACHE_FLAG: '' # placeholder for future logic to add no cache flag if needed
run: |
ECR_DEFAULT_IMAGE_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
DEFAULT_TAG="${{ github.sha }}-operator"
ACR_IMAGE_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
IMAGE_URIS=(
"${ECR_DEFAULT_IMAGE_BASE}:${DEFAULT_TAG}"
"${ACR_IMAGE_BASE}:${DEFAULT_TAG}"
)
if [[ "${{ github.ref_name }}" == "main" ]]; then
IMAGE_URIS+=(
"${ECR_DEFAULT_IMAGE_BASE}:main-operator"
"${ACR_IMAGE_BASE}:main-operator"
)
fi
echo "operator_default_tag=${DEFAULT_TAG}" >> $GITHUB_OUTPUT
TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}")
echo "flags for docker buildx: ${TAGGING_FLAGS}"
if [[ "$NO_CACHE_FLAG" == "true" ]]; then
NO_CACHE_FLAG="--no-cache"
fi
docker buildx build --push ${NO_CACHE_FLAG} \
--platform linux/amd64,linux/arm64 \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
${TAGGING_FLAGS} -f Dockerfile .
echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Image URI |" >> $GITHUB_STEP_SUMMARY
echo "|-----|" >> $GITHUB_STEP_SUMMARY
for image_uri in "${IMAGE_URIS[@]}"; do
echo "| \`${image_uri}\` |" >> $GITHUB_STEP_SUMMARY
done
# ============================================================================
# FRAMEWORK PIPELINES (Build → Test → Copy)
# ============================================================================
# ============================================================================
# VLLM PIPELINE
# ============================================================================
vllm-pipeline:
needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true'
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: vllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: true
test_gpu_timeout_minutes: 35
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
secrets: inherit
# ============================================================================
# SGLANG PIPELINE
# ============================================================================
sglang-pipeline:
needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true'
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: sglang
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: true
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
secrets: inherit
# ============================================================================
# TRTLLM PIPELINE
# ============================================================================
trtllm-pipeline:
needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true'
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["13.1"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: true
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
secrets: inherit
# ============================================================================
# DEPLOYMENT JOBS
# Deploy operator and run end-to-end tests on Kubernetes cluster
# ============================================================================
deploy-operator:
runs-on: prod-default-small-v2
# Run when any deploy test will run: if core, any framework, or deploy files changed
if: |
always() &&
(needs.changed-files.outputs.core == 'true' ||
needs.changed-files.outputs.vllm == 'true' ||
needs.changed-files.outputs.sglang == 'true' ||
needs.changed-files.outputs.trtllm == 'true' ||
needs.changed-files.outputs.deploy == 'true') &&
(needs.operator.result == 'success' || needs.operator.result == 'skipped')
needs: [changed-files, operator]
outputs:
NAMESPACE: ${{ steps.deploy-operator-step.outputs.namespace }}
steps:
- uses: actions/checkout@v4
- name: Determine operator image tag
id: operator-tag
run: |
if [ "${{ needs.operator.result }}" == "success" ]; then
echo "tag=${{ needs.operator.outputs.operator_default_tag }}" >> $GITHUB_OUTPUT
echo "Using newly built operator image: ${{ needs.operator.outputs.operator_default_tag }}"
else
echo "tag=main-operator" >> $GITHUB_OUTPUT
echo "Using stable operator image: main-operator"
fi
- name: Deploy Operator
id: deploy-operator-step
env:
BRANCH: ${{ github.ref_name }}
run: |
set -x
# Set namespace
# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
# Cap at 10 chars
BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
# Create a namespace for this job
echo "Creating an ephemeral namespace..."
kubectl create namespace $NAMESPACE
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
# Set the namespace as default
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/helm/charts/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ steps.operator-tag.outputs.tag }} \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
- name: 🔍 Report Unhealthy Pods
if: failure()
run: |
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
# Descriptive header for the summary
echo "### ⚠️ OPERATOR DEPLOYMENT FAILED: Unhealthy Pods Report" >> $GITHUB_STEP_SUMMARY
echo "Unhealthy pods:" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
# Get pods, exclude healthy ones, and format output
# If the namespace is empty or all pods are healthy, the grep/awk won't output anything, which is handled gracefully.
kubectl get pods -n ${{ steps.deploy-operator-step.outputs.namespace }} --no-headers \
| grep -v -E '(Running|Completed)' \
| awk '{print "- 🔴 **" $1 "** | Status: `" $3 "`"}' >> $GITHUB_STEP_SUMMARY || true
# ============================================================================
#
# End-to-end tests for each framework with various deployment profiles
# ============================================================================
deploy-test-vllm:
# Run if core, vllm, or deploy is changed
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true'
runs-on: prod-default-small-v2
needs: [deploy-operator, vllm-pipeline]
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env:
FRAMEWORK: vllm
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
deployment_file: "deploy/${{ matrix.profile }}.yaml"
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda12-amd64
platform_arch: amd64
deploy-test-sglang:
runs-on: prod-default-small-v2
# Run if core, sglang, or deploy is changed
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true'
needs: [changed-files, deploy-operator, sglang-pipeline]
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile:
- agg
- agg_router
name: deploy-test-sglang (${{ matrix.profile }})
env:
FRAMEWORK: sglang
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
deployment_file: "deploy/${{ matrix.profile }}.yaml"
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda12-amd64
platform_arch: amd64
deploy-test-trtllm:
runs-on: prod-default-small-v2
# Run if core, trtllm, or deploy is changed
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true'
needs: [changed-files, deploy-operator, trtllm-pipeline]
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-trtllm (${{ matrix.profile }})
env:
FRAMEWORK: trtllm
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
deployment_file: "deploy/${{ matrix.profile }}.yaml"
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-cuda13-amd64
platform_arch: amd64
# ============================================================================
# CLEANUP JOBS
# Clean up ephemeral Kubernetes namespace and resources
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator, changed-files]
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Create K8s builders (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
buildkit_worker_addresses: '' # k8s builder
skip_bootstrap: true
- name: Builder Cleanup in case of k8s builder
shell: bash
run: |
docker buildx rm ${{ needs.changed-files.outputs.builder_name }} || true
cleanup:
name: Cleanup AKS resources
runs-on: prod-default-small-v2
if: always()
needs: [deploy-operator, deploy-test-sglang, deploy-test-trtllm, deploy-test-vllm]
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Setup Kubeconfig
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
- name: Cleanup
timeout-minutes: 5
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get dynamographdeployments
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform --namespace $NAMESPACE --timeout 10m || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."