Skip to content

fix

fix #11

name: GKE DeepSeek-V2-Lite Perf Smoke
on:
push:
paths:
- ".github/workflows/gke-connectivity-smoke.yml"
- "test/srt/nightly-test/gke_deepseek_v2_lite_perf_entrypoint.py"
workflow_dispatch:
inputs:
project_id:
description: "GCP project ID. Defaults to vars.GCP_PROJECT_ID."
required: false
type: string
cluster_name:
description: "GKE cluster name. Defaults to vars.GKE_CLUSTER_NAME."
required: false
type: string
cluster_location:
description: "GKE cluster zone or region. Defaults to vars.GKE_CLUSTER_LOCATION."
required: false
type: string
workload_identity_provider:
description: "Workload Identity Provider. Defaults to vars.GCP_WORKLOAD_IDENTITY_PROVIDER."
required: false
type: string
node_pool:
description: "Existing GKE node pool to verify."
required: true
default: "v6e-4x4-flex-perf-tests"
type: string
namespace:
description: "Namespace for Kubernetes RBAC checks."
required: true
default: "default"
type: string
bucket_name:
description: "GCS bucket mounted at /models for model cache."
required: false
default: "inference-model-storage-sgl"
type: string
service_account_name:
description: "Kubernetes service account for GCSFuse access."
required: false
default: "gcs-account"
type: string
repo_ref:
description: "Git ref to test. Defaults to current workflow SHA."
required: false
type: string
concurrency:
group: gke-deepseek-v2-lite-perf-smoke-${{ github.ref }}
cancel-in-progress: false
permissions:
contents: read
id-token: write
jobs:
deepseek-v2-lite-perf:
runs-on: ubuntu-latest
timeout-minutes: 180
env:
PROJECT_ID: ${{ inputs.project_id || vars.GCP_PROJECT_ID }}
CLUSTER_NAME: ${{ inputs.cluster_name || vars.GKE_CLUSTER_NAME }}
CLUSTER_LOCATION: ${{ inputs.cluster_location || vars.GKE_CLUSTER_LOCATION }}
WORKLOAD_IDENTITY_PROVIDER: ${{ inputs.workload_identity_provider || vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
SERVICE_ACCOUNT: ${{ vars.GCP_SERVICE_ACCOUNT }}
NODE_POOL: ${{ inputs.node_pool || 'v6e-4x4-flex-perf-tests' }}
NAMESPACE: ${{ inputs.namespace || 'default' }}
BUCKET_NAME: ${{ inputs.bucket_name || 'inference-model-storage-sgl' }}
K8S_SERVICE_ACCOUNT: ${{ inputs.service_account_name || 'gcs-account' }}
REPO_REF: ${{ inputs.repo_ref || github.sha }}
WORKLOAD_NAME: gke-deepseek-v2-lite-perf-${{ github.run_id }}
steps:
- name: Validate smoke test configuration
shell: bash
run: |
set -euo pipefail
missing=0
for name in PROJECT_ID CLUSTER_NAME CLUSTER_LOCATION WORKLOAD_IDENTITY_PROVIDER NODE_POOL NAMESPACE BUCKET_NAME K8S_SERVICE_ACCOUNT; do
if [[ -z "${!name}" ]]; then
echo "::error::${name} is required. Set the workflow input or the matching repository variable."
missing=1
fi
done
if [[ "${missing}" -ne 0 ]]; then
exit 1
fi
echo "project=${PROJECT_ID}"
echo "cluster=${CLUSTER_NAME}"
echo "location=${CLUSTER_LOCATION}"
echo "node_pool=${NODE_POOL}"
echo "namespace=${NAMESPACE}"
echo "bucket=${BUCKET_NAME}"
echo "repo_ref=${REPO_REF}"
- name: Authenticate to Google Cloud
if: env.SERVICE_ACCOUNT == ''
uses: google-github-actions/auth@v3
with:
project_id: ${{ env.PROJECT_ID }}
workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }}
- name: Authenticate to Google Cloud with service account
if: env.SERVICE_ACCOUNT != ''
uses: google-github-actions/auth@v3
with:
project_id: ${{ env.PROJECT_ID }}
workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ env.SERVICE_ACCOUNT }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v3
with:
version: ">= 490.0.0"
project_id: ${{ env.PROJECT_ID }}
- name: Verify GKE node pool through GCP API
shell: bash
run: |
set -euo pipefail
gcloud container node-pools describe "${NODE_POOL}" \
--project="${PROJECT_ID}" \
--cluster="${CLUSTER_NAME}" \
--location="${CLUSTER_LOCATION}" \
--format="table(name,status,config.machineType,autoscaling.enabled,autoscaling.minNodeCount,autoscaling.maxNodeCount)"
- name: Get GKE credentials
uses: google-github-actions/get-gke-credentials@v3
with:
project_id: ${{ env.PROJECT_ID }}
cluster_name: ${{ env.CLUSTER_NAME }}
location: ${{ env.CLUSTER_LOCATION }}
- name: Verify Kubernetes access
shell: bash
run: |
set -euo pipefail
kubectl cluster-info
kubectl auth can-i get nodes
kubectl auth can-i create jobs --namespace "${NAMESPACE}"
kubectl auth can-i create services --namespace "${NAMESPACE}"
kubectl get nodes -l "cloud.google.com/gke-nodepool=${NODE_POOL}" -o wide
- name: Run DeepSeek-V2-Lite multi-host perf smoke
shell: bash
run: |
set -euo pipefail
HEADLESS_SERVICE_NAME="${WORKLOAD_NAME}-headless"
JOB_COMPLETE_TIMEOUT="30m"
TPU_PROCESS_ADDRESSES="${WORKLOAD_NAME}-0.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-1.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-2.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-3.${HEADLESS_SERVICE_NAME}:8471"
TPU_WORKER_HOSTNAMES="${WORKLOAD_NAME}-0.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-1.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-2.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-3.${HEADLESS_SERVICE_NAME}"
cat > /tmp/tpu-smoke.yaml <<EOF
apiVersion: v1
kind: Service
metadata:
name: ${HEADLESS_SERVICE_NAME}
namespace: ${NAMESPACE}
spec:
clusterIP: None
selector:
job-name: ${WORKLOAD_NAME}
ports:
- name: http
port: 30000
- name: dist-init
port: 10011
- name: tpu-process
port: 8471
- name: control
port: 18080
---
apiVersion: batch/v1
kind: Job
metadata:
name: ${WORKLOAD_NAME}
namespace: ${NAMESPACE}
spec:
completionMode: Indexed
parallelism: 4
completions: 4
backoffLimit: 0
activeDeadlineSeconds: 7200
ttlSecondsAfterFinished: 600
template:
metadata:
annotations:
gke-gcsfuse/volumes: "true"
spec:
subdomain: ${HEADLESS_SERVICE_NAME}
restartPolicy: Never
serviceAccountName: ${K8S_SERVICE_ACCOUNT}
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 4x4
cloud.google.com/gke-nodepool: ${NODE_POOL}
tolerations:
- key: google.com/tpu
operator: Equal
value: present
effect: NoSchedule
containers:
- name: deepseek-v2-lite-perf
image: us-docker.pkg.dev/cloud-tpu-images/jax-ai-image/tpu:jax0.8.1-rev1
command:
- /bin/bash
- -lc
- |
set -euxo pipefail
git clone https://github.com/sgl-project/sglang-jax.git /tmp/sglang-jax
cd /tmp/sglang-jax
git fetch origin "${REPO_REF}" || true
git checkout "${REPO_REF}"
python3 -m pip install --upgrade pip
python3 -m pip install -e "python[all]"
python3 test/srt/nightly-test/gke_deepseek_v2_lite_perf_entrypoint.py
env:
- name: JOB_COMPLETION_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
- name: TPU_WORKER_ID
valueFrom:
fieldRef:
fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
- name: TPU_PROCESS_PORT
value: "8471"
- name: TPU_PROCESS_ADDRESSES
value: "${TPU_PROCESS_ADDRESSES}"
- name: TPU_WORKER_HOSTNAMES
value: "${TPU_WORKER_HOSTNAMES}"
- name: WORKLOAD_NAME
value: "${WORKLOAD_NAME}"
- name: HEADLESS_SERVICE_NAME
value: "${HEADLESS_SERVICE_NAME}"
- name: NNODES
value: "4"
- name: CI_MOUNT_ROOT
value: "/models"
- name: SGLANG_JAX_MODEL_CACHE
value: "/models/model_scope"
- name: JAX_COMPILATION_CACHE_DIR
value: "/tmp/jax_compilation_cache"
- name: HF_HUB_DOWNLOAD_TIMEOUT
value: "600"
ports:
- containerPort: 30000
name: http
- containerPort: 10011
name: dist-init
- containerPort: 8471
name: tpu-process
- containerPort: 18080
name: control
resources:
requests:
google.com/tpu: "4"
limits:
google.com/tpu: "4"
volumeMounts:
- mountPath: /models
name: model-storage
readOnly: true
- mountPath: /dev/shm
name: dev-shm
- mountPath: /tmp/jax_compilation_cache
name: jax-cache
volumes:
- name: dev-shm
emptyDir:
medium: Memory
sizeLimit: 64Gi
- name: jax-cache
emptyDir: {}
- name: gke-gcsfuse-cache
emptyDir: {}
- name: model-storage
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ${BUCKET_NAME}
mountOptions: implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:409600,file-cache:cache-file-for-range-read:true,file-system:kernel-list-cache-ttl-secs:-1,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:128,read_ahead_kb=1024
EOF
kubectl apply -f /tmp/tpu-smoke.yaml
kubectl get job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" -o wide
kubectl wait --for=condition=Ready pod \
--selector="job-name=${WORKLOAD_NAME}" \
--namespace "${NAMESPACE}" \
--timeout=45m
kubectl logs --selector="job-name=${WORKLOAD_NAME}" \
--namespace "${NAMESPACE}" \
--all-containers=true \
--prefix=true \
--follow=true \
--max-log-requests=8
kubectl wait --for=condition=complete "job/${WORKLOAD_NAME}" \
--namespace "${NAMESPACE}" \
--timeout="${JOB_COMPLETE_TIMEOUT}"
- name: Diagnose TPU workload failure
if: failure()
shell: bash
run: |
set -euo pipefail
kubectl get pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" -o wide || true
kubectl describe pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" || true
kubectl logs --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" --all-containers=true --prefix=true || true
- name: Cleanup TPU workload smoke test
if: always()
shell: bash
run: |
set -euo pipefail
kubectl delete job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" --ignore-not-found=true
kubectl delete service "${WORKLOAD_NAME}-headless" --namespace "${NAMESPACE}" --ignore-not-found=true