3636 default : " default"
3737 type : string
3838 run_workload :
39- description : " Create and run a short multi-host TPU Kubernetes Job."
39+ description : " Create and run a multi-host TPU Kubernetes Job."
4040 required : true
4141 default : false
4242 type : boolean
43+ workload_type :
44+ description : " Workload to run when run_workload is true."
45+ required : true
46+ default : " connectivity"
47+ type : choice
48+ options :
49+ - connectivity
50+ - deepseek_v2_lite_perf
51+ bucket_name :
52+ description : " GCS bucket mounted at /models for model cache."
53+ required : true
54+ default : " inference-model-storage-sgl"
55+ type : string
56+ service_account_name :
57+ description : " Kubernetes service account for GCSFuse access."
58+ required : true
59+ default : " gcs-account"
60+ type : string
61+ repo_ref :
62+ description : " Git ref to test. Defaults to current workflow SHA."
63+ required : false
64+ type : string
4365
4466concurrency :
4567 group : gke-connectivity-smoke-${{ github.ref }}
@@ -52,7 +74,7 @@ permissions:
5274jobs :
5375 connectivity :
5476 runs-on : ubuntu-latest
55- timeout-minutes : 60
77+ timeout-minutes : 180
5678 env :
5779 PROJECT_ID : ${{ inputs.project_id || vars.GCP_PROJECT_ID }}
5880 CLUSTER_NAME : ${{ inputs.cluster_name || vars.GKE_CLUSTER_NAME }}
6284 NODE_POOL : ${{ inputs.node_pool || 'v6e-4x4-flex-perf-tests' }}
6385 NAMESPACE : ${{ inputs.namespace || 'default' }}
6486 RUN_WORKLOAD : ${{ inputs.run_workload || github.event_name == 'push' }}
87+ WORKLOAD_TYPE : ${{ inputs.workload_type || 'connectivity' }}
88+ BUCKET_NAME : ${{ inputs.bucket_name || 'inference-model-storage-sgl' }}
89+ K8S_SERVICE_ACCOUNT : ${{ inputs.service_account_name || 'gcs-account' }}
90+ REPO_REF : ${{ inputs.repo_ref || github.sha }}
6591 WORKLOAD_NAME : gke-tpu-smoke-${{ github.run_id }}
6692 steps :
6793 - name : Validate smoke test configuration
@@ -77,6 +103,15 @@ jobs:
77103 fi
78104 done
79105
106+ if [[ "${RUN_WORKLOAD}" == "true" && "${WORKLOAD_TYPE}" == "deepseek_v2_lite_perf" ]]; then
107+ for name in BUCKET_NAME K8S_SERVICE_ACCOUNT; do
108+ if [[ -z "${!name}" ]]; then
109+ echo "::error::${name} is required for deepseek_v2_lite_perf."
110+ missing=1
111+ fi
112+ done
113+ fi
114+
80115 if [[ "${missing}" -ne 0 ]]; then
81116 exit 1
82117 fi
86121 echo "location=${CLUSTER_LOCATION}"
87122 echo "node_pool=${NODE_POOL}"
88123 echo "namespace=${NAMESPACE}"
124+ echo "run_workload=${RUN_WORKLOAD}"
125+ echo "workload_type=${WORKLOAD_TYPE}"
126+ echo "repo_ref=${REPO_REF}"
89127
90128 - name : Authenticate to Google Cloud
91129 if : env.SERVICE_ACCOUNT == ''
@@ -143,11 +181,15 @@ jobs:
143181 run : |
144182 set -euo pipefail
145183
146- cat > /tmp/tpu-smoke.yaml <<EOF
184+ HEADLESS_SERVICE_NAME="${WORKLOAD_NAME}-headless"
185+ JOB_COMPLETE_TIMEOUT="5m"
186+
187+ if [[ "${WORKLOAD_TYPE}" == "connectivity" ]]; then
188+ cat > /tmp/tpu-smoke.yaml <<EOF
147189 apiVersion: v1
148190 kind: Service
149191 metadata:
150- name: ${WORKLOAD_NAME}-headless
192+ name: ${HEADLESS_SERVICE_NAME}
151193 namespace: ${NAMESPACE}
152194 spec:
153195 clusterIP: None
@@ -168,7 +210,7 @@ jobs:
168210 ttlSecondsAfterFinished: 600
169211 template:
170212 spec:
171- subdomain: ${WORKLOAD_NAME}-headless
213+ subdomain: ${HEADLESS_SERVICE_NAME}
172214 restartPolicy: Never
173215 nodeSelector:
174216 cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
@@ -197,20 +239,162 @@ jobs:
197239 limits:
198240 google.com/tpu: "4"
199241 EOF
242+ elif [[ "${WORKLOAD_TYPE}" == "deepseek_v2_lite_perf" ]]; then
243+ JOB_COMPLETE_TIMEOUT="30m"
244+ TPU_PROCESS_ADDRESSES="${WORKLOAD_NAME}-0.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-1.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-2.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-3.${HEADLESS_SERVICE_NAME}:8471"
245+ TPU_WORKER_HOSTNAMES="${WORKLOAD_NAME}-0.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-1.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-2.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-3.${HEADLESS_SERVICE_NAME}"
246+
247+ cat > /tmp/tpu-smoke.yaml <<EOF
248+ apiVersion: v1
249+ kind: Service
250+ metadata:
251+ name: ${HEADLESS_SERVICE_NAME}
252+ namespace: ${NAMESPACE}
253+ spec:
254+ clusterIP: None
255+ selector:
256+ job-name: ${WORKLOAD_NAME}
257+ ports:
258+ - name: http
259+ port: 30000
260+ - name: dist-init
261+ port: 10011
262+ - name: tpu-process
263+ port: 8471
264+ - name: control
265+ port: 18080
266+ ---
267+ apiVersion: batch/v1
268+ kind: Job
269+ metadata:
270+ name: ${WORKLOAD_NAME}
271+ namespace: ${NAMESPACE}
272+ spec:
273+ completionMode: Indexed
274+ parallelism: 4
275+ completions: 4
276+ backoffLimit: 0
277+ activeDeadlineSeconds: 7200
278+ ttlSecondsAfterFinished: 600
279+ template:
280+ metadata:
281+ annotations:
282+ gke-gcsfuse/volumes: "true"
283+ spec:
284+ subdomain: ${HEADLESS_SERVICE_NAME}
285+ restartPolicy: Never
286+ serviceAccountName: ${K8S_SERVICE_ACCOUNT}
287+ nodeSelector:
288+ cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
289+ cloud.google.com/gke-tpu-topology: 4x4
290+ cloud.google.com/gke-nodepool: ${NODE_POOL}
291+ tolerations:
292+ - key: google.com/tpu
293+ operator: Equal
294+ value: present
295+ effect: NoSchedule
296+ containers:
297+ - name: deepseek-v2-lite-perf
298+ image: us-docker.pkg.dev/cloud-tpu-images/jax-ai-image/tpu:jax0.8.1-rev1
299+ command:
300+ - /bin/bash
301+ - -lc
302+ - |
303+ set -euxo pipefail
304+ git clone https://github.com/sgl-project/sglang-jax.git /tmp/sglang-jax
305+ cd /tmp/sglang-jax
306+ git fetch origin "${REPO_REF}" || true
307+ git checkout "${REPO_REF}"
308+ python3 -m pip install --upgrade pip
309+ python3 -m pip install -e "python[all]"
310+ python3 test/srt/nightly-test/gke_deepseek_v2_lite_perf_entrypoint.py
311+ env:
312+ - name: JOB_COMPLETION_INDEX
313+ valueFrom:
314+ fieldRef:
315+ fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
316+ - name: TPU_WORKER_ID
317+ valueFrom:
318+ fieldRef:
319+ fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
320+ - name: TPU_PROCESS_PORT
321+ value: "8471"
322+ - name: TPU_PROCESS_ADDRESSES
323+ value: "${TPU_PROCESS_ADDRESSES}"
324+ - name: TPU_WORKER_HOSTNAMES
325+ value: "${TPU_WORKER_HOSTNAMES}"
326+ - name: WORKLOAD_NAME
327+ value: "${WORKLOAD_NAME}"
328+ - name: HEADLESS_SERVICE_NAME
329+ value: "${HEADLESS_SERVICE_NAME}"
330+ - name: NNODES
331+ value: "4"
332+ - name: CI_MOUNT_ROOT
333+ value: "/models"
334+ - name: SGLANG_JAX_MODEL_CACHE
335+ value: "/models/model_scope"
336+ - name: JAX_COMPILATION_CACHE_DIR
337+ value: "/tmp/jax_compilation_cache"
338+ - name: HF_HUB_DOWNLOAD_TIMEOUT
339+ value: "600"
340+ ports:
341+ - containerPort: 30000
342+ name: http
343+ - containerPort: 10011
344+ name: dist-init
345+ - containerPort: 8471
346+ name: tpu-process
347+ - containerPort: 18080
348+ name: control
349+ resources:
350+ requests:
351+ google.com/tpu: "4"
352+ limits:
353+ google.com/tpu: "4"
354+ volumeMounts:
355+ - mountPath: /models
356+ name: model-storage
357+ readOnly: true
358+ - mountPath: /dev/shm
359+ name: dev-shm
360+ - mountPath: /tmp/jax_compilation_cache
361+ name: jax-cache
362+ volumes:
363+ - name: dev-shm
364+ emptyDir:
365+ medium: Memory
366+ sizeLimit: 64Gi
367+ - name: jax-cache
368+ emptyDir: {}
369+ - name: gke-gcsfuse-cache
370+ emptyDir: {}
371+ - name: model-storage
372+ csi:
373+ driver: gcsfuse.csi.storage.gke.io
374+ readOnly: true
375+ volumeAttributes:
376+ bucketName: ${BUCKET_NAME}
377+ mountOptions: implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:409600,file-cache:cache-file-for-range-read:true,file-system:kernel-list-cache-ttl-secs:-1,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:128,read_ahead_kb=1024
378+ EOF
379+ else
380+ echo "::error::Unsupported WORKLOAD_TYPE=${WORKLOAD_TYPE}"
381+ exit 1
382+ fi
200383
201384 kubectl apply -f /tmp/tpu-smoke.yaml
202385 kubectl get job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" -o wide
203386 kubectl wait --for=condition=Ready pod \
204387 --selector="job-name=${WORKLOAD_NAME}" \
205388 --namespace "${NAMESPACE}" \
206- --timeout=30m
389+ --timeout=45m
207390 kubectl logs --selector="job-name=${WORKLOAD_NAME}" \
208391 --namespace "${NAMESPACE}" \
209392 --all-containers=true \
210- --prefix=true
393+ --prefix=true \
394+ --follow=true
211395 kubectl wait --for=condition=complete "job/${WORKLOAD_NAME}" \
212396 --namespace "${NAMESPACE}" \
213- --timeout=5m
397+ --timeout="${JOB_COMPLETE_TIMEOUT}"
214398
215399 - name : Diagnose TPU workload failure
216400 if : failure() && env.RUN_WORKLOAD == 'true'
0 commit comments