Skip to content

Commit 25a1552

Browse files
committed
test deepseek perf tests on GKE
1 parent 8d3b33e commit 25a1552

3 files changed

Lines changed: 463 additions & 8 deletions

File tree

.github/workflows/gke-connectivity-smoke.yml

Lines changed: 192 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,32 @@ on:
3636
default: "default"
3737
type: string
3838
run_workload:
39-
description: "Create and run a short multi-host TPU Kubernetes Job."
39+
description: "Create and run a multi-host TPU Kubernetes Job."
4040
required: true
4141
default: false
4242
type: boolean
43+
workload_type:
44+
description: "Workload to run when run_workload is true."
45+
required: true
46+
default: "connectivity"
47+
type: choice
48+
options:
49+
- connectivity
50+
- deepseek_v2_lite_perf
51+
bucket_name:
52+
description: "GCS bucket mounted at /models for model cache."
53+
required: true
54+
default: "inference-model-storage-sgl"
55+
type: string
56+
service_account_name:
57+
description: "Kubernetes service account for GCSFuse access."
58+
required: true
59+
default: "gcs-account"
60+
type: string
61+
repo_ref:
62+
description: "Git ref to test. Defaults to current workflow SHA."
63+
required: false
64+
type: string
4365

4466
concurrency:
4567
group: gke-connectivity-smoke-${{ github.ref }}
@@ -52,7 +74,7 @@ permissions:
5274
jobs:
5375
connectivity:
5476
runs-on: ubuntu-latest
55-
timeout-minutes: 60
77+
timeout-minutes: 180
5678
env:
5779
PROJECT_ID: ${{ inputs.project_id || vars.GCP_PROJECT_ID }}
5880
CLUSTER_NAME: ${{ inputs.cluster_name || vars.GKE_CLUSTER_NAME }}
@@ -62,6 +84,10 @@ jobs:
6284
NODE_POOL: ${{ inputs.node_pool || 'v6e-4x4-flex-perf-tests' }}
6385
NAMESPACE: ${{ inputs.namespace || 'default' }}
6486
RUN_WORKLOAD: ${{ inputs.run_workload || github.event_name == 'push' }}
87+
WORKLOAD_TYPE: ${{ inputs.workload_type || 'connectivity' }}
88+
BUCKET_NAME: ${{ inputs.bucket_name || 'inference-model-storage-sgl' }}
89+
K8S_SERVICE_ACCOUNT: ${{ inputs.service_account_name || 'gcs-account' }}
90+
REPO_REF: ${{ inputs.repo_ref || github.sha }}
6591
WORKLOAD_NAME: gke-tpu-smoke-${{ github.run_id }}
6692
steps:
6793
- name: Validate smoke test configuration
@@ -77,6 +103,15 @@ jobs:
77103
fi
78104
done
79105
106+
if [[ "${RUN_WORKLOAD}" == "true" && "${WORKLOAD_TYPE}" == "deepseek_v2_lite_perf" ]]; then
107+
for name in BUCKET_NAME K8S_SERVICE_ACCOUNT; do
108+
if [[ -z "${!name}" ]]; then
109+
echo "::error::${name} is required for deepseek_v2_lite_perf."
110+
missing=1
111+
fi
112+
done
113+
fi
114+
80115
if [[ "${missing}" -ne 0 ]]; then
81116
exit 1
82117
fi
@@ -86,6 +121,9 @@ jobs:
86121
echo "location=${CLUSTER_LOCATION}"
87122
echo "node_pool=${NODE_POOL}"
88123
echo "namespace=${NAMESPACE}"
124+
echo "run_workload=${RUN_WORKLOAD}"
125+
echo "workload_type=${WORKLOAD_TYPE}"
126+
echo "repo_ref=${REPO_REF}"
89127
90128
- name: Authenticate to Google Cloud
91129
if: env.SERVICE_ACCOUNT == ''
@@ -143,11 +181,15 @@ jobs:
143181
run: |
144182
set -euo pipefail
145183
146-
cat > /tmp/tpu-smoke.yaml <<EOF
184+
HEADLESS_SERVICE_NAME="${WORKLOAD_NAME}-headless"
185+
JOB_COMPLETE_TIMEOUT="5m"
186+
187+
if [[ "${WORKLOAD_TYPE}" == "connectivity" ]]; then
188+
cat > /tmp/tpu-smoke.yaml <<EOF
147189
apiVersion: v1
148190
kind: Service
149191
metadata:
150-
name: ${WORKLOAD_NAME}-headless
192+
name: ${HEADLESS_SERVICE_NAME}
151193
namespace: ${NAMESPACE}
152194
spec:
153195
clusterIP: None
@@ -168,7 +210,7 @@ jobs:
168210
ttlSecondsAfterFinished: 600
169211
template:
170212
spec:
171-
subdomain: ${WORKLOAD_NAME}-headless
213+
subdomain: ${HEADLESS_SERVICE_NAME}
172214
restartPolicy: Never
173215
nodeSelector:
174216
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
@@ -197,20 +239,162 @@ jobs:
197239
limits:
198240
google.com/tpu: "4"
199241
EOF
242+
elif [[ "${WORKLOAD_TYPE}" == "deepseek_v2_lite_perf" ]]; then
243+
JOB_COMPLETE_TIMEOUT="30m"
244+
TPU_PROCESS_ADDRESSES="${WORKLOAD_NAME}-0.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-1.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-2.${HEADLESS_SERVICE_NAME}:8471,${WORKLOAD_NAME}-3.${HEADLESS_SERVICE_NAME}:8471"
245+
TPU_WORKER_HOSTNAMES="${WORKLOAD_NAME}-0.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-1.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-2.${HEADLESS_SERVICE_NAME},${WORKLOAD_NAME}-3.${HEADLESS_SERVICE_NAME}"
246+
247+
cat > /tmp/tpu-smoke.yaml <<EOF
248+
apiVersion: v1
249+
kind: Service
250+
metadata:
251+
name: ${HEADLESS_SERVICE_NAME}
252+
namespace: ${NAMESPACE}
253+
spec:
254+
clusterIP: None
255+
selector:
256+
job-name: ${WORKLOAD_NAME}
257+
ports:
258+
- name: http
259+
port: 30000
260+
- name: dist-init
261+
port: 10011
262+
- name: tpu-process
263+
port: 8471
264+
- name: control
265+
port: 18080
266+
---
267+
apiVersion: batch/v1
268+
kind: Job
269+
metadata:
270+
name: ${WORKLOAD_NAME}
271+
namespace: ${NAMESPACE}
272+
spec:
273+
completionMode: Indexed
274+
parallelism: 4
275+
completions: 4
276+
backoffLimit: 0
277+
activeDeadlineSeconds: 7200
278+
ttlSecondsAfterFinished: 600
279+
template:
280+
metadata:
281+
annotations:
282+
gke-gcsfuse/volumes: "true"
283+
spec:
284+
subdomain: ${HEADLESS_SERVICE_NAME}
285+
restartPolicy: Never
286+
serviceAccountName: ${K8S_SERVICE_ACCOUNT}
287+
nodeSelector:
288+
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
289+
cloud.google.com/gke-tpu-topology: 4x4
290+
cloud.google.com/gke-nodepool: ${NODE_POOL}
291+
tolerations:
292+
- key: google.com/tpu
293+
operator: Equal
294+
value: present
295+
effect: NoSchedule
296+
containers:
297+
- name: deepseek-v2-lite-perf
298+
image: us-docker.pkg.dev/cloud-tpu-images/jax-ai-image/tpu:jax0.8.1-rev1
299+
command:
300+
- /bin/bash
301+
- -lc
302+
- |
303+
set -euxo pipefail
304+
git clone https://github.com/sgl-project/sglang-jax.git /tmp/sglang-jax
305+
cd /tmp/sglang-jax
306+
git fetch origin "${REPO_REF}" || true
307+
git checkout "${REPO_REF}"
308+
python3 -m pip install --upgrade pip
309+
python3 -m pip install -e "python[all]"
310+
python3 test/srt/nightly-test/gke_deepseek_v2_lite_perf_entrypoint.py
311+
env:
312+
- name: JOB_COMPLETION_INDEX
313+
valueFrom:
314+
fieldRef:
315+
fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
316+
- name: TPU_WORKER_ID
317+
valueFrom:
318+
fieldRef:
319+
fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
320+
- name: TPU_PROCESS_PORT
321+
value: "8471"
322+
- name: TPU_PROCESS_ADDRESSES
323+
value: "${TPU_PROCESS_ADDRESSES}"
324+
- name: TPU_WORKER_HOSTNAMES
325+
value: "${TPU_WORKER_HOSTNAMES}"
326+
- name: WORKLOAD_NAME
327+
value: "${WORKLOAD_NAME}"
328+
- name: HEADLESS_SERVICE_NAME
329+
value: "${HEADLESS_SERVICE_NAME}"
330+
- name: NNODES
331+
value: "4"
332+
- name: CI_MOUNT_ROOT
333+
value: "/models"
334+
- name: SGLANG_JAX_MODEL_CACHE
335+
value: "/models/model_scope"
336+
- name: JAX_COMPILATION_CACHE_DIR
337+
value: "/tmp/jax_compilation_cache"
338+
- name: HF_HUB_DOWNLOAD_TIMEOUT
339+
value: "600"
340+
ports:
341+
- containerPort: 30000
342+
name: http
343+
- containerPort: 10011
344+
name: dist-init
345+
- containerPort: 8471
346+
name: tpu-process
347+
- containerPort: 18080
348+
name: control
349+
resources:
350+
requests:
351+
google.com/tpu: "4"
352+
limits:
353+
google.com/tpu: "4"
354+
volumeMounts:
355+
- mountPath: /models
356+
name: model-storage
357+
readOnly: true
358+
- mountPath: /dev/shm
359+
name: dev-shm
360+
- mountPath: /tmp/jax_compilation_cache
361+
name: jax-cache
362+
volumes:
363+
- name: dev-shm
364+
emptyDir:
365+
medium: Memory
366+
sizeLimit: 64Gi
367+
- name: jax-cache
368+
emptyDir: {}
369+
- name: gke-gcsfuse-cache
370+
emptyDir: {}
371+
- name: model-storage
372+
csi:
373+
driver: gcsfuse.csi.storage.gke.io
374+
readOnly: true
375+
volumeAttributes:
376+
bucketName: ${BUCKET_NAME}
377+
mountOptions: implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:409600,file-cache:cache-file-for-range-read:true,file-system:kernel-list-cache-ttl-secs:-1,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:128,read_ahead_kb=1024
378+
EOF
379+
else
380+
echo "::error::Unsupported WORKLOAD_TYPE=${WORKLOAD_TYPE}"
381+
exit 1
382+
fi
200383
201384
kubectl apply -f /tmp/tpu-smoke.yaml
202385
kubectl get job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" -o wide
203386
kubectl wait --for=condition=Ready pod \
204387
--selector="job-name=${WORKLOAD_NAME}" \
205388
--namespace "${NAMESPACE}" \
206-
--timeout=30m
389+
--timeout=45m
207390
kubectl logs --selector="job-name=${WORKLOAD_NAME}" \
208391
--namespace "${NAMESPACE}" \
209392
--all-containers=true \
210-
--prefix=true
393+
--prefix=true \
394+
--follow=true
211395
kubectl wait --for=condition=complete "job/${WORKLOAD_NAME}" \
212396
--namespace "${NAMESPACE}" \
213-
--timeout=5m
397+
--timeout="${JOB_COMPLETE_TIMEOUT}"
214398
215399
- name: Diagnose TPU workload failure
216400
if: failure() && env.RUN_WORKLOAD == 'true'

0 commit comments

Comments
 (0)