@@ -227,7 +227,7 @@ jobs:
227227 completions: 4
228228 backoffLimit: 0
229229 activeDeadlineSeconds: 7200
230- ttlSecondsAfterFinished: 600
230+ ttlSecondsAfterFinished: 3600
231231 template:
232232 metadata:
233233 annotations:
@@ -239,7 +239,7 @@ jobs:
239239 nodeSelector:
240240 cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
241241 cloud.google.com/gke-tpu-topology: 4x4
242- # cloud.google.com/gke-nodepool: ${NODE_POOL}
242+ cloud.google.com/gke-nodepool: ${NODE_POOL}
243243 tolerations:
244244 - key: google.com/tpu
245245 operator: Equal
@@ -326,7 +326,7 @@ jobs:
326326 readOnly: true
327327 volumeAttributes:
328328 bucketName: ${BUCKET_NAME}
329- mountOptions: implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:409600 ,file-cache:cache-file-for-range-read:true,file- system:kernel-list-cache-ttl-secs:-1,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:128 ,read_ahead_kb=1024
329+ mountOptions: implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:0 ,file-system:kernel-list-cache-ttl-secs:-1,read_ahead_kb=1024
330330 EOF
331331
332332 kubectl apply -f /tmp/tpu-smoke.yaml
@@ -341,9 +341,28 @@ jobs:
341341 --prefix=true \
342342 --follow=true \
343343 --max-log-requests=8
344- kubectl wait --for=condition=complete "job/${WORKLOAD_NAME}" \
345- --namespace "${NAMESPACE}" \
346- --timeout="${JOB_COMPLETE_TIMEOUT}"
344+
345+ # Wait for Job to reach a terminal state (Complete or Failed). Plain
346+ # `kubectl wait --for=condition=complete` only matches success and
347+ # would block until --timeout even if the Job has already Failed.
348+ deadline=$(($(date +%s) + 1800))
349+ while [[ $(date +%s) -lt $deadline ]]; do
350+ conds=$(kubectl get job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" \
351+ -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}|{.status.conditions[?(@.type=="Failed")].status}')
352+ case "${conds}" in
353+ True\|*)
354+ echo "Job completed successfully"
355+ exit 0
356+ ;;
357+ \|True)
358+ echo "Job failed (condition=Failed). See diagnose step for details."
359+ exit 1
360+ ;;
361+ esac
362+ sleep 10
363+ done
364+ echo "Timed out waiting for Job to reach a terminal state"
365+ exit 1
347366
348367 - name : Diagnose TPU workload failure
349368 if : failure()
@@ -353,6 +372,11 @@ jobs:
353372
354373 kubectl get pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" -o wide || true
355374 kubectl describe pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" || true
375+ kubectl describe job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" || true
376+ kubectl get events --namespace "${NAMESPACE}" --sort-by='.lastTimestamp' \
377+ --field-selector="involvedObject.name=${WORKLOAD_NAME}" || true
378+ kubectl get events --namespace "${NAMESPACE}" --sort-by='.lastTimestamp' \
379+ | grep -E "${WORKLOAD_NAME}|Evicted|DiskPressure|MemoryPressure|OOMKill|FailedScheduling" || true
356380 kubectl logs --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" --all-containers=true --prefix=true || true
357381
358382 - name : Cleanup TPU workload smoke test
0 commit comments