Skip to content

Commit fcb22b4

Browse files
committed
fix
1 parent f34ac08 commit fcb22b4

1 file changed

Lines changed: 29 additions & 5 deletions

File tree

.github/workflows/gke-connectivity-smoke.yml

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ jobs:
227227
completions: 4
228228
backoffLimit: 0
229229
activeDeadlineSeconds: 7200
230-
ttlSecondsAfterFinished: 600
230+
ttlSecondsAfterFinished: 3600
231231
template:
232232
metadata:
233233
annotations:
@@ -326,7 +326,7 @@ jobs:
326326
readOnly: true
327327
volumeAttributes:
328328
bucketName: ${BUCKET_NAME}
329-
mountOptions: implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:409600,file-cache:cache-file-for-range-read:true,file-system:kernel-list-cache-ttl-secs:-1,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:128,read_ahead_kb=1024
329+
mountOptions: implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:0,file-system:kernel-list-cache-ttl-secs:-1,read_ahead_kb=1024
330330
EOF
331331
332332
kubectl apply -f /tmp/tpu-smoke.yaml
@@ -341,9 +341,28 @@ jobs:
341341
--prefix=true \
342342
--follow=true \
343343
--max-log-requests=8
344-
kubectl wait --for=condition=complete "job/${WORKLOAD_NAME}" \
345-
--namespace "${NAMESPACE}" \
346-
--timeout="${JOB_COMPLETE_TIMEOUT}"
344+
345+
# Wait for Job to reach a terminal state (Complete or Failed). Plain
346+
# `kubectl wait --for=condition=complete` only matches success and
347+
# would block until --timeout even if the Job has already Failed.
348+
deadline=$(($(date +%s) + 1800))
349+
while [[ $(date +%s) -lt $deadline ]]; do
350+
conds=$(kubectl get job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" \
351+
-o jsonpath='{.status.conditions[?(@.type=="Complete")].status}|{.status.conditions[?(@.type=="Failed")].status}')
352+
case "${conds}" in
353+
True\|*)
354+
echo "Job completed successfully"
355+
exit 0
356+
;;
357+
\|True)
358+
echo "Job failed (condition=Failed). See diagnose step for details."
359+
exit 1
360+
;;
361+
esac
362+
sleep 10
363+
done
364+
echo "Timed out waiting for Job to reach a terminal state"
365+
exit 1
347366
348367
- name: Diagnose TPU workload failure
349368
if: failure()
@@ -353,6 +372,11 @@ jobs:
353372
354373
kubectl get pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" -o wide || true
355374
kubectl describe pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" || true
375+
kubectl describe job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" || true
376+
kubectl get events --namespace "${NAMESPACE}" --sort-by='.lastTimestamp' \
377+
--field-selector="involvedObject.name=${WORKLOAD_NAME}" || true
378+
kubectl get events --namespace "${NAMESPACE}" --sort-by='.lastTimestamp' \
379+
| grep -E "${WORKLOAD_NAME}|Evicted|DiskPressure|MemoryPressure|OOMKill|FailedScheduling" || true
356380
kubectl logs --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" --all-containers=true --prefix=true || true
357381
358382
- name: Cleanup TPU workload smoke test

0 commit comments

Comments
 (0)