Skip to content

Commit b798cb9

Browse files
fix: add retry logic to sprint-list-epics.sh for worktree startup race condition
When invoked during worktree creation, the ticket reducer can transiently fail (e.g., symlink not yet resolved), causing the script to silently report "No open epics found." The retry loop detects this (empty index but non-empty tracker dir) and retries up to SPRINT_MAX_RETRIES times. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ce1552b commit b798cb9

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

plugins/dso/scripts/sprint-list-epics.sh

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,26 @@ REDUCER="$SCRIPT_DIR/ticket-reducer.py"
3535
TRACKER_DIR="${TICKETS_TRACKER_DIR:-$REPO_ROOT/.tickets-tracker}"
3636

3737
# ---------------------------------------------------------------------------
38-
# Build index from v3 reducer.
38+
# Retry configuration for worktree startup race conditions.
39+
# When the tracker dir has entries but the reducer returns an empty index,
40+
# retry after a short wait. This handles the case where the tracker symlink
41+
# or filesystem isn't fully ready yet (common during worktree creation).
42+
# ---------------------------------------------------------------------------
43+
# REVIEW-DEFENSE: MAX_RETRIES is the number of additional attempts after the initial build,
44+
# not the total number of attempts. Total attempts = MAX_RETRIES + 1 (initial attempt + retries).
45+
# The retry loop condition `attempt < MAX_RETRIES` is intentional: attempt starts at 0 and
46+
# increments after each retry, so the loop runs at most MAX_RETRIES times (additional attempts).
47+
MAX_RETRIES="${SPRINT_MAX_RETRIES:-3}"
48+
RETRY_WAIT="${SPRINT_RETRY_WAIT:-1}"
49+
50+
# ---------------------------------------------------------------------------
51+
# Build index from v3 reducer (with retry on transient failure).
3952
# ---------------------------------------------------------------------------
4053
export _SPRINT_TRACKER_DIR="$TRACKER_DIR"
4154
export _SPRINT_REDUCER="$REDUCER"
42-
index_and_counts=$(python3 -c "
55+
56+
_build_index() {
57+
python3 -c "
4358
import json, os, sys, importlib.util, collections
4459
4560
tracker_dir = os.environ['_SPRINT_TRACKER_DIR']
@@ -90,7 +105,38 @@ for entry_name in os.listdir(tracker_dir):
90105
child_counts[parent_id] += 1
91106
92107
print(json.dumps({'index': idx, 'child_counts': dict(child_counts)}))
93-
" 2>/dev/null || echo '{"index":{},"child_counts":{}}')
108+
" 2>/dev/null || echo '{"index":{},"child_counts":{}}'
109+
}
110+
111+
# Count non-hidden subdirectories in tracker to detect "has entries but reducer failed"
112+
_tracker_has_entries() {
113+
local count
114+
count=$(find "$TRACKER_DIR" -mindepth 1 -maxdepth 1 -type d ! -name '.*' 2>/dev/null | head -1)
115+
[ -n "$count" ]
116+
}
117+
118+
# Build index with retry on transient failure
119+
index_and_counts=$(_build_index)
120+
121+
attempt=0
122+
while [ "$attempt" -lt "$MAX_RETRIES" ]; do
123+
# Check if the index is empty (no tickets resolved)
124+
index_key_count=$(echo "$index_and_counts" | python3 -c "import json,sys; print(len(json.load(sys.stdin).get('index',{})))" 2>/dev/null || echo "0")
125+
126+
if [ "$index_key_count" -gt 0 ]; then
127+
break # Index has entries — proceed normally
128+
fi
129+
130+
# Index is empty — check if tracker dir has entries (indicating transient failure)
131+
if ! _tracker_has_entries; then
132+
break # Tracker genuinely has no tickets — no point retrying
133+
fi
134+
135+
# Tracker has entries but reducer returned empty — transient failure, retry
136+
attempt=$(( attempt + 1 ))
137+
sleep "$RETRY_WAIT"
138+
index_and_counts=$(_build_index)
139+
done
94140

95141
SPRINT_INDEX_JSON=$(echo "$index_and_counts" | python3 -c "import json,sys; print(json.dumps(json.load(sys.stdin)['index']))")
96142
child_counts_json=$(echo "$index_and_counts" | python3 -c "import json,sys; print(json.dumps(json.load(sys.stdin)['child_counts']))")

tests/scripts/test-sprint-list-epics.sh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,77 @@ else
459459
(( FAIL++ ))
460460
fi
461461

462+
# ── Test 25: Retry when tracker dir has entries but reducer returns empty ──────
463+
echo "Test 25: test_retry_on_transient_reducer_failure — retries when tracker not ready"
464+
test_retry_on_transient_reducer_failure() {
465+
local TDIR25 TRACKER25
466+
TDIR25=$(mktemp -d)
467+
TRACKER25="$TDIR25/tracker"
468+
mkdir -p "$TRACKER25"
469+
470+
# Create a valid v3 epic
471+
make_v3_ticket "$TRACKER25" "epic-retry" "epic" "open" "1" "" "Retry Epic"
472+
473+
# Simulate transient reducer failure by making the epic dir temporarily unreadable.
474+
# The retry mechanism should detect that the tracker has entries but the index is empty,
475+
# wait, then retry — at which point the dir is readable and the epic is found.
476+
477+
# Make the epic dir unreadable (reducer will fail to read events, returns empty index)
478+
chmod 000 "$TRACKER25/epic-retry"
479+
480+
# Restore permissions quickly — well before the first retry fires.
481+
# SPRINT_RETRY_WAIT=0.8 gives an 8x margin over the 0.1s background delay.
482+
(sleep 0.1 && chmod 755 "$TRACKER25/epic-retry") &
483+
local restore_pid=$!
484+
485+
local out25 exit25=0
486+
out25=$(TICKETS_TRACKER_DIR="$TRACKER25" SPRINT_MAX_RETRIES=3 SPRINT_RETRY_WAIT=0.8 \
487+
bash "$SCRIPT" 2>/dev/null) || exit25=$?
488+
489+
wait "$restore_pid" 2>/dev/null || true
490+
chmod -R 755 "$TRACKER25" 2>/dev/null || true
491+
rm -rf "$TDIR25"
492+
493+
# The script should have retried and found the epic
494+
[ "$exit25" -eq 0 ] || return 1
495+
echo "$out25" | grep -q "epic-retry" || return 1
496+
}
497+
if test_retry_on_transient_reducer_failure; then
498+
echo " PASS: script retries on transient reducer failure"
499+
(( PASS++ ))
500+
else
501+
echo " FAIL: script did not retry — epic-retry not found after transient failure" >&2
502+
(( FAIL++ ))
503+
fi
504+
505+
# ── Test 26: Retry env vars are respected (SPRINT_MAX_RETRIES=0 means no retry) ─
506+
echo "Test 26: test_no_retry_when_disabled — SPRINT_MAX_RETRIES=0 skips retry"
507+
test_no_retry_when_disabled() {
508+
local TDIR26
509+
TDIR26=$(mktemp -d)
510+
511+
# Empty tracker — no epics at all. With retry disabled, should exit 1 immediately.
512+
local exit26=0 start_time end_time elapsed
513+
start_time=$(python3 -c "import time; print(time.time())")
514+
TICKETS_TRACKER_DIR="$TDIR26" SPRINT_MAX_RETRIES=0 SPRINT_RETRY_WAIT=2 \
515+
bash "$SCRIPT" >/dev/null 2>&1 || exit26=$?
516+
end_time=$(python3 -c "import time; print(time.time())")
517+
elapsed=$(python3 -c "print(float('$end_time') - float('$start_time'))")
518+
519+
rm -rf "$TDIR26"
520+
521+
# Should exit 1 (no epics) and not wait 2 seconds for a retry
522+
[ "$exit26" -eq 1 ] || return 1
523+
python3 -c "exit(0 if float('$elapsed') < 1.5 else 1)" || return 1
524+
}
525+
if test_no_retry_when_disabled; then
526+
echo " PASS: no retry when SPRINT_MAX_RETRIES=0"
527+
(( PASS++ ))
528+
else
529+
echo " FAIL: retry occurred even with SPRINT_MAX_RETRIES=0" >&2
530+
(( FAIL++ ))
531+
fi
532+
462533
echo ""
463534
echo "Results: $PASS passed, $FAIL failed"
464535
[ "$FAIL" -eq 0 ]

0 commit comments

Comments
 (0)