Skip to content

Commit df617d4

Browse files
fix: add retry logic to sprint-list-epics.sh for worktree startup race condition (merge worktree-20260325-170522)
2 parents ce1552b + b798cb9 commit df617d4

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

plugins/dso/scripts/sprint-list-epics.sh

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,26 @@ REDUCER="$SCRIPT_DIR/ticket-reducer.py"
3535
TRACKER_DIR="${TICKETS_TRACKER_DIR:-$REPO_ROOT/.tickets-tracker}"
3636

3737
# ---------------------------------------------------------------------------
38-
# Build index from v3 reducer.
38+
# Retry configuration for worktree startup race conditions.
39+
# When the tracker dir has entries but the reducer returns an empty index,
40+
# retry after a short wait. This handles the case where the tracker symlink
41+
# or filesystem isn't fully ready yet (common during worktree creation).
42+
# ---------------------------------------------------------------------------
43+
# REVIEW-DEFENSE: MAX_RETRIES is the number of additional attempts after the initial build,
44+
# not the total number of attempts. Total attempts = MAX_RETRIES + 1 (initial attempt + retries).
45+
# The retry loop condition `attempt < MAX_RETRIES` is intentional: attempt starts at 0 and
46+
# increments after each retry, so the loop runs at most MAX_RETRIES times (additional attempts).
47+
MAX_RETRIES="${SPRINT_MAX_RETRIES:-3}"
48+
RETRY_WAIT="${SPRINT_RETRY_WAIT:-1}"
49+
50+
# ---------------------------------------------------------------------------
51+
# Build index from v3 reducer (with retry on transient failure).
3952
# ---------------------------------------------------------------------------
4053
export _SPRINT_TRACKER_DIR="$TRACKER_DIR"
4154
export _SPRINT_REDUCER="$REDUCER"
42-
index_and_counts=$(python3 -c "
55+
56+
_build_index() {
57+
python3 -c "
4358
import json, os, sys, importlib.util, collections
4459
4560
tracker_dir = os.environ['_SPRINT_TRACKER_DIR']
@@ -90,7 +105,38 @@ for entry_name in os.listdir(tracker_dir):
90105
child_counts[parent_id] += 1
91106
92107
print(json.dumps({'index': idx, 'child_counts': dict(child_counts)}))
93-
" 2>/dev/null || echo '{"index":{},"child_counts":{}}')
108+
" 2>/dev/null || echo '{"index":{},"child_counts":{}}'
109+
}
110+
111+
# Count non-hidden subdirectories in tracker to detect "has entries but reducer failed"
112+
_tracker_has_entries() {
113+
local count
114+
count=$(find "$TRACKER_DIR" -mindepth 1 -maxdepth 1 -type d ! -name '.*' 2>/dev/null | head -1)
115+
[ -n "$count" ]
116+
}
117+
118+
# Build index with retry on transient failure
119+
index_and_counts=$(_build_index)
120+
121+
attempt=0
122+
while [ "$attempt" -lt "$MAX_RETRIES" ]; do
123+
# Check if the index is empty (no tickets resolved)
124+
index_key_count=$(echo "$index_and_counts" | python3 -c "import json,sys; print(len(json.load(sys.stdin).get('index',{})))" 2>/dev/null || echo "0")
125+
126+
if [ "$index_key_count" -gt 0 ]; then
127+
break # Index has entries — proceed normally
128+
fi
129+
130+
# Index is empty — check if tracker dir has entries (indicating transient failure)
131+
if ! _tracker_has_entries; then
132+
break # Tracker genuinely has no tickets — no point retrying
133+
fi
134+
135+
# Tracker has entries but reducer returned empty — transient failure, retry
136+
attempt=$(( attempt + 1 ))
137+
sleep "$RETRY_WAIT"
138+
index_and_counts=$(_build_index)
139+
done
94140

95141
SPRINT_INDEX_JSON=$(echo "$index_and_counts" | python3 -c "import json,sys; print(json.dumps(json.load(sys.stdin)['index']))")
96142
child_counts_json=$(echo "$index_and_counts" | python3 -c "import json,sys; print(json.dumps(json.load(sys.stdin)['child_counts']))")

tests/scripts/test-sprint-list-epics.sh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,77 @@ else
459459
(( FAIL++ ))
460460
fi
461461

462+
# ── Test 25: Retry when tracker dir has entries but reducer returns empty ──────
463+
echo "Test 25: test_retry_on_transient_reducer_failure — retries when tracker not ready"
464+
test_retry_on_transient_reducer_failure() {
465+
local TDIR25 TRACKER25
466+
TDIR25=$(mktemp -d)
467+
TRACKER25="$TDIR25/tracker"
468+
mkdir -p "$TRACKER25"
469+
470+
# Create a valid v3 epic
471+
make_v3_ticket "$TRACKER25" "epic-retry" "epic" "open" "1" "" "Retry Epic"
472+
473+
# Simulate transient reducer failure by making the epic dir temporarily unreadable.
474+
# The retry mechanism should detect that the tracker has entries but the index is empty,
475+
# wait, then retry — at which point the dir is readable and the epic is found.
476+
477+
# Make the epic dir unreadable (reducer will fail to read events, returns empty index)
478+
chmod 000 "$TRACKER25/epic-retry"
479+
480+
# Restore permissions quickly — well before the first retry fires.
481+
# SPRINT_RETRY_WAIT=0.8 gives an 8x margin over the 0.1s background delay.
482+
(sleep 0.1 && chmod 755 "$TRACKER25/epic-retry") &
483+
local restore_pid=$!
484+
485+
local out25 exit25=0
486+
out25=$(TICKETS_TRACKER_DIR="$TRACKER25" SPRINT_MAX_RETRIES=3 SPRINT_RETRY_WAIT=0.8 \
487+
bash "$SCRIPT" 2>/dev/null) || exit25=$?
488+
489+
wait "$restore_pid" 2>/dev/null || true
490+
chmod -R 755 "$TRACKER25" 2>/dev/null || true
491+
rm -rf "$TDIR25"
492+
493+
# The script should have retried and found the epic
494+
[ "$exit25" -eq 0 ] || return 1
495+
echo "$out25" | grep -q "epic-retry" || return 1
496+
}
497+
if test_retry_on_transient_reducer_failure; then
498+
echo " PASS: script retries on transient reducer failure"
499+
(( PASS++ ))
500+
else
501+
echo " FAIL: script did not retry — epic-retry not found after transient failure" >&2
502+
(( FAIL++ ))
503+
fi
504+
505+
# ── Test 26: Retry env vars are respected (SPRINT_MAX_RETRIES=0 means no retry) ─
506+
echo "Test 26: test_no_retry_when_disabled — SPRINT_MAX_RETRIES=0 skips retry"
507+
test_no_retry_when_disabled() {
508+
local TDIR26
509+
TDIR26=$(mktemp -d)
510+
511+
# Empty tracker — no epics at all. With retry disabled, should exit 1 immediately.
512+
local exit26=0 start_time end_time elapsed
513+
start_time=$(python3 -c "import time; print(time.time())")
514+
TICKETS_TRACKER_DIR="$TDIR26" SPRINT_MAX_RETRIES=0 SPRINT_RETRY_WAIT=2 \
515+
bash "$SCRIPT" >/dev/null 2>&1 || exit26=$?
516+
end_time=$(python3 -c "import time; print(time.time())")
517+
elapsed=$(python3 -c "print(float('$end_time') - float('$start_time'))")
518+
519+
rm -rf "$TDIR26"
520+
521+
# Should exit 1 (no epics) and not wait 2 seconds for a retry
522+
[ "$exit26" -eq 1 ] || return 1
523+
python3 -c "exit(0 if float('$elapsed') < 1.5 else 1)" || return 1
524+
}
525+
if test_no_retry_when_disabled; then
526+
echo " PASS: no retry when SPRINT_MAX_RETRIES=0"
527+
(( PASS++ ))
528+
else
529+
echo " FAIL: retry occurred even with SPRINT_MAX_RETRIES=0" >&2
530+
(( FAIL++ ))
531+
fi
532+
462533
echo ""
463534
echo "Results: $PASS passed, $FAIL failed"
464535
[ "$FAIL" -eq 0 ]

0 commit comments

Comments
 (0)