Skip to content

Commit d66d5cd

Browse files
committed
fix(worktrees): preserve unmerged work on non-main repos and recover stale GC lock
Two reliability bugs in the worktree garbage collector: - Graveyard preservation compared agent branches against a hardcoded `main`, so on a repository whose default branch is not `main` the rev-list check failed and was read as "nothing to preserve", letting cleanup_all_stale delete unmerged agent commits. The base is now resolved from the repo default (origin/HEAD, main, master, and their remote refs), and an inconclusive check preserves the branch to the graveyard instead of dropping it. - The GC lock had no stale-lock recovery: a crashed or killed GC left the lock behind and wedged every future `bernstein worktrees gc`. The lock already recorded the owner pid and start time; acquisition now reclaims a lock once when the owning process is gone or the lock is older than a generous bound, while still refusing a lock held by a live recent process.
1 parent 7269e73 commit d66d5cd

4 files changed

Lines changed: 274 additions & 21 deletions

File tree

src/bernstein/cli/commands/worktrees_cmd.py

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from rich.console import Console
2626
from rich.table import Table
2727

28+
from bernstein.core.process_utils import is_process_alive
2829
from bernstein.core.worktrees.classifier import (
2930
GC_LOCK_RELPATH,
3031
WORKTREE_GC_LIFECYCLE_EVENT,
@@ -136,21 +137,71 @@ class GcLockError(RuntimeError):
136137
"""Raised when the GC lock cannot be acquired."""
137138

138139

140+
# A GC that has "owned" the lock longer than this is treated as a crashed
141+
# leftover. Generous so a legitimately long sweep is never reclaimed under it.
142+
_GC_LOCK_MAX_AGE_S = 6 * 3600
143+
144+
145+
def _read_gc_lock(lock_path: Path) -> dict[str, object] | None:
146+
"""Return the lock's recorded ``{pid, started_at}`` payload, or None."""
147+
try:
148+
data = json.loads(lock_path.read_text(encoding="utf-8"))
149+
except (OSError, ValueError):
150+
return None
151+
return data if isinstance(data, dict) else None
152+
153+
154+
def _gc_lock_is_stale(meta: dict[str, object] | None) -> bool:
155+
"""True when the lock's owning process is gone or the lock is too old.
156+
157+
An unreadable / mid-write payload is NOT treated as stale, so a lock another
158+
process just created (between ``O_EXCL`` and the write) is never reclaimed.
159+
The crashed-GC case the recovery targets always leaves a fully written
160+
``{pid, started_at}`` payload, which the pid-liveness check below detects.
161+
"""
162+
if not isinstance(meta, dict):
163+
return False
164+
pid = meta.get("pid")
165+
if isinstance(pid, int) and pid > 0 and not is_process_alive(pid):
166+
return True
167+
started = meta.get("started_at")
168+
return isinstance(started, (int, float)) and (time.time() - started) > _GC_LOCK_MAX_AGE_S
169+
170+
171+
def _acquire_gc_lock_fd(lock_path: Path) -> int:
172+
"""Open the GC lock with ``O_EXCL``, reclaiming a provably stale lock once."""
173+
try:
174+
return os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
175+
except FileExistsError as exc:
176+
meta = _read_gc_lock(lock_path)
177+
if not _gc_lock_is_stale(meta):
178+
owner = f" held by pid {meta['pid']}" if isinstance(meta, dict) and meta.get("pid") else ""
179+
raise GcLockError(f"another worktree GC is already running ({lock_path}{owner})") from exc
180+
logger.warning("Reclaiming stale worktree GC lock %s (previous owner gone): %s", lock_path, meta)
181+
# A competing acquirer may win the race and re-create the lock; the
182+
# retry below resolves that case.
183+
with contextlib.suppress(OSError):
184+
lock_path.unlink()
185+
try:
186+
return os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
187+
except FileExistsError as retry_exc:
188+
raise GcLockError(f"another worktree GC is already running ({lock_path})") from retry_exc
189+
190+
139191
@contextlib.contextmanager
140192
def lock_gc(repo_root: Path): # type: ignore[no-untyped-def]
141193
"""Acquire :data:`GC_LOCK_RELPATH` exclusively, yielding the lock path.
142194
143-
The lock file is created with ``O_EXCL``; a concurrent invocation
144-
sees the file and raises :class:`GcLockError`. The lock is removed
145-
on context exit even when the body raises.
195+
The lock file is created with ``O_EXCL``; a concurrent invocation by a live
196+
process raises :class:`GcLockError`. A lock left behind by a crashed or
197+
killed GC (its recorded pid is no longer alive, or it is older than
198+
:data:`_GC_LOCK_MAX_AGE_S`) is reclaimed once instead of wedging every future
199+
run. The lock is removed on context exit even when the body raises.
146200
"""
147201
lock_path = repo_root / GC_LOCK_RELPATH
148202
lock_path.parent.mkdir(parents=True, exist_ok=True)
149203

150-
try:
151-
fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
152-
except FileExistsError as exc:
153-
raise GcLockError(f"another worktree GC is already running ({lock_path})") from exc
204+
fd = _acquire_gc_lock_fd(lock_path)
154205

155206
try:
156207
with os.fdopen(fd, "w", encoding="utf-8") as fh:

src/bernstein/core/git/worktree.py

Lines changed: 78 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -336,20 +336,80 @@ def setup_worktree_env(
336336
logger.warning("Failed to run worktree setup command: %s", exc)
337337

338338

339+
def _branch_exists(repo_root: Path, branch: str) -> bool:
340+
"""Return True if *branch* resolves to a commit in *repo_root*."""
341+
try:
342+
result = subprocess.run(
343+
["git", "rev-parse", "--verify", "--quiet", branch],
344+
cwd=repo_root,
345+
capture_output=True,
346+
text=True,
347+
timeout=_GRAVEYARD_GIT_TIMEOUT_S,
348+
)
349+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
350+
return False
351+
return result.returncode == 0
352+
353+
354+
def _resolve_graveyard_base(repo_root: Path) -> str:
355+
"""Resolve the default-branch ref used as the merge baseline for graveyard capture.
356+
357+
A repository whose default branch is not ``main`` (``master``, ``trunk``,
358+
...) would otherwise lose unmerged agent work, because ``git rev-list
359+
<branch> ^main`` fails when ``main`` does not exist and the failure was
360+
read as "nothing to preserve". Resolution order mirrors the project's
361+
canonical default-branch detection: ``origin/HEAD``, then the conventional
362+
names locally and as remote-tracking refs. Falls back to ``"main"`` only as
363+
a last resort.
364+
"""
365+
try:
366+
head = subprocess.run(
367+
["git", "symbolic-ref", "refs/remotes/origin/HEAD"],
368+
cwd=repo_root,
369+
capture_output=True,
370+
text=True,
371+
timeout=_GRAVEYARD_GIT_TIMEOUT_S,
372+
)
373+
if head.returncode == 0:
374+
ref = head.stdout.strip()
375+
if ref.startswith("refs/remotes/origin/"):
376+
return ref.removeprefix("refs/remotes/origin/")
377+
for candidate in ("main", "master", "refs/remotes/origin/main", "refs/remotes/origin/master"):
378+
probe = subprocess.run(
379+
["git", "rev-parse", "--verify", "--quiet", candidate],
380+
cwd=repo_root,
381+
capture_output=True,
382+
text=True,
383+
timeout=_GRAVEYARD_GIT_TIMEOUT_S,
384+
)
385+
if probe.returncode == 0:
386+
return candidate
387+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
388+
pass
389+
return "main"
390+
391+
339392
def _count_unmerged_commits(repo_root: Path, branch: str, base: str = "main") -> int:
340393
"""Return how many commits on *branch* are not reachable from *base*.
341394
342-
Uses ``git rev-list <branch> ^<base> --count``. If the branch is missing
343-
or the command fails, returns ``0`` - callers treat that as "nothing to
344-
preserve" which is safe because graveyard capture is best-effort.
395+
Uses ``git rev-list <branch> ^<base> --count``.
396+
397+
Returns the real count, ``0`` when the branch is gone or genuinely fully
398+
merged, and ``-1`` when the branch still exists but the count could not be
399+
computed (for example the *base* ref does not exist). Callers treat any
400+
non-zero result, including ``-1``, as "preserve to the graveyard" so that
401+
unmerged work is never silently dropped on an inconclusive check.
345402
346403
Args:
347404
repo_root: Repository root directory.
348405
branch: Branch name to compare against *base* (e.g. ``agent/<sid>``).
349-
base: Reference to compare against. Defaults to ``main``.
406+
base: Reference to compare against. Resolve it with
407+
:func:`_resolve_graveyard_base` so it matches the repo default
408+
branch rather than assuming ``main``.
350409
351410
Returns:
352-
Number of commits on *branch* not in *base*. ``0`` on any failure.
411+
Number of commits on *branch* not in *base*; ``0`` when nothing is at
412+
risk; ``-1`` when the check was inconclusive but the branch exists.
353413
"""
354414
try:
355415
result = subprocess.run(
@@ -363,18 +423,18 @@ def _count_unmerged_commits(repo_root: Path, branch: str, base: str = "main") ->
363423
)
364424
except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as exc:
365425
logger.debug("rev-list for %s failed: %s", branch, exc)
366-
return 0
426+
return -1 if _branch_exists(repo_root, branch) else 0
367427
if result.returncode != 0:
368428
logger.debug("rev-list for %s exited %d: %s", branch, result.returncode, result.stderr.strip())
369-
return 0
429+
return -1 if _branch_exists(repo_root, branch) else 0
370430
raw = result.stdout.strip()
371431
if not raw:
372432
return 0
373433
try:
374434
return int(raw)
375435
except ValueError:
376436
logger.debug("rev-list for %s produced non-integer output: %r", branch, raw)
377-
return 0
437+
return -1 if _branch_exists(repo_root, branch) else 0
378438

379439

380440
def _resolve_ref(repo_root: Path, ref: str) -> str | None:
@@ -849,16 +909,20 @@ def cleanup_all_stale(self) -> int:
849909
# remove --force`` followed by ``git branch -D`` would make
850910
# those commits unreachable and gc-eligible.
851911
branch_name = f"agent/{session_id}"
912+
base_ref = _resolve_graveyard_base(self.repo_root)
852913
try:
853-
unmerged = _count_unmerged_commits(self.repo_root, branch_name, base="main")
854-
except Exception as exc: # defensive - never block cleanup
914+
unmerged = _count_unmerged_commits(self.repo_root, branch_name, base=base_ref)
915+
except Exception as exc: # defensive - preserve rather than drop work
855916
logger.debug("Graveyard pre-check failed for %s: %s", session_id, exc)
856-
unmerged = 0
857-
if unmerged > 0:
917+
unmerged = -1 if _branch_exists(self.repo_root, branch_name) else 0
918+
# Any non-zero result preserves: >0 is a real count, -1 means the
919+
# check was inconclusive but the branch exists, so we keep the work.
920+
if unmerged != 0:
858921
logger.warning(
859-
"Stale worktree %s has %d unmerged commit(s); preserving to graveyard",
922+
"Stale worktree %s has %s unmerged commit(s) (base %s); preserving to graveyard",
860923
session_id,
861-
unmerged,
924+
unmerged if unmerged > 0 else "an unknown number of",
925+
base_ref,
862926
)
863927
try:
864928
preserve_branch_to_graveyard(self.repo_root, session_id, branch=branch_name)

tests/unit/test_worktree_graveyard.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,3 +244,61 @@ def test_purge_graveyard_zero_days_purges_everything(repo: Path) -> None:
244244
def test_purge_graveyard_rejects_negative_age(repo: Path) -> None:
245245
with pytest.raises(ValueError, match=">= 0"):
246246
purge_graveyard(repo, older_than_days=-1)
247+
248+
249+
# ----------------------------------------------------------------------------
250+
# Non-`main` default branch must not lose unmerged work (gc-reliability)
251+
# ----------------------------------------------------------------------------
252+
253+
254+
def _init_repo_on(tmp_path: Path, branch: str) -> Path:
255+
repo = tmp_path / f"repo_{branch}"
256+
repo.mkdir()
257+
_run(["git", "init", "-b", branch], repo)
258+
_run(["git", "config", "user.email", "test@example.com"], repo)
259+
_run(["git", "config", "user.name", "Test User"], repo)
260+
_run(["git", "config", "commit.gpgsign", "false"], repo)
261+
(repo / "README.md").write_text("seed\n", encoding="utf-8")
262+
_run(["git", "add", "README.md"], repo)
263+
_run(["git", "commit", "-m", "seed"], repo)
264+
return repo
265+
266+
267+
def test_resolve_graveyard_base_finds_master(tmp_path: Path) -> None:
268+
from bernstein.core.git.worktree import _resolve_graveyard_base
269+
270+
repo = _init_repo_on(tmp_path, "master")
271+
assert _resolve_graveyard_base(repo) == "master"
272+
273+
274+
def test_count_unmerged_uncertain_when_base_missing_branch_exists(tmp_path: Path) -> None:
275+
"""On a master-default repo, the old hardcoded ^main check failed -> 0 (delete).
276+
277+
It must now report -1 (inconclusive but the branch exists) so the caller
278+
preserves the unmerged work instead of silently dropping it.
279+
"""
280+
repo = _init_repo_on(tmp_path, "master")
281+
wt = repo / ".sdd" / "worktrees" / "sid-x"
282+
wt.parent.mkdir(parents=True, exist_ok=True)
283+
_run(["git", "worktree", "add", "-b", "agent/sid-x", str(wt)], repo)
284+
(wt / "f.txt").write_text("work\n", encoding="utf-8")
285+
_run(["git", "add", "f.txt"], wt)
286+
_run(["git", "commit", "-m", "unmerged work"], wt)
287+
288+
assert _count_unmerged_commits(repo, "agent/sid-x", base="main") == -1
289+
290+
291+
def test_count_unmerged_accurate_with_resolved_base(tmp_path: Path) -> None:
292+
from bernstein.core.git.worktree import _resolve_graveyard_base
293+
294+
repo = _init_repo_on(tmp_path, "master")
295+
wt = repo / ".sdd" / "worktrees" / "sid-y"
296+
wt.parent.mkdir(parents=True, exist_ok=True)
297+
_run(["git", "worktree", "add", "-b", "agent/sid-y", str(wt)], repo)
298+
for i in range(2):
299+
(wt / f"f{i}.txt").write_text("x\n", encoding="utf-8")
300+
_run(["git", "add", f"f{i}.txt"], wt)
301+
_run(["git", "commit", "-m", f"c{i}"], wt)
302+
303+
base = _resolve_graveyard_base(repo)
304+
assert _count_unmerged_commits(repo, "agent/sid-y", base=base) == 2

tests/unit/test_worktrees_cmd.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,3 +772,83 @@ def test_tampering_with_recorded_head_breaks_verify(repo_root: Path) -> None:
772772
valid, errors = fresh.verify()
773773
assert valid is False
774774
assert any("HMAC mismatch" in e or "non-canonical" in e for e in errors)
775+
776+
777+
# ----------------------------------------------------------------------------
778+
# GC lock stale recovery (gc-reliability): a crashed GC must not wedge future runs
779+
# ----------------------------------------------------------------------------
780+
781+
782+
def test_gc_lock_stale_when_pid_dead() -> None:
783+
import time as _time
784+
from unittest.mock import patch
785+
786+
from bernstein.cli.commands.worktrees_cmd import _gc_lock_is_stale
787+
788+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=False):
789+
assert _gc_lock_is_stale({"pid": 999999, "started_at": _time.time()}) is True
790+
791+
792+
def test_gc_lock_not_stale_when_pid_alive() -> None:
793+
import os as _os
794+
import time as _time
795+
from unittest.mock import patch
796+
797+
from bernstein.cli.commands.worktrees_cmd import _gc_lock_is_stale
798+
799+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=True):
800+
assert _gc_lock_is_stale({"pid": _os.getpid(), "started_at": _time.time()}) is False
801+
802+
803+
def test_gc_lock_stale_when_too_old() -> None:
804+
import os as _os
805+
import time as _time
806+
from unittest.mock import patch
807+
808+
from bernstein.cli.commands.worktrees_cmd import _gc_lock_is_stale
809+
810+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=True):
811+
assert _gc_lock_is_stale({"pid": _os.getpid(), "started_at": _time.time() - 7 * 3600}) is True
812+
813+
814+
def test_gc_lock_unreadable_is_not_stale() -> None:
815+
from bernstein.cli.commands.worktrees_cmd import _gc_lock_is_stale
816+
817+
assert _gc_lock_is_stale(None) is False
818+
819+
820+
def test_lock_gc_reclaims_lock_from_dead_owner(repo_root: Path) -> None:
821+
"""A lock left by a crashed GC (dead pid) is reclaimed, not wedged forever."""
822+
import json as _json
823+
import time as _time
824+
from unittest.mock import patch
825+
826+
from bernstein.cli.commands.worktrees_cmd import GC_LOCK_RELPATH
827+
828+
lock_path = repo_root / GC_LOCK_RELPATH
829+
lock_path.parent.mkdir(parents=True, exist_ok=True)
830+
lock_path.write_text(_json.dumps({"pid": 999999, "started_at": _time.time()}), encoding="utf-8")
831+
832+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=False):
833+
with lock_gc(repo_root):
834+
assert lock_path.exists()
835+
assert not lock_path.exists()
836+
837+
838+
def test_lock_gc_respects_live_owner(repo_root: Path) -> None:
839+
"""A lock held by a live, recent process is NOT reclaimed."""
840+
import json as _json
841+
import os as _os
842+
import time as _time
843+
from unittest.mock import patch
844+
845+
from bernstein.cli.commands.worktrees_cmd import GC_LOCK_RELPATH
846+
847+
lock_path = repo_root / GC_LOCK_RELPATH
848+
lock_path.parent.mkdir(parents=True, exist_ok=True)
849+
lock_path.write_text(_json.dumps({"pid": _os.getpid(), "started_at": _time.time()}), encoding="utf-8")
850+
851+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=True):
852+
with pytest.raises(GcLockError):
853+
with lock_gc(repo_root):
854+
pass

0 commit comments

Comments
 (0)