Skip to content

Commit e539eb6

Browse files
committed
feat(worktrees): add worktrees unlock to inspect and recover a stuck GC lock
`worktrees gc` takes an exclusive O_EXCL lock; a crashed or killed run can leave it behind. The new `unlock` command reports who holds the lock (pid, liveness, age) and clears it when the owner process is gone, refusing a lock held by a live, recent process unless `--force` is passed. The unlock is recorded as a tamper-evident `worktree.gc_unlock` event in the HMAC-chained audit log, so an operator recovery is forensically reconstructable rather than a silent file deletion. Documented in docs/operations/worktrees.md.
1 parent 27da59e commit e539eb6

3 files changed

Lines changed: 242 additions & 4 deletions

File tree

docs/operations/worktrees.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ only ever blocks deletion, it never deletes more.
6464
```text
6565
bernstein worktrees list [--workdir DIR] [--json]
6666
bernstein worktrees gc [--workdir DIR] [--yes] [--dry] [--force-unsaved]
67+
bernstein worktrees unlock [--workdir DIR] [--force] [--json]
6768
```
6869

6970
- `list` - tabular dump with path, task id, state, age, size, PID. Use
@@ -78,14 +79,23 @@ bernstein worktrees gc [--workdir DIR] [--yes] [--dry] [--force-unsaved]
7879
confirmation (unless `--yes` is also passed) and mirrors the
7980
`maintenance` command's `--force`. The reap is recorded with
8081
`forced=true` in the audit trail.
82+
- `unlock` - inspect and clear a stuck GC lock. It reports who holds the
83+
lock (pid, liveness, age) and removes it when the owner process is gone;
84+
pass `--force` to clear a lock whose owner still looks alive. The unlock
85+
is recorded as a `worktree.gc_unlock` event in the audit trail, so a
86+
manual recovery is never silent.
8187

8288
## GC lock
8389

8490
A single-file lock at `.sdd/runtime/worktree-gc.lock` is held via
85-
`O_EXCL` for the duration of `gc`. The lock is released on exception.
86-
87-
Exit code `2` indicates a lock collision (another `gc` is in flight or
88-
the lock file is stale).
91+
`O_EXCL` for the duration of `gc`. The lock file records the owner pid and
92+
start time, and is released on exception.
93+
94+
A lock left behind by a crashed or killed `gc` is reclaimed automatically on
95+
the next run once its owner process is gone (or the lock is older than a
96+
generous bound), so a crash no longer wedges future runs. Exit code `2`
97+
still indicates a genuine collision with a live, recent `gc`. To inspect or
98+
clear a lock by hand, use `bernstein worktrees unlock`.
8999

90100
## Lifecycle event
91101

src/bernstein/cli/commands/worktrees_cmd.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,3 +577,138 @@ def _append_reap_event(
577577
"dry_run": dry_run,
578578
},
579579
)
580+
581+
582+
# ---------------------------------------------------------------------------
583+
# Lock inspection + recovery (`worktrees unlock`)
584+
# ---------------------------------------------------------------------------
585+
586+
#: Audit event-type appended to the HMAC-chained log when an operator clears
587+
#: a stuck GC lock, so a recovery is forensically reconstructable rather than a
588+
#: silent file deletion.
589+
_WORKTREE_GC_UNLOCK_EVENT = "worktree.gc_unlock"
590+
591+
592+
def _gc_lock_status(repo_root: Path) -> dict[str, object]:
593+
"""Return the current GC lock status without mutating it."""
594+
lock_path = repo_root / GC_LOCK_RELPATH
595+
if not lock_path.exists():
596+
return {"held": False}
597+
meta = _read_gc_lock(lock_path)
598+
pid = meta.get("pid") if isinstance(meta, dict) else None
599+
started = meta.get("started_at") if isinstance(meta, dict) else None
600+
age: float | None = None
601+
if isinstance(started, (int, float)):
602+
age = max(0.0, time.time() - started)
603+
alive: bool | None = None
604+
if isinstance(pid, int) and pid > 0:
605+
alive = is_process_alive(pid)
606+
return {
607+
"held": True,
608+
"path": str(lock_path),
609+
"pid": pid,
610+
"alive": alive,
611+
"age_seconds": age,
612+
"stale": _gc_lock_is_stale(meta),
613+
"readable": isinstance(meta, dict),
614+
}
615+
616+
617+
def _append_gc_unlock_event(repo_root: Path, status: dict[str, object], *, forced: bool) -> None:
618+
"""Record the unlock on the HMAC-chained audit log.
619+
620+
Unlike a reap (which destroys work and is fail-closed), clearing a lock
621+
file destroys nothing, so the caller treats a failed append as a warning
622+
and still recovers: refusing to clear the lock when the audit log is
623+
unavailable would recreate the very wedge this command exists to fix.
624+
"""
625+
age = status.get("age_seconds")
626+
log = _open_audit_log(repo_root)
627+
log.log(
628+
event_type=_WORKTREE_GC_UNLOCK_EVENT,
629+
actor=_AUDIT_ACTOR,
630+
resource_type="worktree_gc_lock",
631+
resource_id=str(repo_root / GC_LOCK_RELPATH),
632+
details={
633+
"previous_pid": status.get("pid"),
634+
"previous_owner_alive": status.get("alive"),
635+
"age_seconds": int(age) if isinstance(age, (int, float)) else None,
636+
"stale": status.get("stale"),
637+
"forced": forced,
638+
},
639+
)
640+
641+
642+
@worktrees_group.command("unlock")
643+
@click.option(
644+
"--workdir",
645+
type=click.Path(path_type=Path, file_okay=False),
646+
default=Path(),
647+
show_default=True,
648+
help="Project root containing .sdd/.",
649+
)
650+
@click.option(
651+
"--force",
652+
is_flag=True,
653+
default=False,
654+
help="Remove the lock even when its owner process still looks alive.",
655+
)
656+
@click.option("--json", "as_json", is_flag=True, default=False, help="Emit machine-readable JSON.")
657+
def unlock_cmd(workdir: Path, force: bool, as_json: bool) -> None:
658+
"""Inspect and clear a stuck worktree GC lock.
659+
660+
``worktrees gc`` takes an exclusive lock; a crashed or killed run can leave
661+
it behind and wedge every later gc. This reports who holds the lock and
662+
removes it when the owner process is gone (or with ``--force``), recording
663+
the unlock as a tamper-evident ``worktree.gc_unlock`` audit event so the
664+
recovery is never silent. A lock held by a live, recent process is refused
665+
unless ``--force`` is passed.
666+
"""
667+
repo_root = workdir.resolve()
668+
status = _gc_lock_status(repo_root)
669+
console = Console()
670+
671+
if not status["held"]:
672+
if as_json:
673+
click.echo(json.dumps({"held": False, "removed": False}, default=str))
674+
else:
675+
console.print("[green]No worktree GC lock is held.[/green]")
676+
return
677+
678+
removable = bool(status["stale"]) or force
679+
removed = False
680+
if removable:
681+
forced_live = force and not status["stale"]
682+
try:
683+
_append_gc_unlock_event(repo_root, status, forced=forced_live)
684+
except Exception as exc: # best-effort: never wedge recovery on audit failure
685+
logger.warning("worktree.gc_unlock audit append failed (continuing): %s", exc)
686+
lock_path = repo_root / GC_LOCK_RELPATH
687+
with contextlib.suppress(OSError):
688+
lock_path.unlink()
689+
removed = not lock_path.exists()
690+
691+
pid = status.get("pid")
692+
alive = status.get("alive")
693+
age = status.get("age_seconds")
694+
age_str = f"{int(age)}s" if isinstance(age, (int, float)) else "unknown age"
695+
owner = f"pid {pid}" if pid else "an unknown process"
696+
liveness = "alive" if alive else ("not running" if alive is False else "liveness unknown")
697+
698+
if as_json:
699+
click.echo(json.dumps({**status, "removed": removed}, default=str))
700+
if not removable:
701+
raise SystemExit(1)
702+
return
703+
704+
if removed:
705+
console.print(f"[green]Cleared worktree GC lock[/green] (previous owner {owner}, {liveness}, {age_str}).")
706+
console.print("[dim]Recorded as a worktree.gc_unlock audit event.[/dim]")
707+
elif removable:
708+
console.print("[yellow]GC lock was already gone.[/yellow]")
709+
else:
710+
console.print(
711+
f"[red]A worktree GC appears to be running ({owner}, {liveness}, {age_str}).[/red]\n"
712+
"Pass --force only if you are sure that process is dead."
713+
)
714+
raise SystemExit(1)

tests/unit/test_worktrees_cmd.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,3 +852,96 @@ def test_lock_gc_respects_live_owner(repo_root: Path) -> None:
852852
with pytest.raises(GcLockError):
853853
with lock_gc(repo_root):
854854
pass
855+
856+
857+
# ----------------------------------------------------------------------------
858+
# `worktrees unlock` (GC lock inspection + recovery)
859+
# ----------------------------------------------------------------------------
860+
861+
862+
def _write_lock(repo_root: Path, pid: int, started_at: float) -> Path:
863+
import json as _json
864+
865+
from bernstein.cli.commands.worktrees_cmd import GC_LOCK_RELPATH
866+
867+
lock_path = repo_root / GC_LOCK_RELPATH
868+
lock_path.parent.mkdir(parents=True, exist_ok=True)
869+
lock_path.write_text(_json.dumps({"pid": pid, "started_at": started_at}), encoding="utf-8")
870+
return lock_path
871+
872+
873+
def test_gc_lock_status_reports_no_lock(repo_root: Path) -> None:
874+
from bernstein.cli.commands.worktrees_cmd import _gc_lock_status
875+
876+
assert _gc_lock_status(repo_root) == {"held": False}
877+
878+
879+
def test_gc_lock_status_flags_dead_owner(repo_root: Path) -> None:
880+
import time as _time
881+
from unittest.mock import patch
882+
883+
from bernstein.cli.commands.worktrees_cmd import _gc_lock_status
884+
885+
_write_lock(repo_root, pid=999999, started_at=_time.time())
886+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=False):
887+
st = _gc_lock_status(repo_root)
888+
assert st["held"] is True
889+
assert st["alive"] is False
890+
assert st["stale"] is True
891+
892+
893+
def test_unlock_no_lock(repo_root: Path) -> None:
894+
from click.testing import CliRunner
895+
896+
from bernstein.cli.commands.worktrees_cmd import worktrees_group
897+
898+
result = CliRunner().invoke(worktrees_group, ["unlock", "--workdir", str(repo_root)])
899+
assert result.exit_code == 0, result.output
900+
assert "No worktree GC lock is held" in result.output
901+
902+
903+
def test_unlock_clears_dead_owner_lock(repo_root: Path) -> None:
904+
import time as _time
905+
from unittest.mock import patch
906+
907+
from click.testing import CliRunner
908+
909+
lock_path = _write_lock(repo_root, pid=999999, started_at=_time.time())
910+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=False):
911+
result = CliRunner().invoke(worktrees_group_ref(), ["unlock", "--workdir", str(repo_root)])
912+
assert result.exit_code == 0, result.output
913+
assert "Cleared worktree GC lock" in result.output
914+
assert not lock_path.exists()
915+
916+
917+
def test_unlock_refuses_live_owner_without_force(repo_root: Path) -> None:
918+
import time as _time
919+
from unittest.mock import patch
920+
921+
from click.testing import CliRunner
922+
923+
lock_path = _write_lock(repo_root, pid=4321, started_at=_time.time())
924+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=True):
925+
result = CliRunner().invoke(worktrees_group_ref(), ["unlock", "--workdir", str(repo_root)])
926+
assert result.exit_code == 1
927+
assert "appears to be running" in result.output
928+
assert lock_path.exists() # not removed
929+
930+
931+
def test_unlock_force_clears_live_owner(repo_root: Path) -> None:
932+
import time as _time
933+
from unittest.mock import patch
934+
935+
from click.testing import CliRunner
936+
937+
lock_path = _write_lock(repo_root, pid=4321, started_at=_time.time())
938+
with patch("bernstein.cli.commands.worktrees_cmd.is_process_alive", return_value=True):
939+
result = CliRunner().invoke(worktrees_group_ref(), ["unlock", "--workdir", str(repo_root), "--force"])
940+
assert result.exit_code == 0, result.output
941+
assert not lock_path.exists()
942+
943+
944+
def worktrees_group_ref(): # small helper so the import lives next to use
945+
from bernstein.cli.commands.worktrees_cmd import worktrees_group
946+
947+
return worktrees_group

0 commit comments

Comments
 (0)