Skip to content

Commit 0b882fb

Browse files
aarthy-dkclaude
andcommitted
fix(testgen): kill the whole subprocess tree on tg start/install shutdown
`proc.terminate()` on Windows calls `TerminateProcess` and only kills the parent — pixeltable-pgserver's `postgres.exe` children get orphaned. The symptom: `tg delete` fails with "Could not remove TestGen data directory" because postgres holds the data files open. New `stop_app_tree(proc, timeout=...)` walks the descendant tree: - Windows: shells out to `taskkill /F /T /PID` to walk the WMI process tree. - POSIX: sends SIGTERM to the process group (parent is now spawned with `start_new_session=True`), escalating to SIGKILL on timeout. Routed all three termination sites in `start_testgen_app` through the helper (port-readiness timeout, KeyboardInterrupt handler, finally cleanup). Added unit tests covering the platform branching and the SIGKILL escalation path. Note: this is a force-kill safety net for uncatchable termination paths (installer crash, terminal X-button close, hang). Graceful shutdown on Ctrl+C is a separate fix in pixeltable-pgserver / testgen's signal handling — see the issue we drafted for the testgen team. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e8aec12 commit 0b882fb

2 files changed

Lines changed: 126 additions & 23 deletions

File tree

dk-installer.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import re
2121
import secrets
2222
import shutil
23+
import signal
2324
import socket
2425
import ssl
2526
import stat
@@ -2464,6 +2465,40 @@ def read_testgen_config_env() -> dict[str, str]:
24642465
return config
24652466

24662467

2468+
def stop_app_tree(proc: subprocess.Popen, timeout: int = 10) -> None:
2469+
"""Terminate ``proc`` and all of its descendants.
2470+
2471+
Plain ``proc.terminate()`` only kills the parent — pixeltable-pgserver
2472+
spawns ``postgres`` children that get orphaned otherwise. Cross-platform:
2473+
on Windows we shell out to ``taskkill /F /T``; on POSIX we send SIGTERM
2474+
to the whole process group (the parent was started with
2475+
``start_new_session=True``).
2476+
"""
2477+
if proc.poll() is not None:
2478+
return
2479+
if platform.system() == "Windows":
2480+
with contextlib.suppress(Exception):
2481+
subprocess.run(
2482+
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2483+
stdout=subprocess.DEVNULL,
2484+
stderr=subprocess.DEVNULL,
2485+
creationflags=getattr(subprocess, "CREATE_NO_WINDOW", 0),
2486+
check=False,
2487+
)
2488+
else:
2489+
with contextlib.suppress(Exception):
2490+
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
2491+
try:
2492+
proc.wait(timeout=timeout)
2493+
except subprocess.TimeoutExpired:
2494+
if platform.system() != "Windows":
2495+
with contextlib.suppress(Exception):
2496+
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
2497+
proc.kill()
2498+
with contextlib.suppress(subprocess.TimeoutExpired):
2499+
proc.wait(timeout=5)
2500+
2501+
24672502
def start_testgen_app(action, args) -> None:
24682503
"""Start ``testgen run-app`` and block until the user interrupts.
24692504
@@ -2496,15 +2531,17 @@ def start_testgen_app(action, args) -> None:
24962531
stdout=subprocess.DEVNULL,
24972532
stderr=subprocess.DEVNULL,
24982533
stdin=subprocess.DEVNULL,
2534+
# POSIX-only: put the parent in its own process group so we can
2535+
# signal the whole tree (postgres included) on shutdown. Windows
2536+
# gets the same effect via ``taskkill /T`` in ``stop_app_tree``.
2537+
start_new_session=(platform.system() != "Windows"),
24992538
)
25002539
except FileNotFoundError as e:
25012540
raise InstallerError(f"Could not start TestGen: {e}") from e
25022541

25032542
try:
25042543
if not wait_for_tcp_port(port, timeout=TESTGEN_APP_READY_TIMEOUT):
2505-
proc.terminate()
2506-
with contextlib.suppress(subprocess.TimeoutExpired):
2507-
proc.wait(timeout=5)
2544+
stop_app_tree(proc, timeout=5)
25082545
raise InstallerError(
25092546
f"TestGen did not start within {TESTGEN_APP_READY_TIMEOUT} seconds. "
25102547
f"See {simplify_path(TESTGEN_LOG_FILE_PATH)} for details."
@@ -2526,19 +2563,11 @@ def start_testgen_app(action, args) -> None:
25262563
# Reset the cursor to column 0 — the terminal echoed `^C` mid-line.
25272564
print("")
25282565
CONSOLE.msg("Stopping TestGen...")
2529-
proc.terminate()
2530-
try:
2531-
proc.wait(timeout=10)
2532-
except subprocess.TimeoutExpired:
2533-
proc.kill()
2534-
proc.wait()
2566+
stop_app_tree(proc, timeout=10)
25352567
CONSOLE.msg("TestGen stopped.")
25362568
CONSOLE.msg(f"To start it again, {command_hint(args.prod, 'start', 'Start TestGen')}.")
25372569
finally:
2538-
if proc.poll() is None:
2539-
proc.terminate()
2540-
with contextlib.suppress(subprocess.TimeoutExpired):
2541-
proc.wait(timeout=5)
2570+
stop_app_tree(proc, timeout=5)
25422571

25432572

25442573
class UvToolUpgradeStep(Step):

tests/test_tg_start.py

Lines changed: 84 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
InstallerError,
1111
TestgenStartAction,
1212
start_testgen_app,
13+
stop_app_tree,
1314
InstallMarker,
1415
)
1516

@@ -93,41 +94,114 @@ def test_start_testgen_app_aborts_on_port_timeout(app_action, args_mock, proc_ru
9394
patch("tests.installer.resolve_testgen_path", return_value="/bin/testgen"),
9495
patch("tests.installer.subprocess.Popen", return_value=proc_running_then_stops),
9596
patch("tests.installer.wait_for_tcp_port", return_value=False),
97+
patch("tests.installer.stop_app_tree") as stop_mock,
9698
pytest.raises(InstallerError, match="did not start within"),
9799
):
98100
start_testgen_app(app_action, args_mock)
99101

100-
proc_running_then_stops.terminate.assert_called()
102+
stop_mock.assert_called_with(proc_running_then_stops, timeout=5)
101103

102104

103105
@pytest.mark.unit
104106
def test_start_testgen_app_handles_keyboard_interrupt(app_action, args_mock, console_msg_mock, empty_tg_config):
105-
"""User Ctrl+C during run is the expected stop signal — terminate cleanly,
106-
don't propagate the exception, and hint at the start command for next time."""
107+
"""User Ctrl+C during run is the expected stop signal — kill the whole
108+
process tree (postgres + parent), don't propagate the exception, and hint
109+
at the start command for next time."""
107110
args_mock.prod = "tg"
108111

109112
proc = MagicMock()
110-
# poll() returns None while alive; after terminate() it transitions to 0.
111113
proc.poll.return_value = None
112-
113-
def _on_terminate():
114-
proc.poll.return_value = 0
115-
116-
proc.terminate.side_effect = _on_terminate
117114
proc.wait.side_effect = [KeyboardInterrupt(), 0]
118115

119116
with (
120117
patch("tests.installer.resolve_testgen_path", return_value="/bin/testgen"),
121118
patch("tests.installer.subprocess.Popen", return_value=proc),
122119
patch("tests.installer.wait_for_tcp_port", return_value=True),
120+
patch("tests.installer.stop_app_tree") as stop_mock,
123121
):
124122
start_testgen_app(app_action, args_mock)
125123

126-
proc.terminate.assert_called()
124+
# Called once for the keyboard-interrupt branch (timeout=10) and again in
125+
# the ``finally`` cleanup (timeout=5; no-op since proc already stopped).
126+
assert stop_mock.call_args_list[0].args[0] is proc
127+
assert stop_mock.call_args_list[0].kwargs == {"timeout": 10}
127128
console_msg_mock.assert_any_msg_contains("TestGen stopped")
128129
console_msg_mock.assert_any_msg_contains("tg start")
129130

130131

132+
# --- stop_app_tree ------------------------------------------------------------
133+
134+
135+
@pytest.mark.unit
136+
def test_stop_app_tree_no_op_when_proc_already_exited():
137+
proc = MagicMock()
138+
proc.poll.return_value = 0 # already exited
139+
140+
with patch("tests.installer.subprocess.run") as run_mock:
141+
stop_app_tree(proc)
142+
143+
run_mock.assert_not_called()
144+
proc.wait.assert_not_called()
145+
146+
147+
@pytest.mark.unit
148+
def test_stop_app_tree_windows_uses_taskkill_tree():
149+
proc = MagicMock()
150+
proc.poll.return_value = None
151+
proc.pid = 4242
152+
proc.wait.return_value = 0
153+
154+
with (
155+
patch("tests.installer.platform.system", return_value="Windows"),
156+
patch("tests.installer.subprocess.run") as run_mock,
157+
):
158+
stop_app_tree(proc, timeout=3)
159+
160+
cmd = run_mock.call_args.args[0]
161+
assert cmd[:4] == ["taskkill", "/F", "/T", "/PID"]
162+
assert cmd[4] == "4242"
163+
proc.wait.assert_called_with(timeout=3)
164+
165+
166+
@pytest.mark.unit
167+
def test_stop_app_tree_posix_signals_process_group():
168+
proc = MagicMock()
169+
proc.poll.return_value = None
170+
proc.pid = 4242
171+
proc.wait.return_value = 0
172+
173+
with (
174+
patch("tests.installer.platform.system", return_value="Linux"),
175+
patch("tests.installer.os.killpg") as killpg_mock,
176+
patch("tests.installer.os.getpgid", return_value=4242),
177+
):
178+
stop_app_tree(proc, timeout=3)
179+
180+
killpg_mock.assert_called_once()
181+
assert killpg_mock.call_args.args[0] == 4242 # pgid
182+
proc.wait.assert_called_with(timeout=3)
183+
184+
185+
@pytest.mark.unit
186+
def test_stop_app_tree_falls_through_to_kill_on_timeout():
187+
"""If SIGTERM doesn't take, escalate to SIGKILL / proc.kill()."""
188+
import subprocess as sp
189+
190+
proc = MagicMock()
191+
proc.poll.return_value = None
192+
proc.pid = 4242
193+
proc.wait.side_effect = [sp.TimeoutExpired(cmd="x", timeout=1), 0]
194+
195+
with (
196+
patch("tests.installer.platform.system", return_value="Linux"),
197+
patch("tests.installer.os.killpg"),
198+
patch("tests.installer.os.getpgid", return_value=4242),
199+
):
200+
stop_app_tree(proc, timeout=1)
201+
202+
proc.kill.assert_called_once()
203+
204+
131205
# --- TestgenStartAction -------------------------------------------------------
132206

133207

0 commit comments

Comments
 (0)