Skip to content

Commit c4b1e70

Browse files
authored
Merge pull request #88 from DataKitchen/fix-tg-orphan-postgres
fix(testgen): kill the whole subprocess tree on tg start/install shutdown
2 parents e8aec12 + 5c31680 commit c4b1e70

5 files changed

Lines changed: 187 additions & 41 deletions

File tree

dk-installer.py

Lines changed: 79 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import re
2121
import secrets
2222
import shutil
23+
import signal
2324
import socket
2425
import ssl
2526
import stat
@@ -1179,6 +1180,15 @@ def run(self, parent=None):
11791180
("The Docker engine is not running.", "Start the Docker engine and try again."),
11801181
label="Docker engine running",
11811182
)
1183+
REQ_DOCKER_COMPOSE = Requirement(
1184+
"DOCKER_COMPOSE",
1185+
("docker", "compose", "version"),
1186+
(
1187+
"The Docker Compose plugin is not available.",
1188+
"Install the Docker Compose plugin and try again.",
1189+
),
1190+
label="Docker Compose installed",
1191+
)
11821192
REQ_TESTGEN_IMAGE = Requirement(
11831193
"TESTGEN_IMAGE",
11841194
("docker", "manifest", "inspect", "{image}"),
@@ -1412,7 +1422,7 @@ def delete_compose_volumes(self, args):
14121422

14131423
class ComposeDeleteAction(Action, ComposeActionMixin):
14141424
args_cmd = "delete"
1415-
requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON]
1425+
requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE]
14161426

14171427
def execute(self, args):
14181428
if self.get_compose_file_path(args).exists():
@@ -1675,7 +1685,7 @@ class ObsUpgradeAction(MultiStepAction, ComposeActionMixin):
16751685
label = "Upgrade"
16761686
title = "Upgrade Observability"
16771687
args_cmd = "upgrade"
1678-
requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON]
1688+
requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE]
16791689

16801690
steps = [
16811691
ObsFetchCurrentVersionStep,
@@ -1918,7 +1928,7 @@ class ObsInstallAction(AnalyticsMultiStepAction, ComposeActionMixin):
19181928
intro_text = ["This process may take 5~15 minutes depending on your system resources and network speed."]
19191929

19201930
args_cmd = "install"
1921-
requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON]
1931+
requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE]
19221932

19231933
def get_parser(self, sub_parsers):
19241934
parser = super().get_parser(sub_parsers)
@@ -2464,6 +2474,40 @@ def read_testgen_config_env() -> dict[str, str]:
24642474
return config
24652475

24662476

2477+
def stop_app_tree(proc: subprocess.Popen, timeout: int = 10) -> None:
2478+
"""Terminate ``proc`` and all of its descendants.
2479+
2480+
Plain ``proc.terminate()`` only kills the parent — pixeltable-pgserver
2481+
spawns ``postgres`` children that get orphaned otherwise. Cross-platform:
2482+
on Windows we shell out to ``taskkill /F /T``; on POSIX we send SIGTERM
2483+
to the whole process group (the parent was started with
2484+
``start_new_session=True``).
2485+
"""
2486+
if proc.poll() is not None:
2487+
return
2488+
if platform.system() == "Windows":
2489+
with contextlib.suppress(Exception):
2490+
subprocess.run(
2491+
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2492+
stdout=subprocess.DEVNULL,
2493+
stderr=subprocess.DEVNULL,
2494+
creationflags=getattr(subprocess, "CREATE_NO_WINDOW", 0),
2495+
check=False,
2496+
)
2497+
else:
2498+
with contextlib.suppress(Exception):
2499+
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
2500+
try:
2501+
proc.wait(timeout=timeout)
2502+
except subprocess.TimeoutExpired:
2503+
if platform.system() != "Windows":
2504+
with contextlib.suppress(Exception):
2505+
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
2506+
proc.kill()
2507+
with contextlib.suppress(subprocess.TimeoutExpired):
2508+
proc.wait(timeout=5)
2509+
2510+
24672511
def start_testgen_app(action, args) -> None:
24682512
"""Start ``testgen run-app`` and block until the user interrupts.
24692513
@@ -2496,15 +2540,17 @@ def start_testgen_app(action, args) -> None:
24962540
stdout=subprocess.DEVNULL,
24972541
stderr=subprocess.DEVNULL,
24982542
stdin=subprocess.DEVNULL,
2543+
# POSIX-only: put the parent in its own process group so we can
2544+
# signal the whole tree (postgres included) on shutdown. Windows
2545+
# gets the same effect via ``taskkill /T`` in ``stop_app_tree``.
2546+
start_new_session=(platform.system() != "Windows"),
24992547
)
25002548
except FileNotFoundError as e:
25012549
raise InstallerError(f"Could not start TestGen: {e}") from e
25022550

25032551
try:
25042552
if not wait_for_tcp_port(port, timeout=TESTGEN_APP_READY_TIMEOUT):
2505-
proc.terminate()
2506-
with contextlib.suppress(subprocess.TimeoutExpired):
2507-
proc.wait(timeout=5)
2553+
stop_app_tree(proc, timeout=5)
25082554
raise InstallerError(
25092555
f"TestGen did not start within {TESTGEN_APP_READY_TIMEOUT} seconds. "
25102556
f"See {simplify_path(TESTGEN_LOG_FILE_PATH)} for details."
@@ -2526,19 +2572,11 @@ def start_testgen_app(action, args) -> None:
25262572
# Reset the cursor to column 0 — the terminal echoed `^C` mid-line.
25272573
print("")
25282574
CONSOLE.msg("Stopping TestGen...")
2529-
proc.terminate()
2530-
try:
2531-
proc.wait(timeout=10)
2532-
except subprocess.TimeoutExpired:
2533-
proc.kill()
2534-
proc.wait()
2575+
stop_app_tree(proc, timeout=10)
25352576
CONSOLE.msg("TestGen stopped.")
25362577
CONSOLE.msg(f"To start it again, {command_hint(args.prod, 'start', 'Start TestGen')}.")
25372578
finally:
2538-
if proc.poll() is None:
2539-
proc.terminate()
2540-
with contextlib.suppress(subprocess.TimeoutExpired):
2541-
proc.wait(timeout=5)
2579+
stop_app_tree(proc, timeout=5)
25422580

25432581

25442582
class UvToolUpgradeStep(Step):
@@ -2668,7 +2706,7 @@ class TestgenInstallAction(ComposeActionMixin, AnalyticsMultiStepAction):
26682706
"Installing TestGen with Docker Compose.",
26692707
"This process may take 5~10 minutes depending on your system resources and network speed.",
26702708
]
2671-
docker_requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_TESTGEN_IMAGE]
2709+
docker_requirements = [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE, REQ_TESTGEN_IMAGE]
26722710

26732711
args_cmd = "install"
26742712
label = "Installation"
@@ -2791,7 +2829,12 @@ def _auto_select_mode(self, args):
27912829
CONSOLE.msg("[d] Docker Compose (Recommended)")
27922830
CONSOLE.msg(" The most stable TestGen experience for persistent use.")
27932831
CONSOLE.msg(" Provides a fully managed environment with an isolated PostgreSQL container.")
2794-
prereq_status = " ".join(f"{'(✓)' if ok else '(X)'} {req.label or req.key}" for req, ok in prereq_results)
2832+
# Hide REQ_DOCKER from the picker — REQ_DOCKER_COMPOSE failure implies
2833+
# the same fix, so showing both bloats the prereq line. The actual
2834+
# check below (and the per-prereq fail messages later) still uses all four.
2835+
prereq_status = " ".join(
2836+
f"{'(✓)' if ok else '(X)'} {req.label or req.key}" for req, ok in prereq_results if req is not REQ_DOCKER
2837+
)
27952838
CONSOLE.msg(f" Prerequisites: {prereq_status}")
27962839
CONSOLE.space()
27972840
CONSOLE.msg("[p] Pip + embedded PostgreSQL")
@@ -2896,6 +2939,7 @@ def get_requirements(self, args):
28962939
return [
28972940
REQ_DOCKER,
28982941
REQ_DOCKER_DAEMON,
2942+
REQ_DOCKER_COMPOSE,
28992943
Requirement(
29002944
"TG_COMPOSE_FILE",
29012945
(
@@ -2949,7 +2993,7 @@ def check_requirements(self, args):
29492993

29502994
def get_requirements(self, args):
29512995
if self._resolved_mode == INSTALL_MODE_DOCKER:
2952-
return [REQ_DOCKER, REQ_DOCKER_DAEMON]
2996+
return [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE]
29532997
return []
29542998

29552999
def _resolve_install_mode(self, args):
@@ -3023,7 +3067,7 @@ def check_requirements(self, args):
30233067

30243068
def get_requirements(self, args):
30253069
if self._resolved_mode == INSTALL_MODE_DOCKER:
3026-
return [REQ_DOCKER, REQ_DOCKER_DAEMON]
3070+
return [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE]
30273071
return []
30283072

30293073
def _resolve_install_mode(self, args):
@@ -3116,10 +3160,14 @@ def check_requirements(self, args):
31163160
super().check_requirements(args)
31173161

31183162
def get_requirements(self, args):
3119-
# Docker mode requires Docker. For pip mode, Docker is only needed when
3120-
# the user asked to export to Observability (the dk-demo container
3121-
# generates the export payload).
3122-
if self._resolved_mode == INSTALL_MODE_DOCKER or getattr(args, "obs_export", False):
3163+
# Docker mode requires Docker + Compose (we shell into the engine
3164+
# container via ``docker compose exec``). For pip mode, Docker is only
3165+
# needed when the user asked to export to Observability — the dk-demo
3166+
# container that generates the export payload runs via ``docker run``,
3167+
# so Compose isn't required there.
3168+
if self._resolved_mode == INSTALL_MODE_DOCKER:
3169+
return [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE]
3170+
if getattr(args, "obs_export", False):
31233171
return [REQ_DOCKER, REQ_DOCKER_DAEMON]
31243172
return []
31253173

@@ -3190,9 +3238,13 @@ def check_requirements(self, args):
31903238
super().check_requirements(args)
31913239

31923240
def get_requirements(self, args):
3193-
# Docker mode requires Docker. For pip mode, the dk-demo container
3194-
# call below is wrapped in try/except so Docker absence is non-fatal.
3195-
return [REQ_DOCKER, REQ_DOCKER_DAEMON] if self._resolved_mode == INSTALL_MODE_DOCKER else []
3241+
# Docker mode requires Docker + Compose (we shell into the engine
3242+
# container via ``docker compose exec``). For pip mode, the dk-demo
3243+
# container call below is wrapped in try/except so Docker absence is
3244+
# non-fatal.
3245+
if self._resolved_mode == INSTALL_MODE_DOCKER:
3246+
return [REQ_DOCKER, REQ_DOCKER_DAEMON, REQ_DOCKER_COMPOSE]
3247+
return []
31963248

31973249
def _resolve_install_mode(self, args):
31983250
# Like delete: idempotent, so "no install" returns rather than aborts.

tests/conftest.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,22 @@ def _no_real_browser_launch():
1717
yield mock
1818

1919

20+
@pytest.fixture(autouse=True)
21+
def _no_real_process_group_signals():
22+
"""Belt-and-suspenders: stop ``stop_app_tree`` from accidentally signalling
23+
a real process group when a Popen-mocked test exercises it. ``MagicMock``'s
24+
auto ``__index__`` returns 1, so ``os.getpgid(mock_proc.pid)`` resolves to
25+
pgid 1 (init) and ``os.killpg(1, SIGTERM)`` from a root container actually
26+
signals init → CI runner shutdown. Tests that need to assert on these
27+
explicitly override the patches inside their own ``with patch(...)``.
28+
"""
29+
with (
30+
patch("tests.installer.os.killpg") as killpg_mock,
31+
patch("tests.installer.os.getpgid", return_value=99999),
32+
):
33+
yield killpg_mock
34+
35+
2036
@pytest.fixture
2137
def stdout_mock():
2238
return Mock(return_value=[])

tests/test_tg_pip_install.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,9 @@ def test_auto_mode_picks_pip_when_docker_unavailable(install_action, args_mock,
204204
@pytest.mark.integration
205205
def test_auto_mode_displays_prereq_status_when_docker_unavailable(install_action, args_mock, console_msg_mock):
206206
"""Docker probe fails → the prereq display lists each requirement with a marker and (for failures) a fix hint."""
207-
# Only the first prereq passes — exercises the mixed pass/fail rendering.
207+
# Only the Compose plugin probe passes — exercises the mixed pass/fail rendering.
208208
def selective_check(req_self, *_, **__):
209-
return req_self.key == "DOCKER"
209+
return req_self.key == "DOCKER_COMPOSE"
210210

211211
with (
212212
patch("tests.installer.Requirement.check_availability", autospec=True, side_effect=selective_check),
@@ -216,8 +216,12 @@ def selective_check(req_self, *_, **__):
216216

217217
console_msg_mock.assert_any_msg_contains("two installation modes")
218218
console_msg_mock.assert_any_msg_contains("Prerequisites:")
219-
console_msg_mock.assert_any_msg_contains("(✓) Docker installed")
219+
console_msg_mock.assert_any_msg_contains("(✓) Docker Compose installed")
220220
console_msg_mock.assert_any_msg_contains("(X) Docker engine running")
221+
# REQ_DOCKER is checked but not displayed — REQ_DOCKER_COMPOSE failure
222+
# implies the same fix, so the picker hides it to keep the line short.
223+
rendered = " ".join(call.args[0] for call in console_msg_mock.call_args_list if call.args)
224+
assert "Docker installed" not in rendered
221225

222226

223227
@pytest.mark.integration

0 commit comments

Comments
 (0)