Skip to content

Commit 7ecfe42

Browse files
agnersclaudeCopilotmdegat01
authored
apps: log container exit code when app exits non-zero (#6848)
* apps: log container exit code when app exits non-zero Issue #6840 reports that stopping an app whose process exits 143 (SIGTERM default disposition) leaves the app in AppState.ERROR. ERROR is the right state for that — Docker itself treats any non-zero exit as a failure (e.g. `--restart on-failure`), and 143 specifically means the SIGTERM grace period was wasted because the app never installed a handler. But Supervisor previously logged nothing about it, leaving authors with no hint that their image is misbehaving. Plumb the exit code through DockerContainerStateEvent and log it from App.container_state_changed on transitions to FAILED: a warning for 143 nudging the author to trap SIGTERM and exit 0, and an error for any other non-zero code (crashes, SIGKILL after grace, app's own error exit). Refactor _container_state_from_model to return (state, exit_code) so the docker event monitor and DockerInterface.attach feed the same exit code through one code path instead of re-reading State.ExitCode in the caller. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * apps: address review feedback on exit-code logging - Replace bare 143 with EXIT_CODE_SIGTERM_DEFAULT (128 + signal.SIGTERM) in supervisor/docker/const.py so the reasoning is documented in code, not just in the log string. - Stop populating exit_code on STOPPED transitions. Previously the refactor made DockerInterface.attach emit exit_code=0 for cleanly stopped containers, while the monitor only emitted an exit code for abnormal exits. Align both paths so exit_code is only set on FAILED. - Add test_app_failed_logs_exit_code covering the three new branches (warning on 143, error on other non-zero, silent when None) and extend test_attach_existing_container to assert the event's exit_code field per state. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * docker/monitor: flatten exit_code branch to satisfy pylint The previous if/else inside the `die` branch pushed the function over pylint's too-many-nested-blocks threshold (6/5). Collapse it back into a pair of conditional expressions: container_state via ternary on the exit code, exit_code via `die_exit_code or None` so 0 stays None. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * Update supervisor/apps/app.py Co-authored-by: Mike Degatano <michael.degatano@gmail.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: Mike Degatano <michael.degatano@gmail.com>
1 parent 267fc6c commit 7ecfe42

7 files changed

Lines changed: 141 additions & 28 deletions

File tree

supervisor/apps/app.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
)
6060
from ..coresys import CoreSys
6161
from ..docker.app import DockerApp
62-
from ..docker.const import ContainerState
62+
from ..docker.const import EXIT_CODE_SIGTERM_DEFAULT, ContainerState
6363
from ..docker.manager import ExecReturn
6464
from ..docker.monitor import DockerContainerStateEvent
6565
from ..docker.stats import DockerStats
@@ -1669,6 +1669,21 @@ async def container_state_changed(self, event: DockerContainerStateEvent) -> Non
16691669
elif event.state == ContainerState.STOPPED:
16701670
self.state = AppState.STOPPED
16711671
elif event.state == ContainerState.FAILED:
1672+
if event.exit_code == EXIT_CODE_SIGTERM_DEFAULT:
1673+
_LOGGER.warning(
1674+
"App %s did not handle SIGTERM and was terminated by the "
1675+
"default signal handler (exit code %d). The app should "
1676+
"trap SIGTERM, shut down cleanly, and exit with code 0. "
1677+
"Please report this to the app developer.",
1678+
self.name,
1679+
EXIT_CODE_SIGTERM_DEFAULT,
1680+
)
1681+
elif event.exit_code is not None:
1682+
_LOGGER.error(
1683+
"App %s exited with non-zero exit code %d",
1684+
self.name,
1685+
event.exit_code,
1686+
)
16721687
self.state = AppState.ERROR
16731688

16741689
async def watchdog_container(self, event: DockerContainerStateEvent) -> None:

supervisor/docker/const.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from enum import StrEnum
77
from pathlib import PurePath
88
import re
9+
import signal
910
from typing import Any
1011

1112
from ..const import MACHINE_ID
@@ -26,6 +27,10 @@
2627
# GitHub Container Registry identifier
2728
GITHUB_CONTAINER_REGISTRY = "ghcr.io"
2829

30+
# Exit code reported when a process is terminated by SIGTERM with the default
31+
# disposition (i.e. no userspace handler ran). POSIX convention is 128 + signal.
32+
EXIT_CODE_SIGTERM_DEFAULT = 128 + signal.SIGTERM
33+
2934

3035
class Capabilities(StrEnum):
3136
"""Linux Capabilities."""

supervisor/docker/interface.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,29 @@ def _restart_policy_from_model(meta_host: dict[str, Any]) -> RestartPolicy | Non
7979
return RestartPolicy.NO
8080

8181

82-
def _container_state_from_model(container_metadata: dict[str, Any]) -> ContainerState:
83-
"""Get container state from model."""
82+
def _container_state_from_model(
83+
container_metadata: dict[str, Any],
84+
) -> tuple[ContainerState, int | None]:
85+
"""Get container state and exit code from model."""
8486
if "State" not in container_metadata:
85-
return ContainerState.UNKNOWN
87+
return ContainerState.UNKNOWN, None
8688

87-
if container_metadata["State"]["Status"] == "running":
88-
if "Health" in container_metadata["State"]:
89+
state_obj = container_metadata["State"]
90+
if state_obj["Status"] == "running":
91+
if "Health" in state_obj:
8992
return (
9093
ContainerState.HEALTHY
91-
if container_metadata["State"]["Health"]["Status"] == "healthy"
92-
else ContainerState.UNHEALTHY
94+
if state_obj["Health"]["Status"] == "healthy"
95+
else ContainerState.UNHEALTHY,
96+
None,
9397
)
94-
return ContainerState.RUNNING
98+
return ContainerState.RUNNING, None
9599

96-
if container_metadata["State"]["ExitCode"] > 0:
97-
return ContainerState.FAILED
100+
exit_code = state_obj["ExitCode"]
101+
if exit_code > 0:
102+
return ContainerState.FAILED, exit_code
98103

99-
return ContainerState.STOPPED
104+
return ContainerState.STOPPED, None
100105

101106

102107
class DockerInterface(JobGroup, ABC):
@@ -422,7 +427,8 @@ async def is_running(self) -> bool:
422427
async def current_state(self) -> ContainerState:
423428
"""Return current state of container."""
424429
if container_metadata := await self._get_container():
425-
return _container_state_from_model(container_metadata)
430+
state, _ = _container_state_from_model(container_metadata)
431+
return state
426432
return ContainerState.UNKNOWN
427433

428434
@Job(name="docker_interface_attach", concurrency=JobConcurrency.GROUP_QUEUE)
@@ -435,7 +441,7 @@ async def attach(
435441
self._meta = await docker_container.show()
436442
self.sys_docker.monitor.watch_container(self._meta)
437443

438-
state = _container_state_from_model(self._meta)
444+
state, exit_code = _container_state_from_model(self._meta)
439445
if not (
440446
skip_state_event_if_down
441447
and state in [ContainerState.STOPPED, ContainerState.FAILED]
@@ -444,7 +450,7 @@ async def attach(
444450
self.sys_bus.fire_event(
445451
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
446452
DockerContainerStateEvent(
447-
self.name, state, docker_container.id, int(time())
453+
self.name, state, docker_container.id, int(time()), exit_code
448454
),
449455
)
450456

supervisor/docker/monitor.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,17 @@
2121

2222
@dataclass(slots=True, frozen=True)
2323
class DockerContainerStateEvent:
24-
"""Event for docker container state change."""
24+
"""Event for docker container state change.
25+
26+
exit_code is populated for FAILED transitions where the source event or
27+
inspect data carried an exit code; None otherwise.
28+
"""
2529

2630
name: str
2731
state: ContainerState
2832
id: str
2933
time: int
34+
exit_code: int | None = None
3035

3136

3237
@dataclass(slots=True, frozen=True)
@@ -109,16 +114,21 @@ async def _run(self, events: ChannelSubscriber) -> None:
109114
or attributes.get("name") in self._unlabeled_managed_containers
110115
):
111116
container_state: ContainerState | None = None
117+
exit_code: int | None = None
112118
action: str = event["Action"]
113119

114120
if action == "start":
115121
container_state = ContainerState.RUNNING
116122
elif action == "die":
123+
die_exit_code = int(
124+
event["Actor"]["Attributes"]["exitCode"]
125+
)
117126
container_state = (
118-
ContainerState.STOPPED
119-
if int(event["Actor"]["Attributes"]["exitCode"]) == 0
120-
else ContainerState.FAILED
127+
ContainerState.FAILED
128+
if die_exit_code
129+
else ContainerState.STOPPED
121130
)
131+
exit_code = die_exit_code or None
122132
elif action == "health_status: healthy":
123133
container_state = ContainerState.HEALTHY
124134
elif action == "health_status: unhealthy":
@@ -130,6 +140,7 @@ async def _run(self, events: ChannelSubscriber) -> None:
130140
state=container_state,
131141
id=event["Actor"]["ID"],
132142
time=event["time"],
143+
exit_code=exit_code,
133144
)
134145
tasks = self.sys_bus.fire_event(
135146
BusEvent.DOCKER_CONTAINER_STATE_CHANGE, state_event

tests/apps/test_app.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from datetime import timedelta
55
import errno
66
from http import HTTPStatus
7+
import logging
78
from pathlib import Path, PurePath
89
from typing import Any
910
from unittest.mock import AsyncMock, MagicMock, PropertyMock, call, patch
@@ -51,7 +52,12 @@
5152
from tests.const import TEST_ADDON_SLUG
5253

5354

54-
async def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState) -> None:
55+
async def _fire_test_event(
56+
coresys: CoreSys,
57+
name: str,
58+
state: ContainerState,
59+
exit_code: int | None = None,
60+
) -> None:
5561
"""Fire a test event and await the listener tasks the bus spawned."""
5662
await fire_bus_event(
5763
coresys,
@@ -61,6 +67,7 @@ async def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState) -
6167
state=state,
6268
id="abc123",
6369
time=1,
70+
exit_code=exit_code,
6471
),
6572
)
6673

@@ -153,6 +160,51 @@ async def test_app_state_listener(coresys: CoreSys, install_app_ssh: App) -> Non
153160
assert install_app_ssh.state == AppState.ERROR
154161

155162

163+
async def test_app_failed_logs_exit_code(
164+
coresys: CoreSys,
165+
install_app_ssh: App,
166+
caplog: pytest.LogCaptureFixture,
167+
) -> None:
168+
"""Test FAILED transition logs exit code with appropriate severity."""
169+
with patch.object(DockerApp, "attach"):
170+
await install_app_ssh.load()
171+
172+
with patch.object(App, "watchdog_container"):
173+
# Exit 143 (SIGTERM default disposition): warning nudging author
174+
caplog.clear()
175+
await _fire_test_event(
176+
coresys,
177+
f"addon_{TEST_ADDON_SLUG}",
178+
ContainerState.FAILED,
179+
exit_code=143,
180+
)
181+
assert install_app_ssh.state == AppState.ERROR
182+
warnings = [
183+
r for r in caplog.records if r.levelno == logging.WARNING and r.message
184+
]
185+
assert any("did not handle SIGTERM" in r.message for r in warnings)
186+
187+
# Any other non-zero exit: error
188+
caplog.clear()
189+
await _fire_test_event(
190+
coresys,
191+
f"addon_{TEST_ADDON_SLUG}",
192+
ContainerState.FAILED,
193+
exit_code=1,
194+
)
195+
errors = [r for r in caplog.records if r.levelno == logging.ERROR]
196+
assert any("exit code 1" in r.message for r in errors)
197+
198+
# No exit code available: stay silent
199+
caplog.clear()
200+
await _fire_test_event(
201+
coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED
202+
)
203+
assert not any(
204+
"exit code" in r.message or "SIGTERM" in r.message for r in caplog.records
205+
)
206+
207+
156208
async def test_app_watchdog(coresys: CoreSys, install_app_ssh: App) -> None:
157209
"""Test app watchdog works correctly."""
158210
with patch.object(DockerApp, "attach"):

tests/docker/test_interface.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -235,23 +235,31 @@ async def test_current_state_failures(coresys: CoreSys):
235235

236236

237237
@pytest.mark.parametrize(
238-
"attrs,expected,fired_when_skip_down",
238+
"attrs,expected,expected_exit_code,fired_when_skip_down",
239239
[
240-
({"State": {"Status": "running"}}, ContainerState.RUNNING, True),
241-
({"State": {"Status": "exited", "ExitCode": 0}}, ContainerState.STOPPED, False),
240+
({"State": {"Status": "running"}}, ContainerState.RUNNING, None, True),
241+
(
242+
{"State": {"Status": "exited", "ExitCode": 0}},
243+
ContainerState.STOPPED,
244+
None,
245+
False,
246+
),
242247
(
243248
{"State": {"Status": "exited", "ExitCode": 137}},
244249
ContainerState.FAILED,
250+
137,
245251
False,
246252
),
247253
(
248254
{"State": {"Status": "running", "Health": {"Status": "healthy"}}},
249255
ContainerState.HEALTHY,
256+
None,
250257
True,
251258
),
252259
(
253260
{"State": {"Status": "running", "Health": {"Status": "unhealthy"}}},
254261
ContainerState.UNHEALTHY,
262+
None,
255263
True,
256264
),
257265
],
@@ -261,6 +269,7 @@ async def test_attach_existing_container(
261269
container: DockerContainer,
262270
attrs: dict[str, Any],
263271
expected: ContainerState,
272+
expected_exit_code: int | None,
264273
fired_when_skip_down: bool,
265274
):
266275
"""Test attaching to existing container."""
@@ -279,7 +288,9 @@ async def test_attach_existing_container(
279288
] == [
280289
call(
281290
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
282-
DockerContainerStateEvent("homeassistant", expected, "abc123", 1),
291+
DockerContainerStateEvent(
292+
"homeassistant", expected, "abc123", 1, expected_exit_code
293+
),
283294
)
284295
]
285296

0 commit comments

Comments
 (0)