Skip to content

Commit 0bcedf5

Browse files
authored
Don't fail Supervisor setup when an app image is missing (#6816)
* Don't fail Supervisor setup when an app image is missing A missing builder image (docker:<version>-cli) during a build-required app load aborted Supervisor setup entirely, leaving the system stuck in setup state where every subsequent operation was blocked by the not-healthy guard. Triggered in practice when the host's Docker patch version had no matching `-cli` tag published on Docker Hub. Two issues compounded the failure: `images.pull` in `run_command` leaked a raw `aiodocker.DockerError` past the `@Job` decorator, which rewrapped it as `JobException` and bypassed the `suppress(DockerError, ...)` guard in `addon.load()`; and the load path treated all Docker errors the same whether the image was simply missing or the daemon itself was misbehaving. Wrap the pull error in `run_command` so it propagates as Supervisor's `DockerError` (a `HassioError`) and is preserved by the decorator. Distinguish 404s in `attach()` and `check_image()` by raising `DockerNotFound`/`DockerAPIError` instead of generic `DockerError`. In `addon.load()`, only the `DockerNotFound` path is treated as "image missing": for build-required apps we skip the inline build and surface a `MISSING_IMAGE` repair so the resolution autofix loop handles it off the critical path; for pull-based apps we still attempt install during load and create the repair on failure. Other `DockerError`s (daemon trouble or a failed internal install in `check_image`) are logged at CRITICAL — which the Sentry logging integration captures — and the addon is left detached rather than masked as a misleading missing-image repair. In the autofix path, swallow `DockerBuildError`, `DockerNoSpaceOnDevice`, `DockerRegistryAuthError`, and `DockerRegistryRateLimitExceeded` as `ResolutionFixupError` so they don't generate Sentry events on every retry. The repair stays available for manual retry once the underlying cause (registry tag published, disk freed, credentials fixed, rate limit expired) is resolved. * Clarify outer DockerError comment in App.load() The comment claimed "a future load will reattempt and surface a MISSING_IMAGE repair if appropriate", but App.load() is only called at Supervisor startup, on fresh install, and on backup restore — there is no automatic retry mechanism. Reword to match reality: the CRITICAL log captures the issue for diagnostics (Sentry), and the user can trigger a manual repair once the daemon is healthy. * Clarify comment about user interaction
1 parent 9dfbfb2 commit 0bcedf5

5 files changed

Lines changed: 124 additions & 63 deletions

File tree

supervisor/apps/app.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
DockerBuildError,
8282
DockerContainerPortConflict,
8383
DockerError,
84+
DockerNotFound,
8485
DockerRegistryAuthError,
8586
HostAppArmorError,
8687
StoreAppNotFoundError,
@@ -258,14 +259,42 @@ async def load(self) -> None:
258259

259260
# Ensure we are using correct image for this system
260261
await self.instance.check_image(self.version, default_image, self.arch)
261-
except DockerError:
262+
except DockerNotFound:
262263
_LOGGER.info("No %s app Docker image %s found", self.slug, self.image)
263-
with suppress(DockerError, AppNotSupportedError):
264-
await self.instance.install(self.version, default_image, arch=self.arch)
264+
if self.need_build:
265+
# Don't run a local build during setup. Surface a repair so
266+
# the resolution autofix loop can handle it off the critical
267+
# path.
268+
self._create_missing_image_issue()
269+
else:
270+
try:
271+
await self.instance.install(
272+
self.version, default_image, arch=self.arch
273+
)
274+
except (DockerError, AppNotSupportedError):
275+
self._create_missing_image_issue()
276+
except DockerError as err:
277+
# Docker error other than a clean "image not found" - we can't
278+
# tell whether the image is actually missing. Log so the issue
279+
# is visible (CRITICAL is captured by the Sentry integration)
280+
# and leave the app detached; the user can attempt a manual
281+
# rebuild from the app page.
282+
_LOGGER.critical(
283+
"Docker error loading app %s, leaving detached: %s", self.slug, err
284+
)
265285

266286
self.persist[ATTR_IMAGE] = default_image
267287
await self.save_persist()
268288

289+
def _create_missing_image_issue(self) -> None:
290+
"""Surface a repair suggestion for a missing app image."""
291+
self.sys_resolution.create_issue(
292+
IssueType.MISSING_IMAGE,
293+
ContextType.ADDON,
294+
reference=self.slug,
295+
suggestions=[SuggestionType.EXECUTE_REPAIR],
296+
)
297+
269298
@property
270299
def ip_address(self) -> IPv4Address:
271300
"""Return IP of app instance."""

supervisor/docker/interface.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,15 +454,20 @@ async def attach(
454454
),
455455
)
456456

457-
with suppress(aiodocker.DockerError):
458-
if not self._meta and self.image:
457+
if not self._meta and self.image:
458+
try:
459459
self._meta = await self.sys_docker.images.inspect(
460460
f"{self.image}:{version!s}"
461461
)
462+
except aiodocker.DockerError as err:
463+
if err.status != HTTPStatus.NOT_FOUND:
464+
raise DockerAPIError(
465+
f"Docker API error inspecting image {self.image}:{version!s}: {err!s}"
466+
) from err
462467

463468
# Successful?
464469
if not self._meta:
465-
raise DockerError(
470+
raise DockerNotFound(
466471
f"Could not get metadata on container or image for {self.name}"
467472
)
468473
_LOGGER.info("Attaching to %s with version %s", self.image, self.version)
@@ -558,7 +563,11 @@ async def check_image(
558563
try:
559564
image = await self.sys_docker.images.inspect(image_name)
560565
except aiodocker.DockerError as err:
561-
raise DockerError(
566+
if err.status == HTTPStatus.NOT_FOUND:
567+
raise DockerNotFound(
568+
f"Image {image_name} not found", _LOGGER.info
569+
) from err
570+
raise DockerAPIError(
562571
f"Could not get {image_name} for check due to: {err!s}",
563572
_LOGGER.error,
564573
) from err

supervisor/docker/manager.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -640,13 +640,17 @@ async def run_command(
640640
try:
641641
await self.images.inspect(f"{image}:{tag}")
642642
except aiodocker.DockerError as err:
643-
if err.status == HTTPStatus.NOT_FOUND:
644-
_LOGGER.info("Pulling image %s:%s", image, tag)
645-
await self.images.pull(image, tag=tag)
646-
else:
643+
if err.status != HTTPStatus.NOT_FOUND:
647644
raise DockerError(
648645
f"Can't inspect image {image}:{tag}: {err}", _LOGGER.error
649646
) from err
647+
_LOGGER.info("Pulling image %s:%s", image, tag)
648+
try:
649+
await self.images.pull(image, tag=tag)
650+
except aiodocker.DockerError as pull_err:
651+
raise DockerError(
652+
f"Can't pull image {image}:{tag}: {pull_err}", _LOGGER.error
653+
) from pull_err
650654

651655
try:
652656
container = await self._run(

supervisor/resolution/fixups/app_execute_repair.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33
import logging
44

55
from ...coresys import CoreSys
6+
from ...exceptions import (
7+
DockerBuildError,
8+
DockerNoSpaceOnDevice,
9+
DockerRegistryAuthError,
10+
DockerRegistryRateLimitExceeded,
11+
ResolutionFixupError,
12+
)
613
from ..const import ContextType, IssueType, SuggestionType
714
from .base import FixupBase
815

@@ -44,7 +51,21 @@ async def process_fixup(self, reference: str | None = None) -> None:
4451

4552
_LOGGER.info("Installing image for app %s", reference)
4653
self.attempts += 1
47-
await app.instance.install(app.version)
54+
try:
55+
await app.instance.install(app.version)
56+
except (
57+
DockerBuildError,
58+
DockerNoSpaceOnDevice,
59+
DockerRegistryAuthError,
60+
DockerRegistryRateLimitExceeded,
61+
) as err:
62+
# These failures won't be resolved by an immediate retry (broken
63+
# Dockerfile or unavailable base/builder image; disk full; bad
64+
# credentials; registry rate limit). Surface as a fixup error so
65+
# FixupBase swallows it without a Sentry event. The repair stays
66+
# available for manual retry once the underlying cause is fixed.
67+
_LOGGER.warning("Cannot repair app %s: %s", reference, err)
68+
raise ResolutionFixupError() from err
4869

4970
@property
5071
def suggestion(self) -> SuggestionType:

tests/apps/test_app.py

Lines changed: 49 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from supervisor.apps.app import App
1919
from supervisor.apps.const import AppBackupMode
2020
from supervisor.apps.model import AppModel
21-
from supervisor.config import CoreConfig
2221
from supervisor.const import ATTR_ADVANCED, AppBoot, AppState, BusEvent
2322
from supervisor.coresys import CoreSys
2423
from supervisor.docker.app import DockerApp
@@ -1128,75 +1127,70 @@ async def test_app_loads_wrong_image(
11281127

11291128

11301129
@pytest.mark.usefixtures("mock_amd64_arch_supported")
1131-
async def test_app_loads_missing_image(coresys: CoreSys, install_app_ssh: App):
1132-
"""Test app corrects a missing image on load."""
1130+
async def test_app_loads_missing_image_build(coresys: CoreSys, install_app_ssh: App):
1131+
"""Test build-required app surfaces a repair when image is missing on load."""
11331132
coresys.docker.images.inspect.side_effect = aiodocker.DockerError(
11341133
HTTPStatus.NOT_FOUND, {"message": "missing"}
11351134
)
11361135

1137-
with (
1138-
patch("pathlib.Path.is_file", return_value=True),
1139-
patch.object(
1140-
coresys.docker,
1141-
"run_command",
1142-
return_value=CommandReturn(0, ["Build successful"]),
1143-
) as mock_run_command,
1144-
patch.object(
1145-
type(coresys.config),
1146-
"local_to_extern_path",
1147-
return_value=PurePath("/addon/path/on/host"),
1148-
),
1149-
):
1136+
with patch.object(
1137+
coresys.docker,
1138+
"run_command",
1139+
return_value=CommandReturn(0, ["Build successful"]),
1140+
) as mock_run_command:
11501141
await install_app_ssh.load()
11511142

1152-
mock_run_command.assert_called_once()
1153-
assert mock_run_command.call_args.args[0] == "docker"
1154-
assert mock_run_command.call_args.kwargs["tag"] == "1.0.0-cli"
1155-
command = mock_run_command.call_args.kwargs["command"]
1156-
assert is_in_list(
1157-
["--platform", "linux/amd64"],
1158-
command,
1143+
# Build-required apps must not run a build during load. A repair is
1144+
# raised so the resolution autofix loop handles it off the critical path.
1145+
mock_run_command.assert_not_called()
1146+
issue = Issue(
1147+
IssueType.MISSING_IMAGE, ContextType.ADDON, reference=install_app_ssh.slug
11591148
)
1160-
assert is_in_list(
1161-
["--tag", "local/amd64-addon-ssh:9.2.1"],
1162-
command,
1149+
assert issue in coresys.resolution.issues
1150+
suggestions = coresys.resolution.suggestions_for_issue(issue)
1151+
assert any(s.type == SuggestionType.EXECUTE_REPAIR for s in suggestions)
1152+
1153+
1154+
@pytest.mark.usefixtures("mock_amd64_arch_supported")
1155+
async def test_app_loads_missing_image_pull(coresys: CoreSys, install_app_ssh: App):
1156+
"""Test pullable app installs the missing image during load."""
1157+
install_app_ssh.data["image"] = "test/amd64-addon-ssh"
1158+
coresys.docker.images.inspect.side_effect = aiodocker.DockerError(
1159+
HTTPStatus.NOT_FOUND, {"message": "missing"}
11631160
)
1164-
assert install_app_ssh.image == "local/amd64-addon-ssh"
1161+
1162+
with patch.object(DockerAPI, "pull_image") as mock_pull_image:
1163+
await install_app_ssh.load()
1164+
1165+
mock_pull_image.assert_called_once()
1166+
issue = Issue(
1167+
IssueType.MISSING_IMAGE, ContextType.ADDON, reference=install_app_ssh.slug
1168+
)
1169+
assert issue not in coresys.resolution.issues
11651170

11661171

11671172
@pytest.mark.usefixtures("container", "mock_amd64_arch_supported")
11681173
async def test_app_load_succeeds_with_docker_errors(
11691174
coresys: CoreSys, install_app_ssh: App, caplog: pytest.LogCaptureFixture
11701175
):
1171-
"""Docker errors while building/pulling an image during load should not raise and fail setup."""
1172-
# Build env invalid failure
1176+
"""Docker errors during load should not raise and fail setup."""
1177+
issue = Issue(
1178+
IssueType.MISSING_IMAGE, ContextType.ADDON, reference=install_app_ssh.slug
1179+
)
1180+
1181+
# Build-required app with missing image: repair issue raised, no exception
11731182
coresys.docker.images.inspect.side_effect = aiodocker.DockerError(
11741183
HTTPStatus.NOT_FOUND, {"message": "missing"}
11751184
)
11761185
caplog.clear()
11771186
await install_app_ssh.load()
1178-
assert "Cannot build app 'local_ssh' because dockerfile is missing" in caplog.text
1179-
1180-
# Image build failure
1181-
caplog.clear()
1182-
with (
1183-
patch("pathlib.Path.is_file", return_value=True),
1184-
patch.object(
1185-
CoreConfig,
1186-
"local_to_extern_path",
1187-
return_value=PurePath("/addon/path/on/host"),
1188-
),
1189-
patch.object(
1190-
DockerAPI, "run_command", return_value=CommandReturn(1, ["error"])
1191-
),
1192-
):
1193-
await install_app_ssh.load()
1194-
assert (
1195-
"Docker build failed for local/amd64-addon-ssh:9.2.1 (exit code 1). Build output:\nerror"
1196-
in caplog.text
1197-
)
1187+
assert issue in coresys.resolution.issues
11981188

1199-
# Image pull failure
1189+
# Pull-based app where check_image's internal install fails: addon left
1190+
# detached, no exception escapes to abort setup. The next load will hit
1191+
# DockerNotFound and trigger the proper repair path.
1192+
stored = coresys.resolution.get_issue_if_present(issue)
1193+
coresys.resolution.dismiss_issue(stored)
12001194
install_app_ssh.data["image"] = "test/amd64-addon-ssh"
12011195
caplog.clear()
12021196
with patch.object(
@@ -1205,7 +1199,11 @@ async def test_app_load_succeeds_with_docker_errors(
12051199
side_effect=aiodocker.DockerError(400, {"message": "error"}),
12061200
):
12071201
await install_app_ssh.load()
1208-
assert "Can't install test/amd64-addon-ssh:9.2.1:" in caplog.text
1202+
assert "Docker error loading app local_ssh, leaving detached" in caplog.text
1203+
assert any(
1204+
"Docker error loading app local_ssh" in r.message and r.levelname == "CRITICAL"
1205+
for r in caplog.records
1206+
)
12091207

12101208

12111209
@pytest.mark.usefixtures("coresys")

0 commit comments

Comments
 (0)