Skip to content

Commit afc1165

Browse files
committed
Don't fail Supervisor setup when an app image is missing
A missing builder image (docker:<version>-cli) during a build-required app load aborted Supervisor setup entirely, leaving the system stuck in setup state where every subsequent operation was blocked by the not-healthy guard. Triggered in practice when the host's Docker patch version had no matching `-cli` tag published on Docker Hub. Two issues compounded the failure: `images.pull` in `run_command` leaked a raw `aiodocker.DockerError` past the `@Job` decorator, which rewrapped it as `JobException` and bypassed the `suppress(DockerError, ...)` guard in `addon.load()`; and the load path treated all Docker errors the same whether the image was simply missing or the daemon itself was misbehaving. Wrap the pull error in `run_command` so it propagates as Supervisor's `DockerError` (a `HassioError`) and is preserved by the decorator. Distinguish 404s in `attach()` and `check_image()` by raising `DockerNotFound`/`DockerAPIError` instead of generic `DockerError`. In `addon.load()`, only the `DockerNotFound` path is treated as "image missing": for build-required apps we skip the inline build and surface a `MISSING_IMAGE` repair so the resolution autofix loop handles it off the critical path; for pull-based apps we still attempt install during load and create the repair on failure. Other `DockerError`s (daemon trouble or a failed internal install in `check_image`) are logged at CRITICAL — which the Sentry logging integration captures — and the addon is left detached rather than masked as a misleading missing-image repair. In the autofix path, swallow `DockerBuildError`, `DockerNoSpaceOnDevice`, `DockerRegistryAuthError`, and `DockerRegistryRateLimitExceeded` as `ResolutionFixupError` so they don't generate Sentry events on every retry. The repair stays available for manual retry once the underlying cause (registry tag published, disk freed, credentials fixed, rate limit expired) is resolved.
1 parent d815c09 commit afc1165

5 files changed

Lines changed: 123 additions & 63 deletions

File tree

supervisor/addons/addon.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
DockerBuildError,
8282
DockerContainerPortConflict,
8383
DockerError,
84+
DockerNotFound,
8485
DockerRegistryAuthError,
8586
HostAppArmorError,
8687
StoreAppNotFoundError,
@@ -258,14 +259,41 @@ async def load(self) -> None:
258259

259260
# Ensure we are using correct image for this system
260261
await self.instance.check_image(self.version, default_image, self.arch)
261-
except DockerError:
262+
except DockerNotFound:
262263
_LOGGER.info("No %s app Docker image %s found", self.slug, self.image)
263-
with suppress(DockerError, AppNotSupportedError):
264-
await self.instance.install(self.version, default_image, arch=self.arch)
264+
if self.need_build:
265+
# Don't run a local build during setup. Surface a repair so
266+
# the resolution autofix loop can handle it off the critical
267+
# path.
268+
self._create_missing_image_issue()
269+
else:
270+
try:
271+
await self.instance.install(
272+
self.version, default_image, arch=self.arch
273+
)
274+
except (DockerError, AppNotSupportedError):
275+
self._create_missing_image_issue()
276+
except DockerError as err:
277+
# Docker error other than a clean "image not found" - we can't
278+
# tell whether the image is actually missing. Log and leave the
279+
# addon detached; a future load will reattempt and surface a
280+
# MISSING_IMAGE repair if appropriate.
281+
_LOGGER.critical(
282+
"Docker error loading app %s, leaving detached: %s", self.slug, err
283+
)
265284

266285
self.persist[ATTR_IMAGE] = default_image
267286
await self.save_persist()
268287

288+
def _create_missing_image_issue(self) -> None:
289+
"""Surface a repair suggestion for a missing app image."""
290+
self.sys_resolution.create_issue(
291+
IssueType.MISSING_IMAGE,
292+
ContextType.ADDON,
293+
reference=self.slug,
294+
suggestions=[SuggestionType.EXECUTE_REPAIR],
295+
)
296+
269297
@property
270298
def ip_address(self) -> IPv4Address:
271299
"""Return IP of app instance."""

supervisor/docker/interface.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -446,15 +446,20 @@ async def attach(
446446
),
447447
)
448448

449-
with suppress(aiodocker.DockerError):
450-
if not self._meta and self.image:
449+
if not self._meta and self.image:
450+
try:
451451
self._meta = await self.sys_docker.images.inspect(
452452
f"{self.image}:{version!s}"
453453
)
454+
except aiodocker.DockerError as err:
455+
if err.status != HTTPStatus.NOT_FOUND:
456+
raise DockerAPIError(
457+
f"Docker API error inspecting image {self.image}:{version!s}: {err!s}"
458+
) from err
454459

455460
# Successful?
456461
if not self._meta:
457-
raise DockerError(
462+
raise DockerNotFound(
458463
f"Could not get metadata on container or image for {self.name}"
459464
)
460465
_LOGGER.info("Attaching to %s with version %s", self.image, self.version)
@@ -550,7 +555,11 @@ async def check_image(
550555
try:
551556
image = await self.sys_docker.images.inspect(image_name)
552557
except aiodocker.DockerError as err:
553-
raise DockerError(
558+
if err.status == HTTPStatus.NOT_FOUND:
559+
raise DockerNotFound(
560+
f"Image {image_name} not found", _LOGGER.info
561+
) from err
562+
raise DockerAPIError(
554563
f"Could not get {image_name} for check due to: {err!s}",
555564
_LOGGER.error,
556565
) from err

supervisor/docker/manager.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -640,13 +640,17 @@ async def run_command(
640640
try:
641641
await self.images.inspect(f"{image}:{tag}")
642642
except aiodocker.DockerError as err:
643-
if err.status == HTTPStatus.NOT_FOUND:
644-
_LOGGER.info("Pulling image %s:%s", image, tag)
645-
await self.images.pull(image, tag=tag)
646-
else:
643+
if err.status != HTTPStatus.NOT_FOUND:
647644
raise DockerError(
648645
f"Can't inspect image {image}:{tag}: {err}", _LOGGER.error
649646
) from err
647+
_LOGGER.info("Pulling image %s:%s", image, tag)
648+
try:
649+
await self.images.pull(image, tag=tag)
650+
except aiodocker.DockerError as pull_err:
651+
raise DockerError(
652+
f"Can't pull image {image}:{tag}: {pull_err}", _LOGGER.error
653+
) from pull_err
650654

651655
try:
652656
container = await self._run(

supervisor/resolution/fixups/addon_execute_repair.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33
import logging
44

55
from ...coresys import CoreSys
6+
from ...exceptions import (
7+
DockerBuildError,
8+
DockerNoSpaceOnDevice,
9+
DockerRegistryAuthError,
10+
DockerRegistryRateLimitExceeded,
11+
ResolutionFixupError,
12+
)
613
from ..const import ContextType, IssueType, SuggestionType
714
from .base import FixupBase
815

@@ -44,7 +51,21 @@ async def process_fixup(self, reference: str | None = None) -> None:
4451

4552
_LOGGER.info("Installing image for app %s", reference)
4653
self.attempts += 1
47-
await app.instance.install(app.version)
54+
try:
55+
await app.instance.install(app.version)
56+
except (
57+
DockerBuildError,
58+
DockerNoSpaceOnDevice,
59+
DockerRegistryAuthError,
60+
DockerRegistryRateLimitExceeded,
61+
) as err:
62+
# These failures won't be resolved by an immediate retry (broken
63+
# Dockerfile or unavailable base/builder image; disk full; bad
64+
# credentials; registry rate limit). Surface as a fixup error so
65+
# FixupBase swallows it without a Sentry event. The repair stays
66+
# available for manual retry once the underlying cause is fixed.
67+
_LOGGER.warning("Cannot repair app %s: %s", reference, err)
68+
raise ResolutionFixupError() from err
4869

4970
@property
5071
def suggestion(self) -> SuggestionType:

tests/addons/test_addon.py

Lines changed: 49 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from supervisor.addons.addon import App
1818
from supervisor.addons.const import AppBackupMode
1919
from supervisor.addons.model import AppModel
20-
from supervisor.config import CoreConfig
2120
from supervisor.const import ATTR_ADVANCED, AppBoot, AppState, BusEvent
2221
from supervisor.coresys import CoreSys
2322
from supervisor.docker.addon import DockerApp
@@ -1078,75 +1077,70 @@ async def test_app_loads_wrong_image(
10781077

10791078

10801079
@pytest.mark.usefixtures("mock_amd64_arch_supported")
1081-
async def test_app_loads_missing_image(coresys: CoreSys, install_app_ssh: App):
1082-
"""Test app corrects a missing image on load."""
1080+
async def test_app_loads_missing_image_build(coresys: CoreSys, install_app_ssh: App):
1081+
"""Test build-required app surfaces a repair when image is missing on load."""
10831082
coresys.docker.images.inspect.side_effect = aiodocker.DockerError(
10841083
HTTPStatus.NOT_FOUND, {"message": "missing"}
10851084
)
10861085

1087-
with (
1088-
patch("pathlib.Path.is_file", return_value=True),
1089-
patch.object(
1090-
coresys.docker,
1091-
"run_command",
1092-
return_value=CommandReturn(0, ["Build successful"]),
1093-
) as mock_run_command,
1094-
patch.object(
1095-
type(coresys.config),
1096-
"local_to_extern_path",
1097-
return_value=PurePath("/addon/path/on/host"),
1098-
),
1099-
):
1086+
with patch.object(
1087+
coresys.docker,
1088+
"run_command",
1089+
return_value=CommandReturn(0, ["Build successful"]),
1090+
) as mock_run_command:
11001091
await install_app_ssh.load()
11011092

1102-
mock_run_command.assert_called_once()
1103-
assert mock_run_command.call_args.args[0] == "docker"
1104-
assert mock_run_command.call_args.kwargs["tag"] == "1.0.0-cli"
1105-
command = mock_run_command.call_args.kwargs["command"]
1106-
assert is_in_list(
1107-
["--platform", "linux/amd64"],
1108-
command,
1093+
# Build-required apps must not run a build during load. A repair is
1094+
# raised so the resolution autofix loop handles it off the critical path.
1095+
mock_run_command.assert_not_called()
1096+
issue = Issue(
1097+
IssueType.MISSING_IMAGE, ContextType.ADDON, reference=install_app_ssh.slug
11091098
)
1110-
assert is_in_list(
1111-
["--tag", "local/amd64-addon-ssh:9.2.1"],
1112-
command,
1099+
assert issue in coresys.resolution.issues
1100+
suggestions = coresys.resolution.suggestions_for_issue(issue)
1101+
assert any(s.type == SuggestionType.EXECUTE_REPAIR for s in suggestions)
1102+
1103+
1104+
@pytest.mark.usefixtures("mock_amd64_arch_supported")
1105+
async def test_app_loads_missing_image_pull(coresys: CoreSys, install_app_ssh: App):
1106+
"""Test pullable app installs the missing image during load."""
1107+
install_app_ssh.data["image"] = "test/amd64-addon-ssh"
1108+
coresys.docker.images.inspect.side_effect = aiodocker.DockerError(
1109+
HTTPStatus.NOT_FOUND, {"message": "missing"}
11131110
)
1114-
assert install_app_ssh.image == "local/amd64-addon-ssh"
1111+
1112+
with patch.object(DockerAPI, "pull_image") as mock_pull_image:
1113+
await install_app_ssh.load()
1114+
1115+
mock_pull_image.assert_called_once()
1116+
issue = Issue(
1117+
IssueType.MISSING_IMAGE, ContextType.ADDON, reference=install_app_ssh.slug
1118+
)
1119+
assert issue not in coresys.resolution.issues
11151120

11161121

11171122
@pytest.mark.usefixtures("container", "mock_amd64_arch_supported")
11181123
async def test_app_load_succeeds_with_docker_errors(
11191124
coresys: CoreSys, install_app_ssh: App, caplog: pytest.LogCaptureFixture
11201125
):
1121-
"""Docker errors while building/pulling an image during load should not raise and fail setup."""
1122-
# Build env invalid failure
1126+
"""Docker errors during load should not raise and fail setup."""
1127+
issue = Issue(
1128+
IssueType.MISSING_IMAGE, ContextType.ADDON, reference=install_app_ssh.slug
1129+
)
1130+
1131+
# Build-required app with missing image: repair issue raised, no exception
11231132
coresys.docker.images.inspect.side_effect = aiodocker.DockerError(
11241133
HTTPStatus.NOT_FOUND, {"message": "missing"}
11251134
)
11261135
caplog.clear()
11271136
await install_app_ssh.load()
1128-
assert "Cannot build app 'local_ssh' because dockerfile is missing" in caplog.text
1129-
1130-
# Image build failure
1131-
caplog.clear()
1132-
with (
1133-
patch("pathlib.Path.is_file", return_value=True),
1134-
patch.object(
1135-
CoreConfig,
1136-
"local_to_extern_path",
1137-
return_value=PurePath("/addon/path/on/host"),
1138-
),
1139-
patch.object(
1140-
DockerAPI, "run_command", return_value=CommandReturn(1, ["error"])
1141-
),
1142-
):
1143-
await install_app_ssh.load()
1144-
assert (
1145-
"Docker build failed for local/amd64-addon-ssh:9.2.1 (exit code 1). Build output:\nerror"
1146-
in caplog.text
1147-
)
1137+
assert issue in coresys.resolution.issues
11481138

1149-
# Image pull failure
1139+
# Pull-based app where check_image's internal install fails: addon left
1140+
# detached, no exception escapes to abort setup. The next load will hit
1141+
# DockerNotFound and trigger the proper repair path.
1142+
stored = coresys.resolution.get_issue_if_present(issue)
1143+
coresys.resolution.dismiss_issue(stored)
11501144
install_app_ssh.data["image"] = "test/amd64-addon-ssh"
11511145
caplog.clear()
11521146
with patch.object(
@@ -1155,7 +1149,11 @@ async def test_app_load_succeeds_with_docker_errors(
11551149
side_effect=aiodocker.DockerError(400, {"message": "error"}),
11561150
):
11571151
await install_app_ssh.load()
1158-
assert "Can't install test/amd64-addon-ssh:9.2.1:" in caplog.text
1152+
assert "Docker error loading app local_ssh, leaving detached" in caplog.text
1153+
assert any(
1154+
"Docker error loading app local_ssh" in r.message and r.levelname == "CRITICAL"
1155+
for r in caplog.records
1156+
)
11591157

11601158

11611159
@pytest.mark.usefixtures("coresys")

0 commit comments

Comments
 (0)