Skip to content

Commit c0618be

Browse files
committed
fix(dashboard-api): async hygiene in routers/extensions.py
Three async-hygiene defects in routers/extensions.py: - extension_logs and extensions_catalog called blocking urllib.urlopen directly on the event-loop thread. With the Console modal polling every 2s and a 30s agent timeout, one slow host-agent response could stall the dashboard-api for up to 30s at a time. - _call_agent, _call_agent_invalidate_compose_cache, and _call_agent_compose_rename caught `except Exception`, swallowing non-network programmer errors with a misleading "host agent unreachable" log. Narrow to (URLError, HTTPError, OSError, TimeoutError) and log the actual exception. - _cleanup_stale_progress was dispatched via run_in_executor with a discarded Future, so failures surfaced only as "Future exception was never retrieved" warnings in stderr. Offload blocking urllib calls via asyncio.to_thread (matching the existing pattern in main.py's api_settings_env_save). Attach a log-on-exception done-callback to the cleanup future. Tests cover the URLError swallow path, the new re-raise behaviour on non-network errors, and the cleanup callback logging.
1 parent d5154c3 commit c0618be

2 files changed

Lines changed: 102 additions & 12 deletions

File tree

dream-server/extensions/services/dashboard-api/routers/extensions.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,18 @@ def _get_service_data_info(service_id: str) -> dict | None:
475475
_AGENT_LOG_TIMEOUT = 30 # seconds — log fetches should be fast
476476

477477

478+
def _fetch_agent_logs(url: str, headers: dict, data: bytes, timeout: int) -> str:
479+
"""Blocking POST to host agent that returns the response body as text.
480+
481+
Extracted so async handlers can offload the urllib call via
482+
``asyncio.to_thread``. urllib.error.HTTPError / URLError raised inside
483+
propagate back to the caller and are handled there.
484+
"""
485+
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
486+
with urllib.request.urlopen(req, timeout=timeout) as resp:
487+
return resp.read().decode()
488+
489+
478490
def _call_agent(action: str, service_id: str) -> bool:
479491
"""Call host agent to start/stop a service. Returns True on success."""
480492
url = f"{AGENT_URL}/v1/extension/{action}"
@@ -487,8 +499,11 @@ def _call_agent(action: str, service_id: str) -> bool:
487499
try:
488500
with urllib.request.urlopen(req, timeout=_AGENT_TIMEOUT) as resp:
489501
return resp.status == 200
490-
except Exception:
491-
logger.warning("Host agent unreachable at %s — fallback to restart_required", AGENT_URL)
502+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError) as exc:
503+
logger.warning(
504+
"Host agent unreachable at %s — fallback to restart_required: %s",
505+
AGENT_URL, exc,
506+
)
492507
return False
493508

494509

@@ -503,9 +518,10 @@ def _call_agent_invalidate_compose_cache() -> None:
503518
logger.warning(
504519
"compose-flags cache invalidation returned HTTP %d", resp.status,
505520
)
506-
except Exception:
521+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError) as exc:
507522
logger.warning(
508-
"Host agent unreachable for compose-flags invalidation at %s", AGENT_URL,
523+
"Host agent unreachable for compose-flags invalidation at %s: %s",
524+
AGENT_URL, exc,
509525
)
510526

511527

@@ -583,8 +599,10 @@ def _call_agent_compose_rename(action: str, service_id: str) -> bool:
583599
try:
584600
with urllib.request.urlopen(req, timeout=_AGENT_LOG_TIMEOUT) as resp:
585601
return resp.status == 200
586-
except Exception:
587-
logger.warning("Host agent unreachable for compose rename at %s", AGENT_URL)
602+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError) as exc:
603+
logger.warning(
604+
"Host agent unreachable for compose rename at %s: %s", AGENT_URL, exc,
605+
)
588606
return False
589607

590608

@@ -603,7 +621,7 @@ def _check_agent_health() -> bool:
603621
req = urllib.request.Request(f"{AGENT_URL}/health")
604622
with urllib.request.urlopen(req, timeout=3) as resp:
605623
available = resp.status == 200
606-
except Exception:
624+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError):
607625
available = False
608626
with _agent_cache_lock:
609627
_agent_cache.update(available=available, checked_at=time.monotonic())
@@ -645,7 +663,16 @@ async def extensions_catalog(
645663
api_key: str = Depends(verify_api_key),
646664
):
647665
"""Get the extensions catalog with computed status."""
648-
asyncio.get_running_loop().run_in_executor(None, _cleanup_stale_progress)
666+
_cleanup_future = asyncio.get_running_loop().run_in_executor(
667+
None, _cleanup_stale_progress,
668+
)
669+
670+
def _log_cleanup_error(f: asyncio.Future) -> None:
671+
exc = f.exception()
672+
if exc is not None:
673+
logger.error("stale-progress cleanup failed: %s", exc, exc_info=exc)
674+
675+
_cleanup_future.add_done_callback(_log_cleanup_error)
649676

650677
from helpers import get_cached_services, get_all_services
651678

@@ -752,12 +779,14 @@ async def extensions_catalog(
752779
except OSError:
753780
lib_available = False
754781

782+
agent_available = await asyncio.to_thread(_check_agent_health)
783+
755784
return {
756785
"extensions": extensions,
757786
"summary": summary,
758787
"gpu_backend": GPU_BACKEND,
759788
"library_available": lib_available,
760-
"agent_available": _check_agent_health(),
789+
"agent_available": agent_available,
761790
}
762791

763792

@@ -863,10 +892,11 @@ async def extension_logs(
863892
"Authorization": f"Bearer {DREAM_AGENT_KEY}",
864893
}
865894
data = json.dumps({"service_id": service_id, "tail": 100}).encode()
866-
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
867895
try:
868-
with urllib.request.urlopen(req, timeout=_AGENT_LOG_TIMEOUT) as resp:
869-
return json.loads(resp.read().decode())
896+
body = await asyncio.to_thread(
897+
_fetch_agent_logs, url, headers, data, _AGENT_LOG_TIMEOUT,
898+
)
899+
return json.loads(body)
870900
except urllib.error.HTTPError as exc:
871901
try:
872902
err_body = json.loads(exc.read().decode())

dream-server/extensions/services/dashboard-api/tests/test_extensions.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2278,3 +2278,63 @@ def test_assert_not_core_blocks_always_on(self, service_id):
22782278
with pytest.raises(HTTPException) as exc_info:
22792279
_assert_not_core(service_id)
22802280
assert exc_info.value.status_code == 403
2281+
2282+
2283+
class TestCallAgentErrorNarrowing:
2284+
"""_call_agent swallows network errors but not programmer errors."""
2285+
2286+
def test_call_agent_returns_false_on_urlerror(self, monkeypatch, caplog):
2287+
"""Network failures produce (False, warning) — callers rely on this."""
2288+
import logging
2289+
import urllib.error
2290+
from routers import extensions as ext_module
2291+
2292+
def _raise(*_args, **_kwargs):
2293+
raise urllib.error.URLError("timeout")
2294+
2295+
monkeypatch.setattr(ext_module.urllib.request, "urlopen", _raise)
2296+
with caplog.at_level(logging.WARNING, logger="routers.extensions"):
2297+
result = ext_module._call_agent("start", "svc-x")
2298+
2299+
assert result is False
2300+
assert any("Host agent unreachable" in r.message for r in caplog.records)
2301+
2302+
def test_call_agent_reraises_non_network_errors(self, monkeypatch):
2303+
"""Programmer errors (e.g. AttributeError) must not be swallowed."""
2304+
from routers import extensions as ext_module
2305+
2306+
def _raise(*_args, **_kwargs):
2307+
raise AttributeError("boom")
2308+
2309+
monkeypatch.setattr(ext_module.urllib.request, "urlopen", _raise)
2310+
with pytest.raises(AttributeError):
2311+
ext_module._call_agent("start", "svc-x")
2312+
2313+
def test_catalog_logs_when_cleanup_future_fails(
2314+
self, test_client, monkeypatch, tmp_path, caplog,
2315+
):
2316+
"""Stale-progress cleanup failures are logged, not lost to fire-and-forget."""
2317+
import logging
2318+
2319+
catalog = [_make_catalog_ext("test-svc", "Test Service")]
2320+
_patch_extensions_config(monkeypatch, catalog, tmp_path=tmp_path)
2321+
2322+
def _boom():
2323+
raise RuntimeError("cleanup exploded")
2324+
2325+
monkeypatch.setattr(
2326+
"routers.extensions._cleanup_stale_progress", _boom,
2327+
)
2328+
2329+
with caplog.at_level(logging.ERROR, logger="routers.extensions"):
2330+
with patch("helpers.get_all_services", new_callable=AsyncMock,
2331+
return_value=[]):
2332+
resp = test_client.get(
2333+
"/api/extensions/catalog",
2334+
headers=test_client.auth_headers,
2335+
)
2336+
2337+
assert resp.status_code == 200
2338+
assert any(
2339+
"stale-progress cleanup failed" in r.message for r in caplog.records
2340+
)

0 commit comments

Comments
 (0)