Skip to content

Commit 9055007

Browse files
Merge pull request #1022 from yasinBursali/fix/extensions-async-hygiene
fix(dashboard-api): async hygiene in routers/extensions.py
2 parents 5893142 + c0618be commit 9055007

2 files changed

Lines changed: 102 additions & 12 deletions

File tree

dream-server/extensions/services/dashboard-api/routers/extensions.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,18 @@ def _get_service_data_info(service_id: str) -> dict | None:
539539
_AGENT_LOG_TIMEOUT = 30 # seconds — log fetches should be fast
540540

541541

542+
def _fetch_agent_logs(url: str, headers: dict, data: bytes, timeout: int) -> str:
543+
"""Blocking POST to host agent that returns the response body as text.
544+
545+
Extracted so async handlers can offload the urllib call via
546+
``asyncio.to_thread``. urllib.error.HTTPError / URLError raised inside
547+
propagate back to the caller and are handled there.
548+
"""
549+
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
550+
with urllib.request.urlopen(req, timeout=timeout) as resp:
551+
return resp.read().decode()
552+
553+
542554
def _call_agent(action: str, service_id: str) -> bool:
543555
"""Call host agent to start/stop a service. Returns True on success."""
544556
url = f"{AGENT_URL}/v1/extension/{action}"
@@ -551,8 +563,11 @@ def _call_agent(action: str, service_id: str) -> bool:
551563
try:
552564
with urllib.request.urlopen(req, timeout=_AGENT_TIMEOUT) as resp:
553565
return resp.status == 200
554-
except Exception:
555-
logger.warning("Host agent unreachable at %s — fallback to restart_required", AGENT_URL)
566+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError) as exc:
567+
logger.warning(
568+
"Host agent unreachable at %s — fallback to restart_required: %s",
569+
AGENT_URL, exc,
570+
)
556571
return False
557572

558573

@@ -567,9 +582,10 @@ def _call_agent_invalidate_compose_cache() -> None:
567582
logger.warning(
568583
"compose-flags cache invalidation returned HTTP %d", resp.status,
569584
)
570-
except Exception:
585+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError) as exc:
571586
logger.warning(
572-
"Host agent unreachable for compose-flags invalidation at %s", AGENT_URL,
587+
"Host agent unreachable for compose-flags invalidation at %s: %s",
588+
AGENT_URL, exc,
573589
)
574590

575591

@@ -647,8 +663,10 @@ def _call_agent_compose_rename(action: str, service_id: str) -> bool:
647663
try:
648664
with urllib.request.urlopen(req, timeout=_AGENT_LOG_TIMEOUT) as resp:
649665
return resp.status == 200
650-
except Exception:
651-
logger.warning("Host agent unreachable for compose rename at %s", AGENT_URL)
666+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError) as exc:
667+
logger.warning(
668+
"Host agent unreachable for compose rename at %s: %s", AGENT_URL, exc,
669+
)
652670
return False
653671

654672

@@ -667,7 +685,7 @@ def _check_agent_health() -> bool:
667685
req = urllib.request.Request(f"{AGENT_URL}/health")
668686
with urllib.request.urlopen(req, timeout=3) as resp:
669687
available = resp.status == 200
670-
except Exception:
688+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError):
671689
available = False
672690
with _agent_cache_lock:
673691
_agent_cache.update(available=available, checked_at=time.monotonic())
@@ -709,7 +727,16 @@ async def extensions_catalog(
709727
api_key: str = Depends(verify_api_key),
710728
):
711729
"""Get the extensions catalog with computed status."""
712-
asyncio.get_running_loop().run_in_executor(None, _cleanup_stale_progress)
730+
_cleanup_future = asyncio.get_running_loop().run_in_executor(
731+
None, _cleanup_stale_progress,
732+
)
733+
734+
def _log_cleanup_error(f: asyncio.Future) -> None:
735+
exc = f.exception()
736+
if exc is not None:
737+
logger.error("stale-progress cleanup failed: %s", exc, exc_info=exc)
738+
739+
_cleanup_future.add_done_callback(_log_cleanup_error)
713740

714741
from helpers import get_cached_services, get_all_services
715742

@@ -816,12 +843,14 @@ async def extensions_catalog(
816843
except OSError:
817844
lib_available = False
818845

846+
agent_available = await asyncio.to_thread(_check_agent_health)
847+
819848
return {
820849
"extensions": extensions,
821850
"summary": summary,
822851
"gpu_backend": GPU_BACKEND,
823852
"library_available": lib_available,
824-
"agent_available": _check_agent_health(),
853+
"agent_available": agent_available,
825854
}
826855

827856

@@ -927,10 +956,11 @@ async def extension_logs(
927956
"Authorization": f"Bearer {DREAM_AGENT_KEY}",
928957
}
929958
data = json.dumps({"service_id": service_id, "tail": 100}).encode()
930-
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
931959
try:
932-
with urllib.request.urlopen(req, timeout=_AGENT_LOG_TIMEOUT) as resp:
933-
return json.loads(resp.read().decode())
960+
body = await asyncio.to_thread(
961+
_fetch_agent_logs, url, headers, data, _AGENT_LOG_TIMEOUT,
962+
)
963+
return json.loads(body)
934964
except urllib.error.HTTPError as exc:
935965
try:
936966
err_body = json.loads(exc.read().decode())

dream-server/extensions/services/dashboard-api/tests/test_extensions.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2552,3 +2552,63 @@ def test_assert_not_core_blocks_always_on(self, service_id):
25522552
with pytest.raises(HTTPException) as exc_info:
25532553
_assert_not_core(service_id)
25542554
assert exc_info.value.status_code == 403
2555+
2556+
2557+
class TestCallAgentErrorNarrowing:
2558+
"""_call_agent swallows network errors but not programmer errors."""
2559+
2560+
def test_call_agent_returns_false_on_urlerror(self, monkeypatch, caplog):
2561+
"""Network failures produce (False, warning) — callers rely on this."""
2562+
import logging
2563+
import urllib.error
2564+
from routers import extensions as ext_module
2565+
2566+
def _raise(*_args, **_kwargs):
2567+
raise urllib.error.URLError("timeout")
2568+
2569+
monkeypatch.setattr(ext_module.urllib.request, "urlopen", _raise)
2570+
with caplog.at_level(logging.WARNING, logger="routers.extensions"):
2571+
result = ext_module._call_agent("start", "svc-x")
2572+
2573+
assert result is False
2574+
assert any("Host agent unreachable" in r.message for r in caplog.records)
2575+
2576+
def test_call_agent_reraises_non_network_errors(self, monkeypatch):
2577+
"""Programmer errors (e.g. AttributeError) must not be swallowed."""
2578+
from routers import extensions as ext_module
2579+
2580+
def _raise(*_args, **_kwargs):
2581+
raise AttributeError("boom")
2582+
2583+
monkeypatch.setattr(ext_module.urllib.request, "urlopen", _raise)
2584+
with pytest.raises(AttributeError):
2585+
ext_module._call_agent("start", "svc-x")
2586+
2587+
def test_catalog_logs_when_cleanup_future_fails(
2588+
self, test_client, monkeypatch, tmp_path, caplog,
2589+
):
2590+
"""Stale-progress cleanup failures are logged, not lost to fire-and-forget."""
2591+
import logging
2592+
2593+
catalog = [_make_catalog_ext("test-svc", "Test Service")]
2594+
_patch_extensions_config(monkeypatch, catalog, tmp_path=tmp_path)
2595+
2596+
def _boom():
2597+
raise RuntimeError("cleanup exploded")
2598+
2599+
monkeypatch.setattr(
2600+
"routers.extensions._cleanup_stale_progress", _boom,
2601+
)
2602+
2603+
with caplog.at_level(logging.ERROR, logger="routers.extensions"):
2604+
with patch("helpers.get_all_services", new_callable=AsyncMock,
2605+
return_value=[]):
2606+
resp = test_client.get(
2607+
"/api/extensions/catalog",
2608+
headers=test_client.auth_headers,
2609+
)
2610+
2611+
assert resp.status_code == 200
2612+
assert any(
2613+
"stale-progress cleanup failed" in r.message for r in caplog.records
2614+
)

0 commit comments

Comments
 (0)