Skip to content

Commit e1e7705

Browse files
fregataaclaude
andcommitted
refactor: restore RouteHealthCheckFilter as in-memory post-filter on the repo
Bring back ``RouteHealthCheckFilter`` as a thin in-memory gate on the revision's ``health_check_config`` presence, applied by ``DeploymentRepository.get_routes_by_statuses`` after the SQL query materializes rows. The SQL surface stays a flat ``(lifecycle, health, traffic)`` selector — no widening, no JSON predicates. Anything more specific (per-status conditional gates) lives in the calling handler. - ``RouteHealthCheckFilter`` exposes one knob: ``health_check_required`` (``True``/``False``/``None``). The repo runs ``_passes_health_check`` over each row. - ``RouteHandler.health_check_filter()`` is abstract; every handler declares its filter explicitly. Lifecycle handlers return a no-op filter; ``HealthCheckRouteHandler`` returns ``health_check_required=True``. - ``AppProxySyncRouteHandler`` widens ``target.health`` to ``[HEALTHY, NOT_CHECKED]`` so revisions that opted out of health_check (which stay NOT_CHECKED for life) remain candidates, and rejects the ``NOT_CHECKED + hc-enabled`` rows in ``execute`` — those routes are waiting for their first probe and must not be announced yet. - Coordinator threads the handler's filter (and the observer's ``health_check_required=True``) into the repo call. - Repo tests cover the (lifecycle x health x traffic) filter plus the ``health_check_required`` True/False post-filter cases. Handler tests assert the declared filter and (for AppProxy) the in-memory eligibility predicate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 296026e commit e1e7705

16 files changed

Lines changed: 218 additions & 87 deletions

File tree

src/ai/backend/manager/data/deployment/types.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,18 @@ class RouteTargetStatuses:
261261
traffic: RouteTrafficStatus | None = None
262262

263263

264+
@dataclass(frozen=True)
265+
class RouteHealthCheckFilter:
266+
"""In-memory gating on a revision's ``health_check_config``.
267+
268+
``True`` keeps only routes whose revision has a config; ``False``
269+
keeps only routes without; ``None`` skips the filter. Anything more
270+
specific belongs in the calling handler.
271+
"""
272+
273+
health_check_required: bool | None = None
274+
275+
264276
@dataclass(frozen=True)
265277
class RouteTransitionTarget:
266278
"""Target state for a route transition (lifecycle + health)."""

src/ai/backend/manager/repositories/deployment/db_source/db_source.py

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
ModelDeploymentAutoScalingRuleData,
7272
ModelRevisionData,
7373
RevisionSearchResult,
74+
RouteHealthCheckFilter,
7475
RouteHealthStatus,
7576
RouteInfo,
7677
RouteSearchResult,
@@ -231,6 +232,20 @@ def _project_health_check_config(
231232
return model_definition.health_check_config()
232233

233234

235+
def _passes_health_check(
236+
has_health_check_config: bool,
237+
health_check_required: bool | None,
238+
) -> bool:
239+
"""Whether the revision's ``health_check_config`` presence matches
240+
``health_check_required`` (``None`` skips the check).
241+
"""
242+
if health_check_required is True:
243+
return has_health_check_config
244+
if health_check_required is False:
245+
return not has_health_check_config
246+
return True
247+
248+
234249
def _project_preset_slots(
235250
preset_row: DeploymentRevisionPresetRow | None,
236251
slot_entries: list[tuple[str, Decimal]],
@@ -1618,13 +1633,15 @@ async def scale_routes(
16181633
async def get_routes_by_statuses(
16191634
self,
16201635
target: RouteTargetStatuses,
1636+
health_check_filter: RouteHealthCheckFilter,
16211637
) -> list[RouteData]:
1622-
"""Routes matching ``(lifecycle, health, traffic)``.
1638+
"""Routes matching ``(lifecycle, health, traffic)`` with
1639+
revision-level ``health_check_config`` gating applied in memory.
16231640
16241641
``model_definition`` is selected so the resolved
1625-
``ModelHealthCheck`` (or ``None``) can be attached to each
1626-
:class:`RouteData`; callers gate on ``RouteData.health_check_config``
1627-
in memory when revision-level health-check enablement matters.
1642+
``ModelHealthCheck`` (or ``None``) is attached to each
1643+
:class:`RouteData`; ``health_check_filter`` then runs over those
1644+
materialized rows.
16281645
"""
16291646
async with self._begin_readonly_session_read_committed() as db_sess:
16301647
query = (
@@ -1646,23 +1663,29 @@ async def get_routes_by_statuses(
16461663
if target.traffic is not None:
16471664
query = query.where(RoutingRow.traffic_status == target.traffic)
16481665
result = await db_sess.execute(query)
1649-
return [
1650-
RouteData(
1651-
route_id=row.id,
1652-
deployment_id=row.endpoint,
1653-
session_data=_build_session_data(row.session, session_status),
1654-
status=row.status,
1655-
health_status=row.health_status,
1656-
traffic_ratio=row.traffic_ratio,
1657-
created_at=row.created_at,
1658-
revision_id=DeploymentRevisionID(row.revision),
1659-
replica_host=row.replica_host,
1660-
replica_port=row.replica_port,
1661-
error_data=row.error_data or {},
1662-
health_check_config=_project_health_check_config(model_definition),
1666+
routes: list[RouteData] = []
1667+
for row, model_definition, session_status in result.all():
1668+
health_check_config = _project_health_check_config(model_definition)
1669+
has_config = health_check_config is not None
1670+
if not _passes_health_check(has_config, health_check_filter.health_check_required):
1671+
continue
1672+
routes.append(
1673+
RouteData(
1674+
route_id=row.id,
1675+
deployment_id=row.endpoint,
1676+
session_data=_build_session_data(row.session, session_status),
1677+
status=row.status,
1678+
health_status=row.health_status,
1679+
traffic_ratio=row.traffic_ratio,
1680+
created_at=row.created_at,
1681+
revision_id=DeploymentRevisionID(row.revision),
1682+
replica_host=row.replica_host,
1683+
replica_port=row.replica_port,
1684+
error_data=row.error_data or {},
1685+
health_check_config=health_check_config,
1686+
)
16631687
)
1664-
for row, model_definition, session_status in result.all()
1665-
]
1688+
return routes
16661689

16671690
async def update_route_status_bulk(
16681691
self,

src/ai/backend/manager/repositories/deployment/repository.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
ModelDeploymentAutoScalingRuleData,
6868
ModelRevisionData,
6969
RevisionSearchResult,
70+
RouteHealthCheckFilter,
7071
RouteInfo,
7172
RouteSearchResult,
7273
RouteStatus,
@@ -631,9 +632,12 @@ async def scale_routes(
631632
async def get_routes_by_statuses(
632633
self,
633634
target: RouteTargetStatuses,
635+
health_check_filter: RouteHealthCheckFilter,
634636
) -> list[RouteData]:
635-
"""Routes matching ``(lifecycle, health, traffic)``."""
636-
return await self._db_source.get_routes_by_statuses(target)
637+
"""Routes matching ``(lifecycle, health, traffic)`` with the
638+
in-memory ``health_check_config`` post-filter applied.
639+
"""
640+
return await self._db_source.get_routes_by_statuses(target, health_check_filter)
637641

638642
@deployment_repository_resilience.apply()
639643
async def update_route_status_bulk(

src/ai/backend/manager/sokovan/deployment/route/coordinator.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from ai.backend.manager.clients.appproxy.client import AppProxyClientPool
2222
from ai.backend.manager.config.provider import ManagerConfigProvider
2323
from ai.backend.manager.data.deployment.types import (
24+
RouteHealthCheckFilter,
2425
RouteHealthStatus,
2526
RouteStatus,
2627
RouteTargetStatuses,
@@ -202,7 +203,10 @@ async def process_route_lifecycle(
202203
await stack.enter_async_context(self._lock_factory(handler.lock_id, lock_lifetime))
203204

204205
target = handler.target_statuses()
205-
routes = await self._deployment_repository.get_routes_by_statuses(target)
206+
routes = await self._deployment_repository.get_routes_by_statuses(
207+
target,
208+
handler.health_check_filter(),
209+
)
206210
if not routes:
207211
log.trace("No routes to process for handler: {}", handler.name())
208212
return
@@ -235,6 +239,7 @@ async def _process_observer(self, observer: RouteObserver) -> None:
235239
lifecycle=[RouteStatus.RUNNING],
236240
health=list(RouteHealthStatus),
237241
),
242+
RouteHealthCheckFilter(health_check_required=True),
238243
)
239244
if not routes:
240245
return

src/ai/backend/manager/sokovan/deployment/route/executor.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -559,9 +559,10 @@ async def sync_service_discovery(self, routes: Sequence[RouteData]) -> RouteExec
559559
async def sync_appproxy(self, routes: Sequence[RouteData]) -> RouteExecutionResult:
560560
"""Push the routing tables for affected endpoints to AppProxy.
561561
562-
The input routes are pre-filtered by the handler (HEALTHY or
563-
revision-disabled health_check), so they already represent the
564-
authoritative set to register. Each route's
562+
The input routes are pre-filtered by the coordinator on
563+
``(lifecycle, health, traffic)`` plus the
564+
``include_health_check_disabled`` post-filter, so they already
565+
represent the authoritative set to register. Each route's
565566
``replica_host``/``replica_port`` is kept up to date by
566567
``check_running_routes`` and is used directly here — same pattern
567568
as :meth:`register_routes_now`. Routes with missing replica info

src/ai/backend/manager/sokovan/deployment/route/handlers/appproxy_sync.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from ai.backend.logging import BraceStyleAdapter
88
from ai.backend.manager.data.deployment.types import (
99
RouteHandlerCategory,
10+
RouteHealthCheckFilter,
1011
RouteHealthStatus,
1112
RouteStatus,
1213
RouteStatusTransitions,
@@ -57,15 +58,18 @@ def category(cls) -> RouteHandlerCategory:
5758

5859
@classmethod
5960
def target_statuses(cls) -> RouteTargetStatuses:
60-
# Includes NOT_CHECKED so routes whose revision disabled health_check
61-
# (they stay NOT_CHECKED forever) remain candidates; ``execute`` then
62-
# narrows the in-memory set to HEALTHY or hc-disabled rows.
61+
# NOT_CHECKED rows pass through for hc-disabled revisions; ``execute``
62+
# rejects hc-enabled NOT_CHECKED rows whose probe is still pending.
6363
return RouteTargetStatuses(
6464
lifecycle=[RouteStatus.RUNNING],
6565
health=[RouteHealthStatus.HEALTHY, RouteHealthStatus.NOT_CHECKED],
6666
traffic=RouteTrafficStatus.ACTIVE,
6767
)
6868

69+
@classmethod
70+
def health_check_filter(cls) -> RouteHealthCheckFilter:
71+
return RouteHealthCheckFilter()
72+
6973
@classmethod
7074
def status_transitions(cls) -> RouteStatusTransitions:
7175
# Pure side-effect handler; never mutates lifecycle/health columns.
@@ -76,10 +80,7 @@ def status_transitions(cls) -> RouteStatusTransitions:
7680
)
7781

7882
async def execute(self, routes: Sequence[RouteData]) -> RouteExecutionResult:
79-
# Register HEALTHY routes plus revisions that opted out of health_check
80-
# (those carry ``health_check_config is None`` and stay NOT_CHECKED);
81-
# NOT_CHECKED routes whose revision still has a probe config must wait
82-
# for their first successful probe before being announced.
83+
# Skip NOT_CHECKED hc-enabled rows: their first probe hasn't run yet.
8384
eligible = [
8485
r
8586
for r in routes

src/ai/backend/manager/sokovan/deployment/route/handlers/base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from ai.backend.manager.data.deployment.types import (
77
RouteHandlerCategory,
8+
RouteHealthCheckFilter,
89
RouteStatusTransitions,
910
RouteTargetStatuses,
1011
)
@@ -59,6 +60,12 @@ def status_transitions(cls) -> RouteStatusTransitions:
5960
"""
6061
raise NotImplementedError("Subclasses must implement status_transitions()")
6162

63+
@classmethod
64+
@abstractmethod
65+
def health_check_filter(cls) -> RouteHealthCheckFilter:
66+
"""Revision-level ``health_check_config`` gating applied by the repo."""
67+
raise NotImplementedError("Subclasses must implement health_check_filter()")
68+
6269
@abstractmethod
6370
async def execute(self, routes: Sequence[RouteData]) -> RouteExecutionResult:
6471
"""Execute the route operation."""

src/ai/backend/manager/sokovan/deployment/route/handlers/health_check.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from ai.backend.logging import BraceStyleAdapter
88
from ai.backend.manager.data.deployment.types import (
99
RouteHandlerCategory,
10+
RouteHealthCheckFilter,
1011
RouteHealthStatus,
1112
RouteStatus,
1213
RouteStatusTransitions,
@@ -69,19 +70,15 @@ def status_transitions(cls) -> RouteStatusTransitions:
6970
stale=RouteTransitionTarget(health_status=RouteHealthStatus.DEGRADED),
7071
)
7172

72-
async def execute(self, routes: Sequence[RouteData]) -> RouteExecutionResult:
73-
"""Execute health check for routes.
74-
75-
Routes whose revision opted out of ``service.health_check`` carry no
76-
``health_check_config`` and never had a ``RouteHealthRecord`` registered;
77-
skipping them avoids classifying their missing record as stale/degraded.
78-
"""
79-
eligible = [r for r in routes if r.health_check_config is not None]
80-
if not eligible:
81-
return RouteExecutionResult(successes=[], errors=[], stale=[])
73+
@classmethod
74+
def health_check_filter(cls) -> RouteHealthCheckFilter:
75+
# Skip revisions with no probe — they have no RouteHealthRecord.
76+
return RouteHealthCheckFilter(health_check_required=True)
8277

83-
log.debug("Checking health for {} routes", len(eligible))
84-
return await self._route_executor.check_route_health(eligible)
78+
async def execute(self, routes: Sequence[RouteData]) -> RouteExecutionResult:
79+
"""Execute health check for routes."""
80+
log.debug("Checking health for {} routes", len(routes))
81+
return await self._route_executor.check_route_health(routes)
8582

8683
async def post_process(self, result: RouteExecutionResult) -> None:
8784
"""Log health-check results.

src/ai/backend/manager/sokovan/deployment/route/handlers/provisioning.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from ai.backend.logging import BraceStyleAdapter
88
from ai.backend.manager.data.deployment.types import (
99
RouteHandlerCategory,
10+
RouteHealthCheckFilter,
1011
RouteHealthStatus,
1112
RouteStatus,
1213
RouteStatusTransitions,
@@ -55,6 +56,10 @@ def target_statuses(cls) -> RouteTargetStatuses:
5556
health=list(RouteHealthStatus),
5657
)
5758

59+
@classmethod
60+
def health_check_filter(cls) -> RouteHealthCheckFilter:
61+
return RouteHealthCheckFilter()
62+
5863
@classmethod
5964
def status_transitions(cls) -> RouteStatusTransitions:
6065
"""Provisioning → RUNNING + NOT_CHECKED on success, FAILED_TO_START on failure."""

src/ai/backend/manager/sokovan/deployment/route/handlers/route_eviction.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from ai.backend.logging import BraceStyleAdapter
88
from ai.backend.manager.data.deployment.types import (
99
RouteHandlerCategory,
10+
RouteHealthCheckFilter,
1011
RouteHealthStatus,
1112
RouteStatus,
1213
RouteStatusTransitions,
@@ -70,6 +71,10 @@ def target_statuses(cls) -> RouteTargetStatuses:
7071
health=list(RouteHealthStatus),
7172
)
7273

74+
@classmethod
75+
def health_check_filter(cls) -> RouteHealthCheckFilter:
76+
return RouteHealthCheckFilter()
77+
7378
@classmethod
7479
def status_transitions(cls) -> RouteStatusTransitions:
7580
"""Eviction: success → TERMINATING, failure → no change."""

0 commit comments

Comments
 (0)