Skip to content

Commit 5c102fc

Browse files
HyeockJinKimclaude
andcommitted
fix(BA-6035): treat no-health-check RUNNING routes as healthy in rolling update
- Add health_check: ModelHealthCheck | None to RouteInfo (populated from RoutingRow) - Rolling update _classify_routes: RUNNING routes without health check count as new_healthy_count so DEPLOYING→READY transition can complete - DB health_status stays DEGRADED (no probe data — correct behaviour) - Add TestNoHealthCheck unit tests - Fix CLI revision add --auto-activate flag (nest under options dict) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 6f50520 commit 5c102fc

6 files changed

Lines changed: 85 additions & 3 deletions

File tree

src/ai/backend/client/cli/v2/deployment/revision.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ def add(deployment_id: str, config: str, preset_id: str | None, auto_activate: b
4949
data["deployment_id"] = deployment_id
5050
if preset_id is not None:
5151
data["revision_preset_id"] = preset_id
52-
data["auto_activate"] = auto_activate
52+
if auto_activate:
53+
data.setdefault("options", {})["auto_activate"] = True
5354
body = AddRevisionInput.model_validate(data)
5455

5556
async def _run() -> None:

src/ai/backend/manager/data/deployment/types.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import yarl
1313
from pydantic import ConfigDict, Field
1414

15-
from ai.backend.common.config import ModelDefinition, ModelDefinitionDraft
15+
from ai.backend.common.config import ModelDefinition, ModelDefinitionDraft, ModelHealthCheck
1616
from ai.backend.common.data.endpoint.types import EndpointLifecycle, ScalingState
1717
from ai.backend.common.data.model_deployment.types import (
1818
ActivenessStatus,
@@ -802,6 +802,7 @@ class RouteInfo:
802802
created_at: datetime
803803
revision_id: UUID
804804
traffic_status: RouteTrafficStatus
805+
health_check: ModelHealthCheck | None
805806
error_data: dict[str, Any] = field(default_factory=dict)
806807

807808
@property

src/ai/backend/manager/models/routing/row.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,5 +271,6 @@ def to_route_info(self) -> RouteInfo:
271271
created_at=self.created_at,
272272
revision_id=self.revision,
273273
traffic_status=self.traffic_status,
274+
health_check=self.health_check,
274275
error_data=self.error_data or {},
275276
)

src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,11 @@ def _classify_routes(
129129
elif route.status.is_inactive():
130130
classified.new_failed_count += 1
131131
elif route.status == RouteStatus.RUNNING:
132-
if route.health_status == RouteHealthStatus.HEALTHY:
132+
if route.health_status == RouteHealthStatus.HEALTHY or route.health_check is None:
133+
# Routes without a health check have no probe data, so we treat them
134+
# as ready once their process is RUNNING (health_status stays DEGRADED
135+
# in DB because no Valkey entry exists — that is correct behaviour, but
136+
# it must not block the DEPLOYING → READY transition).
133137
classified.new_healthy_count += 1
134138
else:
135139
# UNHEALTHY / DEGRADED / NOT_CHECKED all count here:

tests/unit/manager/services/deployment/test_deployment_crud_actions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ def route_info(self, endpoint_id: uuid.UUID) -> RouteInfo:
314314
created_at=datetime(2024, 1, 1, tzinfo=UTC),
315315
revision_id=uuid.uuid4(),
316316
traffic_status=RouteTrafficStatus.ACTIVE,
317+
health_check=None,
317318
)
318319

319320
async def test_existing_replica_returns_data(
@@ -364,6 +365,7 @@ async def test_zero_weight_traffic_inactive(
364365
created_at=datetime(2024, 1, 1, tzinfo=UTC),
365366
revision_id=uuid.uuid4(),
366367
traffic_status=RouteTrafficStatus.INACTIVE,
368+
health_check=None,
367369
)
368370
mock_deployment_repository.get_route = AsyncMock(return_value=inactive_route)
369371

@@ -391,6 +393,7 @@ async def test_unassigned_session_id_is_none(
391393
created_at=datetime(2024, 1, 1, tzinfo=UTC),
392394
revision_id=uuid.uuid4(),
393395
traffic_status=RouteTrafficStatus.ACTIVE,
396+
health_check=None,
394397
)
395398
mock_deployment_repository.get_route = AsyncMock(return_value=route)
396399

@@ -416,6 +419,7 @@ def route_info(self, endpoint_id: uuid.UUID) -> RouteInfo:
416419
created_at=datetime(2024, 1, 1, tzinfo=UTC),
417420
revision_id=uuid.uuid4(),
418421
traffic_status=RouteTrafficStatus.ACTIVE,
422+
health_check=None,
419423
)
420424

421425
async def test_default_pagination(

tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pytest
2424
from pydantic import ValidationError
2525

26+
from ai.backend.common.config import ModelHealthCheck
2627
from ai.backend.common.data.endpoint.types import EndpointLifecycle, ScalingState
2728
from ai.backend.common.dto.manager.v2.deployment.types import IntOrPercent
2829
from ai.backend.common.exception import BackendAISchemaValidationFailed
@@ -61,6 +62,7 @@ def make_int_or_percent(value: int | float) -> IntOrPercent:
6162

6263

6364
OLD_REV = UUID("11111111-1111-1111-1111-111111111111")
65+
_STUB_HEALTH_CHECK = ModelHealthCheck(path="/health", interval=10.0, initial_delay=30.0)
6466
NEW_REV = UUID("22222222-2222-2222-2222-222222222222")
6567
PROJECT_ID = UUID("cccccccc-cccc-cccc-cccc-cccccccccccc")
6668
USER_ID = UUID("dddddddd-dddd-dddd-dddd-dddddddddddd")
@@ -146,6 +148,7 @@ def make_route(
146148
health_status: RouteHealthStatus = RouteHealthStatus.HEALTHY,
147149
endpoint_id: UUID = ENDPOINT_ID,
148150
route_id: UUID | None = None,
151+
health_check: ModelHealthCheck | None = _STUB_HEALTH_CHECK,
149152
) -> RouteInfo:
150153
return RouteInfo(
151154
route_id=route_id or uuid4(),
@@ -159,6 +162,7 @@ def make_route(
159162
traffic_status=RouteTrafficStatus.ACTIVE
160163
if status.is_active()
161164
else RouteTrafficStatus.INACTIVE,
165+
health_check=health_check,
162166
)
163167

164168

@@ -1132,3 +1136,70 @@ def test_small_fraction_with_few_replicas(self) -> None:
11321136
result = RollingUpdateStrategy().evaluate_cycle(deployment, routes, spec)
11331137

11341138
assert len(result.route_changes.rollout_specs) == 1
1139+
1140+
1141+
# ===========================================================================
1142+
# No-health-check scenario
1143+
# ===========================================================================
1144+
1145+
1146+
class TestNoHealthCheck:
1147+
"""Routes without health_check stay DEGRADED in DB but must allow READY transition."""
1148+
1149+
def test_running_degraded_no_health_check_completes(self) -> None:
1150+
"""RUNNING + DEGRADED + no health_check → counts as healthy → COMPLETED."""
1151+
deployment = make_deployment(desired=1)
1152+
spec = RollingUpdateSpec(
1153+
max_surge=make_int_or_percent(1), max_unavailable=make_int_or_percent(0)
1154+
)
1155+
routes = [
1156+
make_route(
1157+
revision_id=NEW_REV,
1158+
status=RouteStatus.RUNNING,
1159+
health_status=RouteHealthStatus.DEGRADED,
1160+
health_check=None,
1161+
)
1162+
]
1163+
1164+
result = RollingUpdateStrategy().evaluate_cycle(deployment, routes, spec)
1165+
1166+
assert result.sub_step == DeploymentLifecycleSubStep.DEPLOYING_COMPLETED
1167+
1168+
def test_running_degraded_with_health_check_does_not_complete(self) -> None:
1169+
"""RUNNING + DEGRADED + has health_check → still unhealthy → PROVISIONING."""
1170+
deployment = make_deployment(desired=1)
1171+
spec = RollingUpdateSpec(
1172+
max_surge=make_int_or_percent(1), max_unavailable=make_int_or_percent(0)
1173+
)
1174+
routes = [
1175+
make_route(
1176+
revision_id=NEW_REV,
1177+
status=RouteStatus.RUNNING,
1178+
health_status=RouteHealthStatus.DEGRADED,
1179+
health_check=_STUB_HEALTH_CHECK,
1180+
)
1181+
]
1182+
1183+
result = RollingUpdateStrategy().evaluate_cycle(deployment, routes, spec)
1184+
1185+
assert result.sub_step == DeploymentLifecycleSubStep.DEPLOYING_PROVISIONING
1186+
1187+
def test_multiple_replicas_no_health_check_completes(self) -> None:
1188+
"""2 desired, 2 RUNNING DEGRADED no-health-check → COMPLETED."""
1189+
deployment = make_deployment(desired=2)
1190+
spec = RollingUpdateSpec(
1191+
max_surge=make_int_or_percent(1), max_unavailable=make_int_or_percent(0)
1192+
)
1193+
routes = [
1194+
make_route(
1195+
revision_id=NEW_REV,
1196+
status=RouteStatus.RUNNING,
1197+
health_status=RouteHealthStatus.DEGRADED,
1198+
health_check=None,
1199+
)
1200+
for _ in range(2)
1201+
]
1202+
1203+
result = RollingUpdateStrategy().evaluate_cycle(deployment, routes, spec)
1204+
1205+
assert result.sub_step == DeploymentLifecycleSubStep.DEPLOYING_COMPLETED

0 commit comments

Comments
 (0)