Skip to content

Commit 957c68b

Browse files
HyeockJinKimclaude
andauthored
feat(BA-6031): extend coordinator layer to support sub_status and traffic_status (#11606)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 23c2883 commit 957c68b

31 files changed

Lines changed: 351 additions & 143 deletions

File tree

changes/11606.enhance.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Extend route coordinator to support sub_status and traffic_status transitions across all lifecycle handler axes.

src/ai/backend/common/dto/manager/scheduling_history/response.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ class RouteHistoryDTO(BaseResponseModel):
7575
phase: str
7676
from_status: str | None = None
7777
to_status: str | None = None
78+
from_sub_status: str | None = None
79+
to_sub_status: str | None = None
7880
result: str
7981
error_code: str | None = None
8082
message: str | None = None

src/ai/backend/common/dto/manager/v2/scheduling_history/response.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,20 +75,17 @@ class DeploymentHistoryNode(BaseResponseModel):
7575

7676

7777
class RouteHistoryNode(BaseResponseModel):
78-
"""Node model representing a route scheduling history record.
79-
80-
from_status/to_status values depend on category:
81-
- category=lifecycle: lifecycle status values (provisioning, running, etc.)
82-
- category=health: health status values (healthy, unhealthy, etc.)
83-
"""
78+
"""Node model representing a route scheduling history record."""
8479

8580
id: UUID = Field(description="History record ID")
8681
route_id: UUID = Field(description="Route ID this history belongs to")
8782
deployment_id: UUID = Field(description="Deployment ID the route belongs to")
8883
category: str = Field(description="Handler category: 'lifecycle' or 'health'")
8984
phase: str = Field(description="Scheduling phase")
90-
from_status: str | None = Field(default=None, description="Status before transition")
91-
to_status: str | None = Field(default=None, description="Status after transition")
85+
from_status: str | None = Field(default=None, description="Lifecycle status before transition")
86+
to_status: str | None = Field(default=None, description="Lifecycle status after transition")
87+
from_sub_status: str | None = Field(default=None, description="Sub-status before transition")
88+
to_sub_status: str | None = Field(default=None, description="Sub-status after transition")
9289
result: str = Field(description="Result of the scheduling attempt")
9390
error_code: str | None = Field(default=None, description="Error code if scheduling failed")
9491
message: str | None = Field(default=None, description="Human-readable message or error detail")

src/ai/backend/manager/api/adapters/scheduling_history/adapter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,8 @@ def _route_data_to_dto(data: RouteHistoryData) -> RouteHistoryNode:
760760
phase=data.phase,
761761
from_status=data.from_status,
762762
to_status=data.to_status,
763+
from_sub_status=data.from_sub_status,
764+
to_sub_status=data.to_sub_status,
763765
result=data.result.value,
764766
error_code=data.error_code,
765767
message=data.message,

src/ai/backend/manager/api/rest/scheduling_history/adapter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,8 @@ def convert_route_history_to_dto(self, data: RouteHistoryData) -> RouteHistoryDT
375375
phase=data.phase,
376376
from_status=data.from_status,
377377
to_status=data.to_status,
378+
from_sub_status=data.from_sub_status,
379+
to_sub_status=data.to_sub_status,
378380
result=data.result.value,
379381
error_code=data.error_code,
380382
message=data.message,

src/ai/backend/manager/data/deployment/types.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -265,23 +265,26 @@ class DeploymentTargetStatuses:
265265

266266
@dataclass(frozen=True)
267267
class RouteTargetStatuses:
268-
"""Target statuses for route handler filtering (lifecycle x health x traffic).
268+
"""Target statuses for route handler filtering.
269269
270-
``traffic=None`` skips the traffic-status predicate. Pass a non-empty list
271-
to restrict to specific ``RouteTrafficStatus`` values.
270+
Each axis is optional — ``None`` skips that predicate entirely.
271+
Pass a non-empty list to restrict to specific values on that axis.
272272
"""
273273

274-
lifecycle: list[RouteStatus]
275-
health: list[RouteHealthStatus]
274+
lifecycle: list[RouteStatus] | None = None
275+
health: list[RouteHealthStatus] | None = None
276276
traffic: list[RouteTrafficStatus] | None = None
277+
sub_status: list[RouteSubStatus] | None = None
277278

278279

279280
@dataclass(frozen=True)
280281
class RouteTransitionTarget:
281-
"""Target state for a route transition (lifecycle + health)."""
282+
"""Target state for a route transition."""
282283

283284
status: RouteStatus | None = None
284285
health_status: RouteHealthStatus | None = None
286+
sub_status: RouteSubStatus | None = None
287+
traffic_status: RouteTrafficStatus | None = None
285288

286289

287290
@dataclass(frozen=True)
@@ -1148,21 +1151,18 @@ class DeploymentHistoryData:
11481151

11491152
@dataclass
11501153
class RouteHistoryData:
1151-
"""Domain model for route history.
1152-
1153-
from_status/to_status contain the relevant status for the category:
1154-
- category=lifecycle: lifecycle status values (provisioning, running, etc.)
1155-
- category=health: health status values (healthy, unhealthy, etc.)
1156-
"""
1154+
"""Domain model for route history."""
11571155

11581156
id: UUID
11591157
route_id: UUID
11601158
deployment_id: UUID
11611159

1162-
category: str # RouteHandlerCategory value
1160+
category: RouteHandlerCategory
11631161
phase: str # RouteLifecycleType value
11641162
from_status: str | None
11651163
to_status: str | None
1164+
from_sub_status: str | None
1165+
to_sub_status: str | None
11661166

11671167
result: SchedulingResult
11681168
error_code: str | None
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Add sub_status columns to route_history and drop health_status columns.
2+
3+
Revision ID: b2c3d4e5f6a7
4+
Revises: a1b2c3d4e5f7
5+
Create Date: 2026-05-14
6+
7+
"""
8+
9+
# Part of: 26.5.0
10+
11+
from alembic import op
12+
13+
# revision identifiers, used by Alembic.
14+
revision = "b2c3d4e5f6a7"
15+
down_revision = "d52fd86275bf"
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade() -> None:
21+
conn = op.get_bind()
22+
for col in ("from_sub_status", "to_sub_status"):
23+
conn.exec_driver_sql(
24+
f"ALTER TABLE route_history ADD COLUMN IF NOT EXISTS {col} VARCHAR(64)"
25+
)
26+
for col in ("from_health_status", "to_health_status"):
27+
conn.exec_driver_sql(f"ALTER TABLE route_history DROP COLUMN IF EXISTS {col}")
28+
29+
30+
def downgrade() -> None:
31+
conn = op.get_bind()
32+
for col in ("to_sub_status", "from_sub_status"):
33+
op.drop_column("route_history", col)
34+
for col in ("from_health_status", "to_health_status"):
35+
conn.exec_driver_sql(
36+
f"ALTER TABLE route_history ADD COLUMN IF NOT EXISTS {col} VARCHAR(64)"
37+
)

src/ai/backend/manager/models/routing/conditions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ai.backend.manager.data.deployment.types import (
1111
RouteHealthStatus,
1212
RouteStatus,
13+
RouteSubStatus,
1314
RouteTrafficStatus,
1415
)
1516
from ai.backend.manager.models.routing import RoutingRow
@@ -117,6 +118,13 @@ def inner() -> sa.sql.expression.ColumnElement[bool]:
117118

118119
return inner
119120

121+
@staticmethod
122+
def by_sub_statuses(sub_statuses: list[RouteSubStatus]) -> QueryCondition:
123+
def inner() -> sa.sql.expression.ColumnElement[bool]:
124+
return RoutingRow.sub_status.in_(sub_statuses)
125+
126+
return inner
127+
120128
@staticmethod
121129
def by_revision_ids(revision_ids: Collection[uuid.UUID]) -> QueryCondition:
122130
def inner() -> sa.sql.expression.ColumnElement[bool]:

src/ai/backend/manager/models/scheduling_history/row.py

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from ai.backend.manager.data.deployment.types import (
1212
DeploymentHandlerCategory,
1313
DeploymentHistoryData,
14+
RouteHandlerCategory,
1415
RouteHistoryData,
1516
)
1617
from ai.backend.manager.data.kernel.types import (
@@ -252,19 +253,22 @@ class RouteHistoryRow(Base): # type: ignore[misc]
252253
"deployment_id", GUID, nullable=False, index=True
253254
)
254255

255-
category: Mapped[str] = mapped_column(
256-
"category", sa.String(length=32), nullable=False, server_default=sa.text("'lifecycle'")
256+
category: Mapped[RouteHandlerCategory] = mapped_column(
257+
"category",
258+
StrEnumType(RouteHandlerCategory),
259+
nullable=False,
260+
server_default=sa.text("'lifecycle'"),
257261
)
258262
phase: Mapped[str] = mapped_column("phase", sa.String(length=64), nullable=False)
259263
from_status: Mapped[str | None] = mapped_column(
260264
"from_status", sa.String(length=64), nullable=True
261265
)
262266
to_status: Mapped[str | None] = mapped_column("to_status", sa.String(length=64), nullable=True)
263-
from_health_status: Mapped[str | None] = mapped_column(
264-
"from_health_status", sa.String(length=64), nullable=True
267+
from_sub_status: Mapped[str | None] = mapped_column(
268+
"from_sub_status", sa.String(length=64), nullable=True
265269
)
266-
to_health_status: Mapped[str | None] = mapped_column(
267-
"to_health_status", sa.String(length=64), nullable=True
270+
to_sub_status: Mapped[str | None] = mapped_column(
271+
"to_sub_status", sa.String(length=64), nullable=True
268272
)
269273

270274
result: Mapped[str] = mapped_column("result", sa.String(length=32), nullable=False)
@@ -296,37 +300,23 @@ class RouteHistoryRow(Base): # type: ignore[misc]
296300
)
297301

298302
def should_merge_with(self, new_row: RouteHistoryRow) -> bool:
299-
"""Check if a new entry should be merged with this one.
300-
301-
Merge conditions:
302-
- Same category, phase, error_code
303-
- For lifecycle category: same to_status
304-
- For health category: same to_health_status
305-
"""
306-
if self.category != new_row.category:
307-
return False
308-
if self.phase != new_row.phase or self.error_code != new_row.error_code:
309-
return False
310-
if self.category == "health":
311-
return self.to_health_status == new_row.to_health_status
312-
return self.to_status == new_row.to_status
303+
return (
304+
self.category == new_row.category
305+
and self.phase == new_row.phase
306+
and self.error_code == new_row.error_code
307+
)
313308

314309
def to_data(self) -> RouteHistoryData:
315-
# API exposes unified from_status/to_status based on category
316-
if self.category == "health":
317-
from_val = self.from_health_status
318-
to_val = self.to_health_status
319-
else:
320-
from_val = self.from_status
321-
to_val = self.to_status
322310
return RouteHistoryData(
323311
id=self.id,
324312
route_id=self.route_id,
325313
deployment_id=self.deployment_id,
326314
category=self.category,
327315
phase=self.phase,
328-
from_status=from_val,
329-
to_status=to_val,
316+
from_status=self.from_status,
317+
to_status=self.to_status,
318+
from_sub_status=self.from_sub_status,
319+
to_sub_status=self.to_sub_status,
330320
result=SchedulingResult(self.result),
331321
error_code=self.error_code,
332322
message=self.message,

src/ai/backend/manager/repositories/deployment/creators/route.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,21 @@
33
from __future__ import annotations
44

55
import uuid
6-
from dataclasses import dataclass
6+
from dataclasses import dataclass, field
77
from typing import Any, override
88

99
from ai.backend.common.identifier.deployment import DeploymentID
1010
from ai.backend.common.identifier.deployment_revision import DeploymentRevisionID
1111
from ai.backend.manager.data.deployment.types import (
1212
RouteHealthStatus,
1313
RouteStatus,
14+
RouteSubStatus,
1415
RouteTrafficStatus,
1516
)
1617
from ai.backend.manager.models.routing import RoutingRow
1718
from ai.backend.manager.repositories.base import CreatorSpec
1819
from ai.backend.manager.repositories.base.updater import BatchUpdaterSpec
20+
from ai.backend.manager.types import OptionalState, TriState
1921

2022

2123
@dataclass
@@ -54,14 +56,16 @@ def build_row(self) -> RoutingRow:
5456
class RouteBatchUpdaterSpec(BatchUpdaterSpec[RoutingRow]):
5557
"""BatchUpdaterSpec for batch updating routes.
5658
57-
Accepts optional fields and only updates fields that are specified.
58-
This allows flexible partial updates for various route operations.
59+
Each axis uses the appropriate optional type:
60+
- :class:`OptionalState` for status fields that cannot be nullified
61+
- :class:`TriState` for ``sub_status`` which must support explicit ``None``
62+
(NULLIFY) when a route exits the PROVISIONING stage
5963
"""
6064

61-
status: RouteStatus | None = None
62-
health_status: RouteHealthStatus | None = None
63-
traffic_ratio: float | None = None
64-
traffic_status: RouteTrafficStatus | None = None
65+
status: OptionalState[RouteStatus] = field(default_factory=OptionalState.nop)
66+
health_status: OptionalState[RouteHealthStatus] = field(default_factory=OptionalState.nop)
67+
traffic_status: OptionalState[RouteTrafficStatus] = field(default_factory=OptionalState.nop)
68+
sub_status: TriState[RouteSubStatus] = field(default_factory=TriState.nop)
6569

6670
@property
6771
@override
@@ -71,12 +75,8 @@ def row_class(self) -> type[RoutingRow]:
7175
@override
7276
def build_values(self) -> dict[str, Any]:
7377
values: dict[str, Any] = {}
74-
if self.status is not None:
75-
values["status"] = self.status
76-
if self.health_status is not None:
77-
values["health_status"] = self.health_status
78-
if self.traffic_ratio is not None:
79-
values["traffic_ratio"] = self.traffic_ratio
80-
if self.traffic_status is not None:
81-
values["traffic_status"] = self.traffic_status
78+
self.status.update_dict(values, "status")
79+
self.health_status.update_dict(values, "health_status")
80+
self.traffic_status.update_dict(values, "traffic_status")
81+
self.sub_status.update_dict(values, "sub_status")
8282
return values

0 commit comments

Comments
 (0)