Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/9997.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Implement Rolling Update deployment strategy
2 changes: 2 additions & 0 deletions src/ai/backend/manager/models/deployment_policy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from .row import (
BlueGreenSpec,
DeploymentPolicyRow,
DeploymentStrategySpec,
RollingUpdateSpec,
)

__all__ = (
"BlueGreenSpec",
"DeploymentPolicyRow",
"DeploymentStrategySpec",
"RollingUpdateSpec",
)
4 changes: 4 additions & 0 deletions src/ai/backend/manager/models/deployment_policy/row.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
__all__ = (
"BlueGreenSpec",
"DeploymentPolicyRow",
"DeploymentStrategySpec",
"RollingUpdateSpec",
)

Expand Down Expand Up @@ -61,6 +62,9 @@ class BlueGreenSpec(BaseModel):
promote_delay_seconds: int = 0


DeploymentStrategySpec = RollingUpdateSpec | BlueGreenSpec


def _get_endpoint_join_condition() -> sa.ColumnElement[bool]:
from ai.backend.manager.models.endpoint import EndpointRow

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,20 @@
DeploymentInfo,
RouteInfo,
)
from ai.backend.manager.models.deployment_policy import BlueGreenSpec
from ai.backend.manager.models.deployment_policy import DeploymentStrategySpec

from .types import AbstractDeploymentStrategy, StrategyCycleResult


class BlueGreenStrategy(AbstractDeploymentStrategy):
"""Blue-green deployment strategy FSM."""

def __init__(self, spec: BlueGreenSpec) -> None:
super().__init__(spec)
self._spec = spec

@override
def evaluate_cycle(
self,
deployment: DeploymentInfo,
routes: Sequence[RouteInfo],
spec: DeploymentStrategySpec,
) -> StrategyCycleResult:
"""Evaluate one cycle of blue-green deployment for a single deployment."""
raise NotImplementedError("Blue-green deployment strategy is not yet implemented")
47 changes: 36 additions & 11 deletions src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,16 @@
InvalidDeploymentStrategy,
InvalidDeploymentStrategySpec,
)
from ai.backend.manager.models.deployment_policy.conditions import DeploymentPolicyConditions
from ai.backend.manager.models.routing.conditions import RouteConditions
from ai.backend.manager.repositories.base import BatchQuerier, NoPagination
from ai.backend.manager.repositories.base import (
BatchQuerier,
NoPagination,
QueryCondition,
combine_conditions_or,
)
from ai.backend.manager.repositories.deployment.options import (
DeploymentPolicyConditions,
RouteConditions,
)
from ai.backend.manager.repositories.deployment.repository import DeploymentRepository
from ai.backend.manager.sokovan.deployment.recorder import DeploymentRecorderContext

Expand Down Expand Up @@ -79,15 +86,33 @@ async def evaluate(
)
)
policy_map = {policy.endpoint: policy for policy in policy_search.items}
# Fetch all non-terminated routes so the strategy can detect rollback
# conditions (e.g. FAILED_TO_START routes after a coordinator crash).
# Fetch non-terminated routes + terminated routes belonging to a
# deploying revision. The FSM needs terminated new-revision routes
# to count accumulated failures for rollback detection, but old
# terminated routes are irrelevant and would bloat the result set.
deploying_revision_ids = {
deployment.deploying_revision_id
for deployment in deployments
if deployment.deploying_revision_id is not None
}
route_conditions: list[QueryCondition] = [
RouteConditions.by_endpoint_ids(endpoint_ids),
]
if deploying_revision_ids:
route_conditions.append(
combine_conditions_or([
RouteConditions.exclude_statuses([RouteStatus.TERMINATED]),
RouteConditions.by_revision_ids(deploying_revision_ids),
])
)
Comment on lines +89 to +107
Copy link

Copilot AI Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment and query logic here say terminated new-revision routes are needed for "rollback detection", but RollingUpdateStrategy currently never uses new_failed_count for any decision (it only logs it) and the tests/docs state rollback is handled by coordinator timeout. If rollback detection is no longer part of the FSM, consider simplifying the route query back to excluding TERMINATED routes (or updating the comment to reflect the real reason for including terminated routes) to avoid extra result-set bloat and confusion.

Copilot uses AI. Check for mistakes.
else:
route_conditions.append(
RouteConditions.exclude_statuses([RouteStatus.TERMINATED]),
)
route_search = await self._deployment_repo.search_routes(
BatchQuerier(
pagination=NoPagination(),
conditions=[
RouteConditions.by_endpoint_ids(endpoint_ids),
RouteConditions.exclude_statuses([RouteStatus.TERMINATED]),
],
conditions=route_conditions,
)
)
route_map: defaultdict[UUID, list[RouteInfo]] = defaultdict(list)
Expand All @@ -108,7 +133,7 @@ async def evaluate(

try:
strategy = self._create_strategy(policy.strategy, policy)
cycle_result = strategy.evaluate_cycle(deployment, routes)
cycle_result = strategy.evaluate_cycle(deployment, routes, policy.strategy_spec)
except BackendAIError as e:
log.warning("deployment {}: evaluation error — {}", deployment.id, e)
result.errors.append(EvaluationErrorData(deployment=deployment, reason=str(e)))
Expand Down Expand Up @@ -170,4 +195,4 @@ def _create_strategy(
f" got {type(spec).__name__}"
),
)
return entry.strategy_cls(spec)
return entry.strategy_cls()
Loading
Loading