Skip to content

Commit 3d455ad

Browse files
committed
wip
1 parent ee4c3e9 commit 3d455ad

6 files changed

Lines changed: 41 additions & 47 deletions

File tree

src/ai/backend/manager/data/deployment/types.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -164,21 +164,15 @@ class DeploymentSubStatus(enum.StrEnum):
164164
class DeploymentSubStep(DeploymentSubStatus):
165165
"""Sub-steps for the DEPLOYING lifecycle phase.
166166
167-
Active states:
168-
- PROVISIONING: New revision routes are being provisioned; waiting for readiness.
169-
- PROGRESSING: Actively replacing old routes with new routes.
170-
- ROLLING_BACK: Actively rolling back failed new routes to previous revision.
171-
172-
Terminal markers (no handler execution, trigger transition only):
173-
- COMPLETED: All strategy conditions satisfied; ready for revision swap.
174-
- ROLLED_BACK: Rollback finished; ready for cleanup and transition to READY.
167+
- PROVISIONING: New revision routes are being provisioned and old routes
168+
are being drained. The main handler for rolling updates.
169+
- ROLLING_BACK: Clearing deploying_revision and transitioning to READY.
170+
- COMPLETED: All strategy conditions satisfied; triggers revision swap.
175171
"""
176172

177173
PROVISIONING = "provisioning"
178-
PROGRESSING = "progressing"
179174
ROLLING_BACK = "rolling_back"
180175
COMPLETED = "completed"
181-
ROLLED_BACK = "rolled_back"
182176

183177

184178
@dataclass(frozen=True)

src/ai/backend/manager/sokovan/deployment/handlers/deploying.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,6 @@ def target_statuses(cls) -> list[DeploymentLifecycleStatus]:
307307
lifecycle=EndpointLifecycle.DEPLOYING,
308308
sub_status=DeploymentSubStep.PROVISIONING,
309309
),
310-
DeploymentLifecycleStatus(
311-
lifecycle=EndpointLifecycle.DEPLOYING,
312-
sub_status=DeploymentSubStep.PROGRESSING,
313-
),
314310
]
315311

316312
@classmethod

src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,10 @@ def evaluate_cycle(
7070
7171
FSM flow:
7272
1. Classify routes into old / new by revision_id.
73-
2. If any new route is PROVISIONING -> PROVISIONING (wait).
74-
3. If no old routes remain and new_healthy >= desired -> COMPLETED.
75-
4. Compute allowed surge/unavailable, decide create/terminate -> PROGRESSING.
73+
2. If any new route is PROVISIONING → PROVISIONING (wait).
74+
3. If no old routes remain and new_healthy >= desired → COMPLETED.
75+
4. Compute allowed surge/unavailable, decide create/terminate
76+
→ PROVISIONING (with route mutations).
7677
7778
Rollback is not decided by the FSM — the coordinator's timeout
7879
sweep handles it by transitioning to ROLLING_BACK when the
@@ -165,7 +166,7 @@ def _compute_progressing(
165166
classified: _ClassifiedRoutes,
166167
desired: int,
167168
) -> StrategyCycleResult:
168-
"""Compute surge/unavailable budget and return PROGRESSING with route mutations."""
169+
"""Compute surge/unavailable budget and return PROVISIONING with route mutations."""
169170
max_surge = self._spec.max_surge # extra routes allowed above desired
170171
max_unavailable = self._spec.max_unavailable # routes allowed to be down
171172

@@ -190,7 +191,7 @@ def _compute_progressing(
190191
route_changes.drain_route_ids.append(route.route_id)
191192

192193
log.debug(
193-
"deployment {}: PROGRESSING create={}, terminate={}, "
194+
"deployment {}: PROVISIONING create={}, terminate={}, "
194195
"old_active={}, new_healthy={}, new_unhealthy={}, new_prov={}",
195196
deployment.id,
196197
to_create,
@@ -202,7 +203,7 @@ def _compute_progressing(
202203
)
203204

204205
return StrategyCycleResult(
205-
sub_step=DeploymentSubStep.PROGRESSING,
206+
sub_step=DeploymentSubStep.PROVISIONING,
206207
route_changes=route_changes,
207208
)
208209

src/ai/backend/manager/sokovan/deployment/strategy/types.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@ class RouteChanges:
3131
class StrategyCycleResult:
3232
"""Result of evaluating a single deployment's strategy cycle.
3333
34-
``sub_step`` indicates the next state: PROVISIONING, PROGRESSING,
35-
or COMPLETED.
34+
``sub_step`` indicates the next state: PROVISIONING or COMPLETED.
3635
"""
3736

3837
sub_step: DeploymentSubStep

tests/unit/manager/sokovan/deployment/strategy/test_applier.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def summary_with_rollout() -> StrategyEvaluationSummary:
7070
@pytest.fixture
7171
def summary_with_drain() -> StrategyEvaluationSummary:
7272
return _build_summary(
73-
{uuid4(): DeploymentSubStep.PROGRESSING},
73+
{uuid4(): DeploymentSubStep.PROVISIONING},
7474
route_changes=RouteChanges(drain_route_ids=[uuid4()]),
7575
)
7676

@@ -90,7 +90,7 @@ def completed_summary() -> tuple[StrategyEvaluationSummary, set[UUID]]:
9090
@pytest.fixture
9191
def rolled_back_summary() -> tuple[StrategyEvaluationSummary, set[UUID]]:
9292
ep_id = uuid4()
93-
summary = _build_summary({ep_id: DeploymentSubStep.ROLLED_BACK})
93+
summary = _build_summary({ep_id: DeploymentSubStep.PROVISIONING})
9494
return summary, {ep_id}
9595

9696

@@ -103,7 +103,7 @@ def mixed_summary() -> tuple[StrategyEvaluationSummary, UUID, UUID, UUID]:
103103
{
104104
provisioning_id: DeploymentSubStep.PROVISIONING,
105105
completed_id: DeploymentSubStep.COMPLETED,
106-
rolled_back_id: DeploymentSubStep.ROLLED_BACK,
106+
rolled_back_id: DeploymentSubStep.PROVISIONING,
107107
},
108108
route_changes=RouteChanges(
109109
rollout_specs=[MagicMock()],

tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
"""Tests for the rolling update FSM evaluation (BEP-1049).
22
33
Tests cover:
4-
- FSM state transitions: PROVISIONING, PROGRESSING, ROLLED_BACK, COMPLETED
4+
- FSM state transitions: PROVISIONING and COMPLETED
55
- max_surge / max_unavailable budget calculations
66
- Multi-cycle progression and termination priority
77
- Edge cases and boundary conditions
8+
9+
Note: Rollback is not decided by the FSM — the coordinator's timeout
10+
sweep handles it. The FSM only returns PROVISIONING (with or without
11+
route mutations) or COMPLETED.
812
"""
913

1014
from __future__ import annotations
@@ -136,13 +140,13 @@ class TestBasicFSMStates:
136140
"""Test fundamental FSM transitions."""
137141

138142
def test_no_routes_creates_new(self) -> None:
139-
"""First cycle with 0 routes → PROGRESSING, creates desired count."""
143+
"""First cycle with 0 routes → PROVISIONING with route creation."""
140144
deployment = make_deployment(desired=1)
141145
spec = RollingUpdateSpec(max_surge=1, max_unavailable=0)
142146

143147
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, [])
144148

145-
assert result.sub_step == DeploymentSubStep.PROGRESSING
149+
assert result.sub_step == DeploymentSubStep.PROVISIONING
146150
assert len(result.route_changes.rollout_specs) == 1
147151
assert len(result.route_changes.drain_route_ids) == 0
148152

@@ -181,8 +185,11 @@ def test_completed_when_all_new_healthy_and_no_old(self) -> None:
181185
pytest.param(RouteStatus.TERMINATED, id="terminated"),
182186
],
183187
)
184-
def test_rollback_when_all_new_in_terminal_state(self, failed_status: RouteStatus) -> None:
185-
"""All new routes in terminal state (with old still present) → ROLLED_BACK."""
188+
def test_all_new_failed_retries_creation(self, failed_status: RouteStatus) -> None:
189+
"""All new routes failed → FSM retries by creating new routes.
190+
191+
Rollback is handled by the coordinator's timeout sweep, not the FSM.
192+
"""
186193
deployment = make_deployment(desired=1)
187194
spec = RollingUpdateSpec(max_surge=1, max_unavailable=0)
188195
routes = [
@@ -192,7 +199,7 @@ def test_rollback_when_all_new_in_terminal_state(self, failed_status: RouteStatu
192199

193200
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
194201

195-
assert result.sub_step == DeploymentSubStep.ROLLED_BACK
202+
assert result.sub_step == DeploymentSubStep.PROVISIONING
196203

197204

198205
# ===========================================================================
@@ -388,7 +395,7 @@ def test_not_completed_when_old_still_exists(self) -> None:
388395

389396
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
390397

391-
assert result.sub_step == DeploymentSubStep.PROGRESSING
398+
assert result.sub_step == DeploymentSubStep.PROVISIONING
392399
assert len(result.route_changes.drain_route_ids) == 1
393400

394401

@@ -400,11 +407,8 @@ def test_not_completed_when_old_still_exists(self) -> None:
400407
class TestRouteStatusClassification:
401408
"""Test how different route statuses affect classification."""
402409

403-
# FIXME: Code classifies DEGRADED as new_provisioning (same as PROVISIONING),
404-
# so this test should expect DeploymentSubStep.PROVISIONING, not ROLLED_BACK.
405-
# Verify the intended behavior and update accordingly.
406-
def test_degraded_new_triggers_rollback(self) -> None:
407-
"""DEGRADED new routes — see FIXME above for classification concern."""
410+
def test_degraded_new_waits_provisioning(self) -> None:
411+
"""DEGRADED new routes are treated as PROVISIONING (still warming up)."""
408412
deployment = make_deployment(desired=1)
409413
spec = RollingUpdateSpec(max_surge=1, max_unavailable=0)
410414
routes = [
@@ -413,10 +417,10 @@ def test_degraded_new_triggers_rollback(self) -> None:
413417

414418
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
415419

416-
assert result.sub_step == DeploymentSubStep.ROLLED_BACK
420+
assert result.sub_step == DeploymentSubStep.PROVISIONING
417421

418-
def test_unhealthy_new_triggers_rollback(self) -> None:
419-
"""All new UNHEALTHY (none healthy, none provisioning) → ROLLED_BACK."""
422+
def test_unhealthy_new_retries(self) -> None:
423+
"""All new UNHEALTHY → PROVISIONING (retries, timeout handles rollback)."""
420424
deployment = make_deployment(desired=1)
421425
spec = RollingUpdateSpec(max_surge=1, max_unavailable=0)
422426
routes = [
@@ -425,7 +429,7 @@ def test_unhealthy_new_triggers_rollback(self) -> None:
425429

426430
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
427431

428-
assert result.sub_step == DeploymentSubStep.ROLLED_BACK
432+
assert result.sub_step == DeploymentSubStep.PROVISIONING
429433

430434
@pytest.mark.parametrize(
431435
"inactive_status",
@@ -459,7 +463,7 @@ def test_partial_new_failure_continues_progress(self) -> None:
459463

460464
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
461465

462-
assert result.sub_step == DeploymentSubStep.PROGRESSING
466+
assert result.sub_step == DeploymentSubStep.PROVISIONING
463467

464468
def test_old_provisioning_counted_as_active(self) -> None:
465469
"""Old routes in PROVISIONING are counted as old_active."""
@@ -472,7 +476,7 @@ def test_old_provisioning_counted_as_active(self) -> None:
472476

473477
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
474478

475-
assert result.sub_step == DeploymentSubStep.PROGRESSING
479+
assert result.sub_step == DeploymentSubStep.PROVISIONING
476480

477481

478482
# ===========================================================================
@@ -548,7 +552,7 @@ def test_more_new_healthy_than_desired_still_completes(self) -> None:
548552
assert result.sub_step == DeploymentSubStep.COMPLETED
549553

550554
def test_only_failed_new_no_old_rolls_back(self) -> None:
551-
"""Only failed new routes, no old → ROLLED_BACK."""
555+
"""Only failed new routes, no old → PROVISIONING (retries creation)."""
552556
deployment = make_deployment(desired=2)
553557
spec = RollingUpdateSpec(max_surge=1, max_unavailable=0)
554558
routes = [
@@ -558,7 +562,7 @@ def test_only_failed_new_no_old_rolls_back(self) -> None:
558562

559563
result = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
560564

561-
assert result.sub_step == DeploymentSubStep.ROLLED_BACK
565+
assert result.sub_step == DeploymentSubStep.PROVISIONING
562566

563567
def test_all_old_inactive_no_new_creates_desired(self) -> None:
564568
"""All old routes are terminated, no new → create desired."""
@@ -579,7 +583,7 @@ def test_deploying_rev_none_rejected(self) -> None:
579583
spec = RollingUpdateSpec(max_surge=1, max_unavailable=0)
580584
routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)]
581585

582-
with pytest.raises(ValueError, match="deploying_revision_id must not be None"):
586+
with pytest.raises(Exception): # InvalidEndpointState
583587
RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes)
584588

585589
def test_route_without_revision_classified_as_old(self) -> None:
@@ -676,7 +680,7 @@ def test_step_by_step_rolling_update(self) -> None:
676680
r4 = RollingUpdateStrategy(spec).evaluate_cycle(deployment, routes_c4)
677681
assert len(r4.route_changes.rollout_specs) == 0
678682
assert len(r4.route_changes.drain_route_ids) == 1
679-
assert r4.sub_step == DeploymentSubStep.PROGRESSING
683+
assert r4.sub_step == DeploymentSubStep.PROVISIONING
680684

681685
# Cycle 5: 0 old, 5 new healthy → completed
682686
routes_c5 = [make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY) for _ in range(5)]

0 commit comments

Comments
 (0)