Skip to content

Commit 5a6c7a2

Browse files
committed
refactor(BA-6057): remove legacy scheduler package superseded by Sokovan
- Delete src/ai/backend/manager/scheduler/ (FIFO/LIFO/DRF schedulers, agent selectors, predicates, utils) — all functionality has been ported to manager/sokovan/scheduler/. - Move ScheduleType enum into manager/sokovan/scheduler/types.py and update its six importers. - Remove dead settle_agent_alloc() and KernelAgentBinding usage from AgentRegistry (no callers since the Phase 3 JSONB write removal). - Drop the orphaned backendai_scheduler_v10 and backendai_agentselector_v10 entrypoint groups from BUILD and plugin entrypoint metadata. - Update tests/unit/plugin/test_entrypoint.py fixtures, model row comments, and manager/CLAUDE.md Scheduler guardrail to point at the Sokovan location.
1 parent 8dbee20 commit 5a6c7a2

21 files changed

Lines changed: 54 additions & 1462 deletions

File tree

src/ai/backend/manager/BUILD

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,6 @@ python_distribution(
4040
"mgr": "ai.backend.manager.cli.__main__:main",
4141
"mgr.start-server": "ai.backend.manager.cli.start_server:main",
4242
},
43-
"backendai_scheduler_v10": {
44-
"fifo": "ai.backend.manager.scheduler.fifo:FIFOSlotScheduler",
45-
"lifo": "ai.backend.manager.scheduler.fifo:LIFOSlotScheduler",
46-
"drf": "ai.backend.manager.scheduler.drf:DRFScheduler",
47-
},
48-
"backendai_agentselector_v10": {
49-
"legacy": "ai.backend.manager.scheduler.agent_selector:LegacyAgentSelector",
50-
"roundrobin": "ai.backend.manager.scheduler.agent_selector:RoundRobinAgentSelector",
51-
"concentrated": "ai.backend.manager.scheduler.agent_selector:ConcentratedAgentSelector",
52-
"dispersed": "ai.backend.manager.scheduler.agent_selector:DispersedAgentSelector",
53-
},
5443
"backendai_error_monitor_v20": {
5544
"intrinsic": "ai.backend.manager.plugin.error_monitor:ErrorMonitor",
5645
},

src/ai/backend/manager/CLAUDE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,5 @@ Each layer has its own `CLAUDE.md`. Read it before modifying code in that direct
3131

3232
## Scheduler
3333

34-
- Scheduling logic belongs in `manager/scheduler/` — do not add scheduling decisions inside
35-
API handlers or service methods.
34+
- Scheduling logic belongs in `manager/sokovan/scheduler/` — do not add scheduling decisions
35+
inside API handlers or service methods.

src/ai/backend/manager/event_dispatcher/handlers/schedule.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
from ai.backend.common.types import AgentId
1919
from ai.backend.logging.utils import BraceStyleAdapter
2020
from ai.backend.manager.data.deployment.types import DeploymentLifecycleSubStep
21-
from ai.backend.manager.scheduler.types import ScheduleType
2221
from ai.backend.manager.sokovan.deployment.coordinator import DeploymentCoordinator
2322
from ai.backend.manager.sokovan.deployment.route.coordinator import RouteCoordinator
2423
from ai.backend.manager.sokovan.deployment.route.types import RouteLifecycleType
2524
from ai.backend.manager.sokovan.deployment.types import DeploymentLifecycleType
2625
from ai.backend.manager.sokovan.scheduler.coordinator import ScheduleCoordinator
26+
from ai.backend.manager.sokovan.scheduler.types import ScheduleType
2727
from ai.backend.manager.sokovan.scheduling_controller import SchedulingController
2828

2929
log = BraceStyleAdapter(logging.getLogger(__spec__.name))

src/ai/backend/manager/models/kernel/row.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ class KernelRow(Base): # type: ignore[misc]
358358
# // an ISO 8601 formatted timestamp of the last attempt
359359
# "failed_predicates": [
360360
# { "name": "concurrency", "msg": "You cannot run more than 30 concurrent sessions." },
361-
# // see the manager.scheduler.predicates module for possible messages
361+
# // see the sokovan scheduler validator modules for possible messages
362362
# ...
363363
# ],
364364
# "passed_predicates": [ {"name": "reserved_time"}, ... ], // names only

src/ai/backend/manager/models/session/row.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@ class SessionRow(Base): # type: ignore[misc]
553553
# // an ISO 8601 formatted timestamp of the last attempt
554554
# "failed_predicates": [
555555
# { "name": "concurrency", "msg": "You cannot run more than 30 concurrent sessions." },
556-
# // see the manager.scheduler.predicates module for possible messages
556+
# // see the sokovan scheduler validator modules for possible messages
557557
# ...
558558
# ],
559559
# "passed_predicates": [ {"name": "reserved_time"}, ... ], // names only

src/ai/backend/manager/registry.py

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,6 @@
184184
from .models.vfolder import (
185185
verify_vfolder_name,
186186
)
187-
from .scheduler.types import KernelAgentBinding
188187
from .types import UserScope
189188

190189
type MSetType = Mapping[str | bytes, bytes | float | int | str]
@@ -1383,47 +1382,6 @@ async def update_scaling_group(self, agent_id: AgentId, scaling_group: str) -> N
13831382
async with self._agent_client_pool.acquire(verified_agent_id) as client:
13841383
await client.update_scaling_group(scaling_group)
13851384

1386-
async def settle_agent_alloc(
1387-
self,
1388-
kernel_agent_bindings: Sequence[KernelAgentBinding],
1389-
) -> None:
1390-
"""
1391-
Tries to settle down agent row's occupied_slots with real value. This must be called
1392-
after kernel creation is completed, to prevent fraction of resource dropped by agent scheduler
1393-
during kernel creation still being reported as used.
1394-
"""
1395-
1396-
keyfunc = lambda item: item.agent_alloc_ctx.agent_id
1397-
for agent_id, group_iterator in itertools.groupby(
1398-
sorted(kernel_agent_bindings, key=keyfunc),
1399-
key=keyfunc,
1400-
):
1401-
actual_allocated_slots = ResourceSlot()
1402-
requested_slots = ResourceSlot()
1403-
1404-
for kernel_agent_binding in group_iterator:
1405-
# this value must be set while running _post_create_kernel
1406-
actual_allocated_slot = self._kernel_actual_allocated_resources.get(
1407-
kernel_agent_binding.kernel.id
1408-
)
1409-
requested_slots += kernel_agent_binding.kernel.requested_slots
1410-
if actual_allocated_slot is not None:
1411-
actual_allocated_slots += ResourceSlot.from_json(actual_allocated_slot)
1412-
del self._kernel_actual_allocated_resources[kernel_agent_binding.kernel.id]
1413-
else: # something's wrong; just fall back to requested slot value
1414-
actual_allocated_slots += kernel_agent_binding.kernel.requested_slots
1415-
1416-
# Phase 3 (BA-4308): Legacy JSONB write to agents.occupied_slots removed.
1417-
# Agent occupied slots are now solely managed by the normalized
1418-
# agent_resources table. The agents.occupied_slots JSONB column is
1419-
# retained for historical audit but no longer written to.
1420-
if actual_allocated_slots != requested_slots:
1421-
log.debug(
1422-
"agent {} has slot calibration diff (requested != actual); "
1423-
"agent_resources table is the source of truth",
1424-
agent_id,
1425-
)
1426-
14271385
async def recalc_resource_usage(self, do_fullscan: bool = False) -> None:
14281386
async def _recalc() -> Mapping[AccessKey, ConcurrencyUsed]:
14291387
access_key_to_concurrency_used: dict[AccessKey, ConcurrencyUsed] = {}

src/ai/backend/manager/scheduler/__init__.py

Whitespace-only changes.

src/ai/backend/manager/scheduler/agent_selector.py

Lines changed: 0 additions & 255 deletions
This file was deleted.

0 commit comments

Comments
 (0)