Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions tensorrt_llm/_torch/pyexecutor/scheduler/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,7 @@ def is_started_request(req: LlmRequest) -> bool:
req_it += 1
continue

was_scheduled = self._try_scheduling_request(
was_scheduled, num_scheduled_peft_pages = self._try_scheduling_request(
scheduler,
req,
scheduled_requests,
Expand Down Expand Up @@ -1065,13 +1065,13 @@ def _try_scheduling_request(
scheduled_blocks_manager: "MaxUtilizationScheduledBlocksManager",
num_scheduled_peft_pages: int,
seen_task_ids: set[int],
) -> bool:
) -> tuple[bool, int]:
if len(scheduled_requests) >= scheduler.max_num_requests:
return False
return False, num_scheduled_peft_pages

blocks_if_scheduled = scheduled_blocks_manager.prepare_blocks_if_schedulable(req)
if blocks_if_scheduled is None:
return False
return False, num_scheduled_peft_pages

# PEFT check only when needed
if scheduler.peft_cache_manager is not None:
Expand All @@ -1084,16 +1084,17 @@ def _try_scheduling_request(
)
max_peft_pages = scheduler._get_max_peft_pages()
if num_required_peft_pages + num_scheduled_peft_pages > max_peft_pages:
return False
return False, num_scheduled_peft_pages
logger.debug(
f"MaxUtilizationScheduler: scheduled peft pages: {num_required_peft_pages}"
)
num_scheduled_peft_pages += num_required_peft_pages
if is_new_task:
seen_task_ids.add(lora_task_id)

scheduled_blocks_manager.update_scheduled_blocks(blocks_if_scheduled)
scheduled_requests.append(req)
return True
return True, num_scheduled_peft_pages


class NoEvictScheduledBlocksManager:
Expand Down
25 changes: 25 additions & 0 deletions tests/unittest/_torch/executor/test_py_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2073,6 +2073,31 @@ def test_lora_doesnt_fit(self):
# First task: 10 pages, second task: 10 pages, total 20 > 15
assert len(fitting) == 1

def test_max_utilization_peft_page_accumulation(self):
"""
MAX_UTILIZATION policy must accumulate scheduled PEFT pages across
requests so the per-batch page cap is enforced.

With max_pages=25 and 10 pages per new task:
req 0 (task 1): total = 0+10 = 10 <= 25 -> admit
req 1 (task 2): total = 10+10 = 20 <= 25 -> admit
req 2 (task 3): total = 20+10 = 30 > 25 -> reject
"""
kv = MockKVCacheManager(num_free_blocks=100, blocks_per_request=5)
peft = MockPeftCacheManager(max_pages=25, pages_per_request=10)
scheduler = PyCapacityScheduler(
max_num_requests=4,
kv_cache_manager=kv,
peft_cache_manager=peft,
scheduler_policy=CapacitySchedulerPolicy.MAX_UTILIZATION,
)
r0 = _make_request(0, lora_task_id=1)
r1 = _make_request(1, lora_task_id=2)
r2 = _make_request(2, lora_task_id=3)
fitting, _disagg, _paused = scheduler.schedule_request([r0, r1, r2])
# 2 tasks x 10 pages = 20 <= 25; 3rd task would push to 30 > 25
assert len(fitting) == 2


# ############################################################################
#
Expand Down
Loading