fix time logging and other small things (#590)

yannicks1 · web-flow · commit 30fcc389b3c1 · 2025-12-09T10:21:16.000+01:00
Addressing some feedback to #552: - fix time logging - removing unnecessary (duplicated) code - fix typing scheduler class - fix some typos --------- Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
diff --git a/tests/scheduling_utils.py b/tests/scheduling_utils.py
@@ -1,7 +1,7 @@
 import copy
 import os
 from collections import defaultdict, deque
-from typing import Any
+from typing import Any, Union
 
 import pytest
 from llm_cache import get_cached_engine
@@ -13,7 +13,8 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
 
-from vllm_spyre.v1.core.scheduler import ContinuousBatchingSpyreScheduler
+from vllm_spyre.v1.core.scheduler import (ChunkedPrefillSpyreScheduler,
+                                          ContinuousBatchingSpyreScheduler)
 
 DISABLE_ASSERTS = False  # used for debugging
 
@@ -162,7 +163,8 @@ def check_scheduler_inference_steps(
         available_blocks=available_blocks,
         backend=backend,
         monkeypatch=monkeypatch)
-    scheduler: ContinuousBatchingSpyreScheduler = engine_core.scheduler
+    scheduler: Union[ContinuousBatchingSpyreScheduler,
+                     ChunkedPrefillSpyreScheduler] = engine_core.scheduler
 
     tokenizer = get_tokenizer(model.name, revision=model.revision)
     # clear the cache of function scheduler.check_batch_tkv_limit()
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py
@@ -211,7 +211,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     f" be divisible by the block size ({cls._block_size}) "
                     "to enable chunked prefill. It was set to "
                     f"`{scheduler_config.max_num_batched_tokens}`. Please "
-                    "set `--max-num-batched-tokens` to a number that satisfy "
+                    "set `--max-num-batched-tokens` to a number that satisfies "
                     "this constraint.")
 
         logger.info(
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -1851,7 +1851,7 @@ def _prepare_chunked_prefill(self, req_id: str):
         
         # Case III 
 
-        No left paddings and more than one chunk
+        No left padding and more than one chunk
 
         13 tokens
         4 blocks
@@ -1862,8 +1862,8 @@ def _prepare_chunked_prefill(self, req_id: str):
 
         NOTE: The goal of this "illustration" is to depics strategies to write
         code to create the chunks, not necessarily enumerate the possible 
-        scenario. Of course there are interpretations where these cases 
-        overlaps. 
+        scenarios. Of course there are interpretations where these cases 
+        overlap. 
         
         '''
 
@@ -2105,7 +2105,7 @@ def add_new_request(self, request: NewRequestData):
         new_tokens = (sampling_params.max_tokens
                       if sampling_params is not None else 0)
         total_tokens = prompt_len + new_tokens - 1
-        # subtract the padding blocks from the reserved blocks
+        # calculate the number of reserved blocks
         n_reserved_blocks = math.ceil(total_tokens / self.block_size)
 
         self.req_ids2num_reserved_blocks[req_id] = n_reserved_blocks
@@ -2216,8 +2216,7 @@ def check_incomplete_prefill(self, scheduler_output: SchedulerOutput):
             return False
 
         # possible prefill
-        req_id = new_reqs[0].req_id if\
-                len(new_reqs) == 1 else \
+        req_id = new_reqs[0].req_id if len(new_reqs) == 1 else \
                 cached_reqs.req_ids[0]
 
         num_scheduled_tokens =\
@@ -2302,6 +2301,9 @@ def execute_model(
             if not self.is_driver_worker:
                 return self.get_empty_output()
 
+            t1 = time.time() - t0
+            logger.debug("t_forward_pass: %.2fms [prefill single chunk]",
+                         (t1 * 1000))
             return CPSpyreModelRunnerOutput(
                 req_ids=list(req_id_to_index.keys()),
                 req_id_to_index=req_id_to_index,
@@ -2319,19 +2321,14 @@ def execute_model(
             sampling_metadata=self.get_sampling_metadata(is_prefill),
         )
         t1 = time.time() - t0
-        logger.debug("t_token: %.2fms", (t1 * 1000))
-
-        # Add the sampled token(s) to the request cache
-        req_ids = ([r.req_id for r in scheduler_output.scheduled_new_reqs]
-                   if len(scheduler_output.scheduled_new_reqs) > 0 \
-                    else self.input_batch.sorted_requests_ids)
+        step_type = "[prefill last chunk]" if is_prefill else "[decode]"
+        logger.debug("t_token: %.2fms %s", (t1 * 1000), step_type)
 
         # Get the right batch, if this is the last chunk to conclude the
         # prefill, we'll generate a token and we should get from the prefill
         # batch because input_batch may have other request that are were
         # not processed at this step.
-        batch = self.prefill_batch if is_prefill \
-            else self.input_batch
+        batch = self.prefill_batch if is_prefill else self.input_batch
 
         # Add the sampled token(s) to the request cache
         req_ids = ([r.req_id for r in scheduler_output.scheduled_new_reqs]