Phlip79
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/inference/gpt/utils.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/inference/gpt/utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎megatron/core/inference/contexts/dynamic_block_allocator.py‎
Lines changed: 17 additions & 13 deletions b/‎megatron/core/inference/contexts/dynamic_block_allocator.py‎
Lines changed: 17 additions & 13 deletions
@@ -174,6 +174,7 @@ def get_inference_context(
         ),
         block_size_tokens=args.inference_dynamic_batching_block_size,
         buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb,
+        paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb,
         max_requests=args.inference_dynamic_batching_max_requests,
         max_tokens=args.inference_dynamic_batching_max_tokens,
         tensor_model_parallel_size=args.tensor_model_parallel_size,
@@ -369,6 +370,7 @@ def _add_request():
                 request.time_end = get_curr_time()
                 request.state = "finished"
                 request.request_id = finished_request.request_id
+                request.events = finished_request.events
 
                 # Update prompt, in case engine has been suspended and resumed.
                 request.prompt_tokens = finished_request.prompt_tokens.tolist()
@@ -543,7 +545,7 @@ def escape_str(s):
             # ---- Prompt summary line ----
             prompt_len = len(requests[request_idxs[0]].prompt_tokens)
             escaped_prompt_text = escape_str(prompt_text)
-            print(f"{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}")
+            print(f"\n{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}")
 
             # ---- Group all outputs for this prompt ----
             output_map = defaultdict(list)
 
@@ -72,7 +72,7 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
         help="Add a deterministic number of requests per step. This arg is "
         "prioritized over `--incoming-requests-per-sec` below (which is non-"
         "deterministic). Note that the number of requests added per step is "
-        "additionally limited by the inference context's `max_active_requests`, "
+        "additionally limited by the inference context's `max_requests`, "
         "`max_tokens`, and KV buffer size.",
     )
     group.add_argument(
@@ -393,7 +393,7 @@ def build_dynamic_engine_setup_prefix(
 
     Args:
         args (Namespace): Command-line arguments for this run.
-        context (DynamicInferenceContext): Stores limits such as `max_active_requests`,
+        context (DynamicInferenceContext): Stores limits such as `max_requests`,
             `max_tokens`, and `gtd_request_count`.
         requests (List[DynamicInferenceRequest]): List of inference requests.
 
@@ -430,7 +430,7 @@ def build_dynamic_engine_setup_prefix(
     buffer_limits_str = (
         f"bf: {get_mem_size_str(args.inference_dynamic_batching_buffer_size_gb*1024**3)}, "
         f"{context.block_allocator.active_count} chunks "
-        f"[r {context.max_active_requests}, t {context.max_tokens}]"
+        f"[r {context.max_requests}, t {context.max_tokens}]"
     )
 
     parts = [
 
@@ -16,21 +16,20 @@ class BlockAllocator:
 
     Args:
         context (DynamicInferenceContext): Dynamic inference context.
-        active_count (int): Total number of active blocks available in the buffer.
-            The full buffer size is 2*active_count, to accommodate an equal-size
-            space for paused requests that live on the CPU.
+        total_count (int): Total number of blocks in the buffer.
+        paused_count (int): Number of paused blocks in the buffer. Must be less
+            than `total_count`.
     """
 
-    def __init__(self, context: "DynamicInferenceContext", total_count: int):
+    def __init__(self, context: "DynamicInferenceContext", total_count: int, paused_count: int):
 
         self.context = context
 
-        active_count = (total_count - 1) // 2  # -1 for dummy_block_idx (see below)
-        active_count = max(1, active_count)  # need at least one block
-        self.total_count = 2 * active_count + 1  # +1 for dummy_block_idx
-        self.total_avail = self.total_count - 1  # -1 for dummy_block_idx
-        self.active_count = active_count
-        self.paused_count = self.total_count - self.active_count - 1  # -1 for dummy_block_idx
+        self.total_count = total_count
+        self.total_avail = total_count - 1  # -1 for dummy_block_idx (see below)
+        self.paused_count = paused_count
+        self.active_count = total_count - paused_count - 1  # -1 for dummy_block_idx
+        assert self.active_count >= 1  # ensures paused_count < total_count - 1
         self.dummy_block_idx = self.total_count - 1
 
         # Initialize block pool as a "stack" data structure
@@ -40,10 +39,15 @@ def __init__(self, context: "DynamicInferenceContext", total_count: int):
 
     def __str__(self):
         return (
-            f"total avail {self.total_avail} / {self.total_count - 1}"
-            f"; active {self.active_count}"
+            f"using: total {self.get_total_used()}/{self.total_count - 1}"
+            f"; active {self.get_active_used()}/{self.active_count}"
+            f"; paused {self.get_paused_used()}/{self.paused_count}"
         )
 
+    def get_total_used(self):
+        """Compute number of total blocks used."""
+        return self.total_count - self.total_avail - 1
+
     def get_active_used(self):
         """Compute number of active blocks used."""
         return (
@@ -77,7 +81,7 @@ def is_memory_available(self, num_blocks: int) -> bool:
         Return:
             (bool) Is memory available?
         """
-        return self.get_active_avail() >= num_blocks
+        return self.total_avail >= num_blocks
 
     def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]:
         """Allocate memory blocks if available, else return None.