Refactor LlmRequest and PyExecutor for improved state management and deep copy logic.

stnie · stnie · commit 04b3472c43e1 · 2026-01-16T11:59:26.000+01:00
- Added a comment to clarify the condition for deep copying in LlmRequest.
- Adjusted logic for state management in PyExecutor for generation requests to skip unnecessary checks.
- Added assertions in unit tests to validate expected behavior in generation scenarios.

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -639,8 +639,10 @@ def create_response(self,
         result, is_final = super().create_serialized_result(
             use_fast_logits, mpi_world_rank)
 
+        # When using beam search we cannot incrementically update the logprobs in the result.
+        # Instead we need to update all logprobs. In that case no deep copy is needed.
         need_deep_copy_logprobs = self.py_result.log_probs and self.sampling_config.beam_width <= 1
-        need_deep_copy_generation_logits = self.py_result._generation_logits
+        need_deep_copy_generation_logits = self.py_result._generation_logits is not None
         need_any_deep_copy = need_deep_copy_logprobs or need_deep_copy_generation_logits
         # Performs a deep copy of py_result._log_probs or py_result._generation_logits to eliminate race conditions
         # that may occur between IPC communication and the overriding of newly generated log_probs
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1757,11 +1757,10 @@ def _executor_loop_overlap(self):
                 # If set before, the response of a request may be incorrect, as it will
                 # use the wrong indices for generation logits when streaming is enabled.
                 for request in scheduled_batch.generation_requests:
-                    if request.state != LlmRequestState.GENERATION_COMPLETE:
-                        if not self.disable_overlap_scheduler and request.will_complete_next_iteration(
-                        ):
-                            request.set_exclude_last_generation_logits(False)
-                            request.state = LlmRequestState.GENERATION_TO_COMPLETE
+                    if request.state != LlmRequestState.GENERATION_COMPLETE and request.will_complete_next_iteration(
+                    ):
+                        request.set_exclude_last_generation_logits(False)
+                        request.state = LlmRequestState.GENERATION_TO_COMPLETE
 
                 if can_queue:
                     if self.enable_iter_perf_stats:
diff --git a/tests/unittest/_torch/sampler/test_return_logits.py b/tests/unittest/_torch/sampler/test_return_logits.py
@@ -179,6 +179,7 @@ def test_generation_with_return_logits(
                 idx=idx,
                 output=output,
                 streaming=True)
+        assert idx == sampling_params.max_tokens - 1
     else:
         for idx, output in enumerate(
                 llm.generate(
@@ -197,3 +198,4 @@ def test_generation_with_return_logits(
                 idx=idx,
                 output=output,
                 streaming=False)
+        assert idx == len(prompts) - 1