Refactor deep copy logic for generation logits in LlmRequest.

stnie · stnie · commit 123a3bd6df85 · 2026-01-14T17:12:26.000Z
- Adjusted the condition for deep copying generation logits, ensuring only necessary copies are made.
- Updated comments for clarity on the copying process of generation logits and their indices to enhance understanding of the logic.

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -654,10 +654,16 @@ def create_response(self,
                 for log_prob in self.py_result.log_probs:
                     log_prob.clear()
 
-            # Perform a deep copy of py_result._generation_logits
+            # Perform copies of py_result._generation_logits
             if need_deep_copy_generation_logits:
-                py_result._generation_logits = deepcopy(
+                # shallow copy of generation_logits to avoid copying the logits tensor
+                py_result._generation_logits = copy(
                     self.py_result._generation_logits)
+                # deep copy the indices to avoid the race condition
+                # In streaming mode LogitsStorage only accesses either the last
+                # or second to last pair of indices. Therefore, copying only these two pairs is sufficient.
+                py_result._generation_logits._logits_indices = py_result._generation_logits._logits_indices[
+                    -2:]
         else:
             py_result = self.py_result