yingguo-trt · pull · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
@@ -55,16 +55,15 @@ foreach(DEP_IDX RANGE ${DEP_COUNT_MINUS_ONE})
   endif()
 
   if(DEP_PATCH_FILE AND NOT DEP_PATCH_FILE STREQUAL "")
+    set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}")
     list(
       APPEND
       FETCH_ARGS
       PATCH_COMMAND
-      patch
-      -p1
-      --forward
-      --batch
-      -i
-      "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}")
+      bash
+      -c
+      "patch -p1 --forward --batch --dry-run -i '${_patch_file}' && patch -p1 --forward --batch -i '${_patch_file}' || echo 'Patch already applied, skipping.'"
+    )
   endif()
 
   FetchContent_Declare(${FETCH_ARGS})

diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h
@@ -243,6 +243,9 @@ enum class ContextChunkingPolicy
     /// @brief Iterate through each context request in sequence and attempt to increase its chunk
     /// count until the constraint is exceeded.
     kEQUAL_PROGRESS = 1,
+
+    /// @brief Force every context request to have a chunk size of `unit_size` or 0 unless it's the last chunk.
+    kFORCE_CHUNK = 2,
 };
 
 std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy);

diff --git a/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp b/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp
@@ -217,16 +217,52 @@ void MicroBatchScheduler::setCtxRequestsChunkSize<MicroBatchScheduler::ContextCh
     }
 }
 
+// Assigns chunk sizes to context requests under the kFORCE_CHUNK policy.
+//
+// Every request is assigned exactly min(contextRemainingLength, chunkUnitSize) tokens.
+// Requests whose chunk would push the running total past ctxTokensCapacity are zeroed.
+//
+// This policy is designed for linear attention state caching, so reusable KV-cache tokens are NOT
+// calculated because it's not supported yet.
+template <>
+void MicroBatchScheduler::setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFORCE_CHUNK>(
+    RequestVector& contextsToBeChunked, std::optional<SizeType32> ctxTokensCapacity, SizeType32 const chunkUnitSize,
+    std::optional<SizeType32> const& maxContextLength)
+{
+    if (maxContextLength && maxContextLength.value() < chunkUnitSize)
+    {
+        TLLM_THROW(
+            "The forced chunk size (%d) exceeds the max context length (%d)", chunkUnitSize, maxContextLength.value());
+    }
+    SizeType32 totalTokens{0};
+    for (auto& llmReq : contextsToBeChunked)
+    {
+        SizeType32 const chunkSize = std::min(llmReq->getContextRemainingLength(), chunkUnitSize);
+        if (ctxTokensCapacity && totalTokens + chunkSize > ctxTokensCapacity.value())
+        {
+            llmReq->setContextChunkSize(0);
+        }
+        else
+        {
+            llmReq->setContextChunkSize(chunkSize);
+            totalTokens += llmReq->getContextChunkSize();
+        }
+    }
+}
+
 // Entry point for chunk-size assignment. Resets all chunk sizes to zero, then
 // dispatches to the appropriate policy-specific implementation:
 //
-//   kEQUAL_PROGRESS      — all requests advance together one chunkUnitSize at a time.
+//   kEQUAL_PROGRESS        — all requests advance together one chunkUnitSize at a time.
 //   kFIRST_COME_FIRST_SERVED — requests are served greedily in order until the budget
 //                              is exhausted.
+//   kFORCE_CHUNK           — every request gets exactly min(remaining, chunkUnitSize)
+//                              tokens; budget is charged at face value (no reuse discount).
 //
-// Both policies are compute-aware: tokens covered by the reusable KV-cache prefix are
-// not charged against ctxTokensCapacity. See the individual template specialisations
-// above for full details.
+// EQUAL_PROGRESS and FIRST_COME_FIRST_SERVED are compute-aware: tokens covered by the
+// reusable KV-cache prefix are not charged against ctxTokensCapacity.
+// FORCE_CHUNK intentionally skips reuse accounting.
+// See the individual template specialisations above for full details.
 void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChunked,
     ContextChunkingPolicy const ctxChunkPolicy, std::optional<SizeType32> ctxTokensCapacity,
     SizeType32 const chunkUnitSize, std::optional<SizeType32> const& maxContextLength)
@@ -245,6 +281,10 @@ void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChu
         setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED>(
             contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength);
         break;
+    case ContextChunkingPolicy::kFORCE_CHUNK:
+        setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFORCE_CHUNK>(
+            contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength);
+        break;
     default: TLLM_THROW("The chunked scheduling type `NO_CHUNKING` cannot be performed.");
     }
 
@@ -384,6 +424,12 @@ std::tuple<RequestVector, RequestVector> MicroBatchScheduler::operator()(Request
         allContextRequestsFit = false;
     }
 
+    // For FORCE_CHUNK policy, always re-chunk regardless of whether all contexts fit.
+    if (mCtxChunkConfig && mCtxChunkConfig.value().chunkingPolicy == ContextChunkingPolicy::kFORCE_CHUNK)
+    {
+        allContextRequestsFit = false;
+    }
+
     // 2. If not all contexts fit into the batch, the chunk size should be adjusted accordingly.
     if (!allContextRequestsFit)
     {

diff --git a/cpp/tensorrt_llm/executor/types.cpp b/cpp/tensorrt_llm/executor/types.cpp
@@ -38,6 +38,7 @@ std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy)
     {
     case ContextChunkingPolicy::kEQUAL_PROGRESS: os << "EQUAL_PROGRESS"; break;
     case ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED: os << "FIRST_COME_FIRST_SERVED"; break;
+    case ContextChunkingPolicy::kFORCE_CHUNK: os << "FORCE_CHUNK"; break;
     }
     return os;
 }

diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
@@ -94,7 +94,8 @@ void initBindings(nb::module_& m)
 
     nb::enum_<tle::ContextChunkingPolicy>(m, "ContextChunkingPolicy")
         .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS)
-        .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED);
+        .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED)
+        .value("FORCE_CHUNK", tle::ContextChunkingPolicy::kFORCE_CHUNK);
 
     nb::enum_<tle::CommunicationType>(m, "CommunicationType").value("MPI", tle::CommunicationType::kMPI);
 

diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
@@ -410,9 +410,11 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::optional<torch:
         if (out_hidden < args.hidden_size)
         {
             // out_tensor has unpadded hidden dim (e.g., nvfp4 with padding).
-            // Set valid_hidden_size so the finalize kernel writes only the needed columns.
+            // Set valid_hidden_size so the finalize kernel writes only the needed columns
+            // directly into out_tensor. Keep output_hidden_size at the full hidden_size so
+            // Gemm2 still computes at the padded width (its cubin config requires it).
             args.valid_hidden_size = out_hidden;
-            args.output_hidden_size = tensorrt_llm::common::roundUp(out_hidden, static_cast<int64_t>(128));
+            args.output_hidden_size = args.hidden_size;
         }
         else
         {