diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index af2ccf9d693a..6cb42d012b8f 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -55,16 +55,15 @@ foreach(DEP_IDX RANGE ${DEP_COUNT_MINUS_ONE}) endif() if(DEP_PATCH_FILE AND NOT DEP_PATCH_FILE STREQUAL "") + set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}") list( APPEND FETCH_ARGS PATCH_COMMAND - patch - -p1 - --forward - --batch - -i - "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}") + bash + -c + "patch -p1 --forward --batch --dry-run -i '${_patch_file}' && patch -p1 --forward --batch -i '${_patch_file}' || echo 'Patch already applied, skipping.'" + ) endif() FetchContent_Declare(${FETCH_ARGS}) diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h index 89618dce540f..77f910455c57 100644 --- a/cpp/include/tensorrt_llm/executor/types.h +++ b/cpp/include/tensorrt_llm/executor/types.h @@ -243,6 +243,9 @@ enum class ContextChunkingPolicy /// @brief Iterate through each context request in sequence and attempt to increase its chunk /// count until the constraint is exceeded. kEQUAL_PROGRESS = 1, + + /// @brief Force every context request to have a chunk size of `unit_size` or 0 unless it's the last chunk. + kFORCE_CHUNK = 2, }; std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy); diff --git a/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp b/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp index 92d4589e0b66..40b760c3cb0a 100644 --- a/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp +++ b/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp @@ -217,16 +217,52 @@ void MicroBatchScheduler::setCtxRequestsChunkSize +void MicroBatchScheduler::setCtxRequestsChunkSize( + RequestVector& contextsToBeChunked, std::optional ctxTokensCapacity, SizeType32 const chunkUnitSize, + std::optional const& maxContextLength) +{ + if (maxContextLength && maxContextLength.value() < chunkUnitSize) + { + TLLM_THROW( + "The forced chunk size (%d) exceeds the max context length (%d)", chunkUnitSize, maxContextLength.value()); + } + SizeType32 totalTokens{0}; + for (auto& llmReq : contextsToBeChunked) + { + SizeType32 const chunkSize = std::min(llmReq->getContextRemainingLength(), chunkUnitSize); + if (ctxTokensCapacity && totalTokens + chunkSize > ctxTokensCapacity.value()) + { + llmReq->setContextChunkSize(0); + } + else + { + llmReq->setContextChunkSize(chunkSize); + totalTokens += llmReq->getContextChunkSize(); + } + } +} + // Entry point for chunk-size assignment. Resets all chunk sizes to zero, then // dispatches to the appropriate policy-specific implementation: // -// kEQUAL_PROGRESS — all requests advance together one chunkUnitSize at a time. +// kEQUAL_PROGRESS — all requests advance together one chunkUnitSize at a time. // kFIRST_COME_FIRST_SERVED — requests are served greedily in order until the budget // is exhausted. +// kFORCE_CHUNK — every request gets exactly min(remaining, chunkUnitSize) +// tokens; budget is charged at face value (no reuse discount). // -// Both policies are compute-aware: tokens covered by the reusable KV-cache prefix are -// not charged against ctxTokensCapacity. See the individual template specialisations -// above for full details. +// EQUAL_PROGRESS and FIRST_COME_FIRST_SERVED are compute-aware: tokens covered by the +// reusable KV-cache prefix are not charged against ctxTokensCapacity. +// FORCE_CHUNK intentionally skips reuse accounting. +// See the individual template specialisations above for full details. void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChunked, ContextChunkingPolicy const ctxChunkPolicy, std::optional ctxTokensCapacity, SizeType32 const chunkUnitSize, std::optional const& maxContextLength) @@ -245,6 +281,10 @@ void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChu setCtxRequestsChunkSize( contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength); break; + case ContextChunkingPolicy::kFORCE_CHUNK: + setCtxRequestsChunkSize( + contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength); + break; default: TLLM_THROW("The chunked scheduling type `NO_CHUNKING` cannot be performed."); } @@ -384,6 +424,12 @@ std::tuple MicroBatchScheduler::operator()(Request allContextRequestsFit = false; } + // For FORCE_CHUNK policy, always re-chunk regardless of whether all contexts fit. + if (mCtxChunkConfig && mCtxChunkConfig.value().chunkingPolicy == ContextChunkingPolicy::kFORCE_CHUNK) + { + allContextRequestsFit = false; + } + // 2. If not all contexts fit into the batch, the chunk size should be adjusted accordingly. if (!allContextRequestsFit) { diff --git a/cpp/tensorrt_llm/executor/types.cpp b/cpp/tensorrt_llm/executor/types.cpp index 86b1b3d38312..e07c759e1b4d 100644 --- a/cpp/tensorrt_llm/executor/types.cpp +++ b/cpp/tensorrt_llm/executor/types.cpp @@ -38,6 +38,7 @@ std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy) { case ContextChunkingPolicy::kEQUAL_PROGRESS: os << "EQUAL_PROGRESS"; break; case ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED: os << "FIRST_COME_FIRST_SERVED"; break; + case ContextChunkingPolicy::kFORCE_CHUNK: os << "FORCE_CHUNK"; break; } return os; } diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp index 4f873e2ed1b2..78c90a86ca37 100644 --- a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp @@ -94,7 +94,8 @@ void initBindings(nb::module_& m) nb::enum_(m, "ContextChunkingPolicy") .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS) - .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED); + .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED) + .value("FORCE_CHUNK", tle::ContextChunkingPolicy::kFORCE_CHUNK); nb::enum_(m, "CommunicationType").value("MPI", tle::CommunicationType::kMPI); diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp index 3440a59737fc..35486c3be937 100644 --- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp @@ -410,9 +410,11 @@ std::vector run_fp4_block_scale_moe_runner(torch::optional