Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions 3rdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,15 @@ foreach(DEP_IDX RANGE ${DEP_COUNT_MINUS_ONE})
endif()

if(DEP_PATCH_FILE AND NOT DEP_PATCH_FILE STREQUAL "")
set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}")
list(
APPEND
FETCH_ARGS
PATCH_COMMAND
patch
-p1
--forward
--batch
-i
"${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}")
bash
-c
"patch -p1 --forward --batch --dry-run -i '${_patch_file}' && patch -p1 --forward --batch -i '${_patch_file}' || echo 'Patch already applied, skipping.'"
)
endif()

FetchContent_Declare(${FETCH_ARGS})
Expand Down
3 changes: 3 additions & 0 deletions cpp/include/tensorrt_llm/executor/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ enum class ContextChunkingPolicy
/// @brief Iterate through each context request in sequence and attempt to increase its chunk
/// count until the constraint is exceeded.
kEQUAL_PROGRESS = 1,

/// @brief Force every context request to have a chunk size of `unit_size` or 0 unless it's the last chunk.
kFORCE_CHUNK = 2,
};

std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy);
Expand Down
54 changes: 50 additions & 4 deletions cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,16 +217,52 @@ void MicroBatchScheduler::setCtxRequestsChunkSize<MicroBatchScheduler::ContextCh
}
}

// Assigns chunk sizes to context requests under the kFORCE_CHUNK policy.
//
// Every request is assigned exactly min(contextRemainingLength, chunkUnitSize) tokens.
// Requests whose chunk would push the running total past ctxTokensCapacity are zeroed.
//
// This policy is designed for linear attention state caching, so reusable KV-cache tokens are NOT
// calculated because it's not supported yet.
template <>
void MicroBatchScheduler::setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFORCE_CHUNK>(
RequestVector& contextsToBeChunked, std::optional<SizeType32> ctxTokensCapacity, SizeType32 const chunkUnitSize,
std::optional<SizeType32> const& maxContextLength)
{
if (maxContextLength && maxContextLength.value() < chunkUnitSize)
{
TLLM_THROW(
"The forced chunk size (%d) exceeds the max context length (%d)", chunkUnitSize, maxContextLength.value());
}
SizeType32 totalTokens{0};
for (auto& llmReq : contextsToBeChunked)
{
SizeType32 const chunkSize = std::min(llmReq->getContextRemainingLength(), chunkUnitSize);
if (ctxTokensCapacity && totalTokens + chunkSize > ctxTokensCapacity.value())
{
llmReq->setContextChunkSize(0);
}
else
{
llmReq->setContextChunkSize(chunkSize);
totalTokens += llmReq->getContextChunkSize();
}
}
}

// Entry point for chunk-size assignment. Resets all chunk sizes to zero, then
// dispatches to the appropriate policy-specific implementation:
//
// kEQUAL_PROGRESS — all requests advance together one chunkUnitSize at a time.
// kEQUAL_PROGRESS — all requests advance together one chunkUnitSize at a time.
// kFIRST_COME_FIRST_SERVED — requests are served greedily in order until the budget
// is exhausted.
// kFORCE_CHUNK — every request gets exactly min(remaining, chunkUnitSize)
// tokens; budget is charged at face value (no reuse discount).
//
// Both policies are compute-aware: tokens covered by the reusable KV-cache prefix are
// not charged against ctxTokensCapacity. See the individual template specialisations
// above for full details.
// EQUAL_PROGRESS and FIRST_COME_FIRST_SERVED are compute-aware: tokens covered by the
// reusable KV-cache prefix are not charged against ctxTokensCapacity.
// FORCE_CHUNK intentionally skips reuse accounting.
// See the individual template specialisations above for full details.
void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChunked,
ContextChunkingPolicy const ctxChunkPolicy, std::optional<SizeType32> ctxTokensCapacity,
SizeType32 const chunkUnitSize, std::optional<SizeType32> const& maxContextLength)
Expand All @@ -245,6 +281,10 @@ void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChu
setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED>(
contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength);
break;
case ContextChunkingPolicy::kFORCE_CHUNK:
setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFORCE_CHUNK>(
contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength);
break;
default: TLLM_THROW("The chunked scheduling type `NO_CHUNKING` cannot be performed.");
}

Expand Down Expand Up @@ -384,6 +424,12 @@ std::tuple<RequestVector, RequestVector> MicroBatchScheduler::operator()(Request
allContextRequestsFit = false;
}

// For FORCE_CHUNK policy, always re-chunk regardless of whether all contexts fit.
if (mCtxChunkConfig && mCtxChunkConfig.value().chunkingPolicy == ContextChunkingPolicy::kFORCE_CHUNK)
{
allContextRequestsFit = false;
}

// 2. If not all contexts fit into the batch, the chunk size should be adjusted accordingly.
if (!allContextRequestsFit)
{
Expand Down
1 change: 1 addition & 0 deletions cpp/tensorrt_llm/executor/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy)
{
case ContextChunkingPolicy::kEQUAL_PROGRESS: os << "EQUAL_PROGRESS"; break;
case ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED: os << "FIRST_COME_FIRST_SERVED"; break;
case ContextChunkingPolicy::kFORCE_CHUNK: os << "FORCE_CHUNK"; break;
}
return os;
}
Expand Down
3 changes: 2 additions & 1 deletion cpp/tensorrt_llm/nanobind/executor/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ void initBindings(nb::module_& m)

nb::enum_<tle::ContextChunkingPolicy>(m, "ContextChunkingPolicy")
.value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS)
.value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED);
.value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED)
.value("FORCE_CHUNK", tle::ContextChunkingPolicy::kFORCE_CHUNK);

nb::enum_<tle::CommunicationType>(m, "CommunicationType").value("MPI", tle::CommunicationType::kMPI);

Expand Down
6 changes: 4 additions & 2 deletions cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,11 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::optional<torch:
if (out_hidden < args.hidden_size)
{
// out_tensor has unpadded hidden dim (e.g., nvfp4 with padding).
// Set valid_hidden_size so the finalize kernel writes only the needed columns.
// Set valid_hidden_size so the finalize kernel writes only the needed columns
// directly into out_tensor. Keep output_hidden_size at the full hidden_size so
// Gemm2 still computes at the padded width (its cubin config requires it).
args.valid_hidden_size = out_hidden;
args.output_hidden_size = tensorrt_llm::common::roundUp(out_hidden, static_cast<int64_t>(128));
args.output_hidden_size = args.hidden_size;
}
else
{
Expand Down
Loading
Loading