Skip to content

Commit 66d7711

Browse files
authored
[None][feat] Batch addSequence with two-phase claim and unified VSWA/non-reuse support (NVIDIA#13029)
Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
1 parent ea117de commit 66d7711

5 files changed

Lines changed: 1503 additions & 162 deletions

File tree

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,72 @@ class WindowBlockManager
768768
[[nodiscard]] SizeType32 addSequence(GenerationRequest& sequence, SizeType32 inputLength,
769769
SizeType32 numContextBlocks, LlmRequest& llmRequest, bool isEnableBlockReuse);
770770

771+
//! \brief Per-request block allocation statistics from batch addSequence.
772+
struct BatchSeqStats
773+
{
774+
SizeType32 prepopulatedLen{0};
775+
SizeType32 allocTotalDelta{0};
776+
SizeType32 allocNewDelta{0};
777+
SizeType32 reusedDelta{0};
778+
SizeType32 missedDelta{0};
779+
};
780+
781+
//! \brief Result of Phase 1 (claim-only) of batch addSequence.
782+
//! \details Holds matched blocks and prepared data so Phase 2 can proceed without
783+
//! re-traversing the radix tree.
784+
struct ClaimResult
785+
{
786+
struct ClaimedBlock
787+
{
788+
BlockPtr block;
789+
SizeType32 numMatchedTokens; //!< tokens matched in this block
790+
bool isPartialMatch;
791+
bool needsCopy; //!< partial match on block with refs or non-leaf (needs getFreeBlock + copy in Phase 2)
792+
bool isPlaceholder; //!< placeholder block (linear attention recurrent states)
793+
bool shouldReleaseCopySource{false}; //!< last copier releases the claimed source after copy
794+
};
795+
796+
std::vector<ClaimedBlock> claimedBlocks;
797+
SizeType32 totalMatchedTokens{0};
798+
SizeType32 latestMatchingNonPlaceholderBlockIdx{-1};
799+
SizeType32 numSharedContextBlocks{0};
800+
SizeType32 numContextBlocks{0};
801+
bool shareLastContextBlockAmongBeams{true};
802+
std::vector<BlockKey> blockKeys;
803+
std::vector<executor::RetentionPriorityAndDuration> perBlockRetentions;
804+
executor::KvCacheTransferMode mode{executor::KvCacheTransferMode::DRAM};
805+
std::string directory;
806+
};
807+
808+
//! \brief Tracks which request currently "owns" a partially-matched leaf block across
809+
//! the batch Phase 1 loop, so that at most one request reuses the block in-place
810+
//! while all others copy.
811+
struct PartialClaimTracker
812+
{
813+
struct Entry
814+
{
815+
size_t requestIdx; //!< index of the request that currently owns the reuse
816+
size_t claimedIdx; //!< index into that request's claimedBlocks vector
817+
bool fullyMatched; //!< true once any request fully matches this block
818+
};
819+
820+
//! Keyed by block ID.
821+
std::unordered_map<KVCacheBlock::IdType, Entry> map;
822+
};
823+
824+
//! \brief Batch add sequences with two-phase claim-then-onboard under a single lock.
825+
//! \details Phase 1 claims all matching blocks across all requests (protecting from eviction).
826+
//! Phase 2 onboards host blocks and allocates non-matching blocks.
827+
//! The mCachedBlocksRootMutex is held for the entire operation.
828+
//! \param sequences Per-request GenerationRequest references (parallel with other vectors).
829+
//! \param inputLengths Per-request effective input length.
830+
//! \param numContextBlocksVec Per-request number of context blocks.
831+
//! \param llmRequests Per-request LlmRequest references.
832+
//! \return Per-request prepopulatedPromptLen.
833+
[[nodiscard]] std::vector<BatchSeqStats> addSequenceBatch(std::vector<GenerationRequest*> const& sequences,
834+
std::vector<SizeType32> const& inputLengths, std::vector<SizeType32> const& numContextBlocksVec,
835+
std::vector<std::reference_wrapper<LlmRequest>> const& llmRequests, bool isEnableBlockReuse);
836+
771837
//! \brief Allocate new block for each beam of the sequence.
772838
//! \details Might free cached blocks if no free blocks are available.
773839
void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams);
@@ -1087,6 +1153,25 @@ class WindowBlockManager
10871153
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "",
10881154
bool isEnableBlockReuse = false);
10891155

1156+
//! \brief Phase 1: Walk radix tree and claim matching blocks.
1157+
//! \details Caller must hold mCachedBlocksRootMutex.
1158+
//! Uses \p tracker to coordinate partial-match ownership across requests in
1159+
//! the same batch. \p claimResults is the full vector so that a previous
1160+
//! request's ClaimedBlock can be retroactively marked needsCopy.
1161+
[[nodiscard]] ClaimResult claimMatchingBlocks(GenerationRequest& sequence, SizeType32 inputLength,
1162+
SizeType32 numContextBlocks, LlmRequest& llmRequest, size_t requestIdx, PartialClaimTracker& tracker,
1163+
std::vector<ClaimResult>& claimResults);
1164+
1165+
//! \brief Build ClaimResult metadata without walking the radix tree.
1166+
//! \details Used for non-reuse path where all blocks are freshly allocated.
1167+
[[nodiscard]] ClaimResult buildClaimResultMetadata(
1168+
GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest);
1169+
1170+
//! \brief Phase 2: Onboard claimed host blocks and allocate non-matching blocks.
1171+
//! \details Caller must hold mCachedBlocksRootMutex.
1172+
[[nodiscard]] SizeType32 onboardAndAllocateBlocks(
1173+
GenerationRequest& sequence, LlmRequest& llmRequest, ClaimResult& claimResult, bool isEnableBlockReuse);
1174+
10901175
//! \brief Free block and all it's descendants. This makes block a claimed leaf block.
10911176
void freeChildren(BlockPtr const& block);
10921177

@@ -1286,6 +1371,13 @@ class BlockManager
12861371
[[nodiscard]] SizeType32 addSequence(GenerationRequest& sequence, SizeType32 inputLength,
12871372
SizeType32 numContextBlocks, LlmRequest& llmRequest, SizeType32 windowSize, bool isEnableBlockReuse);
12881373

1374+
//! \brief Batch add sequences forwarding to WindowBlockManager::addSequenceBatch.
1375+
[[nodiscard]] std::vector<WindowBlockManager::BatchSeqStats> addSequenceBatch(
1376+
std::vector<GenerationRequest*> const& sequences, std::vector<SizeType32> const& inputLengths,
1377+
std::vector<SizeType32> const& numContextBlocksVec,
1378+
std::vector<std::reference_wrapper<LlmRequest>> const& llmRequests, SizeType32 windowSize,
1379+
bool isEnableBlockReuse);
1380+
12891381
void allocateBlock(GenerationRequest& sequence, SizeType32 windowSize);
12901382

12911383
//! \brief According to request's current position, copy data from the last full block to the next block (ignoring
@@ -1793,6 +1885,18 @@ class BaseKVCacheManager
17931885
OptionalRef<LlmRequest> llmRequest = std::nullopt)
17941886
= 0;
17951887

1888+
//! \brief Batch add sequences with two-phase claim-then-onboard strategy.
1889+
//! \details For each attention window, when block reuse is enabled, Phase 1 claims all matching
1890+
//! blocks across all requests (protecting them from eviction via PartialClaimTracker),
1891+
//! then Phase 2 onboards host blocks and allocates non-matching blocks. When block reuse
1892+
//! is disabled, buildClaimResultMetadata() prepares ClaimResult metadata without radix
1893+
//! tree traversal, and Phase 2 performs fresh allocation only. Supports variable sliding
1894+
//! window attention (VSWA) by iterating over all window sizes.
1895+
virtual void addSequenceBatch(
1896+
std::vector<std::tuple<LlmRequest::RequestIdType, SizeType32, SizeType32>> const& requestInfos,
1897+
std::vector<std::reference_wrapper<LlmRequest>> const& llmRequests)
1898+
= 0;
1899+
17961900
[[nodiscard]] virtual std::optional<KVCacheBlock::IdType> removeSequence(LlmRequest::RequestIdType requestId,
17971901
OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinOnRelease = false)
17981902
= 0;
@@ -2168,6 +2272,10 @@ class KVCacheManager : public BaseKVCacheManager
21682272
void addSequence(LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
21692273
OptionalRef<LlmRequest> llmRequest = std::nullopt) override;
21702274

2275+
void addSequenceBatch(
2276+
std::vector<std::tuple<LlmRequest::RequestIdType, SizeType32, SizeType32>> const& requestInfos,
2277+
std::vector<std::reference_wrapper<LlmRequest>> const& llmRequests) override;
2278+
21712279
[[nodiscard]] std::optional<KVCacheBlock::IdType> removeSequence(LlmRequest::RequestIdType requestId,
21722280
OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinOnRelease = false) override;
21732281

0 commit comments

Comments
 (0)