@@ -768,6 +768,72 @@ class WindowBlockManager
768768 [[nodiscard]] SizeType32 addSequence (GenerationRequest& sequence, SizeType32 inputLength,
769769 SizeType32 numContextBlocks, LlmRequest& llmRequest, bool isEnableBlockReuse);
770770
771+ // ! \brief Per-request block allocation statistics from batch addSequence.
772+ struct BatchSeqStats
773+ {
774+ SizeType32 prepopulatedLen{0 };
775+ SizeType32 allocTotalDelta{0 };
776+ SizeType32 allocNewDelta{0 };
777+ SizeType32 reusedDelta{0 };
778+ SizeType32 missedDelta{0 };
779+ };
780+
781+ // ! \brief Result of Phase 1 (claim-only) of batch addSequence.
782+ // ! \details Holds matched blocks and prepared data so Phase 2 can proceed without
783+ // ! re-traversing the radix tree.
784+ struct ClaimResult
785+ {
786+ struct ClaimedBlock
787+ {
788+ BlockPtr block;
789+ SizeType32 numMatchedTokens; // !< tokens matched in this block
790+ bool isPartialMatch;
791+ bool needsCopy; // !< partial match on block with refs or non-leaf (needs getFreeBlock + copy in Phase 2)
792+ bool isPlaceholder; // !< placeholder block (linear attention recurrent states)
793+ bool shouldReleaseCopySource{false }; // !< last copier releases the claimed source after copy
794+ };
795+
796+ std::vector<ClaimedBlock> claimedBlocks;
797+ SizeType32 totalMatchedTokens{0 };
798+ SizeType32 latestMatchingNonPlaceholderBlockIdx{-1 };
799+ SizeType32 numSharedContextBlocks{0 };
800+ SizeType32 numContextBlocks{0 };
801+ bool shareLastContextBlockAmongBeams{true };
802+ std::vector<BlockKey> blockKeys;
803+ std::vector<executor::RetentionPriorityAndDuration> perBlockRetentions;
804+ executor::KvCacheTransferMode mode{executor::KvCacheTransferMode::DRAM};
805+ std::string directory;
806+ };
807+
808+ // ! \brief Tracks which request currently "owns" a partially-matched leaf block across
809+ // ! the batch Phase 1 loop, so that at most one request reuses the block in-place
810+ // ! while all others copy.
811+ struct PartialClaimTracker
812+ {
813+ struct Entry
814+ {
815+ size_t requestIdx; // !< index of the request that currently owns the reuse
816+ size_t claimedIdx; // !< index into that request's claimedBlocks vector
817+ bool fullyMatched; // !< true once any request fully matches this block
818+ };
819+
820+ // ! Keyed by block ID.
821+ std::unordered_map<KVCacheBlock::IdType, Entry> map;
822+ };
823+
824+ // ! \brief Batch add sequences with two-phase claim-then-onboard under a single lock.
825+ // ! \details Phase 1 claims all matching blocks across all requests (protecting from eviction).
826+ // ! Phase 2 onboards host blocks and allocates non-matching blocks.
827+ // ! The mCachedBlocksRootMutex is held for the entire operation.
828+ // ! \param sequences Per-request GenerationRequest references (parallel with other vectors).
829+ // ! \param inputLengths Per-request effective input length.
830+ // ! \param numContextBlocksVec Per-request number of context blocks.
831+ // ! \param llmRequests Per-request LlmRequest references.
832+ // ! \return Per-request prepopulatedPromptLen.
833+ [[nodiscard]] std::vector<BatchSeqStats> addSequenceBatch (std::vector<GenerationRequest*> const & sequences,
834+ std::vector<SizeType32> const & inputLengths, std::vector<SizeType32> const & numContextBlocksVec,
835+ std::vector<std::reference_wrapper<LlmRequest>> const & llmRequests, bool isEnableBlockReuse);
836+
771837 // ! \brief Allocate new block for each beam of the sequence.
772838 // ! \details Might free cached blocks if no free blocks are available.
773839 void allocateBlock (GenerationRequest& sequence, bool shareAmongBeams);
@@ -1087,6 +1153,25 @@ class WindowBlockManager
10871153 executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const & directory = " " ,
10881154 bool isEnableBlockReuse = false );
10891155
1156+ // ! \brief Phase 1: Walk radix tree and claim matching blocks.
1157+ // ! \details Caller must hold mCachedBlocksRootMutex.
1158+ // ! Uses \p tracker to coordinate partial-match ownership across requests in
1159+ // ! the same batch. \p claimResults is the full vector so that a previous
1160+ // ! request's ClaimedBlock can be retroactively marked needsCopy.
1161+ [[nodiscard]] ClaimResult claimMatchingBlocks (GenerationRequest& sequence, SizeType32 inputLength,
1162+ SizeType32 numContextBlocks, LlmRequest& llmRequest, size_t requestIdx, PartialClaimTracker& tracker,
1163+ std::vector<ClaimResult>& claimResults);
1164+
1165+ // ! \brief Build ClaimResult metadata without walking the radix tree.
1166+ // ! \details Used for non-reuse path where all blocks are freshly allocated.
1167+ [[nodiscard]] ClaimResult buildClaimResultMetadata (
1168+ GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest);
1169+
1170+ // ! \brief Phase 2: Onboard claimed host blocks and allocate non-matching blocks.
1171+ // ! \details Caller must hold mCachedBlocksRootMutex.
1172+ [[nodiscard]] SizeType32 onboardAndAllocateBlocks (
1173+ GenerationRequest& sequence, LlmRequest& llmRequest, ClaimResult& claimResult, bool isEnableBlockReuse);
1174+
10901175 // ! \brief Free block and all it's descendants. This makes block a claimed leaf block.
10911176 void freeChildren (BlockPtr const & block);
10921177
@@ -1286,6 +1371,13 @@ class BlockManager
12861371 [[nodiscard]] SizeType32 addSequence (GenerationRequest& sequence, SizeType32 inputLength,
12871372 SizeType32 numContextBlocks, LlmRequest& llmRequest, SizeType32 windowSize, bool isEnableBlockReuse);
12881373
1374+ // ! \brief Batch add sequences forwarding to WindowBlockManager::addSequenceBatch.
1375+ [[nodiscard]] std::vector<WindowBlockManager::BatchSeqStats> addSequenceBatch (
1376+ std::vector<GenerationRequest*> const & sequences, std::vector<SizeType32> const & inputLengths,
1377+ std::vector<SizeType32> const & numContextBlocksVec,
1378+ std::vector<std::reference_wrapper<LlmRequest>> const & llmRequests, SizeType32 windowSize,
1379+ bool isEnableBlockReuse);
1380+
12891381 void allocateBlock (GenerationRequest& sequence, SizeType32 windowSize);
12901382
12911383 // ! \brief According to request's current position, copy data from the last full block to the next block (ignoring
@@ -1793,6 +1885,18 @@ class BaseKVCacheManager
17931885 OptionalRef<LlmRequest> llmRequest = std::nullopt )
17941886 = 0;
17951887
1888+ // ! \brief Batch add sequences with two-phase claim-then-onboard strategy.
1889+ // ! \details For each attention window, when block reuse is enabled, Phase 1 claims all matching
1890+ // ! blocks across all requests (protecting them from eviction via PartialClaimTracker),
1891+ // ! then Phase 2 onboards host blocks and allocates non-matching blocks. When block reuse
1892+ // ! is disabled, buildClaimResultMetadata() prepares ClaimResult metadata without radix
1893+ // ! tree traversal, and Phase 2 performs fresh allocation only. Supports variable sliding
1894+ // ! window attention (VSWA) by iterating over all window sizes.
1895+ virtual void addSequenceBatch (
1896+ std::vector<std::tuple<LlmRequest::RequestIdType, SizeType32, SizeType32>> const & requestInfos,
1897+ std::vector<std::reference_wrapper<LlmRequest>> const & llmRequests)
1898+ = 0;
1899+
17961900 [[nodiscard]] virtual std::optional<KVCacheBlock::IdType> removeSequence (LlmRequest::RequestIdType requestId,
17971901 OptionalRef<LlmRequest const > llmRequest = std::nullopt , bool pinOnRelease = false )
17981902 = 0;
@@ -2168,6 +2272,10 @@ class KVCacheManager : public BaseKVCacheManager
21682272 void addSequence (LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
21692273 OptionalRef<LlmRequest> llmRequest = std::nullopt ) override ;
21702274
2275+ void addSequenceBatch (
2276+ std::vector<std::tuple<LlmRequest::RequestIdType, SizeType32, SizeType32>> const & requestInfos,
2277+ std::vector<std::reference_wrapper<LlmRequest>> const & llmRequests) override ;
2278+
21712279 [[nodiscard]] std::optional<KVCacheBlock::IdType> removeSequence (LlmRequest::RequestIdType requestId,
21722280 OptionalRef<LlmRequest const > llmRequest = std::nullopt , bool pinOnRelease = false ) override ;
21732281
0 commit comments