Integrate zero copy api

yizhang-nv · yizhang-nv · commit e76e4f741e58 · 2025-12-23T00:55:48.000-08:00
Signed-off-by: yizhang-nv &lt;187001205+yizhang-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManagerV2Utils.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManagerV2Utils.cpp
@@ -172,4 +172,52 @@ void copyBatchBlockOffsets(ITensor& output, SizeType32 batchSize, std::vector<Bl
     }
 }
 
+SizeType32 IndexMapper::addNewSequence(LlmRequest::RequestIdType requestId)
+{
+    TLLM_CHECK(indexMap_.find(requestId) == indexMap_.end());
+    auto iter = freeIndices_.begin();
+    TLLM_CHECK_WITH_INFO(iter != freeIndices_.end(), "No free index found");
+    auto index = *iter;
+    freeIndices_.erase(iter);
+    indexMap_[requestId] = index;
+    return index;
+}
+
+SizeType32 IndexMapper::getIndex(LlmRequest::RequestIdType requestId)
+{
+    return indexMap_[requestId];
+}
+
+void IndexMapper::removeSequence(LlmRequest::RequestIdType requestId)
+{
+    auto iter = indexMap_.find(requestId);
+    TLLM_CHECK(iter != indexMap_.end());
+    auto index = iter->second;
+    freeIndices_.insert(index);
+    indexMap_.erase(iter);
+}
+
+at::Tensor IndexMapper::getCopyIndex(
+    std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 numContext, SizeType32 beamWidth)
+{
+    int numSeqs = numContext + beamWidth * (requestIds.size() - numContext);
+    for (uint32_t i = 0, idx = 0; i < requestIds.size(); i++)
+    {
+        if (i < numContext)
+        {
+            copyIndex_[idx++] = indexMap_[requestIds[i]] * maxBeamWidth_;
+        }
+        else
+        {
+            for (uint32_t j = 0; j < beamWidth; j++)
+            {
+                copyIndex_[idx++] = indexMap_[requestIds[i]] * maxBeamWidth_ + j;
+            }
+        }
+    }
+
+    auto options = at::TensorOptions().dtype(at::ScalarType::Int).pinned_memory(true);
+    return at::from_blob(copyIndex_, numSeqs, options);
+}
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManagerV2Utils.cu b/cpp/tensorrt_llm/batch_manager/kvCacheManagerV2Utils.cu
@@ -1,15 +1,20 @@
 #include "kvCacheManagerV2Utils.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/common/memoryUtils.h"
 #include <algorithm>
 #include <array>
 #include <cassert>
 #include <cuda_runtime.h>
+#include <vector>
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
 {
 using Grain = uint4;
 constexpr uint32_t ctaSize = 128;
+constexpr uint32_t copyBlockCtaSize = 128;
+constexpr uint32_t copyBlocknbBufs = 2;
 constexpr uint32_t nbBufs = 4;
 constexpr uint32_t grainBytes = sizeof(Grain);
 
@@ -162,4 +167,139 @@ CUresult copyDeviceToDevice(std::vector<MMTask> const& tasks, ssize_t numBytes,
     return launchBatchedCopy(false, tasks, numBytes, stream);
 }
 
+// dst_tensor[:, :num_seqs, 0] = src_tensor[:, copy_idx]
+// dst_tensor[:, :num_seqs, 1] = dst_tensor[:, :num_seqs, 0] + 1
+template <bool COPY_V_IDX = true>
+__global__ void copyBatchBlockOffsetsToDeviceKernel(SizeType32 const* __restrict__ srcPtr,
+    SizeType32* __restrict__ dstPtr, SizeType32 const maxNumSequences, SizeType32 numBlocksPerSeq,
+    SizeType32 const* __restrict__ copyIndex)
+{
+    constexpr uint32_t kvFactor = 2;
+    constexpr auto elemPerAccess = sizeof(PackedInt) / sizeof(SizeType32);
+
+    __shared__ PackedInt data[copyBlocknbBufs][copyBlockCtaSize];
+
+    auto const iterPerSeq = divUp(numBlocksPerSeq * sizeof(SizeType32), sizeof(PackedInt) * copyBlockCtaSize);
+    auto const tid = threadIdx.x;
+    auto const poolIdx = blockIdx.x;
+    auto const seqIdx = blockIdx.y;
+    auto const seqDimStride = kvFactor * numBlocksPerSeq;
+    uint32_t const srcIdxBeg = tid * elemPerAccess + (poolIdx * maxNumSequences + copyIndex[seqIdx]) * seqDimStride;
+    uint32_t const dstIdxKBeg = tid * elemPerAccess + (poolIdx * maxNumSequences + seqIdx) * seqDimStride;
+    uint32_t const dstIdxVBeg = dstIdxKBeg + numBlocksPerSeq;
+
+    uint32_t const srcIdxEnd = (poolIdx * maxNumSequences + copyIndex[seqIdx]) * seqDimStride + numBlocksPerSeq;
+
+    for (uint32_t i = 0; i < iterPerSeq + copyBlocknbBufs; i++)
+    {
+        uint32_t const idxBuf = i % copyBlocknbBufs;
+        if (i >= copyBlocknbBufs)
+        {
+            uint32_t const stIter = i - copyBlocknbBufs;
+            assert(idxBuf == (stIter % copyBlocknbBufs));
+            auto const offset = copyBlockCtaSize * stIter * elemPerAccess;
+            SizeType32 const srcIdx = srcIdxBeg + offset;
+            SizeType32 const dstIdxK = dstIdxKBeg + offset;
+            SizeType32 const dstIdxV = dstIdxVBeg + offset;
+            PackedInt const& src = data[idxBuf][tid];
+            PackedInt& dstK = *reinterpret_cast<PackedInt*>(dstPtr + dstIdxK);
+            PackedInt& dstV = *reinterpret_cast<PackedInt*>(dstPtr + dstIdxV);
+            asm volatile("cp.async.wait_group %0;\n" ::"n"(copyBlocknbBufs - 1) : "memory");
+            if (srcIdx < srcIdxEnd)
+            {
+                dstK = src;
+                if (COPY_V_IDX)
+                {
+#pragma unroll
+                    for (uint32_t j = 0; j < elemPerAccess; j++)
+                    {
+                        dstV.unpacked[j] = src.unpacked[j] + 1;
+                    }
+                }
+            }
+        }
+        uint32_t const ldIter = i;
+        PackedInt* const dst = &data[idxBuf][tid];
+        uint32_t const srcIdx = srcIdxBeg + copyBlockCtaSize * ldIter * elemPerAccess;
+        PackedInt const* const src = reinterpret_cast<PackedInt const*>(srcPtr + srcIdx);
+        if (srcIdx < srcIdxEnd)
+        {
+            uint32_t const size = sizeof(PackedInt);
+            asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"l"(__cvta_generic_to_shared(dst)),
+                         "l"(src), "n"(size), "r"(size)
+                         : "memory");
+        }
+        asm volatile("cp.async.commit_group;\n" : : : "memory");
+    }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
+}
+
+// Host-side launcher
+void copyBatchBlockOffsetsToDevice(
+    ITensor const& input, ITensor& output, ITensor const& copyIndex, bool copyVIdx, CUstream stream) noexcept
+{
+    using namespace tensorrt_llm::runtime;
+
+    auto const* srcPtr = bufferCast<tk::KVCacheIndex::UnderlyingType const>(input);
+    auto* dstPtr = bufferCast<tk::KVCacheIndex::UnderlyingType>(
+        output); // [numPools, maxNumSequences, kvFactor, numBlocksPerSeq]
+    auto const* copyIndexPtr = bufferCast<SizeType32 const>(copyIndex);
+    auto const& srcShape = input.getShape();
+    auto const& dstShape = output.getShape();
+    auto const& copyIndexShape = copyIndex.getShape();
+
+    TLLM_CHECK(srcShape.nbDims == 4); // [numPools, maxNumSequences, kvFactor, numBlocksPerSeq]
+    TLLM_CHECK(dstShape.nbDims == 4); // [numPools, maxNumSequences, kvFactor, numBlocksPerSeq]
+
+    SizeType32 numPools = srcShape.d[0];
+    SizeType32 maxNumSequences = srcShape.d[1];
+    SizeType32 numBlocksPerSeq = srcShape.d[3];
+    SizeType32 numSeqs = copyIndexShape.d[0];
+
+    if (numSeqs == 0)
+    {
+        return;
+    }
+
+    TLLM_CHECK_WITH_INFO((numBlocksPerSeq * sizeof(SizeType32)) % sizeof(PackedInt) == 0,
+        "Not implemented case: numBlocksPerSeq * sizeof(SizeType32) = %zu must be a multiple of %zu.",
+        static_cast<size_t>(numBlocksPerSeq * sizeof(SizeType32)), static_cast<size_t>(sizeof(PackedInt)));
+
+    dim3 gridDim(numPools, numSeqs, 1);
+    dim3 blockDim(copyBlockCtaSize);
+
+    if (copyVIdx)
+    {
+        copyBatchBlockOffsetsToDeviceKernel<true>
+            <<<gridDim, blockDim, 0, stream>>>(srcPtr, dstPtr, maxNumSequences, numBlocksPerSeq, copyIndexPtr);
+    }
+    else
+    {
+        copyBatchBlockOffsetsToDeviceKernel<false>
+            <<<gridDim, blockDim, 0, stream>>>(srcPtr, dstPtr, maxNumSequences, numBlocksPerSeq, copyIndexPtr);
+    }
+}
+
+IndexMapper::IndexMapper(SizeType32 maxBatchSize, SizeType32 maxBeamWidth)
+    : maxBatchSize_(maxBatchSize)
+    , maxBeamWidth_(maxBeamWidth)
+{
+    indexMap_.reserve(maxBatchSize);
+    for (SizeType32 i = 0; i < maxBatchSize; i++)
+    {
+        freeIndices_.insert(i);
+    }
+    // Allocate copyIndex_ memory as pinned (page-locked) host memory
+    TLLM_CUDA_CHECK(cudaMallocHost(&copyIndex_, maxBatchSize * maxBeamWidth * sizeof(SizeType32)));
+}
+
+IndexMapper::~IndexMapper()
+{
+    indexMap_.clear();
+    freeIndices_.clear();
+    TLLM_CUDA_CHECK(cudaFreeHost(copyIndex_));
+}
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManagerV2Utils.h b/cpp/tensorrt_llm/batch_manager/kvCacheManagerV2Utils.h
@@ -17,11 +17,14 @@
 
 #pragma once
 
+#include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/kernels/kvCacheIndex.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include <ATen/ATen.h>
 #include <cstdint>
 #include <cuda.h>
+#include <set>
 #include <vector>
 
 namespace tk = tensorrt_llm::kernels;
@@ -51,6 +54,37 @@ struct BlockIndices
     SizeType32 length;
 };
 
+using PackedInt = union
+{
+    int4 packed;
+    SizeType32 unpacked[4];
+};
+
+class IndexMapper
+{
+public:
+    IndexMapper(SizeType32 maxBatchSize, SizeType32 maxBeamWidth);
+
+    ~IndexMapper();
+
+    SizeType32 addNewSequence(LlmRequest::RequestIdType requestId);
+
+    SizeType32 getIndex(LlmRequest::RequestIdType requestId);
+
+    void removeSequence(LlmRequest::RequestIdType requestId);
+
+    at::Tensor getCopyIndex(
+        std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 numContext, SizeType32 beamWidth);
+
+private:
+    std::unordered_map<LlmRequest::RequestIdType, SizeType32> indexMap_;
+    std::set<SizeType32> freeIndices_;
+    SizeType32* copyIndex_;
+    SizeType32 currentIndex_;
+    SizeType32 maxBatchSize_;
+    SizeType32 maxBeamWidth_;
+};
+
 CUresult copyDiskToDisk(
     std::vector<Task<DiskAddress, DiskAddress>> const& tasks, ssize_t numBytes, CUstream stream) noexcept;
 CUresult copyDiskToHost(
@@ -69,4 +103,7 @@ CUresult copyDeviceToDevice(
 void copyBatchBlockOffsets(ITensor& output, SizeType32 batchSize, std::vector<BlockIndices> const& batchBlockIndices,
     SizeType32 numPools, SizeType32 offset) noexcept;
 
+void copyBatchBlockOffsetsToDevice(
+    ITensor const& input, ITensor& output, ITensor const& copyIndex, bool copyVIdx, CUstream stream) noexcept;
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManagerV2Utils.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManagerV2Utils.cpp
@@ -76,6 +76,13 @@ void KVCacheManagerV2UtilsBindings::initBindings(nb::module_& module)
         .def_rw("addr", &BlockIndices::addr)
         .def_rw("length", &BlockIndices::length);
 
+    nb::class_<IndexMapper>(module, "IndexMapper")
+        .def(nb::init<SizeType32, SizeType32>(), nb::arg("max_batch_size"), nb::arg("max_beam_width"))
+        .def("add_new_sequence", &IndexMapper::addNewSequence)
+        .def("get_index", &IndexMapper::getIndex)
+        .def("remove_sequence", &IndexMapper::removeSequence)
+        .def("get_copy_index", &IndexMapper::getCopyIndex);
+
     // Bind copy functions
     module.def(
         "copy_disk_to_disk",
@@ -137,6 +144,22 @@ void KVCacheManagerV2UtilsBindings::initBindings(nb::module_& module)
         },
         nb::arg("output"), nb::arg("batch_size"), nb::arg("batch_block_indices"), nb::arg("num_pools"),
         nb::arg("offset"), nb::call_guard<nb::gil_scoped_release>(), "Copy batch block indices to output tensor");
+
+    module.def(
+        "copy_batch_block_offsets_to_device",
+        [](at::Tensor input, at::Tensor output, at::Tensor copyIndex, bool copyVIdx, uintptr_t stream)
+        {
+            auto _input = from_torch(input);
+            auto _output = from_torch(output);
+            auto _copyIndex = from_torch(copyIndex);
+            TLLM_CHECK_WITH_INFO(_input.has_value(), "Invalid input tensor.");
+            TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
+            TLLM_CHECK_WITH_INFO(_copyIndex.has_value(), "Invalid copy index tensor.");
+            copyBatchBlockOffsetsToDevice(*(_input.value()), *(_output.value()), *(_copyIndex.value()), copyVIdx,
+                reinterpret_cast<CUstream>(stream));
+        },
+        nb::arg("input"), nb::arg("output"), nb::arg("copy_index"), nb::arg("copy_v_idx"), nb::arg("stream"),
+        nb::call_guard<nb::gil_scoped_release>(), "Copy batch block indices to device");
 }
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager_v2
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -725,11 +725,8 @@ def _post_init_with_buffers(self, buffers) -> None:
                 dtype=torch.int32,
                 capture_graph=capture_graph,
             )
-            self.host_kv_cache_block_offsets = torch.empty_like(
-                self.kv_cache_block_offsets,
-                device='cpu',
-                pin_memory=True,
-            )
+            self.host_kv_cache_block_offsets = self.kv_cache_manager.host_kv_cache_block_offsets
+            assert self.host_kv_cache_block_offsets.shape == self.kv_cache_block_offsets.shape, f"host_kv_cache_block_offsets and kv_cache_block_offsets should have the same shape, but got {self.host_kv_cache_block_offsets.shape} and {self.kv_cache_block_offsets.shape}"
             self.block_ids_per_seq = None
             self.kv_block_ids_per_seq = None
             if self.enable_flash_mla:
@@ -861,16 +858,8 @@ def prepare(self) -> None:
         if self.kv_cache_manager is not None:
             # Copy blocks for all context requests
             self.kv_cache_manager.copy_batch_block_offsets(
-                self.host_kv_cache_block_offsets,
-                self.request_ids[:self.num_contexts], 1, 0)
-            # Copy blocks for all generation requests
-            self.kv_cache_manager.copy_batch_block_offsets(
-                self.host_kv_cache_block_offsets,
-                self.request_ids[self.num_contexts:], self.beam_width,
-                self.num_contexts)
-            self.kv_cache_block_offsets[:, :self.num_seqs].copy_(
-                self.host_kv_cache_block_offsets[:, :self.num_seqs],
-                non_blocking=True)
+                self.kv_cache_block_offsets, self.request_ids, self.beam_width,
+                self.num_contexts, self.num_generations)
 
             error_message = (
                 f"The max KV cache length of input sequences ({self.kv_lens[:self.num_seqs].max()}) "
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1479,6 +1479,9 @@ def _executor_loop_overlap(self):
                         iter_stats=iter_stats,
                         ctx_transmission_reqs=ctx_transmission_reqs)
 
+                else:
+                    self.previous_batch = None
+
                 if self.kv_cache_transceiver and self.ctx_in_transmission_requests:
                     self._check_kv_transfer_timeout()
                     self._terminate_disagg_ctx_finished_requests()
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py