Skip to content

[Bug]: BeamSearch with beam_width 128 Encountered an error in in cudaMemcpy2DAsync #12071

@Doloxetine

Description

@Doloxetine

System Info

GPU: H20
Version: 1.2.0rc7
NVIDIA-SMI 535.161.08
Driver Version: 535.161.08
CUDA Version: 13.0

Who can help?

No response

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

cat > /workspace/build_qwen.py << 'EOF'
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm import BuildConfig
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.plugin import PluginConfig

def main():
kv_cache_config = KvCacheConfig(
enable_block_reuse=False,
free_gpu_memory_fraction=0.9,
tokens_per_block=32,
)

plugin_config = PluginConfig()
plugin_config.tokens_per_block = 32
plugin_config.use_paged_context_fmha = True 
plugin_config.streamingllm = False
plugin_config.paged_kv_cache = True 


build_config = BuildConfig(
    max_batch_size=2,  
    max_beam_width=128,
    max_seq_len=768,    
    plugin_config=plugin_config,
)

llm = LLM(
    model="/workspace/checkpoints/qwen3-0.6B",
    dtype="float16",
    build_config=build_config,
    kv_cache_config=kv_cache_config,
)

llm.save("/workspace/checkpoints/qwen3-0.6B-engine-beam")
print("Build completed!")

if name == 'main':
main()
EOF

trtllm-serve serve --backend tensorrt --max_beam_width 128 --max_batch_size 8 --max_seq_len 768 --host 0.0.0.0 --port 8000 --free_gpu_memory_fraction 0.9 "/workspace/checkpoints/qwen3-0.6B-engine-beam"

Expected behavior

"POST /v1/chat/completions HTTP/1.1" 200 OK

actual behavior

[TensorRT-LLM][ERROR] Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] CUDA runtime error in cudaMemcpy2DAsync( dstPtr, copyPitch, srcPtr, copyPitch, copyWidth, copyHeight, cudaMemcpyHostToDevice, cudaStream.get()): invalid pitch argument (../tensorrt_llm/batch_manager/transformerBuffers.cpp:386)
1 0x7f9699eaa75b void tensorrt_llm::_v1::common::check(cudaError, char const*, char const*, int) + 139
2 0x7f965bf8c64f tensorrt_llm::batch_manager::TransformerBuffers::copyKvBlockOffsets(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const*, tensorrt_llm::runtime::BufferManager const&) + 831
3 0x7f965bf83d34 tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 8420
4 0x7f965bf87ade tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 302
5 0x7f965bfa03e3 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::prepareBuffers[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 323
6 0x7f965bfa49ba tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 378
7 0x7f965bfa50d7 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 263
8 0x7f965bfb3edf tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2191
9 0x7f965c1148e9 tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 457
10 0x7f965c120e22 tensorrt_llm::executor::Executor::Impl::executionLoop() + 1426
11 0x7f97f5045db4 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xecdb4) [0x7f97f5045db4]
12 0x7f9a4e6c3aa4 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9caa4) [0x7f9a4e6c3aa4]
13 0x7f9a4e750c6c /usr/lib/x86_64-linux-gnu/libc.so.6(+0x129c6c) [0x7f9a4e750c6c]
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/llmapi/utils.py", line 40, in wrapper
return func(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(args, kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 474, in _handle_response
handler(response.error_msg)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/executor.py", line 280, in _handle_background_error
raise error
tensorrt_llm.executor.utils.RequestError: Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] CUDA runtime error in cudaMemcpy2DAsync( dstPtr, copyPitch, srcPtr, copyPitch, copyWidth, copyHeight, cudaMemcpyHostToDevice, cudaStream.get()): invalid pitch argument (../tensorrt_llm/batch_manager/transformerBuffers.cpp:386)
1 0x7f9699eaa75b void tensorrt_llm::_v1::common::check(cudaError, char const
, char const
, int) + 139
2 0x7f965bf8c64f tensorrt_llm::batch_manager::TransformerBuffers::copyKvBlockOffsets(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const
, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const
, tensorrt_llm::runtime::BufferManager const&) + 831
3 0x7f965bf83d34 tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager
, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager
, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 8420
4 0x7f965bf87ade tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 302
5 0x7f965bfa03e3 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::prepareBuffers[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 323
6 0x7f965bfa49ba tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 378
7 0x7f965bfa50d7 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 263
8 0x7f965bfb3edf tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2191
9 0x7f965c1148e9 tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 457
10 0x7f965c120e22 tensorrt_llm::executor::Executor::Impl::executionLoop() + 1426
11 0x7f97f5045db4 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xecdb4) [0x7f97f5045db4]
12 0x7f9a4e6c3aa4 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9caa4) [0x7f9a4e6c3aa4]
13 0x7f9a4e750c6c /usr/lib/x86_64-linux-gnu/libc.so.6(+0x129c6c) [0x7f9a4e750c6c]
[03/10/2026-03:31:39] [TRT-LLM] [E] Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/serve/openai_server.py", line 597, in openai_chat
response = await create_chat_response(promise, postproc_params, disaggregated_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/serve/openai_server.py", line 512, in create_chat_response
await promise.aresult()
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 792, in aresult
await self._aresult_step()
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 770, in _aresult_step
self._handle_response(response)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 627, in _handle_response
GenerationResultBase._handle_response(self, response)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/llmapi/utils.py", line 44, in wrapper
raise e
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/llmapi/utils.py", line 40, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 474, in _handle_response
handler(response.error_msg)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/executor.py", line 280, in _handle_background_error
raise error

additional notes

beam_width == 50 is ok

Before submitting a new issue...

  • Make sure you already searched for relevant issues, and checked the documentation and examples for answers to frequently asked questions.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions