-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Description
System Info
GPU: H20
Version: 1.2.0rc7
NVIDIA-SMI 535.161.08
Driver Version: 535.161.08
CUDA Version: 13.0
Who can help?
No response
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
cat > /workspace/build_qwen.py << 'EOF'
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm import BuildConfig
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.plugin import PluginConfig
def main():
kv_cache_config = KvCacheConfig(
enable_block_reuse=False,
free_gpu_memory_fraction=0.9,
tokens_per_block=32,
)
plugin_config = PluginConfig()
plugin_config.tokens_per_block = 32
plugin_config.use_paged_context_fmha = True
plugin_config.streamingllm = False
plugin_config.paged_kv_cache = True
build_config = BuildConfig(
max_batch_size=2,
max_beam_width=128,
max_seq_len=768,
plugin_config=plugin_config,
)
llm = LLM(
model="/workspace/checkpoints/qwen3-0.6B",
dtype="float16",
build_config=build_config,
kv_cache_config=kv_cache_config,
)
llm.save("/workspace/checkpoints/qwen3-0.6B-engine-beam")
print("Build completed!")
if name == 'main':
main()
EOF
trtllm-serve serve --backend tensorrt --max_beam_width 128 --max_batch_size 8 --max_seq_len 768 --host 0.0.0.0 --port 8000 --free_gpu_memory_fraction 0.9 "/workspace/checkpoints/qwen3-0.6B-engine-beam"
Expected behavior
"POST /v1/chat/completions HTTP/1.1" 200 OK
actual behavior
[TensorRT-LLM][ERROR] Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] CUDA runtime error in cudaMemcpy2DAsync( dstPtr, copyPitch, srcPtr, copyPitch, copyWidth, copyHeight, cudaMemcpyHostToDevice, cudaStream.get()): invalid pitch argument (../tensorrt_llm/batch_manager/transformerBuffers.cpp:386)
1 0x7f9699eaa75b void tensorrt_llm::_v1::common::check(cudaError, char const*, char const*, int) + 139
2 0x7f965bf8c64f tensorrt_llm::batch_manager::TransformerBuffers::copyKvBlockOffsets(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const*, tensorrt_llm::runtime::BufferManager const&) + 831
3 0x7f965bf83d34 tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 8420
4 0x7f965bf87ade tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 302
5 0x7f965bfa03e3 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::prepareBuffers[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 323
6 0x7f965bfa49ba tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 378
7 0x7f965bfa50d7 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 263
8 0x7f965bfb3edf tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2191
9 0x7f965c1148e9 tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 457
10 0x7f965c120e22 tensorrt_llm::executor::Executor::Impl::executionLoop() + 1426
11 0x7f97f5045db4 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xecdb4) [0x7f97f5045db4]
12 0x7f9a4e6c3aa4 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9caa4) [0x7f9a4e6c3aa4]
13 0x7f9a4e750c6c /usr/lib/x86_64-linux-gnu/libc.so.6(+0x129c6c) [0x7f9a4e750c6c]
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/llmapi/utils.py", line 40, in wrapper
return func(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(args, kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 474, in _handle_response
handler(response.error_msg)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/executor.py", line 280, in _handle_background_error
raise error
tensorrt_llm.executor.utils.RequestError: Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] CUDA runtime error in cudaMemcpy2DAsync( dstPtr, copyPitch, srcPtr, copyPitch, copyWidth, copyHeight, cudaMemcpyHostToDevice, cudaStream.get()): invalid pitch argument (../tensorrt_llm/batch_manager/transformerBuffers.cpp:386)
1 0x7f9699eaa75b void tensorrt_llm::_v1::common::check(cudaError, char const, char const, int) + 139
2 0x7f965bf8c64f tensorrt_llm::batch_manager::TransformerBuffers::copyKvBlockOffsets(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager const, tensorrt_llm::runtime::BufferManager const&) + 831
3 0x7f965bf83d34 tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 8420
4 0x7f965bf87ade tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::runtime::decoder::DecoderState const&, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> >, std::less, std::allocator<std::pair<unsigned long const, std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, bool, bool, tensorrt_llm::_v1::common::OptionalRef<tensorrt_llm::runtime::ITensor const>) + 302
5 0x7f965bfa03e3 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::prepareBuffers[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 323
6 0x7f965bfa49ba tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 378
7 0x7f965bfa50d7 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 263
8 0x7f965bfb3edf tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2191
9 0x7f965c1148e9 tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 457
10 0x7f965c120e22 tensorrt_llm::executor::Executor::Impl::executionLoop() + 1426
11 0x7f97f5045db4 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xecdb4) [0x7f97f5045db4]
12 0x7f9a4e6c3aa4 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9caa4) [0x7f9a4e6c3aa4]
13 0x7f9a4e750c6c /usr/lib/x86_64-linux-gnu/libc.so.6(+0x129c6c) [0x7f9a4e750c6c]
[03/10/2026-03:31:39] [TRT-LLM] [E] Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/serve/openai_server.py", line 597, in openai_chat
response = await create_chat_response(promise, postproc_params, disaggregated_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/serve/openai_server.py", line 512, in create_chat_response
await promise.aresult()
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 792, in aresult
await self._aresult_step()
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 770, in _aresult_step
self._handle_response(response)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 627, in _handle_response
GenerationResultBase._handle_response(self, response)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/llmapi/utils.py", line 44, in wrapper
raise e
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/llmapi/utils.py", line 40, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py", line 474, in _handle_response
handler(response.error_msg)
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/executor.py", line 280, in _handle_background_error
raise error
additional notes
beam_width == 50 is ok
Before submitting a new issue...
- Make sure you already searched for relevant issues, and checked the documentation and examples for answers to frequently asked questions.