启动命令:CUDA_VISIBLE_DEVICES=0,1,2,3 grpst start server.mar --inference_conf=conf/inference_deepseek-r1-distill-qwen.yml --mpi_np=4
报错信息:
[TensorRT-LLM][ERROR] tensorrt_llm::common::TllmException: [TensorRT-LLM][ERROR] Assertion failed: len <= remaining_buffer_size (/workspace/tensorrt_llm/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp:34)
1 0x7fdd6f903370 tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 95
2 0x7fdd106eb24c tensorrt_llm::kernels::jit::CubinObj::CubinObj(void const*, unsigned long) + 412
3 0x7fdd107017d4 tensorrt_llm::kernels::jit::CubinObjRegistryTemplate<tensorrt_llm::kernels::XQAKernelFullHashKey, tensorrt_llm::kernels::XQAKernelFullHasher>::CubinObjRegistryTemplate(void const*, unsigned long) + 292
4 0x7fdd10701132 tensorrt_llm::kernels::DecoderXQARunner::Resource::Resource(void const*, unsigned long) + 50
5 0x7fdd0405e149 tensorrt_llm::plugins::GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const*, unsigned long) + 1193
6 0x7fdd04095232 tensorrt_llm::plugins::GPTAttentionPlugin::GPTAttentionPlugin(void const*, unsigned long) + 18
7 0x7fdd040952b2 tensorrt_llm::plugins::GPTAttentionPluginCreator::deserializePlugin(char const*, void const*, unsigned long) + 50
8 0x7fdcf5aa6b5b /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11deb5b) [0x7fdcf5aa6b5b]
9 0x7fdcf5aa345e /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11db45e) [0x7fdcf5aa345e]
10 0x7fdcf59d62b7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110e2b7) [0x7fdcf59d62b7]
11 0x7fdcf59d4e6a /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110ce6a) [0x7fdcf59d4e6a]
12 0x7fdcf59eca77 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1124a77) [0x7fdcf59eca77]
13 0x7fdcf59f05b6 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11285b6) [0x7fdcf59f05b6]
14 0x7fdcf59f0b06 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1128b06) [0x7fdcf59f0b06]
15 0x7fdcf5a27fc7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x115ffc7) [0x7fdcf5a27fc7]
16 0x7fdcf5a28bd8 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160bd8) [0x7fdcf5a28bd8]
17 0x7fdcf5a28cdb /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160cdb) [0x7fdcf5a28cdb]
18 0x7fdd1252f275 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(tensorrt_llm::runtime::RawEngine const&, nvinfer1::ILogger*, float, bool) + 1413
19 0x7fdd1298d428 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptrnvinfer1::ILogger, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::runtime::RawEngine const&, bool, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 1304
20 0x7fdd1291151e tensorrt_llm::batch_manager::TrtGptModelFactory::create(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::batch_manager::TrtGptModelType, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 526
21 0x7fdd12a28029 tensorrt_llm::executor::Executor::Impl::createModel(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::executor::ExecutorConfig const&) + 185
22 0x7fdd12a286bd tensorrt_llm::executor::Executor::Impl::loadModel(std::optionalstd::filesystem::__cxx11::path const&, std::optional<std::basic_string_view<unsigned char, std::char_traits > > const&, tensorrt_llm::runtime::GptJsonConfig const&, tensorrt_llm::executor::ExecutorConfig const&, bool, std::optional<std::map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, tensorrt_llm::executor::Tensor, std::less<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, tensorrt_llm::executor::Tensor> > > > const&) + 1229
23 0x7fdd12a2990a tensorrt_llm::executor::Executor::Impl::Impl(std::filesystem::__cxx11::path const&, std::optionalstd::filesystem::__cxx11::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 2474
24 0x7fdd12a0f757 tensorrt_llm::executor::Executor::Executor(std::filesystem::__cxx11::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 87
25 0x7fdd6f977bfa std::__detail::_MakeUniq<tensorrt_llm::executor::Executor>::__single_object std::make_unique<tensorrt_llm::executor::Executor, std::__cxx11::basic_string<char, std::char_traits, std::allocator >&, tensorrt_llm::executor::ModelType&, tensorrt_llm::executor::ExecutorConfig&>(std::__cxx11::basic_string<char, std::char_traits, std::allocator >&, tensorrt_llm::executor::ModelType&, tensorrt_llm::executor::ExecutorConfig&) + 154
26 0x7fdd6f974966 netease::grps::TrtLlmModelInstance::TrtLlmModelInstance(netease::grps::TrtLlmModelState*, netease::grps::LLMStyler*, netease::grps::MultiInstanceTokenizer*, netease::grps::VIT*) + 3542
27 0x7fdd73427eae netease::grps::TrtllmInferer::Load() + 4286
28 0x55b7cafefe2e ./bin/grps_server(+0xf2e2e) [0x55b7cafefe2e]
29 0x55b7caff1e2c ./bin/grps_server(+0xf4e2c) [0x55b7caff1e2c]
30 0x55b7caf82a7a ./bin/grps_server(+0x85a7a) [0x55b7caf82a7a]
31 0x7fdd6ff9a1ca /usr/lib/x86_64-linux-gnu/libc.so.6(+0x2a1ca) [0x7fdd6ff9a1ca]
32 0x7fdd6ff9a28b __libc_start_main + 139
33 0x55b7caf87865 ./bin/grps_server(+0x8a865) [0x55b7caf87865]
模型在TensorRT-LLM单独测过没有问题,麻烦看下。
启动命令:CUDA_VISIBLE_DEVICES=0,1,2,3 grpst start server.mar --inference_conf=conf/inference_deepseek-r1-distill-qwen.yml --mpi_np=4
报错信息:
[TensorRT-LLM][ERROR] tensorrt_llm::common::TllmException: [TensorRT-LLM][ERROR] Assertion failed: len <= remaining_buffer_size (/workspace/tensorrt_llm/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp:34)
1 0x7fdd6f903370 tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 95
2 0x7fdd106eb24c tensorrt_llm::kernels::jit::CubinObj::CubinObj(void const*, unsigned long) + 412
3 0x7fdd107017d4 tensorrt_llm::kernels::jit::CubinObjRegistryTemplate<tensorrt_llm::kernels::XQAKernelFullHashKey, tensorrt_llm::kernels::XQAKernelFullHasher>::CubinObjRegistryTemplate(void const*, unsigned long) + 292
4 0x7fdd10701132 tensorrt_llm::kernels::DecoderXQARunner::Resource::Resource(void const*, unsigned long) + 50
5 0x7fdd0405e149 tensorrt_llm::plugins::GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const*, unsigned long) + 1193
6 0x7fdd04095232 tensorrt_llm::plugins::GPTAttentionPlugin::GPTAttentionPlugin(void const*, unsigned long) + 18
7 0x7fdd040952b2 tensorrt_llm::plugins::GPTAttentionPluginCreator::deserializePlugin(char const*, void const*, unsigned long) + 50
8 0x7fdcf5aa6b5b /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11deb5b) [0x7fdcf5aa6b5b]
9 0x7fdcf5aa345e /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11db45e) [0x7fdcf5aa345e]
10 0x7fdcf59d62b7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110e2b7) [0x7fdcf59d62b7]
11 0x7fdcf59d4e6a /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110ce6a) [0x7fdcf59d4e6a]
12 0x7fdcf59eca77 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1124a77) [0x7fdcf59eca77]
13 0x7fdcf59f05b6 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11285b6) [0x7fdcf59f05b6]
14 0x7fdcf59f0b06 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1128b06) [0x7fdcf59f0b06]
15 0x7fdcf5a27fc7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x115ffc7) [0x7fdcf5a27fc7]
16 0x7fdcf5a28bd8 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160bd8) [0x7fdcf5a28bd8]
17 0x7fdcf5a28cdb /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160cdb) [0x7fdcf5a28cdb]
18 0x7fdd1252f275 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(tensorrt_llm::runtime::RawEngine const&, nvinfer1::ILogger*, float, bool) + 1413
19 0x7fdd1298d428 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptrnvinfer1::ILogger, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::runtime::RawEngine const&, bool, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 1304
20 0x7fdd1291151e tensorrt_llm::batch_manager::TrtGptModelFactory::create(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::batch_manager::TrtGptModelType, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 526
21 0x7fdd12a28029 tensorrt_llm::executor::Executor::Impl::createModel(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::executor::ExecutorConfig const&) + 185
22 0x7fdd12a286bd tensorrt_llm::executor::Executor::Impl::loadModel(std::optionalstd::filesystem::__cxx11::path const&, std::optional<std::basic_string_view<unsigned char, std::char_traits > > const&, tensorrt_llm::runtime::GptJsonConfig const&, tensorrt_llm::executor::ExecutorConfig const&, bool, std::optional<std::map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, tensorrt_llm::executor::Tensor, std::less<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, tensorrt_llm::executor::Tensor> > > > const&) + 1229
23 0x7fdd12a2990a tensorrt_llm::executor::Executor::Impl::Impl(std::filesystem::__cxx11::path const&, std::optionalstd::filesystem::__cxx11::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 2474
24 0x7fdd12a0f757 tensorrt_llm::executor::Executor::Executor(std::filesystem::__cxx11::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 87
25 0x7fdd6f977bfa std::__detail::_MakeUniq<tensorrt_llm::executor::Executor>::__single_object std::make_unique<tensorrt_llm::executor::Executor, std::__cxx11::basic_string<char, std::char_traits, std::allocator >&, tensorrt_llm::executor::ModelType&, tensorrt_llm::executor::ExecutorConfig&>(std::__cxx11::basic_string<char, std::char_traits, std::allocator >&, tensorrt_llm::executor::ModelType&, tensorrt_llm::executor::ExecutorConfig&) + 154
26 0x7fdd6f974966 netease::grps::TrtLlmModelInstance::TrtLlmModelInstance(netease::grps::TrtLlmModelState*, netease::grps::LLMStyler*, netease::grps::MultiInstanceTokenizer*, netease::grps::VIT*) + 3542
27 0x7fdd73427eae netease::grps::TrtllmInferer::Load() + 4286
28 0x55b7cafefe2e ./bin/grps_server(+0xf2e2e) [0x55b7cafefe2e]
29 0x55b7caff1e2c ./bin/grps_server(+0xf4e2c) [0x55b7caff1e2c]
30 0x55b7caf82a7a ./bin/grps_server(+0x85a7a) [0x55b7caf82a7a]
31 0x7fdd6ff9a1ca /usr/lib/x86_64-linux-gnu/libc.so.6(+0x2a1ca) [0x7fdd6ff9a1ca]
32 0x7fdd6ff9a28b __libc_start_main + 139
33 0x55b7caf87865 ./bin/grps_server(+0x8a865) [0x55b7caf87865]
模型在TensorRT-LLM单独测过没有问题,麻烦看下。