Skip to content
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2726,8 +2726,8 @@ static-analysis-files: &static_analysis_files |
tests/unittest/_torch/sampler/test_beam_search_util.py |
)$

# Global exclude pattern for vendored third-party code
exclude: '(^triton_kernels/|trtllmGenKernels/fmha/cubin/kernelMetaInfo\.h$|cubin\.cpp$|cubin\.h$)'
# Global exclude: vendored code + trtllm-gen FMHA artifacts (cubin pointers, export headers, cuda_ptx)
exclude: '(^triton_kernels/|trtllmGenKernels/fmha/cubin/kernelMetaInfo\.h$|cubin\.cpp$|cubin\.h$|trtllmGenKernels/fmha/trtllmGen_fmha_export/|trtllmGenKernels/fmha/cuda_ptx/)'

default_install_hook_types: [pre-commit, commit-msg]
repos:
Expand Down
33 changes: 29 additions & 4 deletions cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* and https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_update.cu
* Copyright (c) 2024, Tri Dao.
*
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -349,20 +349,45 @@ void causal_conv1d_fwd_launch(ConvParamsBase& params, cudaStream_t stream)
});
}

template <int kWidth, typename input_t, typename weight_t>
void causal_conv1d_fwd_dispatch(ConvParamsBase& params, cudaStream_t stream)
{
bool const isVarlen = params.query_start_loc_ptr != nullptr;
constexpr int kNarrowThreads = 64;
constexpr int kWideThreads = 128;
constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
constexpr int kShortSeqThreshold = kNarrowThreads * kNElts;
// Varlen prefill launches one block per sequence/channel pair, so the per-sequence
// work is usually much smaller than params.seqlen suggests. That path also disables
// the wide vector-load specialization, so the 128-thread kernel tends to overprovision
// threads for many short chunks. Prefer the narrower launch for varlen and for short
// fixed-length inputs; keep the wider launch for long dense sequences.
bool const preferNarrowKernel = isVarlen || params.seqlen <= kShortSeqThreshold;

if (preferNarrowKernel)
{
causal_conv1d_fwd_launch<kNarrowThreads, kWidth, input_t, weight_t>(params, stream);
}
else
{
causal_conv1d_fwd_launch<kWideThreads, kWidth, input_t, weight_t>(params, stream);
}
}

template <typename input_t, typename weight_t>
void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream)
{
if (params.width == 2)
{
causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
causal_conv1d_fwd_dispatch<2, input_t, weight_t>(params, stream);
}
else if (params.width == 3)
{
causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
causal_conv1d_fwd_dispatch<3, input_t, weight_t>(params, stream);
}
else if (params.width == 4)
{
causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
causal_conv1d_fwd_dispatch<4, input_t, weight_t>(params, stream);
}
}

Expand Down
18 changes: 15 additions & 3 deletions cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -111,11 +111,22 @@ bool FmhaDispatcher::isSupported()
tllmRunnerParams.mKernelType = FmhaKernelType::Context;
tllmRunnerParams.mTileScheduler = TileScheduler::Persistent;
tllmRunnerParams.mMultiCtasKvMode = false;
tllmRunnerParams.mNumHeadsQ = mFixedParams.numQHeads;
tllmRunnerParams.mNumHeadsKv = mFixedParams.numKvHeads;
tllmRunnerParams.mHeadDimQkNope = mFixedParams.headSizeQkNope;
tllmRunnerParams.mBatchSize = 1;
tllmRunnerParams.mMaxSeqLenQ = 1;
tllmRunnerParams.mMaxSeqLenKv = 1;
tllmRunnerParams.mMaxSeqLenCacheKv = 1;
tllmRunnerParams.mSumOfSeqLensQ = 1;
tllmRunnerParams.mSumOfSeqLensKv = 1;
tllmRunnerParams.mMaxNumPagesPerSeqKv = 1;
// Assume same headDim for Qk and V here.
tllmRunnerParams.mHeadDimQk = mFixedParams.headSize;
tllmRunnerParams.mHeadDimV = mFixedParams.headSizeV;
tllmRunnerParams.mNumTokensPerPage = mFixedParams.numTokensPerBlock;
tllmRunnerParams.mNumTokensPerPage = (qkvLayout == QkvLayout::PagedKv) ? mFixedParams.numTokensPerBlock : 0;
tllmRunnerParams.mNumHeadsQPerKv = mFixedParams.numQHeads / mFixedParams.numKvHeads;
tllmRunnerParams.mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
// Set the chunked attention size and sliding window size to INT_MAX to disable them when checking if
// the kernel is supported.
tllmRunnerParams.mChunkedAttentionSize = INT_MAX;
Expand Down Expand Up @@ -221,10 +232,11 @@ void FmhaDispatcher::run(MHARunnerParams runnerParams)
tllmRunnerParams.mSumOfSeqLensQ = runnerParams.totalQSeqLen;
tllmRunnerParams.mSumOfSeqLensKv = runnerParams.totalKvSeqLen;
tllmRunnerParams.mMaxNumPagesPerSeqKv = maxBlocksPerSeq;
tllmRunnerParams.mNumTokensPerPage = numTokensPerBlock;
tllmRunnerParams.mNumTokensPerPage = (qkvLayout == QkvLayout::PagedKv) ? numTokensPerBlock : 0;
tllmRunnerParams.mScaleQ = mFixedParams.qScaling;
// Set it to INT_MAX as the kv cache pageOffsets will ensure that there is no out-of-bounds access.
tllmRunnerParams.mNumPagesInMemPool = INT_MAX;
tllmRunnerParams.mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
tllmRunnerParams.mSfStartTokenIdx = 0;
// For mla chunked prefill
tllmRunnerParams.softmaxStatsPtr = reinterpret_cast<float2*>(runnerParams.softmaxStatsPtr);
Expand Down
1 change: 1 addition & 0 deletions cpp/tensorrt_llm/kernels/indexerTopK.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/kernels/heuristicTopKDecode.h"
#include "tensorrt_llm/kernels/noAuxTcKernels.h"
#include <cfloat>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

Expand Down
67 changes: 67 additions & 0 deletions cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,82 @@
# the License.
#

set(cutlass_source_dir ${CMAKE_BINARY_DIR}/_deps/cutlass-src)
# Point to the targets include directory which contains both cuda.h and cccl/
# Prefer CUDAToolkit from parent; otherwise derive from CMAKE_CUDA_COMPILER.
if(DEFINED CUDAToolkit_INCLUDE_DIRS AND CUDAToolkit_INCLUDE_DIRS)
list(GET CUDAToolkit_INCLUDE_DIRS 0 CUDA_TARGETS_INCLUDE_DIR)
else()
get_filename_component(CUDA_BIN_PATH ${CMAKE_CUDA_COMPILER} DIRECTORY)
get_filename_component(CUDA_TOOLKIT_ROOT ${CUDA_BIN_PATH} DIRECTORY)
set(cudaTargetsArch ${CMAKE_SYSTEM_PROCESSOR})
if(cudaTargetsArch STREQUAL "aarch64" OR cudaTargetsArch STREQUAL "arm64")
set(cudaTargetsArch sbsa)
endif()
set(CUDA_TARGETS_INCLUDE_DIR
"${CUDA_TOOLKIT_ROOT}/targets/${cudaTargetsArch}-linux/include")
endif()
file(CREATE_LINK ${CUDA_TARGETS_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cuda
SYMBOLIC)
file(CREATE_LINK ${cutlass_source_dir} ${CMAKE_CURRENT_BINARY_DIR}/cutlass
SYMBOLIC)
# Create parent directory for symbolic link
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/trtllm)
file(CREATE_LINK ${CMAKE_CURRENT_SOURCE_DIR}/trtllmGen_fmha_export/trtllm/dev
${CMAKE_CURRENT_BINARY_DIR}/trtllm/dev SYMBOLIC)
file(CREATE_LINK ${CMAKE_CURRENT_SOURCE_DIR}/cuda_ptx
${CMAKE_CURRENT_BINARY_DIR}/cuda_ptx SYMBOLIC)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/trtllmGen_fmha_export/KernelParams.h
${CMAKE_CURRENT_BINARY_DIR}/KernelParams.h COPYONLY)
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/trtllmGen_fmha_export/KernelParamsDecl.h
${CMAKE_CURRENT_BINARY_DIR}/KernelParamsDecl.h COPYONLY)

file(GLOB_RECURSE SRC_CPP *.cpp)
file(GLOB_RECURSE SRC_CU *.cu)

filter_source_cuda_architectures(
SOURCE_LIST SRC_CPP
ARCHS 100 103 100f
TARGET trtllm_gen_fmha_interface)
target_include_directories(
trtllm_gen_fmha_interface
INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/trtllmGen_fmha_export)

add_library(trtllm_gen_fmha OBJECT ${SRC_CPP} ${SRC_CU})
set_property(TARGET trtllm_gen_fmha PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET trtllm_gen_fmha PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(trtllm_gen_fmha PUBLIC trtllm_gen_fmha_interface)
target_include_directories(
trtllm_gen_fmha PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/trtllmGen_fmha_export)
set(TRTLLM_FMHA_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}")
target_compile_definitions(
trtllm_gen_fmha
PRIVATE TRTLLM_FMHA_BUILD_DIR="${TRTLLM_FMHA_BUILD_DIR}" TLLM_PUBLIC_RELEASE
TLLM_GEN_EXPORT_INTERFACE TLLM_FMHA_TRTLLM_COMPAT)
target_compile_definitions(
trtllm_gen_fmha_interface INTERFACE TLLM_GEN_EXPORT_INTERFACE
TLLM_FMHA_TRTLLM_COMPAT)

# Link the TrtLlmGen FMHA static library
set(TRTLLM_GEN_FMHA_LIB
${CMAKE_CURRENT_SOURCE_DIR}/lib/${TARGET_ARCH}/libTrtLlmGenFmhaLib.a)
if(NOT EXISTS ${TRTLLM_GEN_FMHA_LIB})
message(
FATAL_ERROR
"TrtLlmGen FMHA library not found: ${TRTLLM_GEN_FMHA_LIB}. "
"Please ensure the pre-built archive exists under lib/${TARGET_ARCH}/.")
endif()
target_link_libraries(trtllm_gen_fmha PUBLIC ${TRTLLM_GEN_FMHA_LIB})

# Link the TrtLlmGen core library (contains GenLog::getInstance()
# implementation)
set(TRTLLM_GEN_CORE_LIB
${CMAKE_CURRENT_SOURCE_DIR}/lib/${TARGET_ARCH}/libTrtLlmGen.a)
if(NOT EXISTS ${TRTLLM_GEN_CORE_LIB})
message(
FATAL_ERROR
"TrtLlmGen core library not found: ${TRTLLM_GEN_CORE_LIB}. "
"Please ensure the pre-built archive exists under lib/${TARGET_ARCH}/.")
endif()
target_link_libraries(trtllm_gen_fmha PUBLIC ${TRTLLM_GEN_CORE_LIB})
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading
Loading