diff --git a/cpp/kernels/xqa/defines.h b/cpp/kernels/xqa/defines.h index efc5c4ec6522..b369b4304571 100644 --- a/cpp/kernels/xqa/defines.h +++ b/cpp/kernels/xqa/defines.h @@ -129,6 +129,18 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena #define SLIDING_WINDOW 0 #endif +#ifndef SKIP_SOFTMAX_ATTN +#define SKIP_SOFTMAX_ATTN 0 +#endif + +#ifndef SKIP_SOFTMAX_ATTN_BLOCK_STATS +#define SKIP_SOFTMAX_ATTN_BLOCK_STATS 0 +#endif + +#ifndef SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE +#define SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE 1 +#endif + // 0 - no PDL // 1 - naive PDL // 2 - aggressive PDL (implemented only in mha_sm90.cu for now) diff --git a/cpp/kernels/xqa/gmma.cuh b/cpp/kernels/xqa/gmma.cuh index f5f29c73e778..7f5a843865c1 100644 --- a/cpp/kernels/xqa/gmma.cuh +++ b/cpp/kernels/xqa/gmma.cuh @@ -106,6 +106,7 @@ __device__ inline MatDesc makeMatDesc(void const* data, uint32_t dimKByteOffset, asm volatile("trap;\n"); return 0; }(); + assert(__cvta_generic_to_shared(data) % baseAlign == 0); uint32_t const baseOffset = ((patternAddr % baseAlign == 0) ? 0U : ((patternAddr >> 0x7) & 0x7)); return MatDesc{ /*addr=*/MatDesc::encode(__cvta_generic_to_shared(data)), diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu index 330364ee88f1..5881a93fc4fe 100644 --- a/cpp/kernels/xqa/mha.cu +++ b/cpp/kernels/xqa/mha.cu @@ -2734,6 +2734,25 @@ static constexpr auto kernel_mha = kernel_mha_impl; #endif #ifndef GENERATE_CUBIN +uint32_t computeNbSubSeqPerSeqMHA(cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen) +{ + if (!allowMultiBlockMode) + { + return 1; + } + auto const env = std::getenv("XQA_NB_SUB_SEQ"); + if (env != nullptr) + { + int32_t const val = std::stoi(env); + if (val > 0) + { + return val; + } + } + return std::min( + std::max(1U, prop.multiProcessorCount / (batchSize * nbKHeads)), divUp(maxSeqLen, ctaTile.x)); +} + void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #if SLIDING_WINDOW uint32_t slidingWinSize, @@ -2771,6 +2790,13 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads, // int8/fp8 KV cache. #if SPEC_DEC SpecDecParams const& specDecParams, +#endif +#if SKIP_SOFTMAX_ATTN + float const skipSoftmaxThresholdScaleFactor, // for compatibility with mha_sm90.cu only +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + uint32_t* __restrict__ skippedBlockCount, // for compatibility with mha_sm90.cu only + uint32_t* __restrict__ totalBlockCount, // for compatibility with mha_sm90.cu only +#endif #endif uint32_t* semaphores, void* scratch, cudaStream_t stream) { @@ -2793,24 +2819,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads, uint32_t const nbQHeads = nbKHeads * headGrpSize; // const uint32_t nbSubSeqPerSeq = allowMultiBlockMode ? DBG_NB_CTAS_PER_SEQ : 1; - uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t - { - if (!allowMultiBlockMode) - { - return 1; - } - auto const env = std::getenv("XQA_NB_SUB_SEQ"); - if (env != nullptr) - { - int32_t const val = std::stoi(env); - if (val > 0) - { - return val; - } - } - return std::min( - std::max(1U, prop.multiProcessorCount / (batchSize * nbKHeads)), divUp(maxSeqLen, ctaTile.x)); - }(); + uint32_t const nbSubSeqPerSeq = computeNbSubSeqPerSeqMHA(prop, batchSize, nbKHeads, maxSeqLen); // gridDim.z == batchSize && gridDim.y == nbKHeads && gridDim.x == nbSubSeqPerSeq #if SPEC_DEC const uint32_t nbTokenBlocksPerGrp = divUp(qSeqLen * headGrpSize, rowsPerBlock); diff --git a/cpp/kernels/xqa/mha.h b/cpp/kernels/xqa/mha.h index a40a5e6c0d06..2c7ef50a8353 100644 --- a/cpp/kernels/xqa/mha.h +++ b/cpp/kernels/xqa/mha.h @@ -90,6 +90,9 @@ struct BeamSearchParams // match trt-llm API. }; +uint32_t computeNbSubSeqPerSeqMHA( + cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen); + void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads, #if SLIDING_WINDOW uint32_t slidingWinSize, @@ -127,9 +130,18 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads, // int8/fp8 KV cache. #if SPEC_DEC SpecDecParams const& specDecParams, +#endif +#if SKIP_SOFTMAX_ATTN + float const skipSoftmaxThresholdScaleFactor, +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount, +#endif #endif uint32_t* semaphores, void* scratch, cudaStream_t stream); +uint32_t computeNbSubSeqPerSeqHopperF8MHA( + cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen); + void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #if SLIDING_WINDOW uint32_t slidingWinSize, @@ -167,6 +179,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, // int8/fp8 KV cache. #if SPEC_DEC SpecDecParams const& specDecParams, +#endif +#if SKIP_SOFTMAX_ATTN + float const skipSoftmaxThresholdScaleFactor, +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount, +#endif #endif uint32_t* semaphores, void* scratch, cudaStream_t stream); diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu index 457b10689124..dc21872ac696 100644 --- a/cpp/kernels/xqa/mha_sm90.cu +++ b/cpp/kernels/xqa/mha_sm90.cu @@ -49,6 +49,10 @@ static_assert(specDecQLen * headGrpSize <= 32, "SPEC_Q_SEQ_LEN macro value is to #define SWAP_AB (!SPEC_DEC) #endif +#if SKIP_SOFTMAX_ATTN +static_assert(SWAP_AB && USE_PAGED_KV_CACHE && !SPEC_DEC && BEAM_WIDTH == 1, "SKIP_SOFTMAX_ATTN is not supported."); +#endif + #define IS_SUPPORTED_F16_CASE (CACHE_ELEM_ENUM == 0 && !SPEC_DEC && SWAP_AB && !USE_INPUT_KV && !LOW_PREC_OUTPUT) inline constexpr bool swapAB = SWAP_AB; @@ -138,26 +142,38 @@ using PaddedOutHead = PaddedInputHead; struct alignas(128) SharedMem { + using QBuffer = Vec, nbQParts>; using KBuffer = Array2D; - static constexpr uint32_t nbKBuf = 2; - KBuffer k[nbKBuf]; // as is loaded from global mem. using XBuffer = Vec, nbXParts>; - static constexpr uint32_t nbXBuf - = 2 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens)); using VBuffer = Vec, cacheHeadNbParts>; #if !SWAP_AB using VTBuffer = Array2D; #endif - static constexpr uint32_t nbVBuf = 2; #if CACHE_ELEM_ENUM == 0 using OutSwizzleBuf = Array2D; #elif CACHE_ELEM_ENUM == 2 using OutSwizzleBuf = Array2D, 4>, ctaNbQHeads, exactDiv(headElems, 4 * 4)>; #endif + +#if SKIP_SOFTMAX_ATTN + static constexpr uint32_t nbKBuf = 2; + static constexpr uint32_t nbVBuf = 3; // @fixme: skip_softmax_attn: for skip softmax attn, an extra VBuffer is used + static constexpr uint32_t nbXBuf + = 3 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens)); +#else + static constexpr uint32_t nbKBuf = 2; + static constexpr uint32_t nbVBuf = 2; + static constexpr uint32_t nbXBuf + = 2 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens)); +#endif static_assert(nbXBuf == nbVBuf); + // note: buffers used for GMMA may have additional alignment requirements + KBuffer k[nbKBuf]; // as is loaded from global mem. + QBuffer q; // For gmma math. Conversion done if needed. + union ReusedXVOutSwizzleBuf { struct XV @@ -196,9 +212,6 @@ struct alignas(128) SharedMem return reusedXVOutSwizzleBuf[i].outSwizzle; } - using QBuffer = Vec, nbQParts>; - QBuffer q; // For gmma math. Conversion done if needed. - // @fixme: move these into reusedXVOutSwizzleBuf #if SWAP_AB ShmQWiseVec xColMax[nbXBuf]; @@ -220,6 +233,11 @@ struct alignas(128) SharedMem Vec pages[2]; // one for K and one for V #endif +#if SKIP_SOFTMAX_ATTN + uint32_t skipSoftmaxVotesGemm0ToV[nbXBuf]; // guarded by skipSoftmaxXBar + uint32_t skipSoftmaxVotesGemm0ToGemm1[nbXBuf]; // guarded by xBar +#endif + // mem barriers CtaBarrierPair qBar; @@ -229,6 +247,9 @@ struct alignas(128) SharedMem CtaBarrierPair vtBar[nbVBuf]; #endif CtaBarrierPair xBar[nbXBuf]; +#if SKIP_SOFTMAX_ATTN + CtaBarrierPair skipSoftmaxXBar[nbXBuf]; // for V to wait for X to be ready +#endif // used internally in the gemm0 warp group // @fixme: use separate arrive and wait for all usage @@ -425,8 +446,13 @@ __device__ void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, #endif #if SWAP_AB +#if SKIP_SOFTMAX_ATTN +__device__ RegColWiseVec computeWarpGrpColMax_sync(CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src, + float skipSoftmaxThreshold, uint32_t* smemSkipVote, bool maybeSkip); +#else __device__ RegColWiseVec computeWarpGrpColMax_sync( CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src); +#endif __device__ void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32_t validRowBeg, uint32_t validRowEnd); __device__ void warpGrpOnlineSoftmax(Gemm0Acc& acc, RegColWiseVec const& colMax); __device__ RegColWiseVec computeWarpColSum(Gemm0Acc& src); @@ -675,6 +701,12 @@ CUBIN_EXPORT __global__ #endif #if SPEC_DEC SpecDecParams const specDecParams, +#endif +#if SKIP_SOFTMAX_ATTN + float const skipSoftmaxThresholdScaleFactor, +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount, +#endif #endif uint32_t* __restrict__ const semaphores = nullptr, // [nbReq][nbKHeads][divUp(specDecParams.qSeqLen, inputTokensPerCta)] @@ -753,6 +785,10 @@ CUBIN_EXPORT __global__ uint32_t const nbSubSeq = isMultiBlockMode ? mha::min(nbTilesInUse / multiBlockMinNbTilesPerCta, maxNbSubSeq) : 1; static_assert(multiBlockMinNbTiles >= multiBlockMinNbTilesPerCta * 2); assert(isMultiBlockMode == (nbSubSeq > 1)); +#if SKIP_SOFTMAX_ATTN + bool const disableSkipForShortSeq = (cacheSeqLen < skipSoftmaxThresholdScaleFactor); + float const skipSoftmaxThreshold = disableSkipForShortSeq ? 0.0f : skipSoftmaxThresholdScaleFactor / cacheSeqLen; +#endif if (idxSubSeq >= nbSubSeq) { return; @@ -776,21 +812,34 @@ CUBIN_EXPORT __global__ assert(dynamicSmemSize() >= sizeof(SharedMem)); SharedMem& smem = *reinterpret_cast(&smemByteBuf[0]); - constexpr uint32_t nbBuffers = 2; - static_assert(nbBuffers == SharedMem::nbKBuf && nbBuffers == SharedMem::nbVBuf && nbBuffers == SharedMem::nbXBuf); - if (wid < nbBuffers) + constexpr uint32_t maxNbBuffers = (SharedMem::nbXBuf > SharedMem::nbVBuf) ? SharedMem::nbXBuf : SharedMem::nbVBuf; + static_assert( + maxNbBuffers >= SharedMem::nbKBuf && maxNbBuffers >= SharedMem::nbVBuf && maxNbBuffers >= SharedMem::nbXBuf); + if (wid < maxNbBuffers) { if (warpElectSync()) { - smem.kBar[wid].initialize(gemm0NbThrds, gemm0NbThrds + warp_size); - smem.vBar[wid].initialize(gemm1NbThrds, gemm1NbThrds + warp_size); + if (wid < SharedMem::nbKBuf) + { + smem.kBar[wid].initialize(gemm0NbThrds, gemm0NbThrds + warp_size); + } + if (wid < SharedMem::nbXBuf) + { +#if SKIP_SOFTMAX_ATTN + smem.skipSoftmaxXBar[wid].initialize(gemm0NbThrds + warp_size, gemm0NbThrds + warp_size); + smem.vBar[wid].initialize(gemm1NbThrds + warp_size, gemm1NbThrds + warp_size); +#else + smem.vBar[wid].initialize(gemm1NbThrds, gemm1NbThrds + warp_size); +#endif + #if !SWAP_AB - smem.vtBar[wid].initialize(gemm1NbThrds * 2, gemm1NbThrds * 2); + smem.vtBar[wid].initialize(gemm1NbThrds * 2, gemm1NbThrds * 2); #endif - smem.xBar[wid].initialize(gemm0NbThrds + gemm1NbThrds, gemm0NbThrds + gemm1NbThrds); + smem.xBar[wid].initialize(gemm0NbThrds + gemm1NbThrds, gemm0NbThrds + gemm1NbThrds); + } } } - else if (wid == nbBuffers) + else if (wid == maxNbBuffers) { if (warpElectSync()) { @@ -819,6 +868,10 @@ CUBIN_EXPORT __global__ SpecDec const specDec{specDecParams, idxReq, idxInputSubSeq, cacheSeqLen}; #endif +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + uint32_t localSkippedBlockCount = 0; +#endif + // QK gemm constexpr uint32_t nbGmmaInstM = exactDiv(gemm0CtaTileNbTokens, gmma::instM); using Acc = GmmaAcc; @@ -940,10 +993,39 @@ CUBIN_EXPORT __global__ } } #endif + + uint32_t const idxXBuf = idxIter % SharedMem::nbXBuf; + auto& xBar = smem.xBar[idxXBuf]; // update colMax in shared mem and get a register copy #if SWAP_AB +#if SKIP_SOFTMAX_ATTN + auto& skipSoftmaxXBar = smem.skipSoftmaxXBar[idxXBuf]; + skipSoftmaxXBar.consumed.arrive_and_wait(); + + bool const maybeSkip = !disableSkipForShortSeq && idxIter != 0; + RegColWiseVec const colMax = computeWarpGrpColMax_sync(smem.gemm0WarpGrpBar, smem.gemm0CurrentSeqMax, acc, + skipSoftmaxThreshold, &smem.skipSoftmaxVotesGemm0ToV[idxXBuf], maybeSkip); + bool const shouldSkipSoftmaxAttn = static_cast(smem.skipSoftmaxVotesGemm0ToV[idxXBuf]); + unused(skipSoftmaxXBar.produced.arrive()); + warpGrpOnlineSoftmax(acc, colMax); + if (shouldSkipSoftmaxAttn) + { + xBar.consumed.arrive_and_wait(); + if (threadIdx.x == 0) + { + smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf] = 1U; +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + localSkippedBlockCount++; +#endif + } + asm volatile("fence.proxy.async.shared::cta;\n"); // maybe not used + unused(xBar.produced.arrive()); + continue; + } +#else RegColWiseVec const colMax = computeWarpGrpColMax_sync(smem.gemm0WarpGrpBar, smem.gemm0CurrentSeqMax, acc); warpGrpOnlineSoftmax(acc, colMax); +#endif #else RegRowWiseVec const rowMax = computeWarpGrpRowMax_sync(warpRank, smem.gemm0CurrentSeqMax, acc); warpGrpOnlineSoftmax(acc, rowMax); @@ -959,8 +1041,6 @@ CUBIN_EXPORT __global__ // map 1 to fp8_max before conversion to fp8 acc = acc * kE4M3_MAX; - uint32_t const idxXBuf = idxIter % SharedMem::nbXBuf; - auto& xBar = smem.xBar[idxXBuf]; // @fixme: for fp16/bf16, try not to transpose acc here, and leave it to the next GEMM. #if SWAP_AB storeGemm0AccToShm(warpRank, laneId(), smem.xBuf(idxXBuf), xBar.consumed, acc); @@ -989,13 +1069,25 @@ CUBIN_EXPORT __global__ storeShmRowWiseVec(warpRank, smem.xRowMax[idxXBuf], rowMax); storeShmRowWiseVec(warpRank, smem.xRowSum[idxXBuf], rowSum); #endif - +#if SKIP_SOFTMAX_ATTN + if (threadIdx.x == 0) + { + smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf] = 0; + } +#endif __syncwarp(); // the release semantics of arrive does not work for async consumers like gmma. additional fence is // needed. asm volatile("fence.proxy.async.shared::cta;\n"); unused(xBar.produced.arrive()); } +#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS + if (threadIdx.x == 0 && skippedBlockCount != nullptr && totalBlockCount != nullptr) + { + atomicAdd(skippedBlockCount, localSkippedBlockCount); + atomicAdd(totalBlockCount, nbIters); + } +#endif unused(smem.qBar.consumed.arrive()); } else if (warpIdx.z == 1) @@ -1043,216 +1135,231 @@ CUBIN_EXPORT __global__ uint32_t idxVTile = idxVTileInit + idxIter * nbSubSeq; auto const idxVBuf = idxIter % SharedMem::nbVBuf; auto const idxXBuf = idxVBuf; + auto& xBar = smem.xBar[idxXBuf]; auto& vBar = smem.vBar[idxVBuf]; - arrive_tx_and_wait(vBar.produced, exactDiv(sizeof(SharedMem::VBuffer), gemm1NbThrds)); auto const& vBuf = smem.vBuf(idxVBuf); + xBar.produced.arrive_and_wait(); +#if SKIP_SOFTMAX_ATTN + bool shouldSkipSoftmaxAttn = smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf]; // guarded by xBar + if (shouldSkipSoftmaxAttn) + { + vBar.produced.arrive_and_wait(); + } +#endif + +#if SKIP_SOFTMAX_ATTN + if (!shouldSkipSoftmaxAttn) // skip XVGemm +#endif + { + arrive_tx_and_wait(vBar.produced, exactDiv(sizeof(SharedMem::VBuffer), gemm1NbThrds)); #if !SWAP_AB - CtaBarrierPair& vtBar = smem.vtBar[idxVBuf]; - auto& vtBuf = smem.vtBuf(idxVBuf); - vtBar.consumed.arrive_and_wait(); - transposeVTile(warpRank, laneId(), vtBuf, vBuf); - vBar.consumed.arrive(); - vtBar.produced.arrive(); + CtaBarrierPair& vtBar = smem.vtBar[idxVBuf]; + auto& vtBuf = smem.vtBuf(idxVBuf); + vtBar.consumed.arrive_and_wait(); + transposeVTile(warpRank, laneId(), vtBuf, vBuf); + vBar.consumed.arrive(); + vtBar.produced.arrive(); #endif - auto& xBar = smem.xBar[idxXBuf]; - xBar.produced.arrive_and_wait(); #if !defined(NDEBUG) && DBG_PRINT #if SWAP_AB - if (threadIdx.x == 0) - { - printf("colMax:\n"); - for (int i = 0; i < ctaNbQHeads; i++) - { - printf("%f, ", smem.xColMax[idxXBuf][i]); - } - printf("\n"); - printf("colSum:\n"); - for (int n = 0; n < 4; n++) + if (threadIdx.x == 0) { + printf("colMax:\n"); for (int i = 0; i < ctaNbQHeads; i++) { - printf("%f, ", smem.xColSum[idxXBuf][n][i]); + printf("%f, ", smem.xColMax[idxXBuf][i]); } printf("\n"); - } - printf("\n"); - printf("X:\n"); - for (int i = 0; i < ctaNbQHeads; i++) - { - for (int j = 0; j < gemm0CtaTileNbTokens; j++) + printf("colSum:\n"); + for (int n = 0; n < 4; n++) { - auto const& elemsPerXPart = (cacheElemsPerGrain * grainsPerXPart); - auto const e = reinterpret_cast&>( - smem.xBuf(idxXBuf)[j / elemsPerXPart].template at( - i, j % elemsPerXPart / cacheElemsPerGrain))[j % cacheElemsPerGrain]; - printf("%.2f, ", float(e)); - if (j % 16 == 15) + for (int i = 0; i < ctaNbQHeads; i++) { - printf("| "); + printf("%f, ", smem.xColSum[idxXBuf][n][i]); } + printf("\n"); + } + printf("\n"); + printf("X:\n"); + for (int i = 0; i < ctaNbQHeads; i++) + { + for (int j = 0; j < gemm0CtaTileNbTokens; j++) + { + auto const& elemsPerXPart = (cacheElemsPerGrain * grainsPerXPart); + auto const e = reinterpret_cast&>( + smem.xBuf(idxXBuf)[j / elemsPerXPart].template at( + i, j % elemsPerXPart / cacheElemsPerGrain))[j % cacheElemsPerGrain]; + printf("%.2f, ", float(e)); + if (j % 16 == 15) + { + printf("| "); + } + } + printf("\n\n"); } - printf("\n\n"); } - } - smem.gemm1WarpGrpBar.arrive_and_wait(); + smem.gemm1WarpGrpBar.arrive_and_wait(); #else - if (blockIdx.y == 1 && threadIdx.x == 0) - { - printf("rowMax:\n"); - for (int i = 0; i < ctaNbQHeads; i++) + if (blockIdx.y == 1 && threadIdx.x == 0) { - printf("%f, ", smem.xRowMax[idxXBuf][i]); - } - printf("\n"); - printf("rowSum:\n"); - for (int i = 0; i < ctaNbQHeads; i++) - { - printf("%f, ", smem.xRowSum[idxXBuf][i]); + printf("rowMax:\n"); + for (int i = 0; i < ctaNbQHeads; i++) + { + printf("%f, ", smem.xRowMax[idxXBuf][i]); + } + printf("\n"); + printf("rowSum:\n"); + for (int i = 0; i < ctaNbQHeads; i++) + { + printf("%f, ", smem.xRowSum[idxXBuf][i]); + } + printf("\n"); } - printf("\n"); - } - smem.gemm1WarpGrpBar.arrive_and_wait(); + smem.gemm1WarpGrpBar.arrive_and_wait(); #endif #endif #if SWAP_AB - // @fixme: if first tile, no need to rescale acc. For persistent CTA, just re-initialize acc instead. - rescaleGemm1AccForNewColMax_sync(warpRank, smem.xColMax[idxXBuf], smem.xColSum[idxXBuf], - smem.gemm1AccColMax, acc, smem.gemm1AccColSum, smem.gemm1WarpGrpBar); + // @fixme: if first tile, no need to rescale acc. For persistent CTA, just re-initialize acc instead. + rescaleGemm1AccForNewColMax_sync(warpRank, smem.xColMax[idxXBuf], smem.xColSum[idxXBuf], + smem.gemm1AccColMax, acc, smem.gemm1AccColSum, smem.gemm1WarpGrpBar); #else - rescaleGemm1AccForNewRowMax_sync( - warpRank, smem.xRowMax[idxXBuf], smem.xRowSum[idxXBuf], smem.gemm1AccColMax, acc, smem.gemm1AccColSum); + rescaleGemm1AccForNewRowMax_sync(warpRank, smem.xRowMax[idxXBuf], smem.xRowSum[idxXBuf], + smem.gemm1AccColMax, acc, smem.gemm1AccColSum); #endif - auto& xBuf = smem.xBuf(idxXBuf); + auto& xBuf = smem.xBuf(idxXBuf); - auto const descXBase = gmma::makeMatDesc(nullptr, 0, SharedMem::XBuffer::Elem::rowBytes * 8, - gmma::getSwizzleMode(SharedMem::XBuffer::Elem{})) - .raw(); + auto const descXBase = gmma::makeMatDesc(nullptr, 0, SharedMem::XBuffer::Elem::rowBytes * 8, + gmma::getSwizzleMode(SharedMem::XBuffer::Elem{})) + .raw(); #if CACHE_ELEM_ENUM == 0 - auto const descVBase = gmma::makeMatDesc(nullptr, 0, SharedMem::VBuffer::Elem::rowBytes * 8, - gmma::getSwizzleMode(SharedMem::VBuffer::Elem{})) - .raw(); + auto const descVBase = gmma::makeMatDesc(nullptr, 0, SharedMem::VBuffer::Elem::rowBytes * 8, + gmma::getSwizzleMode(SharedMem::VBuffer::Elem{})) + .raw(); #endif #if SWAP_AB //@fixme: to reduce code size, we can disable unroll and use double-buffer for LDSM in loadVTileTransposed. #pragma unroll - for (uint32_t idxInstK = 0; idxInstK < gemm1NbGmmaInstK; idxInstK++) - { + for (uint32_t idxInstK = 0; idxInstK < gemm1NbGmmaInstK; idxInstK++) + { #if CACHE_ELEM_ENUM == 2 - Vec const fragA - = loadVTileTransposed(warpRank, laneId(), vBuf, idxInstK); + Vec const fragA + = loadVTileTransposed(warpRank, laneId(), vBuf, idxInstK); #if !defined(NDEBUG) && DBG_PRINT - if (threadIdx.x == 0) - { - printf("fragA:\nidxInstK == %u\n", idxInstK); - } - smem.gemm1WarpGrpBar.arrive_and_wait(); - for (int m = 0; m < 2; m++) - { - for (int w = 0; w < 4; w++) + if (threadIdx.x == 0) { - if (warpRank == w) + printf("fragA:\nidxInstK == %u\n", idxInstK); + } + smem.gemm1WarpGrpBar.arrive_and_wait(); + for (int m = 0; m < 2; m++) + { + for (int w = 0; w < 4; w++) { - if (laneId() == 0) - { - printf(" warpRank = %u\n", warpRank); - } - __syncwarp(); - for (int a = 0; a < 2; a++) + if (warpRank == w) { - for (int b = 0; b < 8; b++) + if (laneId() == 0) { - for (int c = 0; c < 2; c++) + printf(" warpRank = %u\n", warpRank); + } + __syncwarp(); + for (int a = 0; a < 2; a++) + { + for (int b = 0; b < 8; b++) { - for (int d = 0; d < 4; d++) + for (int c = 0; c < 2; c++) { - if (laneId() == b * 4 + d) + for (int d = 0; d < 4; d++) { - for (int e = 0; e < 4; e++) + if (laneId() == b * 4 + d) { - auto const& elem4 = reinterpret_cast<__nv_fp8_e4m3 const(&)[4]>( - fragA[m](0, c)(a, 0)); - printf("%.2f, ", float(elem4[e])); + for (int e = 0; e < 4; e++) + { + auto const& elem4 = reinterpret_cast<__nv_fp8_e4m3 const(&)[4]>( + fragA[m](0, c)(a, 0)); + printf("%.2f, ", float(elem4[e])); + } } + __syncwarp(); } - __syncwarp(); } + if (laneId() == 0) + { + printf("\n"); + } + __syncwarp(); } - if (laneId() == 0) + if (laneId() == 0 && a == 0) { - printf("\n"); + printf("----------------------\n"); } __syncwarp(); } - if (laneId() == 0 && a == 0) - { - printf("----------------------\n"); - } - __syncwarp(); } + smem.gemm1WarpGrpBar.arrive_and_wait(); } - smem.gemm1WarpGrpBar.arrive_and_wait(); } - } #endif #endif - BoundedVal const kOffsetInGrains{grainsPerInstK * idxInstK}; - auto const descX = addAddr(descXBase, - &xBuf[kOffsetInGrains.template divBy().get()]( - 0, kOffsetInGrains.template mod().get())); + BoundedVal const kOffsetInGrains{grainsPerInstK * idxInstK}; + auto const descX = addAddr(descXBase, + &xBuf[kOffsetInGrains.template divBy().get()]( + 0, kOffsetInGrains.template mod().get())); #if CACHE_ELEM_ENUM == 2 - gmma::fence(); + gmma::fence(); #endif #pragma unroll - for (uint32_t idxInstM = 0; idxInstM < gemm1NbGmmaInstM; idxInstM++) - { + for (uint32_t idxInstM = 0; idxInstM < gemm1NbGmmaInstM; idxInstM++) + { #if CACHE_ELEM_ENUM == 0 - auto const descV - = addAddr(descVBase, &vBuf[idxInstM](kOffsetInGrains.get() * cacheElemsPerGrain, 0)); - gmma::mma_async_shmA( - reinterpret_cast(acc(idxInstM, 0)), - descV, descX, true); + auto const descV + = addAddr(descVBase, &vBuf[idxInstM](kOffsetInGrains.get() * cacheElemsPerGrain, 0)); + gmma::mma_async_shmA( + reinterpret_cast(acc(idxInstM, 0)), + descV, descX, true); #elif CACHE_ELEM_ENUM == 2 - gmma::mma_async_regA( - reinterpret_cast(acc(idxInstM, 0)), - reinterpret_cast(fragA[idxInstM]), descX, true); + gmma::mma_async_regA( + reinterpret_cast(acc(idxInstM, 0)), + reinterpret_cast(fragA[idxInstM]), descX, true); #endif + } + gmma::commit_group(); + //@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of + // gmma. + gmma::wait_group<0>(); } - gmma::commit_group(); - //@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of - // gmma. - gmma::wait_group<0>(); - } #else - auto const descVTBase = gmma::makeMatDesc( - nullptr, 0, SharedMem::VTBuffer::rowBytes * 8, gmma::getSwizzleMode(SharedMem::VTBuffer{})) - .raw(); - vtBar.produced.arrive_and_wait(); + auto const descVTBase = gmma::makeMatDesc( + nullptr, 0, SharedMem::VTBuffer::rowBytes * 8, gmma::getSwizzleMode(SharedMem::VTBuffer{})) + .raw(); + vtBar.produced.arrive_and_wait(); // if (idxIter == 1 && threadIdx.x == 0) { // printf("vtBuf:\n"); // dbg::printArray2D<__nv_fp8_e4m3, true>(vtBuf); // } #pragma unroll - for (uint32_t m = 0; m < Gemm1Acc::rows; m++) - { -#pragma unroll - for (uint32_t k = 0; k < gemm1NbGmmaInstK; k++) + for (uint32_t m = 0; m < Gemm1Acc::rows; m++) { - BoundedVal const kOffsetInGrains{grainsPerInstK * k}; - auto const descX = addAddr(descXBase, - &xBuf[kOffsetInGrains.template divBy().get()]( - gmma::instM * m, kOffsetInGrains.template mod().get())); - auto const descVT = addAddr( - descVTBase, &vtBuf(0, kOffsetInGrains.template mod().get())); - gmma::mma_async_shmA( - reinterpret_cast(acc(m, 0)), descX, - descVT, true); +#pragma unroll + for (uint32_t k = 0; k < gemm1NbGmmaInstK; k++) + { + BoundedVal const kOffsetInGrains{grainsPerInstK * k}; + auto const descX = addAddr(descXBase, + &xBuf[kOffsetInGrains.template divBy().get()]( + gmma::instM * m, kOffsetInGrains.template mod().get())); + auto const descVT = addAddr( + descVTBase, &vtBuf(0, kOffsetInGrains.template mod().get())); + gmma::mma_async_shmA( + reinterpret_cast(acc(m, 0)), descX, + descVT, true); + } } - } - gmma::commit_group(); - //@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of gmma. - gmma::wait_group<0>(); + gmma::commit_group(); + //@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of + // gmma. + gmma::wait_group<0>(); #endif + } + if (idxIter == nbIters - 1) { // gmma::wait_group should have already synchronized threads, so this may be unnecessary. @@ -1471,8 +1578,24 @@ CUBIN_EXPORT __global__ tensorMap #endif }; +#if SKIP_SOFTMAX_ATTN + for (auto& b : smem.skipSoftmaxXBar) + { + unused(b.consumed.arrive()); + } +#endif for (uint32_t idxIter = 0; idxIter < nbIters; idxIter++) { + uint32_t const idxVBuf = idxIter % SharedMem::nbVBuf; + auto& vBar = smem.vBar[idxVBuf]; +#if SKIP_SOFTMAX_ATTN + uint32_t idxXBuf = idxIter % SharedMem::nbXBuf; + auto& skipSoftmaxXBar = smem.skipSoftmaxXBar[idxXBuf]; + skipSoftmaxXBar.produced.arrive_and_wait(); + bool shouldSkipSoftmaxAttn = smem.skipSoftmaxVotesGemm0ToV[idxXBuf]; + skipSoftmaxXBar.consumed.arrive(); +#endif + uint32_t const idxVTile = idxVTileInit + idxIter * nbSubSeq; vTileLoader.loadPages(idxVTile); #if USE_INPUT_KV || ENABLE_PDL == 2 @@ -1506,8 +1629,20 @@ CUBIN_EXPORT __global__ } #endif - uint32_t const idxVBuf = idxIter % SharedMem::nbVBuf; - auto& vBar = smem.vBar[idxVBuf]; +#if SKIP_SOFTMAX_ATTN + if (shouldSkipSoftmaxAttn) + { + vBar.consumed.arrive_and_wait(); + // compared to non-skip softmax attn, we need to increase vBar.produced count to avoid race + // condition where vBar.consumed is arrived again without wait without skip softmax attn, XVGemm + // will wait for tx_count, so its progress won't go ahead of vload warp with skip softmax attn, + // XVGemm WG may go ahead of vload warp, as previous vBar only have XVGemm WG threads and a tx_count + // (now = 0). Then it may arrive vBar.consumed before it is arrive_and_wait-ed + vBar.produced.arrive(); + continue; + } +#endif + vBar.consumed.arrive_and_wait(); if (warpElectSync()) { @@ -1517,6 +1652,9 @@ CUBIN_EXPORT __global__ vTileLoader.loadData(smem.vBuf(idxVBuf)[idxPart], idxVTile, idxPart, vBar.produced); } } +#if SKIP_SOFTMAX_ATTN + vBar.produced.arrive(); +#endif __syncwarp(); } } @@ -1992,9 +2130,23 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, #endif // SPEC_DEC // smemColMax is persistent across multiple iterations +#if SKIP_SOFTMAX_ATTN +__device__ inline RegColWiseVec computeWarpGrpColMax_sync(CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, + Gemm0Acc const& src, float skipSoftmaxThreshold, uint32_t* smemSkipVote, bool maybeSkip) +#else __device__ inline RegColWiseVec computeWarpGrpColMax_sync( CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src) +#endif { +#if SKIP_SOFTMAX_ATTN + if (threadIdx.x == 0) + { + *smemSkipVote = maybeSkip ? 1U : 0U; // will sync before vote + } + float const lnThreshold + = log(skipSoftmaxThreshold); // this can be -inf, but should be safe as we only use it for comparison +#endif + auto colMax = RegColWiseVec::filled(Vec::filled(safeInitRowMax)); #pragma unroll for (uint32_t n = 0; n < src.cols; n++) @@ -2029,6 +2181,25 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync( } uint32_t const lane = laneId(); +#if SKIP_SOFTMAX_ATTN + auto prevOrCurrentMax = RegColWiseVec(); +#if SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE + if (lane < 4) + { +#pragma unroll + for (uint32_t n = 0; n < src.cols; n++) + { +#pragma unroll + for (uint32_t j = 0; j < 2; j++) + { + prevOrCurrentMax[n][j] = smemColMax[8 * n + 2 * lane + j]; + } + } + } + warpGrpBar.arrive_and_wait(); +#endif +#endif + if (lane < 4) { #pragma unroll @@ -2037,12 +2208,27 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync( #pragma unroll for (uint32_t j = 0; j < 2; j++) { +#if SKIP_SOFTMAX_ATTN && !SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE + // prevOrCurrentMax <= actual smemColMax (after updates from all 4 warps done), but always >= + // smemColMax(Prev), the smemColMax value *before* this tile is computed. + // When determine whether to skip, it is safe to use prevOrCurrentMax: 1) all 4 warps' localmax < + // smemColMax(Prev), then prevOrCurrentMax == smemColMax(Prev), result not affected; 2) if some localmax + // > smemColMax(Prev), prevOrCurrentMax > smemColMax(Prev), some warps may incorrectly vote skip, but + // at least one warp whose localColMax is larger will not skip, then the tile is not skipped. + // This reduces some sync and check, but has issue when threshold > 1. + prevOrCurrentMax[n][j] = atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]); +#else atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]); +#endif } } } warpGrpBar.arrive_and_wait(); + uint32_t const idxInQuad = lane % 4; +#if SKIP_SOFTMAX_ATTN + bool localShouldSkip = true; +#endif #pragma unroll for (uint32_t n = 0; n < src.cols; n++) @@ -2050,10 +2236,21 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync( #pragma unroll for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++) { +#if SKIP_SOFTMAX_ATTN + if (lane < 4 && 8 * n + 2 * idxInQuad + j < headGrpSize) + { + localShouldSkip &= (colMax[n][j] - prevOrCurrentMax[n][j]) < lnThreshold; + } +#endif assert(colMax[n][j] <= smemColMax[8 * n + 2 * idxInQuad + j]); colMax[n][j] = smemColMax[8 * n + 2 * idxInQuad + j]; } } + +#if SKIP_SOFTMAX_ATTN + atomicAnd(smemSkipVote, static_cast(localShouldSkip)); // this will be translated to redux and voteu +#endif + warpGrpBar.arrive_and_wait(); return colMax; } @@ -2199,7 +2396,7 @@ __device__ inline void storeGemm0AccToShm( uint32_t const idxOctInsideHalf = idxInHalf / 8; uint32_t const idxRowInsideOct = lane % 8; uint32_t const warpBaseC = 16 * warpRank; - auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> std::pair + auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> mha::pair { uint32_t const accR = idxAccCoreMat / Gemm0Acc::cols; uint32_t const accC = idxAccCoreMat % Gemm0Acc::cols; @@ -3231,6 +3428,24 @@ __device__ inline void storeRotatedPairsForQ(SharedMem::QBuffer& dst, } #ifndef GENERATE_CUBIN +uint32_t computeNbSubSeqPerSeqHopperF8MHA( + cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen) +{ + auto const env = std::getenv("XQA_NB_SUB_SEQ"); + if (env != nullptr) + { + int32_t const val = std::stoi(env); + if (val > 0) + { + return val; + } + } + float const factor = 0.25f; + return mha::min( + mha::max(1U, (uint32_t) round(prop.multiProcessorCount * 3 / (batchSize * nbKHeads) * factor)), + divUp(maxSeqLen, gemm0CtaTileNbTokens)); +} + void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #if SLIDING_WINDOW uint32_t slidingWinSize, @@ -3268,6 +3483,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, // int8/fp8 KV cache. #if SPEC_DEC SpecDecParams const& specDecParams, +#endif +#if SKIP_SOFTMAX_ATTN + float const skipSoftmaxThresholdScaleFactor, +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount, +#endif #endif uint32_t* semaphores, void* scratch, cudaStream_t stream) { @@ -3286,22 +3507,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, uint32_t const nbVHeads = nbKHeads; uint32_t const nbQHeads = nbKHeads * headGrpSize; uint32_t const nbQKVHeads = nbQHeads + nbKHeads + nbVHeads; - uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t - { - auto const env = std::getenv("XQA_NB_SUB_SEQ"); - if (env != nullptr) - { - int32_t const val = std::stoi(env); - if (val > 0) - { - return val; - } - } - float const factor = 0.25f; - return mha::min( - mha::max(1U, (uint32_t) round(prop.multiProcessorCount * 3 / (batchSize * nbKHeads) * factor)), - divUp(maxSeqLen, gemm0CtaTileNbTokens)); - }(); + uint32_t const nbSubSeqPerSeq = computeNbSubSeqPerSeqHopperF8MHA(prop, batchSize, nbKHeads, maxSeqLen); #if SPEC_DEC uint32_t const qSeqLen = specDecParams.qSeqLen; #else @@ -3371,6 +3577,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #endif #if SPEC_DEC specDecParams, +#endif +#if SKIP_SOFTMAX_ATTN + skipSoftmaxThresholdScaleFactor, +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + skippedBlockCount, totalBlockCount, +#endif #endif semaphores, scratch); #else diff --git a/cpp/kernels/xqa/mha_stdheaders.cuh b/cpp/kernels/xqa/mha_stdheaders.cuh index 5d22d2e018bf..8f4c252c62d2 100644 --- a/cpp/kernels/xqa/mha_stdheaders.cuh +++ b/cpp/kernels/xqa/mha_stdheaders.cuh @@ -1272,6 +1272,19 @@ using is_void = is_same, void>; template inline constexpr bool is_void_v = is_void::value; #endif + +#ifndef GENERATE_CUBIN +template +using pair = std::pair; +#else +template +struct pair +{ + T1 first; + T2 second; +}; +#endif + } // namespace mha #if GENERATE_CUBIN diff --git a/cpp/kernels/xqa/test/refAttention.cpp b/cpp/kernels/xqa/test/refAttention.cpp index 303678518f5f..cc218f4cbd3c 100644 --- a/cpp/kernels/xqa/test/refAttention.cpp +++ b/cpp/kernels/xqa/test/refAttention.cpp @@ -50,7 +50,8 @@ using Vector = Matrix; template Eigen::Matrix refFlashAttention(IOHead const* q, CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, float qScale, - float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks) + float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor, + uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum) { uint32_t const nbTiles = divUp(seqLen, tileSize); auto gemm1Acc = Eigen::Matrix::Zero().eval(); @@ -61,6 +62,16 @@ Eigen::Matrix refFlashAt float const qkScale = qScale * kvScale / sqrtf(validElemsPerHead); uint32_t const seqBeg = (seqLen < slidingWinSize ? 0 : seqLen - slidingWinSize); uint32_t const idxTileBeg = seqBeg / tileSize; + + uint32_t const nbSubSeq = (multiBlockNum > 0 && nbTiles >= 2) ? mha::min(nbTiles, multiBlockNum) : 1; + std::vector> skipRowMaxs(nbSubSeq); + for (uint32_t i = 0; i < nbSubSeq; i++) + { + skipRowMaxs[i].fill(-INFINITY); + } + bool const disableSkipForShortSeq = (seqLen < skipSoftmaxThresholdScaleFactor); + float const skipSoftmaxThreshold = disableSkipForShortSeq ? 0.0f : skipSoftmaxThresholdScaleFactor / seqLen; + for (uint32_t idxTile = idxTileBeg; idxTile < nbTiles; idxTile++) { Eigen::Matrix gemm0Acc; @@ -88,7 +99,22 @@ Eigen::Matrix refFlashAt } } - Eigen::Vector const tileRowMax = gemm0Acc.rowwise().maxCoeff().cwiseMax(rowMax).eval(); + Eigen::Vector const localRowMax = gemm0Acc.rowwise().maxCoeff().eval(); + Eigen::Vector const tileRowMax = localRowMax.cwiseMax(rowMax).eval(); + auto const prevSkipRowMax = skipRowMaxs[idxTile % nbSubSeq]; + skipRowMaxs[idxTile % nbSubSeq] = localRowMax.cwiseMax(skipRowMaxs[idxTile % nbSubSeq]).eval(); + + if (!disableSkipForShortSeq && skipSoftmaxThreshold > 0) + { + *totalBlockCount += 1; + auto const skipSoftmaxMask = ((localRowMax - prevSkipRowMax).array() < std::log(skipSoftmaxThreshold)); + bool const skipBlock = skipSoftmaxMask.all() && ((idxTile - idxTileBeg) >= nbSubSeq); + if (skipBlock) + { + *skippedBlockCount += 1; + continue; + } + } Eigen::Matrix tileX = (gemm0Acc.colwise() - tileRowMax).array().exp().eval(); @@ -138,7 +164,8 @@ Eigen::Matrix refFlashAt template Eigen::Matrix \ refFlashAttention(IOHead const* q, \ CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, \ - float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks) + float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, \ + float skipSoftmaxThreshold, uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum) INSTANTIATE_refFlashAttention(CacheElem, 64, false, false); INSTANTIATE_refFlashAttention(CacheElem, 64, false, true); diff --git a/cpp/kernels/xqa/test/refAttention.h b/cpp/kernels/xqa/test/refAttention.h index 4f1e673adaad..a8dd32bab65f 100644 --- a/cpp/kernels/xqa/test/refAttention.h +++ b/cpp/kernels/xqa/test/refAttention.h @@ -88,7 +88,8 @@ struct CacheSeq template Eigen::Matrix refFlashAttention(IOHead const* q, CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, float qScale, - float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks); + float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor, + uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum); template #if SPEC_DEC diff --git a/cpp/kernels/xqa/test/test.cpp b/cpp/kernels/xqa/test/test.cpp index 76e94616ce2c..9702d4bf6100 100644 --- a/cpp/kernels/xqa/test/test.cpp +++ b/cpp/kernels/xqa/test/test.cpp @@ -150,7 +150,8 @@ template #endif #endif void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, bool verbose = false, - bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30) + bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30, + float skipSoftmaxThresholdScaleFactor = 0.0f) { #if IS_MLA if (nbKHeads != 1) @@ -224,6 +225,12 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, seqLen = (16U << 20) / gmemCacheHeadBytes; // 32MB per K+V head. } ctxLen = std::min(ctxLen, seqLen); + uint32_t skippedBlockCount = 0; + uint32_t totalBlockCount = 0; + if (skipSoftmaxThresholdScaleFactor > 0) + { + assert(useQGMMA); + } float const kScale = cacheElemSize == 2 ? 1.f : 1 / 4.f; float const vScale = kScale; float const qScale = 1.f; @@ -329,6 +336,17 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, auto const rcpOutScale = ManagedMemBuf(1); auto const seqLenList = ManagedMemBuf(batchSize); auto const ctxLenList = ManagedMemBuf(batchSize); +#if SKIP_SOFTMAX_ATTN +#ifdef SKIP_SOFTMAX_ATTN_BLOCK_STATS + auto const kernelSkippedBlockCount = ManagedMemBuf(1); + auto const kernelTotalBlockCount = ManagedMemBuf(1); + kernelSkippedBlockCount[0] = 0; + kernelTotalBlockCount[0] = 0; +#endif +#else + EXPECT_EQ(skipSoftmaxThresholdScaleFactor, 0.0f) + << "Got non-zero skipSoftmaxThresholdScaleFactor while SKIP_SOFTMAX_ATTN is not enabled."; +#endif #if USE_PAGED_KV_CACHE auto const pageListBuf = ManagedMemBuf(pageListBytes); #if PAGED_KV_CACHE_LAYOUT == 1 @@ -726,6 +744,11 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, maxSeqLen, &seqLenList[0][0], batchSize, kvCacheScale.get(), semaphores.get(), scratch, stream); }; #else + auto multiBlockNum = [&]() + { + auto const calcFunc = useQGMMA ? &computeNbSubSeqPerSeqHopperF8MHA : &computeNbSubSeqPerSeqMHA; + return calcFunc(prop, batchSize, nbKHeads, maxSeqLen); + }(); auto runKernel = [&]() { auto const launchFunc = useQGMMA ? &launchHopperF8MHA : &launchMHA; @@ -776,6 +799,12 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, batchSize, kvCacheScale.get(), #if SPEC_DEC specDecParams, +#endif +#if SKIP_SOFTMAX_ATTN + skipSoftmaxThresholdScaleFactor, +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + kernelSkippedBlockCount.get(), kernelTotalBlockCount.get(), +#endif #endif semaphores.get(), scratch, stream); checkCuda(cudaGetLastError()); @@ -813,6 +842,10 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, checkCuda(cudaEventRecord(toc, stream)); prefetchToDevice(cudaCpuDeviceId); checkCuda(cudaStreamSynchronize(stream)); +#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS + kernelSkippedBlockCount[0] /= nbIters; + kernelTotalBlockCount[0] /= nbIters; +#endif if (testPerf) { float ms; @@ -849,6 +882,15 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, = totalNbCacheLoadBytes + inputBytes + outputBytes; // we ignore page indices and beam search indices. float const dramSolTime = totalTraffic / bandwidth * 1E3f; float const dramSolRatio = dramSolTime / ms; +#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS + size_t const totalNbCacheLoadWithSkip = gmemCacheHeadBytes + * (nbKHeads + nbVHeads * (1 - 1.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0])) + * nbLoadedCacheTokens; + float const totalTrafficWithSkip + = totalNbCacheLoadWithSkip + inputBytes + outputBytes; // we ignore page indices and beam search indices. + float const dramSolTimeWithSkip = totalTrafficWithSkip / bandwidth * 1E3f; + float const dramSolRatioWithSkip = dramSolTimeWithSkip / ms; +#endif if (verbose) { printf("done\n"); @@ -863,7 +905,13 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, } float const tops = headGrpSize * qSeqLen * float(seqLen) * (validElemsPerKHead + validElemsPerVHead) * 2 * nbKHeads * batchSize / (ms * 1E-3F) * 1E-12F; +#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS + printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0], + kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]); + printf("dramSolRatioWithSkip: %f%% (%f ms, TOPS = %f)\n", dramSolRatioWithSkip * 100, ms, tops); +#else printf("dramSolRatio: %f%% (%f ms, TOPS = %f)\n", dramSolRatio * 100, ms, tops); +#endif } if (refCheck) { @@ -1084,8 +1132,8 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, if (useQGMMA) { refOutput = refFlashAttention(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq, - vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, - refAttentionSinks); + vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, refAttentionSinks, + skipSoftmaxThresholdScaleFactor, &skippedBlockCount, &totalBlockCount, multiBlockNum); // refOutput = refAttention(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq, // vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize); } @@ -1132,6 +1180,14 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, #endif } } +#if SKIP_SOFTMAX_ATTN + printf("host skippedBlockCount: %d/%d (%.2f%%)\n", skippedBlockCount, totalBlockCount, + totalBlockCount == 0 ? 0.0f : 100.0f * skippedBlockCount / totalBlockCount); +#if SKIP_SOFTMAX_ATTN_BLOCK_STATS + printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0], + kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]); +#endif +#endif if (saveData) { fout_refOutput.close(); @@ -1253,6 +1309,14 @@ TEST(RefCheck, llama_V2_70b) #if SLIDING_WINDOW runTest<2>(2, 4096, false, true, false, false, false, ~0, 256); runTest<2>(2, 400, false, true, false, false, false, ~0U, 256); +#endif +#if SKIP_SOFTMAX_ATTN + runTest<1>(32, 2048, false, true, false, false, false, ~0U, 1U << 30, 0.f); + runTest<4>(32, 1538, false, true, false, false, false, ~0U, 1U << 30, 1280.f); + runTest<2>(32, 4096, false, true, false, false, false, ~0U, 1U << 30, 125.f); + runTest<4>(32, 300, false, true, false, false, false, ~0U, 1U << 30, 80.f); + runTest<4>(32, 500, false, true, false, false, false, ~0U, 1U << 30, 501.0f); + runTest<4>(32, 500, false, true, false, false, false, ~0U, 1U << 30, 500.f); #endif runTest<8>(120, 367, false, true); runTest<8>(1792, 2048, false, true); diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index 339da7c527a1..32a9332a0191 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -298,6 +298,11 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams& xqaParams.use_sparse_attention = useTllmGenSparseAttention(); // Skip softmax threshold. xqaParams.skip_softmax_threshold_scale_factor = mSkipSoftmaxThresholdScaleFactorDecode; +#ifdef SKIP_SOFTMAX_STAT + // Statistics of skip-softmax, pointers of device memory for output + xqaParams.skip_softmax_total_blocks = mSkipSoftmaxTotalBlocks; + xqaParams.skip_softmax_skipped_blocks = mSkipSoftmaxSkippedBlocks; +#endif // Cross attention parameters. xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp index 33587d796198..9571737f04a2 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp @@ -105,7 +105,8 @@ CubinObj CompileEngine::compile() const // scratch in this case. /*use_input_kv=*/applyRoPEInXqaKernel, /*rope_style=*/ropeStyle, - /*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree}; + /*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree, + /*use_skip_softmax_attn=*/mXqaParams.skip_softmax_threshold_scale_factor != 0}; if (context.kernel_type == TLLM_XQA_JIT_MLA) { auto const& c = context; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index 90dda051a07e..877a780072c4 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -232,6 +232,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& jit::CubinObj const* const cubinObj = mResource->getCubinObjRegistry()->getCubin(key); TLLM_CHECK(cubinObj != nullptr && cubinObj->isInitialized()); bool const isSpecDec = xqaParams.multi_query_tokens; + bool const isSkipSoftmax = xqaParams.skip_softmax_threshold_scale_factor != 0; bool const isHMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kAMPERE_WARP_SPECIALIZED); bool const isGMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kHOPPER_WARP_SPECIALIZED); bool const isMLAKernel = (cubinObj->getKernelType() == XQAKernelType::kSM120_MLA); @@ -378,7 +379,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& .mask = reinterpret_cast(xqaParams.spec_decoding_packed_mask)}; }; - constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 16; + constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 19; uint32_t idxNextParam = 0; void* kernelParams[kMAX_NB_KERNEL_PARAMS]; auto appendParam = [&](auto* p) mutable @@ -514,6 +515,16 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& appendParam(&specDecParams); specDecBlocks = divUp(specDecParams.qSeqLen, 64 / num_q_heads_over_kv); } + if (isSkipSoftmax) + { + TLLM_CHECK_WITH_INFO(isGMMAKernel, "skip softmax is only supported for GMMA kernel for now."); + TLLM_CHECK_WITH_INFO(!isSpecDec, "skip softmax is not supported with spec dec for now."); + appendParam(&xqaParams.skip_softmax_threshold_scale_factor); +#ifdef SKIP_SOFTMAX_STAT + appendParam(&xqaParams.skip_softmax_total_blocks); + appendParam(&xqaParams.skip_softmax_skipped_blocks); +#endif + } appendParam(&launchParams.semaphores); appendParam(&launchParams.scratch); kernelParams[idxNextParam] = nullptr; // one extra nullptr at end as guard. diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp index 26fadd21cc05..f6f73dab2e6a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp @@ -96,10 +96,16 @@ bool supportConfigQGMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlu { return false; } - if (xqaParams.kv_cache_data_type != DATA_TYPE_E4M3) + if (!contains({DATA_TYPE_FP16, DATA_TYPE_BF16, DATA_TYPE_E4M3}, xqaParams.kv_cache_data_type)) { return false; } + bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0; + if (!is_skip_softmax && xqaParams.kv_cache_data_type != DATA_TYPE_E4M3) + { + // Only use hopper kernel with fp16/bf16 kv cache data type when skip softmax is enabled + return false; + } if (xqaParams.beam_width != 1) { return false; @@ -168,6 +174,11 @@ bool supportConfigHMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlug { return false; } + bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0; + if (is_skip_softmax) + { + return false; + } return true; } @@ -201,6 +212,11 @@ bool supportConfigMLA(XQAParams const& xqaParams, int SM, bool forConfigurePlugi { return false; } + bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0; + if (is_skip_softmax) + { + return false; + } return true; } diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h index ab9e93f0d4a9..b132e769188b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h @@ -66,6 +66,7 @@ extern "C" bool is_spec_dec_tree = true; // useful only when multi_query_tokens, should be true unless using linear tree in spec-dec. + bool use_skip_softmax_attn; } tllmXqaJitContext; // tllmXqaJitProgram is an opaque handle for a program. diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp index 96481d8474f4..384b29e31385 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp @@ -215,6 +215,10 @@ tllmXqaJitStatus getMacroFlags(tllmXqaJitContext const* context, std::vectoruse_input_kv ? "1" : "0"; macros["ROPE_STYLE"] = std::to_string(int(context->rope_style)); macros["IS_SPEC_DEC_TREE"] = context->is_spec_dec_tree ? "1" : "0"; + macros["SKIP_SOFTMAX_ATTN"] = context->use_skip_softmax_attn ? "1" : "0"; +#ifdef SKIP_SOFTMAX_STAT + macros["SKIP_SOFTMAX_ATTN_BLOCK_STATS"] = context->use_skip_softmax_attn ? "1" : "0"; +#endif // Without these macros, NVRTC uses precompiled headers for cuda_fp16.h etc. // Linking might fail due to ABI incompatibility. diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp index 7bd7c32e5e68..822483560cfd 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp @@ -493,6 +493,10 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo { SUPPORT_RETURN_FALSE("streaming-llm"); } + if (xqaParams.skip_softmax_threshold_scale_factor != 0) + { + SUPPORT_RETURN_FALSE("skip_softmax_threshold_scale_factor"); + } // OPTIMIZE: For the standard generation-phase MHA, there are still extra limitations. // NOTE: Medusa mode = Multi_query_tokens > 1. diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp index e4b642a11e49..250e850a3ac8 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp @@ -64,6 +64,21 @@ CUtensorMapSwizzle getSwizzleMode(uint32_t partBytes) } }; +CUtensorMapDataType_enum getDataTypeFromXqaParams(XQAParams const& xqaParams) +{ + if (xqaParams.kv_cache_data_type == DATA_TYPE_BF16) + { + return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16; + } + else if (xqaParams.kv_cache_data_type == DATA_TYPE_FP16) + { + return CU_TENSOR_MAP_DATA_TYPE_FLOAT16; + } + TLLM_CHECK(xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 || xqaParams.kv_cache_data_type == DATA_TYPE_E5M2 + || xqaParams.kv_cache_data_type == DATA_TYPE_INT8); + return CU_TENSOR_MAP_DATA_TYPE_UINT8; +} + CUtensorMap makeTensorMapForQ(std::shared_ptr const& driver, void const* addr, CUtensorMapDataType_enum dataType, uint32_t headElems, uint32_t totalNbHeads, uint32_t partElems, uint32_t boxHeads) { @@ -131,24 +146,26 @@ CUtensorMap makeTensorMapForHopperXqaKVCache( if constexpr (std::is_same_v) { uint32_t const headElems = xqaParams.head_size; - uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8); + CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams); + uint32_t const elemBytes = getElemBytes(dataType); TLLM_CHECK(headElems <= 256); uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256); uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes; - return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8, - xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems); + return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size, + xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems); } else { static_assert(std::is_same_v); uint32_t const headElems = xqaParams.head_size; - uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8); + CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams); + uint32_t const elemBytes = getElemBytes(dataType); TLLM_CHECK(headElems <= 256); uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256); uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes; - return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, CU_TENSOR_MAP_DATA_TYPE_UINT8, - xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width, - xqaParams.batch_size, partElems); + return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, dataType, xqaParams.head_size, + xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width, xqaParams.batch_size, + partElems); } } @@ -161,11 +178,12 @@ template CUtensorMap makeTensorMapForXqaMlaKVCache(std::shared_ptr const& driver, XQAParams const& xqaParams, KVCacheBuffer const& kv_cache_buffer, bool forK) { + CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams); uint32_t const partElems = (forK ? 64 : 128); if constexpr (std::is_same_v) { - return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8, - xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems); + return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size, + xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems); } else { @@ -183,7 +201,7 @@ CUtensorMap makeTensorMapForXqaMlaQ( std::shared_ptr const& driver, XQAParams const& xqaParams, void const* q) { uint32_t const partElems = 64; - return makeTensorMapForQ(driver, q, CU_TENSOR_MAP_DATA_TYPE_UINT8, xqaParams.head_size, + return makeTensorMapForQ(driver, q, getDataTypeFromXqaParams(xqaParams), xqaParams.head_size, xqaParams.num_q_heads * xqaParams.total_num_input_tokens, partElems, xqaParams.num_q_heads); } } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h index 406bf54b1ffd..ce2b77aa9279 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h @@ -119,7 +119,12 @@ struct XQAParams bool use_sparse_attention = false; // Skip softmax threshold. - float skip_softmax_threshold_scale_factor = 0.0f; + float skip_softmax_threshold_scale_factor = 0; + +#ifdef SKIP_SOFTMAX_STAT + uint32_t* skip_softmax_total_blocks = nullptr; + uint32_t* skip_softmax_skipped_blocks = nullptr; +#endif cudaStream_t stream = 0; // layer index @@ -199,6 +204,10 @@ struct XQAParams << "sparse_params: " << sparse_params.toString() << std::endl << "use_sparse_attention :" << (use_sparse_attention ? "true" : "false") << std ::endl << "skip_softmax_threshold_scale_factor :" << skip_softmax_threshold_scale_factor << std ::endl +#ifdef SKIP_SOFTMAX_STAT + << "skip_softmax_total_blocks :" << skip_softmax_total_blocks << std ::endl + << "skip_softmax_skipped_blocks :" << skip_softmax_skipped_blocks << std ::endl +#endif << "stream :" << stream; return ss.str(); diff --git a/security_scanning/cpp/kernels/fmha_v2/poetry.lock b/security_scanning/cpp/kernels/fmha_v2/poetry.lock index 7cecb7d14cd0..c5e0ba403a74 100644 --- a/security_scanning/cpp/kernels/fmha_v2/poetry.lock +++ b/security_scanning/cpp/kernels/fmha_v2/poetry.lock @@ -150,53 +150,58 @@ testing = ["filelock"] [[package]] name = "tomli" -version = "2.3.0" +version = "2.4.0" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" files = [ - {file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"}, - {file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"}, - {file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"}, - {file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"}, - {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"}, - {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"}, - {file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"}, - {file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"}, - {file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"}, - {file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"}, - {file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"}, - {file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"}, - {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"}, - {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"}, - {file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"}, - {file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"}, - {file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"}, - {file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"}, - {file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"}, - {file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"}, - {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"}, - {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"}, - {file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"}, - {file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"}, - {file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"}, - {file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"}, - {file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"}, - {file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"}, - {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"}, - {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"}, - {file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"}, - {file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"}, - {file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"}, - {file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"}, - {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"}, - {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"}, - {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"}, - {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"}, - {file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"}, - {file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"}, - {file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"}, - {file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"}, + {file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"}, + {file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"}, + {file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"}, + {file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"}, + {file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"}, + {file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"}, + {file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"}, + {file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"}, + {file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"}, + {file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"}, + {file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"}, + {file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"}, + {file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"}, + {file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"}, + {file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"}, + {file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"}, + {file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"}, ] [[package]] diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock index f012c4737ff2..d776030daacd 100644 --- a/security_scanning/docs/poetry.lock +++ b/security_scanning/docs/poetry.lock @@ -1119,53 +1119,58 @@ test = ["pytest"] [[package]] name = "tomli" -version = "2.3.0" +version = "2.4.0" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" files = [ - {file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"}, - {file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"}, - {file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"}, - {file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"}, - {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"}, - {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"}, - {file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"}, - {file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"}, - {file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"}, - {file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"}, - {file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"}, - {file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"}, - {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"}, - {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"}, - {file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"}, - {file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"}, - {file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"}, - {file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"}, - {file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"}, - {file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"}, - {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"}, - {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"}, - {file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"}, - {file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"}, - {file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"}, - {file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"}, - {file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"}, - {file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"}, - {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"}, - {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"}, - {file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"}, - {file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"}, - {file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"}, - {file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"}, - {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"}, - {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"}, - {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"}, - {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"}, - {file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"}, - {file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"}, - {file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"}, - {file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"}, + {file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"}, + {file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"}, + {file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"}, + {file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"}, + {file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"}, + {file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"}, + {file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"}, + {file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"}, + {file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"}, + {file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"}, + {file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"}, + {file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"}, + {file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"}, + {file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"}, + {file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"}, + {file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"}, + {file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock index 29e89c7f1c74..1bb438e2f236 100644 --- a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock +++ b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock @@ -42,6 +42,40 @@ files = [ {file = "av-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:273a3e32de64819e4a1cd96341824299fe06f70c46f2288b5dc4173944f0fd62"}, {file = "av-16.1.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:640f57b93f927fba8689f6966c956737ee95388a91bd0b8c8b5e0481f73513d6"}, {file = "av-16.1.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:ae3fb658eec00852ebd7412fdc141f17f3ddce8afee2d2e1cf366263ad2a3b35"}, + {file = "av-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:27ee558d9c02a142eebcbe55578a6d817fedfde42ff5676275504e16d07a7f86"}, + {file = "av-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7ae547f6d5fa31763f73900d43901e8c5fa6367bb9a9840978d57b5a7ae14ed2"}, + {file = "av-16.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8cf065f9d438e1921dc31fc7aa045790b58aee71736897866420d80b5450f62a"}, + {file = "av-16.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a345877a9d3cc0f08e2bc4ec163ee83176864b92587afb9d08dff50f37a9a829"}, + {file = "av-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:f49243b1d27c91cd8c66fdba90a674e344eb8eb917264f36117bf2b6879118fd"}, + {file = "av-16.1.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:ce2a1b3d8bf619f6c47a9f28cfa7518ff75ddd516c234a4ee351037b05e6a587"}, + {file = "av-16.1.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:408dbe6a2573ca58a855eb8cd854112b33ea598651902c36709f5f84c991ed8e"}, + {file = "av-16.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:57f657f86652a160a8a01887aaab82282f9e629abf94c780bbdbb01595d6f0f7"}, + {file = "av-16.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:adbad2b355c2ee4552cac59762809d791bda90586d134a33c6f13727fb86cb3a"}, + {file = "av-16.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f42e1a68ec2aebd21f7eb6895be69efa6aa27eec1670536876399725bbda4b99"}, + {file = "av-16.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58fe47aeaef0f100c40ec8a5de9abbd37f118d3ca03829a1009cf288e9aef67c"}, + {file = "av-16.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:565093ebc93b2f4b76782589564869dadfa83af5b852edebedd8fee746457d06"}, + {file = "av-16.1.0-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:574081a24edb98343fd9f473e21ae155bf61443d4ec9d7708987fa597d6b04b2"}, + {file = "av-16.1.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:9ab00ea29c25ebf2ea1d1e928d7babb3532d562481c5d96c0829212b70756ad0"}, + {file = "av-16.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a84a91188c1071f238a9523fd42dbe567fb2e2607b22b779851b2ce0eac1b560"}, + {file = "av-16.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c2cd0de4dd022a7225ff224fde8e7971496d700be41c50adaaa26c07bb50bf97"}, + {file = "av-16.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0816143530624a5a93bc5494f8c6eeaf77549b9366709c2ac8566c1e9bff6df5"}, + {file = "av-16.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e3a28053af29644696d0c007e897d19b1197585834660a54773e12a40b16974c"}, + {file = "av-16.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e3e67144a202b95ed299d165232533989390a9ea3119d37eccec697dc6dbb0c"}, + {file = "av-16.1.0-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:39a634d8e5a87e78ea80772774bfd20c0721f0d633837ff185f36c9d14ffede4"}, + {file = "av-16.1.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:0ba32fb9e9300948a7fa9f8a3fc686e6f7f77599a665c71eb2118fdfd2c743f9"}, + {file = "av-16.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:ca04d17815182d34ce3edc53cbda78a4f36e956c0fd73e3bab249872a831c4d7"}, + {file = "av-16.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ee0e8de2e124a9ef53c955fe2add6ee7c56cc8fd83318265549e44057db77142"}, + {file = "av-16.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:22bf77a2f658827043a1e184b479c3bf25c4c43ab32353677df2d119f080e28f"}, + {file = "av-16.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2dd419d262e6a71cab206d80bbf28e0a10d0f227b671cdf5e854c028faa2d043"}, + {file = "av-16.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:53585986fd431cd436f290fba662cfb44d9494fbc2949a183de00acc5b33fa88"}, + {file = "av-16.1.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:76f5ed8495cf41e1209a5775d3699dc63fdc1740b94a095e2485f13586593205"}, + {file = "av-16.1.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:8d55397190f12a1a3ae7538be58c356cceb2bf50df1b33523817587748ce89e5"}, + {file = "av-16.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:9d51d9037437218261b4bbf9df78a95e216f83d7774fbfe8d289230b5b2e28e2"}, + {file = "av-16.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0ce07a89c15644407f49d942111ca046e323bbab0a9078ff43ee57c9b4a50dad"}, + {file = "av-16.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:cac0c074892ea97113b53556ff41c99562db7b9f09f098adac1f08318c2acad5"}, + {file = "av-16.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7dec3dcbc35a187ce450f65a2e0dda820d5a9e6553eea8344a1459af11c98649"}, + {file = "av-16.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6f90dc082ff2068ddbe77618400b44d698d25d9c4edac57459e250c16b33d700"}, + {file = "av-16.1.0.tar.gz", hash = "sha256:a094b4fd87a3721dacf02794d3d2c82b8d712c85b9534437e82a8a978c175ffd"}, ] [[package]] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index c550fe1d89dd..747b7605a8a1 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "3c65ec3c556d610a017d11fb968c6576c5b3b493", - "timestamp": "2026-01-11T02:39:21Z" + "commit_hash": "c0e25e54181528c8e0818e2e9bc22fe5a889b8cc", + "timestamp": "2026-01-12T02:39:25Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index c65cb8d26e08..412f60179d65 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -3213,6 +3213,28 @@ files = [ {file = "nvidia_cusparselt_cu12-0.7.1-py3-none-win_amd64.whl", hash = "sha256:f67fbb5831940ec829c9117b7f33807db9f9678dc2a617fbe781cac17b4e1075"}, ] +[[package]] +name = "nvidia-cutlass-dsl" +version = "4.3.4" +description = "NVIDIA CUTLASS Python DSL" +optional = false +python-versions = ">=3.10" +files = [ + {file = "nvidia_cutlass_dsl-4.3.4-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:118508bc84f2a55ec7af3affd379bb713edf837d593218329909db67b518e700"}, + {file = "nvidia_cutlass_dsl-4.3.4-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3fdf0603ab7ec1bf6a499fbf72cff65e73b597d6e1359286808317c69aeb7c3d"}, + {file = "nvidia_cutlass_dsl-4.3.4-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c5bd21ed877da171f115123a12aae4a920035fc47eb57c807f9fba9f3df97cf4"}, + {file = "nvidia_cutlass_dsl-4.3.4-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:671936f1df909e7de377d0cc00cb4287a3458c013d34947600423e9deb827e41"}, + {file = "nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:57693d87677919572ab9eefa386b3f39e8e888bc4a9db7ab8730a97e8dbe06b4"}, + {file = "nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a48fbff859e44dd548f8f26819d97d0595acea70e3b057c91dfdb47929015c72"}, + {file = "nvidia_cutlass_dsl-4.3.4-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:36bde25160f461f393beba81868ef9e54d5ba2e0e7666ed3e44b6dbf788af493"}, + {file = "nvidia_cutlass_dsl-4.3.4-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:be127f0f087028fa498f50a994c49f95b2c6a518e11e2567bc3d71528bf0a504"}, +] + +[package.dependencies] +cuda-python = ">=12.8" +numpy = "*" +typing-extensions = "*" + [[package]] name = "nvidia-ml-py" version = "13.590.44" @@ -5518,53 +5540,58 @@ testing = ["datasets", "numpy", "pytest", "pytest-asyncio", "requests", "ruff", [[package]] name = "tomli" -version = "2.3.0" +version = "2.4.0" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" files = [ - {file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"}, - {file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"}, - {file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"}, - {file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"}, - {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"}, - {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"}, - {file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"}, - {file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"}, - {file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"}, - {file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"}, - {file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"}, - {file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"}, - {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"}, - {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"}, - {file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"}, - {file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"}, - {file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"}, - {file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"}, - {file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"}, - {file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"}, - {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"}, - {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"}, - {file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"}, - {file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"}, - {file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"}, - {file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"}, - {file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"}, - {file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"}, - {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"}, - {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"}, - {file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"}, - {file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"}, - {file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"}, - {file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"}, - {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"}, - {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"}, - {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"}, - {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"}, - {file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"}, - {file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"}, - {file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"}, - {file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"}, + {file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"}, + {file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"}, + {file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"}, + {file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"}, + {file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"}, + {file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"}, + {file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"}, + {file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"}, + {file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"}, + {file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"}, + {file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"}, + {file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"}, + {file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"}, + {file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"}, + {file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"}, + {file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"}, + {file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"}, ] [[package]] @@ -6317,4 +6344,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "7c2c53dee07ff38461334f55fc46cbcb05770c5c4c6f95f797eb9f9800e76f60" +content-hash = "f17eedd404a2af6728d14710809ea47ad34bc6672c035073bad9e6c709131a08" diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index 05956ca82131..d7101b9d45f8 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -72,6 +72,7 @@ triton = "3.5.0" tiktoken = "^0.12.0" blobfile = "^3.1.0" openai-harmony = "0.0.4" +nvidia-cutlass-dsl = "4.3.4" plotly = "^6.5.1" numexpr = "<2.14.0" partial-json-parser = "^0.2.1.1.post7" diff --git a/tests/integration/defs/perf/base_perf_pytorch.csv b/tests/integration/defs/perf/base_perf_pytorch.csv index a0da249c2a23..43ef4b2c75d6 100644 --- a/tests/integration/defs/perf/base_perf_pytorch.csv +++ b/tests/integration/defs/perf/base_perf_pytorch.csv @@ -3,15 +3,7 @@ network_name,perf_case_name,test_name,threshold,absolute_threshold,metric_type,p "llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,76.45, "llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,9785.75, "llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,55.64, -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,171845.02,H100_PCIe -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.17,H100_PCIe -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,48.09,H100_PCIe -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,6155.59,H100_PCIe -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,139897.82,H100_NVL -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,69.59,H100_NVL -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,58.63,H100_NVL -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,7504.07,H100_NVL -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,125068.76,H100 -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.09,H100 -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,65.50,H100 -"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,8384.00,H100 +"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",0.1,50,INFERENCE_TIME,1359184.5059,H100_PCIe +"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,50,KV_CACHE_SIZE,10.92,H100_PCIe +"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,10,SEQ_THROUGHPUT,0.3767,H100_PCIe +"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,10,TOKEN_THROUGHPUT,385.7372,H100_PCIe diff --git a/tests/integration/test_lists/test-db/l0_perf.yml b/tests/integration/test_lists/test-db/l0_perf.yml index 4b5d50fb4fa6..a915f9a90803 100644 --- a/tests/integration/test_lists/test-db/l0_perf.yml +++ b/tests/integration/test_lists/test-db/l0_perf.yml @@ -42,5 +42,4 @@ l0_perf: stage: pre_merge backend: pytorch tests: - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-_autodeploy-float16-input_output_len:128,128-reqs:8192] - perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-input_output_len:1024,1024-reqs:512] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 1352c282ae77..eb177e30714a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -393,3 +393,4 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (http accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892) accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839) +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918)