Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CODING_GUIDELINES.md
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ char* const errStr = getErrorStr(status); // const pointer to mutable cha
Code should adhere to [PEP 8](https://peps.python.org/pep-0008/#fn-hi), unless otherwise noted.

#### Python Standard
1. The code developed for TensorRT-LLM should conform to Python 3.8+.
1. The code developed for TensorRT-LLM should conform to Python 3.10+.

#### Formatting

Expand Down
8 changes: 1 addition & 7 deletions benchmarks/cpp/disaggServerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ class DisaggExecutorServer
texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse,
benchmarkParams.maxTokensInPagedKvCache, benchmarkParams.maxAttentionWindowVec,
benchmarkParams.sinkTokenLength, benchmarkParams.freeGpuMemoryFractions.at(in),
benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
benchmarkParams.kvHostCacheSize);
texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode,
benchmarkParams.cudaGraphCacheSize);
Expand Down Expand Up @@ -1213,8 +1213,6 @@ int main(int argc, char* argv[])
options.add_options()("kv_host_cache_bytes",
"Size of secondary memory pool used for offloading kv cache blocks (in bytes).",
cxxopts::value<size_t>()->default_value("0"));
options.add_options()("kv_onboard_blocks", "If offloaded blocks should be onboarded to primary memory before reuse",
cxxopts::value<bool>()->default_value("true"));
options.add_options()(
"max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType32>());
options.add_options()("gpu_weights_percent",
Expand Down Expand Up @@ -1482,10 +1480,6 @@ int main(int argc, char* argv[])
TLLM_CHECK_WITH_INFO(
benchmarkParams.kvHostCacheSize == false, "Currently disaggServer don't support kv_host_cache!");

// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
benchmarkParams.kvOnboardBlocks = result["kv_onboard_blocks"].as<bool>();
TLLM_CHECK_WITH_INFO(
benchmarkParams.kvOnboardBlocks == true, "Currently disaggServer don't support kv_onboard_blocks =false!");
// Argument: Medusa choices for the Medusa speculative decoding.
if (result.count("medusa_choices"))
{
Expand Down
7 changes: 1 addition & 6 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ class ExecutorServer

texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize,
benchmarkParams.crossKvCacheFraction);
texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
std::nullopt, benchmarkParams.loraHostCacheSize);
Expand Down Expand Up @@ -1133,8 +1133,6 @@ int main(int argc, char* argv[])
options.add_options()("kv_host_cache_bytes",
"Size of secondary memory pool used for offloading kv cache blocks (in bytes).",
cxxopts::value<size_t>()->default_value("0"));
options.add_options()("kv_onboard_blocks", "If offloaded blocks should be onboarded to primary memory before reuse",
cxxopts::value<bool>()->default_value("true"));
options.add_options()(
"max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType32>());

Expand Down Expand Up @@ -1355,9 +1353,6 @@ int main(int argc, char* argv[])
// Argument: How many KV cache blocks (as fraction of number of GPU kv cache blocks).
benchmarkParams.kvHostCacheSize = result["kv_host_cache_bytes"].as<size_t>();

// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
benchmarkParams.kvOnboardBlocks = result["kv_onboard_blocks"].as<bool>();

// Argument: Medusa choices for the Medusa speculative decoding.
if (result.count("medusa_choices"))
{
Expand Down
1 change: 0 additions & 1 deletion benchmarks/cpp/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ struct BenchmarkParams

// KV cache block offloading
size_t kvHostCacheSize{0};
bool kvOnboardBlocks{true};

// Weights offloading
float gpuWeightsPercent{1.0};
Expand Down
16 changes: 7 additions & 9 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ class WindowBlockManager
std::vector<SizeType32> const& managedLayers, std::vector<SizeType32> const& numKvHeadsPerLayer,
SizeType32 sizePerHead, SizeType32 tokensPerBlock, bool isSWA, SizeType32 blocksInPrimaryPool,
SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
radix_block_tree::UnifiedBlockTree& lookupTree, std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent = nullptr,
Expand Down Expand Up @@ -1132,8 +1132,6 @@ class WindowBlockManager
// getPoolLayerIdx
std::unordered_map<SizeType32, SizeType32> mLayerToIndexWithinPool;

// Whether offloaded blocks should be onboarded before reuse.
bool mOnboardBlocks;
// Buffer manager
runtime::BufferManager mBufferManager;

Expand Down Expand Up @@ -1241,7 +1239,7 @@ class BlockManager
CudaStreamPtr stream, SizeType32 maxSequenceLength, SizeType32 maxBeamWidth,
std::vector<SizeType32> const& maxAttentionWindowVec,
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
SizeType32 sinkBubbleLength, CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnPartialReuse = true,
Expand Down Expand Up @@ -1985,7 +1983,7 @@ class KVCacheManager : public BaseKVCacheManager
std::vector<SizeType32> const& maxAttentionWindowVec,
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnpartialReuse = true,
Expand All @@ -1999,7 +1997,7 @@ class KVCacheManager : public BaseKVCacheManager
std::vector<SizeType32> const& maxAttentionWindowVec,
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnpartialReuse = true,
Expand All @@ -2013,7 +2011,7 @@ class KVCacheManager : public BaseKVCacheManager
std::vector<SizeType32> const& maxAttentionWindowVec,
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = true,
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnpartialReuse = true,
Expand All @@ -2027,8 +2025,8 @@ class KVCacheManager : public BaseKVCacheManager
std::vector<SizeType32> const& maxAttentionWindowVec,
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true,
bool copyOnpartialReuse = true, bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true, bool copyOnpartialReuse = true,
bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
SizeType32 indexerKCacheIndexHeadDim = 0,
std::optional<LinearAttentionMetadata> linearAttentionMetadata = std::nullopt);

Expand Down
7 changes: 1 addition & 6 deletions cpp/include/tensorrt_llm/executor/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -1039,7 +1039,7 @@ class KvCacheConfig
std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
std::optional<size_t> const& hostCacheSize = std::nullopt,
std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
Expand All @@ -1056,7 +1056,6 @@ class KvCacheConfig
[[nodiscard]] std::optional<FloatType> getFreeGpuMemoryFraction() const;
[[nodiscard]] std::optional<FloatType> getCrossKvCacheFraction() const;
[[nodiscard]] std::optional<size_t> getHostCacheSize() const;
[[nodiscard]] bool getOnboardBlocks() const;
[[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
[[nodiscard]] size_t getEventBufferMaxSize() const;
[[nodiscard]] bool getUseUvm() const;
Expand All @@ -1072,7 +1071,6 @@ class KvCacheConfig
void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
void setCrossKvCacheFraction(FloatType crossKvCacheFraction);
void setHostCacheSize(size_t hostCacheSize);
void setOnboardBlocks(bool onboardBlocks);
void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
void setEventBufferMaxSize(size_t eventBufferMaxSize);
void setUseUvm(bool useUvm);
Expand Down Expand Up @@ -1116,9 +1114,6 @@ class KvCacheConfig
/// Having a secondary memory pool increases KV cache block reuse potential.
std::optional<size_t> mHostCacheSize;

/// @brief Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
bool mOnboardBlocks;

/// @brief Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
std::optional<RetentionPriority> mSecondaryOffloadMinPriority;

Expand Down
Loading
Loading