Reset pipeline cache usage statistics on each generate call (openvinotoolkit#1961)

vshampor · web-flow · commit f1eee93746f8 · 2025-03-26T12:32:18.000+04:00
Otherwise the average and max cache are persisted during the entire
lifetime of the pipeline, which is inflexible if we want to check the
per-`generate` dynamics of cache utilization.
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -42,12 +42,12 @@ struct PipelineMetrics {
     float cache_usage = 0.0;
 
     /**
-    * Max KV cache usage during the lifetime of the pipeline in %
+    * Max KV cache usage during the last .generate() call in %
     */
     float max_cache_usage = 0.0;
 
     /**
-    * Running average of the KV cache usage during the lifetime of the pipeline, with max window size of 1000 steps
+    * Running average of the KV cache usage during the last .generate() call, with max window size of 1000 internal model inferences
     */
     float avg_cache_usage = 0.0;
 
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -367,6 +367,7 @@ std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                              const std::vector<GenerationConfig>& sampling_params,
                                                              const StreamerVariant& streamer) {
+    _reset_cache_usage_statistics();
     ManualTimer generate_timer("generate()");
     generate_timer.start();
 
@@ -511,6 +512,12 @@ float ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_current_running_a
     return std::accumulate(m_previous_step_cache_usages.begin(), m_previous_step_cache_usages.end(), 0.0) / m_previous_step_cache_usages.size();
 }
 
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::_reset_cache_usage_statistics() {
+    m_previous_step_cache_usages.clear();
+    m_pipeline_metrics.max_cache_usage = 0.0;
+    m_pipeline_metrics.avg_cache_usage = 0.0;
+}
+
 void ContinuousBatchingPipeline::ContinuousBatchingImpl::drop_requests() {
     for (const std::shared_ptr<ov::genai::SequenceGroup> request : m_requests) {
         for (const auto& sequence: request->get_sequences()) {
diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
@@ -92,6 +92,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
     void _maybe_evict_cache_blocks(const SchedulerConfig& sched_config);
 
     void _register_step_cache_usage(float step_cache_usage);
+    void _reset_cache_usage_statistics();
     float _get_current_running_average_cache_usage() const;
     void _compute_cache_rotation_data(const std::vector<SequenceGroup::Ptr>& sequence_groups, const Scheduler::Output& scheduler_output);