Rajeev975
diff --git a/‎velox/core/QueryConfig.h‎
Lines changed: 27 additions & 0 deletions b/‎velox/core/QueryConfig.h‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎velox/docs/configs.rst‎
Lines changed: 16 additions & 0 deletions b/‎velox/docs/configs.rst‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎velox/expression/EvalCtx.h‎
Lines changed: 37 additions & 0 deletions b/‎velox/expression/EvalCtx.h‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎velox/expression/Expr.cpp‎
Lines changed: 150 additions & 4 deletions b/‎velox/expression/Expr.cpp‎
Lines changed: 150 additions & 4 deletions
@@ -87,6 +87,25 @@ class QueryConfig {
   static constexpr const char* kExprTrackCpuUsageForFunctions =
       "expression.track_cpu_usage_for_functions";
 
+  /// Enables adaptive per-function CPU usage sampling. When enabled, each
+  /// function is calibrated over the first 6 batches (1 warmup + 5
+  /// calibration) to measure the overhead of CPU tracking (clock_gettime).
+  /// Functions where tracking overhead exceeds
+  /// kExprAdaptiveCpuSamplingMaxOverheadPct are automatically sampled at a
+  /// rate proportional to their overhead. Functions with low overhead are
+  /// always tracked. Disabled by default.
+  static constexpr const char* kExprAdaptiveCpuSampling =
+      "expression.adaptive_cpu_sampling";
+
+  /// Maximum acceptable overhead percentage for CPU tracking per function.
+  /// Used with kExprAdaptiveCpuSampling. Functions whose CPU tracking overhead
+  /// exceeds this threshold are sampled at a rate of
+  /// ceil(overhead_pct / max_overhead_pct). For example, with max_overhead=1.0,
+  /// a function with 70% tracking overhead is sampled every 70th batch.
+  /// Default: 1.0 (1% overhead target).
+  static constexpr const char* kExprAdaptiveCpuSamplingMaxOverheadPct =
+      "expression.adaptive_cpu_sampling_max_overhead_pct";
+
   /// Controls whether non-deterministic expressions are deduplicated during
   /// compilation. This is intended for testing and debugging purposes. By
   /// default, this is set to true to preserve standard behavior. If set to
@@ -1387,6 +1406,14 @@ class QueryConfig {
     return get<std::string>(kExprTrackCpuUsageForFunctions, "");
   }
 
+  bool exprAdaptiveCpuSampling() const {
+    return get<bool>(kExprAdaptiveCpuSampling, false);
+  }
+
+  double exprAdaptiveCpuSamplingMaxOverheadPct() const {
+    return get<double>(kExprAdaptiveCpuSamplingMaxOverheadPct, 1.0);
+  }
+
   bool exprDedupNonDeterministic() const {
     return get<bool>(kExprDedupNonDeterministic, true);
   }
 
@@ -286,6 +286,22 @@ Expression Evaluation Configuration
        ``expression.track_cpu_usage`` is set to false. Function names are case-insensitive and will be normalized
        to lowercase. This allows fine-grained control over CPU tracking overhead when only specific functions need to
        be monitored.
+   * - expression.adaptive_cpu_sampling
+     - boolean
+     - false
+     - Enables adaptive per-function CPU usage sampling. Each function is calibrated over 6 batches (1 warmup + 5
+       calibration) to measure the overhead of CPU tracking (clock_gettime) relative to the function's execution time.
+       The timer overhead is measured once per ExprSet and shared across all functions. Functions where tracking overhead
+       is acceptable are always tracked; functions where overhead exceeds ``expression.adaptive_cpu_sampling_max_overhead_pct``
+       are sampled at a rate proportional to their overhead. Sampled timing stats are extrapolated to approximate
+       full-population values.
+   * - expression.adaptive_cpu_sampling_max_overhead_pct
+     - float
+     - 1.0
+     - Maximum acceptable CPU tracking overhead percentage per function, used with ``expression.adaptive_cpu_sampling``.
+       Functions whose tracking overhead exceeds this threshold are sampled at a rate of
+       ceil(overhead_pct / max_overhead_pct). For example, with max_overhead=1.0, a function with 70% tracking overhead
+       is sampled every 70th batch, bounding its effective overhead to ~1%. Must be greater than 0.
    * - legacy_cast
      - bool
      - false
 
@@ -539,6 +539,34 @@ class EvalCtx {
     return execCtx_->optimizationParams().dictionaryMemoizationEnabled;
   }
 
+  /// Returns true if adaptive per-function CPU sampling is enabled.
+  bool adaptiveCpuSamplingEnabled() const {
+    return adaptiveCpuSamplingEnabled_;
+  }
+
+  void setAdaptiveCpuSamplingEnabled(bool enabled) {
+    adaptiveCpuSamplingEnabled_ = enabled;
+  }
+
+  /// Returns the maximum acceptable overhead pct for adaptive sampling.
+  double adaptiveCpuSamplingMaxOverheadPct() const {
+    return adaptiveCpuSamplingMaxOverheadPct_;
+  }
+
+  void setAdaptiveCpuSamplingMaxOverheadPct(double pct) {
+    adaptiveCpuSamplingMaxOverheadPct_ = pct;
+  }
+
+  /// Returns the measured CpuWallTimer overhead in nanoseconds (per
+  /// invocation). Measured once per ExprSet and shared across all Expr nodes.
+  uint64_t timerOverheadNanos() const {
+    return timerOverheadNanos_;
+  }
+
+  void setTimerOverheadNanos(uint64_t nanos) {
+    timerOverheadNanos_ = nanos;
+  }
+
   /// Returns the maximum number of distinct inputs to cache results for in a
   /// given shared subexpression.
   uint32_t maxSharedSubexprResultsCached() const {
@@ -610,6 +638,15 @@ class EvalCtx {
   // If 'captureErrorDetails()' is false, stores flags indicating which rows had
   // errors without storing actual exceptions.
   EvalErrorsPtr errors_;
+
+  // Whether adaptive per-function CPU sampling is enabled.
+  bool adaptiveCpuSamplingEnabled_{false};
+
+  // Maximum acceptable overhead percentage for adaptive CPU sampling.
+  double adaptiveCpuSamplingMaxOverheadPct_{1.0};
+
+  // Measured CpuWallTimer overhead (nanos per invocation), shared across Exprs.
+  uint64_t timerOverheadNanos_{0};
 };
 
 /// Utility wrapper struct that is used to temporarily reset the value of the
 
@@ -16,6 +16,7 @@
 #include <boost/lexical_cast.hpp>
 #include <boost/uuid/uuid_generators.hpp>
 #include <boost/uuid/uuid_io.hpp>
+#include <cmath>
 
 #include "velox/common/base/Exceptions.h"
 #include "velox/common/base/Fs.h"
@@ -1587,13 +1588,94 @@ bool Expr::applyFunctionWithPeeling(
   return true;
 }
 
+std::unique_ptr<CpuWallTimer> Expr::cpuWallTimer(const EvalCtx& context) {
+  // 1. Compile-time tracking (set via trackCpuUsage_) always wins.
+  if (trackCpuUsage_) {
+    return std::make_unique<CpuWallTimer>(stats_.timing);
+  }
+
+  // 2. Adaptive per-function sampling.
+  if (context.adaptiveCpuSamplingEnabled()) {
+    switch (adaptiveState_) {
+      case AdaptiveCpuSamplingState::kWarmup:
+        // Warmup batch: just run the function, no timing.
+        return nullptr;
+      case AdaptiveCpuSamplingState::kCalibrating: {
+        // Measure function execution time (without CpuWallTimer).
+        // Timer overhead is measured once per ExprSet and shared via EvalCtx.
+        calibrationStopWatch_.emplace();
+        return nullptr;
+      }
+      case AdaptiveCpuSamplingState::kAlwaysTrack:
+        return std::make_unique<CpuWallTimer>(stats_.timing);
+      case AdaptiveCpuSamplingState::kSampling:
+        if (++adaptiveSamplingCounter_ % adaptiveSamplingRate_ == 0) {
+          return std::make_unique<CpuWallTimer>(stats_.timing);
+        }
+        return nullptr;
+    }
+  }
+
+  return nullptr;
+}
+
+void Expr::finalizeAdaptiveCalibration(
+    double maxOverheadPct,
+    uint64_t timerOverheadNanos) {
+  switch (adaptiveState_) {
+    case AdaptiveCpuSamplingState::kWarmup: {
+      adaptiveState_ = AdaptiveCpuSamplingState::kCalibrating;
+      break;
+    }
+    case AdaptiveCpuSamplingState::kCalibrating: {
+      calibrationFunctionWallNanos_ +=
+          calibrationStopWatch_->elapsed().wallNanos;
+      calibrationStopWatch_.reset();
+
+      if (++calibrationBatchCount_ < kCalibrationBatches) {
+        break;
+      }
+
+      // Use the shared timer overhead measurement, scaled by calibration
+      // batch count. The overhead per invocation is a platform constant
+      // measured once per ExprSet.
+      auto totalTimerOverhead = timerOverheadNanos * calibrationBatchCount_;
+
+      if (calibrationFunctionWallNanos_ > 0 && maxOverheadPct > 0) {
+        double overheadPct = 100.0 * static_cast<double>(totalTimerOverhead) /
+            static_cast<double>(calibrationFunctionWallNanos_);
+
+        if (overheadPct > maxOverheadPct) {
+          adaptiveSamplingRate_ =
+              static_cast<uint32_t>(std::ceil(overheadPct / maxOverheadPct));
+          // Start counter at rate-1 so the first post-calibration batch is
+          // always timed (++counter hits rate, which passes % rate == 0).
+          adaptiveSamplingCounter_ = adaptiveSamplingRate_ - 1;
+          adaptiveState_ = AdaptiveCpuSamplingState::kSampling;
+        } else {
+          adaptiveState_ = AdaptiveCpuSamplingState::kAlwaysTrack;
+        }
+      } else {
+        // Function ~0ns — timer dominates. Aggressive sampling.
+        adaptiveSamplingRate_ = 100;
+        adaptiveSamplingCounter_ = adaptiveSamplingRate_ - 1;
+        adaptiveState_ = AdaptiveCpuSamplingState::kSampling;
+      }
+      break;
+    }
+    default:
+      VELOX_UNREACHABLE(
+          "Unexpected adaptive sampling state in finalizeAdaptiveCalibration");
+  }
+}
+
 void Expr::applyFunction(
     const SelectivityVector& rows,
     EvalCtx& context,
     VectorPtr& result) {
   stats_.numProcessedVectors += 1;
   stats_.numProcessedRows += rows.countSelected();
-  auto timer = cpuWallTimer();
+  auto timer = cpuWallTimer(context);
 
   computeIsAsciiForInputs(vectorFunction_.get(), inputValues_, rows);
   auto isAscii = type()->isVarchar()
@@ -1633,6 +1715,14 @@ void Expr::applyFunction(
     result->asUnchecked<SimpleVector<StringView>>()->setIsAscii(
         isAscii.value(), rows);
   }
+
+  // Only do Adaptive Calibration if the adaptive sampling is on and we are in
+  // warmup or calibrating state.
+  if (context.adaptiveCpuSamplingEnabled() && isCalibrating()) {
+    finalizeAdaptiveCalibration(
+        context.adaptiveCpuSamplingMaxOverheadPct(),
+        context.timerOverheadNanos());
+  }
 }
 
 void Expr::evalSpecialFormWithStats(
@@ -1641,9 +1731,17 @@ void Expr::evalSpecialFormWithStats(
     VectorPtr& result) {
   stats_.numProcessedVectors += 1;
   stats_.numProcessedRows += rows.countSelected();
-  auto timer = cpuWallTimer();
+  auto timer = cpuWallTimer(context);
 
   evalSpecialForm(rows, context, result);
+
+  // Only do Adaptive Calibration if the adaptive sampling is on and we are in
+  // warmup or calibrating state.
+  if (context.adaptiveCpuSamplingEnabled() && isCalibrating()) {
+    finalizeAdaptiveCalibration(
+        context.adaptiveCpuSamplingMaxOverheadPct(),
+        context.timerOverheadNanos());
+  }
 }
 
 namespace {
@@ -1873,7 +1971,14 @@ ExprSet::ExprSet(
     core::ExecCtx* execCtx,
     bool enableConstantFolding,
     bool lazyDereference)
-    : execCtx_(execCtx), lazyDereference_(lazyDereference) {
+    : execCtx_(execCtx),
+      lazyDereference_(lazyDereference),
+      adaptiveCpuSampling_(
+          execCtx->queryCtx()->queryConfig().exprAdaptiveCpuSampling()),
+      adaptiveCpuSamplingMaxOverheadPct_(
+          execCtx->queryCtx()
+              ->queryConfig()
+              .exprAdaptiveCpuSamplingMaxOverheadPct()) {
   exprs_ = compileExpressions(sources, execCtx, this, enableConstantFolding);
   if (lazyDereference_) {
     validateLazyDereference(exprs_);
@@ -1886,6 +1991,24 @@ ExprSet::ExprSet(
 }
 
 namespace {
+
+/// If the expression is in adaptive sampling mode, extrapolate timing stats
+/// to approximate full-population values. Otherwise, return raw stats.
+exec::ExprStats adjustStats(const exec::Expr& expr) {
+  if (expr.isAdaptiveSampling() && expr.stats().timing.count > 0) {
+    exec::ExprStats adjusted = expr.stats();
+    double ratio = static_cast<double>(adjusted.numProcessedVectors) /
+        static_cast<double>(adjusted.timing.count);
+    adjusted.timing.cpuNanos = static_cast<uint64_t>(
+        static_cast<double>(adjusted.timing.cpuNanos) * ratio);
+    adjusted.timing.wallNanos = static_cast<uint64_t>(
+        static_cast<double>(adjusted.timing.wallNanos) * ratio);
+    adjusted.timing.count = adjusted.numProcessedVectors;
+    return adjusted;
+  }
+  return expr.stats();
+}
+
 void addStats(
     const exec::Expr& expr,
     std::unordered_map<std::string, exec::ExprStats>& stats,
@@ -1904,7 +2027,7 @@ void addStats(
   bool emptyStats =
       !expr.stats().numProcessedRows && !expr.stats().defaultNullRowsSkipped;
   if (!emptyStats && !excludeSplFormExpr) {
-    stats[expr.name()].add(expr.stats());
+    stats[expr.name()].add(adjustStats(expr));
   }
 
   for (const auto& input : expr.inputs()) {
@@ -2016,6 +2139,24 @@ void printInputAndExprs(
 }
 } // namespace
 
+void ExprSet::initializeAdaptiveCpuSampling(EvalCtx& context) {
+  context.setAdaptiveCpuSamplingEnabled(true);
+  context.setAdaptiveCpuSamplingMaxOverheadPct(
+      adaptiveCpuSamplingMaxOverheadPct_);
+
+  // Measure CpuWallTimer overhead once per ExprSet (platform constant).
+  if (!timerOverheadMeasured_) {
+    CpuWallTiming dummyTiming;
+    DeltaCpuWallTimeStopWatch overheadWatch;
+    {
+      auto dummy = std::make_unique<CpuWallTimer>(dummyTiming);
+    }
+    timerOverheadNanos_ = overheadWatch.elapsed().wallNanos;
+    timerOverheadMeasured_ = true;
+  }
+  context.setTimerOverheadNanos(timerOverheadNanos_);
+}
+
 void ExprSet::eval(
     int32_t begin,
     int32_t end,
@@ -2029,6 +2170,11 @@ void ExprSet::eval(
     clearSharedSubexprs();
   }
 
+  // Apply adaptive per-function CPU sampling if configured.
+  if (adaptiveCpuSampling_) {
+    initializeAdaptiveCpuSampling(context);
+  }
+
   if (!lazyDereference_) {
     // Make sure LazyVectors, referenced by multiple expressions, are loaded for
     // all the "rows".