1616#include < boost/lexical_cast.hpp>
1717#include < boost/uuid/uuid_generators.hpp>
1818#include < boost/uuid/uuid_io.hpp>
19+ #include < cmath>
1920
2021#include " velox/common/base/Exceptions.h"
2122#include " velox/common/base/Fs.h"
@@ -1587,13 +1588,94 @@ bool Expr::applyFunctionWithPeeling(
15871588 return true ;
15881589}
15891590
1591+ std::unique_ptr<CpuWallTimer> Expr::cpuWallTimer (const EvalCtx& context) {
1592+ // 1. Compile-time tracking (set via trackCpuUsage_) always wins.
1593+ if (trackCpuUsage_) {
1594+ return std::make_unique<CpuWallTimer>(stats_.timing );
1595+ }
1596+
1597+ // 2. Adaptive per-function sampling.
1598+ if (context.adaptiveCpuSamplingEnabled ()) {
1599+ switch (adaptiveState_) {
1600+ case AdaptiveCpuSamplingState::kWarmup :
1601+ // Warmup batch: just run the function, no timing.
1602+ return nullptr ;
1603+ case AdaptiveCpuSamplingState::kCalibrating : {
1604+ // Measure function execution time (without CpuWallTimer).
1605+ // Timer overhead is measured once per ExprSet and shared via EvalCtx.
1606+ calibrationStopWatch_.emplace ();
1607+ return nullptr ;
1608+ }
1609+ case AdaptiveCpuSamplingState::kAlwaysTrack :
1610+ return std::make_unique<CpuWallTimer>(stats_.timing );
1611+ case AdaptiveCpuSamplingState::kSampling :
1612+ if (++adaptiveSamplingCounter_ % adaptiveSamplingRate_ == 0 ) {
1613+ return std::make_unique<CpuWallTimer>(stats_.timing );
1614+ }
1615+ return nullptr ;
1616+ }
1617+ }
1618+
1619+ return nullptr ;
1620+ }
1621+
1622+ void Expr::finalizeAdaptiveCalibration (
1623+ double maxOverheadPct,
1624+ uint64_t timerOverheadNanos) {
1625+ switch (adaptiveState_) {
1626+ case AdaptiveCpuSamplingState::kWarmup : {
1627+ adaptiveState_ = AdaptiveCpuSamplingState::kCalibrating ;
1628+ break ;
1629+ }
1630+ case AdaptiveCpuSamplingState::kCalibrating : {
1631+ calibrationFunctionWallNanos_ +=
1632+ calibrationStopWatch_->elapsed ().wallNanos ;
1633+ calibrationStopWatch_.reset ();
1634+
1635+ if (++calibrationBatchCount_ < kCalibrationBatches ) {
1636+ break ;
1637+ }
1638+
1639+ // Use the shared timer overhead measurement, scaled by calibration
1640+ // batch count. The overhead per invocation is a platform constant
1641+ // measured once per ExprSet.
1642+ auto totalTimerOverhead = timerOverheadNanos * calibrationBatchCount_;
1643+
1644+ if (calibrationFunctionWallNanos_ > 0 && maxOverheadPct > 0 ) {
1645+ double overheadPct = 100.0 * static_cast <double >(totalTimerOverhead) /
1646+ static_cast <double >(calibrationFunctionWallNanos_);
1647+
1648+ if (overheadPct > maxOverheadPct) {
1649+ adaptiveSamplingRate_ =
1650+ static_cast <uint32_t >(std::ceil (overheadPct / maxOverheadPct));
1651+ // Start counter at rate-1 so the first post-calibration batch is
1652+ // always timed (++counter hits rate, which passes % rate == 0).
1653+ adaptiveSamplingCounter_ = adaptiveSamplingRate_ - 1 ;
1654+ adaptiveState_ = AdaptiveCpuSamplingState::kSampling ;
1655+ } else {
1656+ adaptiveState_ = AdaptiveCpuSamplingState::kAlwaysTrack ;
1657+ }
1658+ } else {
1659+ // Function ~0ns — timer dominates. Aggressive sampling.
1660+ adaptiveSamplingRate_ = 100 ;
1661+ adaptiveSamplingCounter_ = adaptiveSamplingRate_ - 1 ;
1662+ adaptiveState_ = AdaptiveCpuSamplingState::kSampling ;
1663+ }
1664+ break ;
1665+ }
1666+ default :
1667+ VELOX_UNREACHABLE (
1668+ " Unexpected adaptive sampling state in finalizeAdaptiveCalibration" );
1669+ }
1670+ }
1671+
15901672void Expr::applyFunction (
15911673 const SelectivityVector& rows,
15921674 EvalCtx& context,
15931675 VectorPtr& result) {
15941676 stats_.numProcessedVectors += 1 ;
15951677 stats_.numProcessedRows += rows.countSelected ();
1596- auto timer = cpuWallTimer ();
1678+ auto timer = cpuWallTimer (context );
15971679
15981680 computeIsAsciiForInputs (vectorFunction_.get (), inputValues_, rows);
15991681 auto isAscii = type ()->isVarchar ()
@@ -1633,6 +1715,14 @@ void Expr::applyFunction(
16331715 result->asUnchecked <SimpleVector<StringView>>()->setIsAscii (
16341716 isAscii.value (), rows);
16351717 }
1718+
1719+ // Only do Adaptive Calibration if the adaptive sampling is on and we are in
1720+ // warmup or calibrating state.
1721+ if (context.adaptiveCpuSamplingEnabled () && isCalibrating ()) {
1722+ finalizeAdaptiveCalibration (
1723+ context.adaptiveCpuSamplingMaxOverheadPct (),
1724+ context.timerOverheadNanos ());
1725+ }
16361726}
16371727
16381728void Expr::evalSpecialFormWithStats (
@@ -1641,9 +1731,17 @@ void Expr::evalSpecialFormWithStats(
16411731 VectorPtr& result) {
16421732 stats_.numProcessedVectors += 1 ;
16431733 stats_.numProcessedRows += rows.countSelected ();
1644- auto timer = cpuWallTimer ();
1734+ auto timer = cpuWallTimer (context );
16451735
16461736 evalSpecialForm (rows, context, result);
1737+
1738+ // Only do Adaptive Calibration if the adaptive sampling is on and we are in
1739+ // warmup or calibrating state.
1740+ if (context.adaptiveCpuSamplingEnabled () && isCalibrating ()) {
1741+ finalizeAdaptiveCalibration (
1742+ context.adaptiveCpuSamplingMaxOverheadPct (),
1743+ context.timerOverheadNanos ());
1744+ }
16471745}
16481746
16491747namespace {
@@ -1873,7 +1971,14 @@ ExprSet::ExprSet(
18731971 core::ExecCtx* execCtx,
18741972 bool enableConstantFolding,
18751973 bool lazyDereference)
1876- : execCtx_(execCtx), lazyDereference_(lazyDereference) {
1974+ : execCtx_(execCtx),
1975+ lazyDereference_ (lazyDereference),
1976+ adaptiveCpuSampling_(
1977+ execCtx->queryCtx ()->queryConfig().exprAdaptiveCpuSampling()),
1978+ adaptiveCpuSamplingMaxOverheadPct_(
1979+ execCtx->queryCtx ()
1980+ ->queryConfig()
1981+ .exprAdaptiveCpuSamplingMaxOverheadPct()) {
18771982 exprs_ = compileExpressions (sources, execCtx, this , enableConstantFolding);
18781983 if (lazyDereference_) {
18791984 validateLazyDereference (exprs_);
@@ -1886,6 +1991,24 @@ ExprSet::ExprSet(
18861991}
18871992
18881993namespace {
1994+
1995+ // / If the expression is in adaptive sampling mode, extrapolate timing stats
1996+ // / to approximate full-population values. Otherwise, return raw stats.
1997+ exec::ExprStats adjustStats (const exec::Expr& expr) {
1998+ if (expr.isAdaptiveSampling () && expr.stats ().timing .count > 0 ) {
1999+ exec::ExprStats adjusted = expr.stats ();
2000+ double ratio = static_cast <double >(adjusted.numProcessedVectors ) /
2001+ static_cast <double >(adjusted.timing .count );
2002+ adjusted.timing .cpuNanos = static_cast <uint64_t >(
2003+ static_cast <double >(adjusted.timing .cpuNanos ) * ratio);
2004+ adjusted.timing .wallNanos = static_cast <uint64_t >(
2005+ static_cast <double >(adjusted.timing .wallNanos ) * ratio);
2006+ adjusted.timing .count = adjusted.numProcessedVectors ;
2007+ return adjusted;
2008+ }
2009+ return expr.stats ();
2010+ }
2011+
18892012void addStats (
18902013 const exec::Expr& expr,
18912014 std::unordered_map<std::string, exec::ExprStats>& stats,
@@ -1904,7 +2027,7 @@ void addStats(
19042027 bool emptyStats =
19052028 !expr.stats ().numProcessedRows && !expr.stats ().defaultNullRowsSkipped ;
19062029 if (!emptyStats && !excludeSplFormExpr) {
1907- stats[expr.name ()].add (expr. stats ( ));
2030+ stats[expr.name ()].add (adjustStats (expr ));
19082031 }
19092032
19102033 for (const auto & input : expr.inputs ()) {
@@ -2016,6 +2139,24 @@ void printInputAndExprs(
20162139}
20172140} // namespace
20182141
2142+ void ExprSet::initializeAdaptiveCpuSampling (EvalCtx& context) {
2143+ context.setAdaptiveCpuSamplingEnabled (true );
2144+ context.setAdaptiveCpuSamplingMaxOverheadPct (
2145+ adaptiveCpuSamplingMaxOverheadPct_);
2146+
2147+ // Measure CpuWallTimer overhead once per ExprSet (platform constant).
2148+ if (!timerOverheadMeasured_) {
2149+ CpuWallTiming dummyTiming;
2150+ DeltaCpuWallTimeStopWatch overheadWatch;
2151+ {
2152+ auto dummy = std::make_unique<CpuWallTimer>(dummyTiming);
2153+ }
2154+ timerOverheadNanos_ = overheadWatch.elapsed ().wallNanos ;
2155+ timerOverheadMeasured_ = true ;
2156+ }
2157+ context.setTimerOverheadNanos (timerOverheadNanos_);
2158+ }
2159+
20192160void ExprSet::eval (
20202161 int32_t begin,
20212162 int32_t end,
@@ -2029,6 +2170,11 @@ void ExprSet::eval(
20292170 clearSharedSubexprs ();
20302171 }
20312172
2173+ // Apply adaptive per-function CPU sampling if configured.
2174+ if (adaptiveCpuSampling_) {
2175+ initializeAdaptiveCpuSampling (context);
2176+ }
2177+
20322178 if (!lazyDereference_) {
20332179 // Make sure LazyVectors, referenced by multiple expressions, are loaded for
20342180 // all the "rows".
0 commit comments