@@ -238,7 +238,9 @@ class PerformanceTest : public ::testing::Test
238238 );
239239
240240 BenchmarkResult result;
241- result.avg_ms = std::accumulate (times.begin (), times.end (), 0.0 ) / times.size ();
241+ // Use trimmed data for avg as well, to keep the same statistical basis
242+ // as std_dev/min/max/mid which are all computed on filtered_times.
243+ result.avg_ms = std::accumulate (filtered_times.begin (), filtered_times.end (), 0.0 ) / filtered_times.size ();
242244 result.min_ms = filtered_times.front ();
243245 result.max_ms = filtered_times.back ();
244246 result.iterations = run_count;
@@ -335,7 +337,9 @@ class PerformanceTest : public ::testing::Test
335337 << " , total: " << (result.total_time_ms / 1000.0 ) << " s)" << std::endl;
336338 } else {
337339 const auto & baseline = baseline_data[key];
338- double diff_percent = ((result.mid_ms - baseline.mid_ms ) / baseline.mid_ms ) * 100.0 ;
340+ // Use the trimmed mean (avg_ms) consistently for diff, direction and
341+ // the t-test below, so the whole decision path shares one statistic.
342+ double diff_percent = ((result.avg_ms - baseline.avg_ms ) / baseline.avg_ms ) * 100.0 ;
339343
340344 // Statistical comparison
341345 double t_stat, p_value;
@@ -348,23 +352,23 @@ class PerformanceTest : public ::testing::Test
348352 // No statistically significant difference
349353 std::cout << " [No Significant Change] " << test_name << " : "
350354 << std::setprecision (6 )
351- << " median : " << result.mid_ms << " ms "
352- << " (baseline: " << baseline.mid_ms << " ms, "
355+ << " avg : " << result.avg_ms << " ms "
356+ << " (baseline: " << baseline.avg_ms << " ms, "
353357 << " diff: " << diff_percent << " %, "
354358 << " p=" << p_value << " , not significant)" << std::endl;
355359 } else if (!exceeds_threshold) {
356360 // Statistically significant but below threshold - acceptable
357361 std::cout << " [Acceptable Change] " << test_name << " : "
358362 << std::setprecision (6 )
359- << " median : " << result.mid_ms << " ms "
360- << " (baseline: " << baseline.mid_ms << " ms, "
363+ << " avg : " << result.avg_ms << " ms "
364+ << " (baseline: " << baseline.avg_ms << " ms, "
361365 << " diff: " << diff_percent << " %, "
362366 << " below " << PERF_REGRESSION_THRESHOLD << " % threshold)" << std::endl;
363- } else if (result.mid_ms < baseline.mid_ms ) {
367+ } else if (result.avg_ms < baseline.avg_ms ) {
364368 // Statistically significant improvement above threshold
365369 std::cout << " [Performance Improve " << std::abs (diff_percent) << " %] " << test_name << " : "
366- << std::setprecision (6 ) << " median : " << result.mid_ms << " ms "
367- << " (baseline: " << baseline.mid_ms << " ms, "
370+ << std::setprecision (6 ) << " avg : " << result.avg_ms << " ms "
371+ << " (baseline: " << baseline.avg_ms << " ms, "
368372 << " t=" << t_stat << " , p=" << p_value << " )" << std::endl;
369373 } else {
370374 // Statistically significant regression above threshold - FAIL
0 commit comments