cpu_profiler: deflake cpu_profiler_test

travisdowns · travisdowns · commit 52ca3bfcae80 · 2025-04-14T14:15:40.000-07:00
This test fails often in GHA. Identical failures are difficult to
reproduce locally, but other local failures are easy to repro by using
something like `taskset -c 0,1 stress -c 2` while running the tests.

Introduce "loose mode" in the test, enabled by default, which looses
some of the checks by a large amount.

Test this by running the test 100 times in CI in another PR.
diff --git a/tests/unit/cpu_profiler_test.cc b/tests/unit/cpu_profiler_test.cc
@@ -43,6 +43,15 @@
 
 namespace {
 
+// If true, the acceptable thresholds are greatly increased "close enough"
+// checks, which can reduce the flakiness on heavily loaded or otherwise
+// unpredictable systems. If false, the thresholds are much stricter and
+// should be used for more deterministic systems.
+// We default to true to avoid flakes in CI, but when running locally you
+// can consider setting this to false to get a more accurate picture of
+// the profiler's behavior.
+constexpr bool use_loose_thresholds = true;
+
 struct temporary_profiler_settings {
     std::chrono::nanoseconds prev_ns;
     bool prev_enabled;
@@ -68,15 +77,37 @@ struct temporary_profiler_settings {
 //
 // The function below takes this error into account and allows the actual samples taken
 // to be slightly less than the expect number of samples if there was no error.
-bool close_to_expected(size_t actual_size, size_t expected_size, double allowed_dev = 0.15) {
-    auto lower_bound = (1 - allowed_dev) * expected_size;
-    auto upper_bound = (1 + allowed_dev) * expected_size;
+bool close_to_expected(size_t actual_size, size_t expected_size) {
+
+    constexpr double allowed_dev = 0.15;
+
+    size_t lower_bound, upper_bound;
+
+    if (use_loose_thresholds) {
+        // widen the thresholds a lot
+        lower_bound = round(pow(1 - allowed_dev, 4)) * expected_size;
+        upper_bound = round(pow(1 + allowed_dev, 4)) * expected_size;
+    } else {
+        lower_bound = round((1 - allowed_dev) * expected_size);
+        upper_bound = round((1 + allowed_dev) * expected_size);
+    }
 
     BOOST_TEST_INFO("actual_size: " << actual_size << ", lower_bound " << lower_bound << ", upper_bound " << upper_bound);
 
     return actual_size <= upper_bound && actual_size >= lower_bound;
 }
 
+// If loose thresholds are enabled, this call maps to close_to_exepected, otherwise
+// it does an exact equality check.
+void maybe_exact(size_t actual, size_t expected, auto message) {
+    BOOST_TEST_INFO(message);
+    if (use_loose_thresholds) {
+        close_to_expected(actual, expected);
+    } else {
+        BOOST_REQUIRE_EQUAL(actual, expected);
+    }
+}
+
 /*
  * Get the current profile results and dropped count. If sg_in_main is true, also validates that
  * the sg associated with the profile is always main, as we expect unless some SG have been
@@ -161,7 +192,7 @@ SEASTAR_THREAD_TEST_CASE(mixed_case) {
         spin(20ms);
     }
 
-    BOOST_REQUIRE_EQUAL(reports, 5);
+    maybe_exact(reports, 5, "reports");
     auto results = get_profile();
     BOOST_REQUIRE(close_to_expected(results.size(), 12));
 }
@@ -343,6 +374,6 @@ SEASTAR_THREAD_TEST_CASE(scheduling_group_test) {
     BOOST_CHECK_GT(count_a + count_b, 10);
     BOOST_CHECK_GT(count_a, 0);
     BOOST_CHECK_GT(count_b, 0);
-    BOOST_CHECK_LT(count_main, 3);
+    BOOST_CHECK_LT(count_main, 10);
     BOOST_CHECK_LT(dropped_samples, 5);
 }