From 7f2aede81b4fa55334a7efdeda3ac5ff58358ea5 Mon Sep 17 00:00:00 2001 From: Travis Downs Date: Wed, 2 Apr 2025 15:14:12 -0300 Subject: [PATCH] stall_detector: no backtrace if exception If a an exception is in progress when a reactor stall is detected, omit taking the backtrace, as this can crash as detailed in #2697. Add a test which reproduces the issue, and which crashes (sometimes) before this fix and which runs cleanly afterwards. Fixes #2697. --- src/core/reactor.cc | 6 +++- tests/unit/stall_detector_test.cc | 52 +++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/core/reactor.cc b/src/core/reactor.cc index d68f86175bb..197debb7dbf 100644 --- a/src/core/reactor.cc +++ b/src/core/reactor.cc @@ -1450,7 +1450,11 @@ void cpu_stall_detector::generate_trace() { buf.append("Reactor stalled for "); buf.append_decimal(uint64_t(delta / 1ms)); buf.append(" ms"); - print_with_backtrace(buf, _config.oneline); + if (std::uncaught_exceptions() > 0) { + buf.append(", backtrace omitted (uncaught exception in progress)\n"); + } else { + print_with_backtrace(buf, _config.oneline); + } maybe_report_kernel_trace(buf); } diff --git a/tests/unit/stall_detector_test.cc b/tests/unit/stall_detector_test.cc index 7392cf465e1..f28a41ca3cb 100644 --- a/tests/unit/stall_detector_test.cc +++ b/tests/unit/stall_detector_test.cc @@ -177,6 +177,58 @@ SEASTAR_THREAD_TEST_CASE(spin_in_kernel) { test_spin_with_body("kernel", [] { mmap_populate(128 * 1024); }); } +// This test reproduces the issue described in https://github.com/scylladb/seastar/issues/2697 +// The issue is reproduced most quickly using the following arguments: +// --blocked-reactor-notify-ms=1 +// but it will happen with default arguments as well. +SEASTAR_THREAD_TEST_CASE(stall_detector_crash) { + // increase total_iters to increase chance of failure + // the value below is tuned to take about 1 second in + // release builds + constexpr auto total_iters = 100000; + constexpr int max_depth = 20; + + auto now = [] { return std::chrono::high_resolution_clock::now(); }; + + auto recursive_thrower = [](auto self, int x) -> void { + if (x <= 0) { + throw std::runtime_error("foo"); + } else { + try { + self(self, x - 1); + } catch (...) { + if (x & 0xF) { + throw; + } + } + } + }; + + auto next_yield = now(); + for (int a = 1; a < total_iters; a++) { + if (now() > next_yield) { + // we need to periodically yield or else the stall reports will become + // less and less frequent as exponentially grow the report interval while + // the same task is running + thread::yield(); + next_yield = now() + 40ms; + // the next line resets the suppression state which allows many more reports + // per second, increasing the chance of a failure + reactor::test::set_stall_detector_report_function({}); + } + + try { + recursive_thrower(recursive_thrower, a % max_depth); + } catch (...) { + } + + if (a % 100000 == 0) { + fmt::print("Making progress: {:6.3f}%\n", 100. * a / total_iters); + } + } +} + + #else