reactor: Disable hot polling if wakeup granularity is too high

StephanDollberg · StephanDollberg · commit 08f8e624366e · 2025-04-15T09:24:20.000+01:00
We are seeing an issue where rand write performance under performs massively (100x). This is caused by the linux dio kernel thread/workqueue being starved and hence aio write completitions aren't being served in a timely manner. This doesn't happen using "default" linux settings but only if `/proc/sys/kernel/sched_wakeup_granularity_ns` or (`/sys/kernel/debug/sched/wakeup_granularity_ns` on newer kernels) is raised. Specifically this effect can be observed on RHEL-8 as the `tuned` version that ships with it sets this value to 15000000 but can reproduced on any other system by just bumping that value. This patch tries to detect this being the case and if so it will warn and disable hot polling (both `--poll-aio` and `--idle-poll-time-us`) which gives back the majority of the performance. Note because this setting has moved to debug fs on newer kernels which requires root rights to read it's actually not very likely that we will be able to detect it on these. However, RHEL8 uses an older kernel and is likely the major offender to run into this bug (we have had multiple customers run into this). Ref scylladb#2696 (cherry picked from commit 099cf61)
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
@@ -3834,6 +3834,30 @@ static bool kernel_supports_aio_fsync() {
     return internal::kernel_uname().whitelisted({"4.18"});
 }
 
+static std::tuple<std::filesystem::path, uint64_t> wakeup_granularity() {
+    auto try_read = [] (auto path) -> uint64_t {
+        try {
+            return read_first_line_as<uint64_t>(path);
+        } catch (...) {
+            return 0;
+        }
+    };
+
+    auto legacy_path = "/proc/sys/kernel/sched_wakeup_granularity_ns";
+    if (auto val = try_read(legacy_path); val) {
+        return {legacy_path, val};
+    }
+
+    // This will in practice almost always fail because debug fs requires root
+    // perms to read so we are out of luck
+    auto debug_fs_path = "/sys/kernel/debug/sched/wakeup_granularity_ns";
+    if (auto val = try_read(legacy_path); val) {
+        return {debug_fs_path, val};
+    }
+
+    return {"", 0};
+}
+
 static program_options::selection_value<network_stack_factory> create_network_stacks_option(reactor_options& zis) {
     using value_type = program_options::selection_value<network_stack_factory>;
     value_type::candidates candidates;
@@ -4535,6 +4559,22 @@ void smp::configure(const smp_options& smp_opts, const reactor_options& reactor_
         .no_poll_aio = !reactor_opts.poll_aio.get_value() || (reactor_opts.poll_aio.defaulted() && reactor_opts.overprovisioned),
     };
 
+    // Disable hot polling if sched wakeup granularity is too high
+    // dio thread will be starved otherwise
+    // see https://github.com/scylladb/seastar/issues/2696
+    if (!reactor_cfg.no_poll_aio || reactor_cfg.max_poll_time != 0us) {
+        auto [wakeup_file, granularity] = wakeup_granularity();
+        // 15M is chosen as it's what tuned sets. Though you probably already
+        // see an adverse effect earlier.
+        if (granularity >= 15000000) {
+            reactor_cfg.no_poll_aio = true;
+            reactor_cfg.max_poll_time = 0us;
+            seastar_logger.warn(
+                "Setting --poll-aio=0 and --idle-poll-time-us=0 due to too high sched_wakeup_granularity of {} in {}",
+                granularity, wakeup_file.string());
+        }
+    }
+
     aio_nowait_supported = reactor_opts.linux_aio_nowait.get_value();
     std::mutex mtx;