smp: prefault: add seastar::join_memory_prefault method

tomershafir · tomershafir · commit 8b3dc8c4b383 · 2025-04-09T19:24:35.000+03:00
Currently, memory prefault logic is internal and seastar doesnt provide much control to users. In order to improve the situation, I suggest to provide a barrier for the prefault work. This allows to:

* Prefer predictable low latency and high throughput from the start of request serving, at the cost of a startup delay, depending on machine characteristics and application specific requirements. For example, a fixed capacity on prem db setup, where slower startup can be tolerated. From users perspective, they generally cannot tolerate inconsistency (like spikes in latency during startup).
* Similarly, improve user scheduling decisions, like running less critical tasks while prefault works.
* Reliably test the prefault logic, improving reliability and users trust in seastar.

This patch adds memory_prefaulter class as a friend of smp class, and passes a smp context to the prefaulter. The prefulater calls the smp context upon completion using a new broadcast method, which sends a completion event to all the reactor threads. A new promise member on the reactor class is enables to return a per-reactor future that represents the prefault completion state. This way, the mechanism is eventually consistent on all the reactors. The interface is a free function on the seastar namespace.
diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh
@@ -399,6 +399,9 @@ private:
 
     friend void handle_signal(int signo, noncopyable_function<void ()>&& handler, bool once);
 
+    promise<> _memory_prefault_promise;
+    friend future<> join_memory_prefault();
+    
     uint64_t pending_task_count() const;
     void run_tasks(task_queue& tq);
     bool have_more_tasks() const;
@@ -721,6 +724,15 @@ inline int hrtimer_signal() {
     return SIGRTMIN;
 }
 
+/// Waits until memory prefault background work is finished.
+///
+/// Can only be called on shard 0 for initialization logic.
+/// On other shards, the future will hang forever.
+///
+/// \return a future that becomes ready when the memory prefault background
+///         work is finished, or after it's detected that memory preafult 
+///         is not initialized.
+future<> join_memory_prefault();
 
 extern logger seastar_logger;
 
diff --git a/include/seastar/core/smp.hh b/include/seastar/core/smp.hh
@@ -325,8 +325,10 @@ class smp : public std::enable_shared_from_this<smp> {
     static thread_local std::thread::id _tmain;
     bool _using_dpdk = false;
 
+    friend class internal::memory_prefaulter;
 private:
     void setup_prefaulter(const seastar::resource::resources& res, seastar::memory::internal::numa_layout layout);
+    future<> submit_memory_prefault_completion();
 public:
     explicit smp(alien::instance& alien);
     ~smp();
diff --git a/src/core/prefault.hh b/src/core/prefault.hh
@@ -30,13 +30,11 @@
 #include <seastar/core/task.hh>
 #include <seastar/core/memory.hh>
 
-namespace seastar::alien {
+namespace seastar {
 
-class instance;
+class smp;
 
-};
-
-namespace seastar::internal {
+namespace internal {
 
 // Responsible for pre-faulting in memory so soft page fault latency doesn't impact applications
 class memory_prefaulter {
@@ -46,13 +44,16 @@ class memory_prefaulter {
     std::unordered_map<unsigned, std::vector<memory::internal::memory_range>> _layout_by_node_id;
     std::atomic<unsigned> _active_threads = 0;
 public:
-    explicit memory_prefaulter(alien::instance& alien, const resource::resources& res, memory::internal::numa_layout layout);
+    // The memory_prefaulter instance is destroyed only after the smp context, so the reference is valid
+    explicit memory_prefaulter(smp& smp_context, const resource::resources& res, memory::internal::numa_layout layout);
     ~memory_prefaulter();
 private:
     void work(std::vector<memory::internal::memory_range>& ranges, size_t page_size, std::optional<size_t> huge_page_size_opt);
     void join_threads() noexcept;
+    void alien_on_complete(smp& smp_context);
 };
 
+}
 
 }
 
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
@@ -4288,6 +4288,15 @@ unsigned smp::adjust_max_networking_aio_io_control_blocks(unsigned network_iocbs
     return network_iocbs;
 }
 
+future<> join_memory_prefault() {
+    return engine()._memory_prefault_promise.get_future();
+}
+
+future<> 
+smp::submit_memory_prefault_completion() {
+    return smp::submit_to(0, [] { engine()._memory_prefault_promise.set_value(); });
+}
+
 void smp::configure(const smp_options& smp_opts, const reactor_options& reactor_opts)
 {
     bool use_transparent_hugepages = !reactor_opts.overprovisioned;
@@ -4674,6 +4683,9 @@ void smp::configure(const smp_options& smp_opts, const reactor_options& reactor_
 
     if (smp_opts.lock_memory && smp_opts.lock_memory.get_value() && layout && !layout->ranges.empty()) {
         smp::setup_prefaulter(resources, std::move(*layout));
+    } else {
+        // If memory prefault is not initialized, submit completion immediately 
+        run_in_background(submit_memory_prefault_completion());
     }
 }
 
diff --git a/src/core/smp.cc b/src/core/smp.cc
@@ -35,6 +35,7 @@ module seastar;
 #else
 #include <seastar/core/smp.hh>
 #include <seastar/core/alien.hh>
+#include <seastar/core/internal/run_in_background.hh>
 #include <seastar/core/resource.hh>
 #include <seastar/core/loop.hh>
 #include <seastar/core/semaphore.hh>
@@ -176,7 +177,7 @@ void
 smp::setup_prefaulter(const seastar::resource::resources& res, seastar::memory::internal::numa_layout layout) {
     // Stack guards mprotect() random pages, so the prefaulter will hard-fault.
 #ifndef SEASTAR_THREAD_STACK_GUARDS
-    _prefaulter = std::make_unique<internal::memory_prefaulter>(_alien, res, std::move(layout));
+    _prefaulter = std::make_unique<internal::memory_prefaulter>(*this, res, std::move(layout));
 #endif
 }
 
@@ -200,7 +201,7 @@ get_huge_page_size() {
     return std::nullopt;
 }
 
-internal::memory_prefaulter::memory_prefaulter(alien::instance& alien, const resource::resources& res, memory::internal::numa_layout layout) {
+internal::memory_prefaulter::memory_prefaulter(smp& smp_context, const resource::resources& res, memory::internal::numa_layout layout) {
     for (auto& range : layout.ranges) {
         _layout_by_node_id[range.numa_node_id].push_back(std::move(range));
     }
@@ -219,11 +220,11 @@ internal::memory_prefaulter::memory_prefaulter(alien::instance& alien, const res
             }
             a.set(cpuset);
         }
-        _worker_threads.emplace_back(a, [this, &alien, &ranges, page_size, huge_page_size_opt] {
+        _worker_threads.emplace_back(a, [this, &smp_context, &ranges, page_size, huge_page_size_opt] {
             ++_active_threads;
             work(ranges, page_size, huge_page_size_opt);
             if (!--_active_threads) {
-                run_on(alien, 0, [this] () noexcept { join_threads(); });
+                alien_on_complete(smp_context);
             }
         });
     }
@@ -238,6 +239,14 @@ internal::memory_prefaulter::join_threads() noexcept {
     _layout_by_node_id.clear();
 }
 
+void 
+internal::memory_prefaulter::alien_on_complete(smp& smp_context) {
+    run_on(smp_context._alien, 0, [this, &smp_context] () noexcept {
+        join_threads();
+        run_in_background(smp_context.submit_memory_prefault_completion());
+    });
+}
+
 internal::memory_prefaulter::~memory_prefaulter() {
     _stop_request.store(true, std::memory_order_relaxed);
     join_threads();

Original file line number	Diff line number	Diff line change
`@@ -4288,6 +4288,15 @@ unsigned smp::adjust_max_networking_aio_io_control_blocks(unsigned network_iocbs`
`4288`	`4288`	`return network_iocbs;`
`4289`	`4289`	`}`
`4290`	`4290`
	`4291`	`+future<> join_memory_prefault() {`
	`4292`	`+ return engine()._memory_prefault_promise.get_future();`
	`4293`	`+}`
	`4294`	`+`
	`4295`	`+future<>`
	`4296`	`+smp::submit_memory_prefault_completion() {`
	`4297`	`+ return smp::submit_to(0, [] { engine()._memory_prefault_promise.set_value(); });`
	`4298`	`+}`
	`4299`	`+`
`4291`	`4300`	`void smp::configure(const smp_options& smp_opts, const reactor_options& reactor_opts)`
`4292`	`4301`	`{`
`4293`	`4302`	`bool use_transparent_hugepages = !reactor_opts.overprovisioned;`
`@@ -4674,6 +4683,9 @@ void smp::configure(const smp_options& smp_opts, const reactor_options& reactor_`
`4674`	`4683`
`4675`	`4684`	`if (smp_opts.lock_memory && smp_opts.lock_memory.get_value() && layout && !layout->ranges.empty()) {`
`4676`	`4685`	`smp::setup_prefaulter(resources, std::move(*layout));`
	`4686`	`+ } else {`
	`4687`	`+ // If memory prefault is not initialized, submit completion immediately`
	`4688`	`+ run_in_background(submit_memory_prefault_completion());`
`4677`	`4689`	`}`
`4678`	`4690`	`}`
`4679`	`4691`