Skip to content

Commit 8b3dc8c

Browse files
committed
smp: prefault: add seastar::join_memory_prefault method
Currently, memory prefault logic is internal and seastar doesnt provide much control to users. In order to improve the situation, I suggest to provide a barrier for the prefault work. This allows to: * Prefer predictable low latency and high throughput from the start of request serving, at the cost of a startup delay, depending on machine characteristics and application specific requirements. For example, a fixed capacity on prem db setup, where slower startup can be tolerated. From users perspective, they generally cannot tolerate inconsistency (like spikes in latency during startup). * Similarly, improve user scheduling decisions, like running less critical tasks while prefault works. * Reliably test the prefault logic, improving reliability and users trust in seastar. This patch adds memory_prefaulter class as a friend of smp class, and passes a smp context to the prefaulter. The prefulater calls the smp context upon completion using a new broadcast method, which sends a completion event to all the reactor threads. A new promise member on the reactor class is enables to return a per-reactor future that represents the prefault completion state. This way, the mechanism is eventually consistent on all the reactors. The interface is a free function on the seastar namespace.
1 parent 05a39bc commit 8b3dc8c

File tree

5 files changed

+46
-10
lines changed

5 files changed

+46
-10
lines changed

include/seastar/core/reactor.hh

+12
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,9 @@ private:
399399

400400
friend void handle_signal(int signo, noncopyable_function<void ()>&& handler, bool once);
401401

402+
promise<> _memory_prefault_promise;
403+
friend future<> join_memory_prefault();
404+
402405
uint64_t pending_task_count() const;
403406
void run_tasks(task_queue& tq);
404407
bool have_more_tasks() const;
@@ -721,6 +724,15 @@ inline int hrtimer_signal() {
721724
return SIGRTMIN;
722725
}
723726

727+
/// Waits until memory prefault background work is finished.
728+
///
729+
/// Can only be called on shard 0 for initialization logic.
730+
/// On other shards, the future will hang forever.
731+
///
732+
/// \return a future that becomes ready when the memory prefault background
733+
/// work is finished, or after it's detected that memory preafult
734+
/// is not initialized.
735+
future<> join_memory_prefault();
724736

725737
extern logger seastar_logger;
726738

include/seastar/core/smp.hh

+2
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,10 @@ class smp : public std::enable_shared_from_this<smp> {
325325
static thread_local std::thread::id _tmain;
326326
bool _using_dpdk = false;
327327

328+
friend class internal::memory_prefaulter;
328329
private:
329330
void setup_prefaulter(const seastar::resource::resources& res, seastar::memory::internal::numa_layout layout);
331+
future<> submit_memory_prefault_completion();
330332
public:
331333
explicit smp(alien::instance& alien);
332334
~smp();

src/core/prefault.hh

+7-6
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,11 @@
3030
#include <seastar/core/task.hh>
3131
#include <seastar/core/memory.hh>
3232

33-
namespace seastar::alien {
33+
namespace seastar {
3434

35-
class instance;
35+
class smp;
3636

37-
};
38-
39-
namespace seastar::internal {
37+
namespace internal {
4038

4139
// Responsible for pre-faulting in memory so soft page fault latency doesn't impact applications
4240
class memory_prefaulter {
@@ -46,13 +44,16 @@ class memory_prefaulter {
4644
std::unordered_map<unsigned, std::vector<memory::internal::memory_range>> _layout_by_node_id;
4745
std::atomic<unsigned> _active_threads = 0;
4846
public:
49-
explicit memory_prefaulter(alien::instance& alien, const resource::resources& res, memory::internal::numa_layout layout);
47+
// The memory_prefaulter instance is destroyed only after the smp context, so the reference is valid
48+
explicit memory_prefaulter(smp& smp_context, const resource::resources& res, memory::internal::numa_layout layout);
5049
~memory_prefaulter();
5150
private:
5251
void work(std::vector<memory::internal::memory_range>& ranges, size_t page_size, std::optional<size_t> huge_page_size_opt);
5352
void join_threads() noexcept;
53+
void alien_on_complete(smp& smp_context);
5454
};
5555

56+
}
5657

5758
}
5859

src/core/reactor.cc

+12
Original file line numberDiff line numberDiff line change
@@ -4288,6 +4288,15 @@ unsigned smp::adjust_max_networking_aio_io_control_blocks(unsigned network_iocbs
42884288
return network_iocbs;
42894289
}
42904290

4291+
future<> join_memory_prefault() {
4292+
return engine()._memory_prefault_promise.get_future();
4293+
}
4294+
4295+
future<>
4296+
smp::submit_memory_prefault_completion() {
4297+
return smp::submit_to(0, [] { engine()._memory_prefault_promise.set_value(); });
4298+
}
4299+
42914300
void smp::configure(const smp_options& smp_opts, const reactor_options& reactor_opts)
42924301
{
42934302
bool use_transparent_hugepages = !reactor_opts.overprovisioned;
@@ -4674,6 +4683,9 @@ void smp::configure(const smp_options& smp_opts, const reactor_options& reactor_
46744683

46754684
if (smp_opts.lock_memory && smp_opts.lock_memory.get_value() && layout && !layout->ranges.empty()) {
46764685
smp::setup_prefaulter(resources, std::move(*layout));
4686+
} else {
4687+
// If memory prefault is not initialized, submit completion immediately
4688+
run_in_background(submit_memory_prefault_completion());
46774689
}
46784690
}
46794691

src/core/smp.cc

+13-4
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ module seastar;
3535
#else
3636
#include <seastar/core/smp.hh>
3737
#include <seastar/core/alien.hh>
38+
#include <seastar/core/internal/run_in_background.hh>
3839
#include <seastar/core/resource.hh>
3940
#include <seastar/core/loop.hh>
4041
#include <seastar/core/semaphore.hh>
@@ -176,7 +177,7 @@ void
176177
smp::setup_prefaulter(const seastar::resource::resources& res, seastar::memory::internal::numa_layout layout) {
177178
// Stack guards mprotect() random pages, so the prefaulter will hard-fault.
178179
#ifndef SEASTAR_THREAD_STACK_GUARDS
179-
_prefaulter = std::make_unique<internal::memory_prefaulter>(_alien, res, std::move(layout));
180+
_prefaulter = std::make_unique<internal::memory_prefaulter>(*this, res, std::move(layout));
180181
#endif
181182
}
182183

@@ -200,7 +201,7 @@ get_huge_page_size() {
200201
return std::nullopt;
201202
}
202203

203-
internal::memory_prefaulter::memory_prefaulter(alien::instance& alien, const resource::resources& res, memory::internal::numa_layout layout) {
204+
internal::memory_prefaulter::memory_prefaulter(smp& smp_context, const resource::resources& res, memory::internal::numa_layout layout) {
204205
for (auto& range : layout.ranges) {
205206
_layout_by_node_id[range.numa_node_id].push_back(std::move(range));
206207
}
@@ -219,11 +220,11 @@ internal::memory_prefaulter::memory_prefaulter(alien::instance& alien, const res
219220
}
220221
a.set(cpuset);
221222
}
222-
_worker_threads.emplace_back(a, [this, &alien, &ranges, page_size, huge_page_size_opt] {
223+
_worker_threads.emplace_back(a, [this, &smp_context, &ranges, page_size, huge_page_size_opt] {
223224
++_active_threads;
224225
work(ranges, page_size, huge_page_size_opt);
225226
if (!--_active_threads) {
226-
run_on(alien, 0, [this] () noexcept { join_threads(); });
227+
alien_on_complete(smp_context);
227228
}
228229
});
229230
}
@@ -238,6 +239,14 @@ internal::memory_prefaulter::join_threads() noexcept {
238239
_layout_by_node_id.clear();
239240
}
240241

242+
void
243+
internal::memory_prefaulter::alien_on_complete(smp& smp_context) {
244+
run_on(smp_context._alien, 0, [this, &smp_context] () noexcept {
245+
join_threads();
246+
run_in_background(smp_context.submit_memory_prefault_completion());
247+
});
248+
}
249+
241250
internal::memory_prefaulter::~memory_prefaulter() {
242251
_stop_request.store(true, std::memory_order_relaxed);
243252
join_threads();

0 commit comments

Comments
 (0)