Skip to content

Commit fd50f8c

Browse files
committed
scx_p2dq: Add NUMA aware load balancing
Add NUMA aware pick 2 load balancing. The load balancer first tries to maintain NUMA locality before attempting to pick two across NUMA nodes. This reduces NUMA migrations on large systems. Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
1 parent ac58df7 commit fd50f8c

File tree

5 files changed

+225
-20
lines changed

5 files changed

+225
-20
lines changed

scheds/rust/scx_p2dq/src/bpf/intf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ enum stat_idx {
9393
P2DQ_STAT_EAS_LITTLE_SELECT,
9494
P2DQ_STAT_EAS_BIG_SELECT,
9595
P2DQ_STAT_EAS_FALLBACK,
96+
P2DQ_STAT_PICK2_SAME_NUMA,
97+
P2DQ_STAT_PICK2_CROSS_NUMA,
98+
P2DQ_STAT_PICK2_NUMA_FALLBACK,
9699
P2DQ_NR_STATS,
97100
};
98101

scheds/rust/scx_p2dq/src/bpf/main.bpf.c

Lines changed: 178 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ const volatile struct {
9696

9797
bool dispatch_lb_interactive;
9898
bool dispatch_pick2_disable;
99+
bool dispatch_pick2_numa_aware;
99100
bool eager_load_balance;
100101
bool max_dsq_pick2;
101102
bool wakeup_llc_migrations;
@@ -110,6 +111,7 @@ const volatile struct {
110111

111112
.dispatch_lb_interactive = false,
112113
.dispatch_pick2_disable = false,
114+
.dispatch_pick2_numa_aware = true,
113115
.eager_load_balance = true,
114116
.max_dsq_pick2 = false,
115117
.wakeup_llc_migrations = false,
@@ -206,6 +208,12 @@ u32 llcs_per_node[MAX_NUMA_NODES];
206208
/* Global DHQ counter for unique indexing */
207209
u32 global_dhq_count = 0;
208210

211+
/* NUMA-aware pick2 load balancing data structures */
212+
/* Maps NUMA node ID to list of LLC IDs in that node */
213+
u64 node_llc_ids[MAX_NUMA_NODES][MAX_LLCS];
214+
/* Count of LLCs per NUMA node (for random selection) */
215+
u32 node_llc_counts[MAX_NUMA_NODES];
216+
209217
u64 min_slice_ns = 500;
210218

211219
private(A) struct bpf_cpumask __kptr *all_cpumask;
@@ -963,6 +971,127 @@ static struct llc_ctx *rand_llc_ctx(void)
963971
return lookup_llc_ctx(bpf_get_prandom_u32() % topo_config.nr_llcs);
964972
}
965973

974+
/*
975+
* Returns a random LLC context from the specified NUMA node.
976+
* Returns NULL if node_id is invalid or has no LLCs.
977+
*/
978+
static struct llc_ctx *rand_llc_ctx_from_node(u32 node_id)
979+
{
980+
u32 llc_count, llc_index;
981+
u64 llc_id;
982+
983+
if (node_id >= MAX_NUMA_NODES)
984+
return NULL;
985+
986+
llc_count = node_llc_counts[node_id];
987+
if (llc_count == 0 || llc_count > MAX_LLCS)
988+
return NULL;
989+
990+
/* Random selection within this node's LLCs */
991+
llc_index = bpf_get_prandom_u32() % llc_count;
992+
993+
/* Verifier bounds check */
994+
if (llc_index >= MAX_LLCS)
995+
return NULL;
996+
997+
llc_id = node_llc_ids[node_id][llc_index];
998+
return lookup_llc_ctx(llc_id);
999+
}
1000+
1001+
/*
1002+
* Pick two different LLCs from the same NUMA node.
1003+
* Returns 0 on success, -EINVAL if can't pick 2 different LLCs.
1004+
*/
1005+
static __always_inline int pick_two_same_node(u32 node_id,
1006+
struct llc_ctx **left,
1007+
struct llc_ctx **right)
1008+
{
1009+
u32 llc_count, attempts = 0;
1010+
u32 left_index, right_index;
1011+
u64 llc_id;
1012+
1013+
if (node_id >= MAX_NUMA_NODES)
1014+
return -EINVAL;
1015+
1016+
llc_count = node_llc_counts[node_id];
1017+
1018+
/* Need at least 2 LLCs in this node */
1019+
if (llc_count < 2)
1020+
return -EINVAL;
1021+
1022+
*left = rand_llc_ctx_from_node(node_id);
1023+
if (!*left)
1024+
return -EINVAL;
1025+
1026+
/* Try to pick a different LLC, with bounded loop for verifier */
1027+
#pragma unroll
1028+
for (attempts = 0; attempts < 8; attempts++) {
1029+
*right = rand_llc_ctx_from_node(node_id);
1030+
if (*right && (*right)->id != (*left)->id) {
1031+
stat_inc(P2DQ_STAT_PICK2_SAME_NUMA);
1032+
return 0;
1033+
}
1034+
}
1035+
1036+
/* Fallback: deterministically pick a different LLC */
1037+
left_index = (*left)->index;
1038+
right_index = (left_index + 1) % llc_count;
1039+
1040+
if (right_index >= MAX_LLCS)
1041+
return -EINVAL;
1042+
1043+
llc_id = node_llc_ids[node_id][right_index];
1044+
*right = lookup_llc_ctx(llc_id);
1045+
1046+
if (*right && (*right)->id != (*left)->id) {
1047+
stat_inc(P2DQ_STAT_PICK2_SAME_NUMA);
1048+
return 0;
1049+
}
1050+
1051+
return -EINVAL;
1052+
}
1053+
1054+
/*
1055+
* Pick two LLCs from different NUMA nodes for cross-node balancing.
1056+
*/
1057+
static __always_inline int pick_two_cross_node(u32 current_node_id,
1058+
struct llc_ctx **left,
1059+
struct llc_ctx **right)
1060+
{
1061+
u32 other_node, attempts = 0;
1062+
1063+
/* Pick one from current node */
1064+
*left = rand_llc_ctx_from_node(current_node_id);
1065+
if (!*left)
1066+
return -EINVAL;
1067+
1068+
/* Try to pick from a different node */
1069+
#pragma unroll
1070+
for (attempts = 0; attempts < 8; attempts++) {
1071+
other_node = bpf_get_prandom_u32() % topo_config.nr_nodes;
1072+
if (other_node >= MAX_NUMA_NODES)
1073+
continue;
1074+
1075+
if (other_node != current_node_id &&
1076+
node_llc_counts[other_node] > 0) {
1077+
*right = rand_llc_ctx_from_node(other_node);
1078+
if (*right && (*right)->id != (*left)->id) {
1079+
stat_inc(P2DQ_STAT_PICK2_CROSS_NUMA);
1080+
return 0;
1081+
}
1082+
}
1083+
}
1084+
1085+
/* Fallback to any random LLC */
1086+
*right = rand_llc_ctx();
1087+
if (*right && (*right)->id != (*left)->id) {
1088+
stat_inc(P2DQ_STAT_PICK2_CROSS_NUMA);
1089+
return 0;
1090+
}
1091+
1092+
return -EINVAL;
1093+
}
1094+
9661095
static bool keep_running(struct cpu_ctx *cpuc, struct llc_ctx *llcx,
9671096
struct task_struct *p)
9681097
{
@@ -2538,18 +2667,34 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
25382667
}
25392668

25402669
/*
2541-
* For pick two load balancing we randomly choose two LLCs. We then
2542-
* first try to consume from the LLC with the largest load. If we are
2543-
* unable to consume from the first LLC then the second LLC is consumed
2544-
* from. This yields better work conservation on machines with a large
2545-
* number of LLCs.
2670+
* NUMA-aware pick-2: prefer same-NUMA LLCs, then try cross-NUMA,
2671+
* finally fallback to random selection.
25462672
*/
2673+
if (lb_config.dispatch_pick2_numa_aware && topo_config.nr_nodes > 1) {
2674+
u32 current_node_id = cpu_node_ids[cpu];
2675+
int ret;
2676+
2677+
if (node_llc_counts[current_node_id] >= 2) {
2678+
ret = pick_two_same_node(current_node_id, &left, &right);
2679+
if (ret == 0 && left && right)
2680+
goto llc_selected;
2681+
}
2682+
2683+
ret = pick_two_cross_node(current_node_id, &left, &right);
2684+
if (ret == 0 && left && right)
2685+
goto llc_selected;
2686+
2687+
stat_inc(P2DQ_STAT_PICK2_NUMA_FALLBACK);
2688+
}
2689+
2690+
/* Legacy random selection */
25472691
left = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[0]) : rand_llc_ctx();
25482692
right = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[1]) : rand_llc_ctx();
25492693

25502694
if (!left || !right)
25512695
return -EINVAL;
25522696

2697+
/* Handle collision (same LLC picked twice) */
25532698
if (left->id == right->id) {
25542699
i = llc_get_load(cur_llcx) % topo_config.nr_llcs;
25552700
i &= 0x3; // verifier
@@ -2559,6 +2704,8 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
25592704
return -EINVAL;
25602705
}
25612706

2707+
llc_selected:
2708+
25622709

25632710
if (llc_get_load(right) > llc_get_load(left)) {
25642711
first = right;
@@ -3542,9 +3689,35 @@ static s32 p2dq_init_impl()
35423689

35433690
// First we initialize LLCs because DSQs are created at the LLC level.
35443691
bpf_for(i, 0, topo_config.nr_llcs) {
3692+
u32 llc_id, node_id, node_llc_idx;
3693+
35453694
ret = init_llc(i);
35463695
if (ret)
35473696
return ret;
3697+
3698+
/* Build NUMA-to-LLC mapping for NUMA-aware pick2 */
3699+
if (i >= MAX_LLCS)
3700+
continue;
3701+
3702+
llc_id = llc_ids[i];
3703+
llcx = lookup_llc_ctx(llc_id);
3704+
if (!llcx)
3705+
continue;
3706+
3707+
node_id = llcx->node_id;
3708+
if (node_id >= MAX_NUMA_NODES) {
3709+
scx_bpf_error("LLC %u has invalid node_id %u", llc_id, node_id);
3710+
return -EINVAL;
3711+
}
3712+
3713+
node_llc_idx = node_llc_counts[node_id];
3714+
if (node_llc_idx >= MAX_LLCS) {
3715+
scx_bpf_error("Node %u has too many LLCs", node_id);
3716+
return -EINVAL;
3717+
}
3718+
3719+
node_llc_ids[node_id][node_llc_idx] = llc_id;
3720+
node_llc_counts[node_id]++;
35483721
}
35493722

35503723
bpf_for(i, 0, topo_config.nr_nodes) {

scheds/rust/scx_p2dq/src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,10 @@ pub struct SchedulerOpts {
185185
#[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
186186
pub dispatch_pick2_disable: bool,
187187

188+
/// Disables NUMA-aware pick2 load balancing.
189+
#[clap(long, action = clap::ArgAction::SetTrue)]
190+
pub disable_pick2_numa_aware: bool,
191+
188192
/// Enables pick2 load balancing on the dispatch path when LLC utilization is under the
189193
/// specified utilization.
190194
#[clap(long, default_value = "75", value_parser = clap::value_parser!(u64).range(0..100))]
@@ -432,6 +436,7 @@ macro_rules! init_open_skel {
432436
rodata.lb_config.max_dsq_pick2 = MaybeUninit::new(opts.max_dsq_pick2);
433437
rodata.lb_config.eager_load_balance = MaybeUninit::new(!opts.eager_load_balance);
434438
rodata.lb_config.dispatch_pick2_disable = MaybeUninit::new(opts.dispatch_pick2_disable);
439+
rodata.lb_config.dispatch_pick2_numa_aware = MaybeUninit::new(!opts.disable_pick2_numa_aware);
435440
rodata.lb_config.dispatch_lb_busy = opts.dispatch_lb_busy;
436441
rodata.lb_config.dispatch_lb_interactive =
437442
MaybeUninit::new(opts.dispatch_lb_interactive);

scheds/rust/scx_p2dq/src/main.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ use bpf_intf::stat_idx_P2DQ_STAT_IDLE;
6161
use bpf_intf::stat_idx_P2DQ_STAT_KEEP;
6262
use bpf_intf::stat_idx_P2DQ_STAT_LLC_MIGRATION;
6363
use bpf_intf::stat_idx_P2DQ_STAT_NODE_MIGRATION;
64+
use bpf_intf::stat_idx_P2DQ_STAT_PICK2_CROSS_NUMA;
65+
use bpf_intf::stat_idx_P2DQ_STAT_PICK2_NUMA_FALLBACK;
66+
use bpf_intf::stat_idx_P2DQ_STAT_PICK2_SAME_NUMA;
6467
use bpf_intf::stat_idx_P2DQ_STAT_SELECT_PICK2;
6568
use bpf_intf::stat_idx_P2DQ_STAT_THERMAL_AVOID;
6669
use bpf_intf::stat_idx_P2DQ_STAT_THERMAL_KICK;
@@ -284,6 +287,9 @@ impl<'a> Scheduler<'a> {
284287
eas_little_select: stats[stat_idx_P2DQ_STAT_EAS_LITTLE_SELECT as usize],
285288
eas_big_select: stats[stat_idx_P2DQ_STAT_EAS_BIG_SELECT as usize],
286289
eas_fallback: stats[stat_idx_P2DQ_STAT_EAS_FALLBACK as usize],
290+
pick2_same_numa: stats[stat_idx_P2DQ_STAT_PICK2_SAME_NUMA as usize],
291+
pick2_cross_numa: stats[stat_idx_P2DQ_STAT_PICK2_CROSS_NUMA as usize],
292+
pick2_numa_fallback: stats[stat_idx_P2DQ_STAT_PICK2_NUMA_FALLBACK as usize],
287293
}
288294
}
289295

scheds/rust/scx_p2dq/src/stats.rs

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,12 @@ pub struct Metrics {
9191
pub eas_big_select: u64,
9292
#[stat(desc = "Number of times EAS fell back to non-preferred core type")]
9393
pub eas_fallback: u64,
94+
#[stat(desc = "Number of times pick2 selected LLCs from same NUMA node")]
95+
pub pick2_same_numa: u64,
96+
#[stat(desc = "Number of times pick2 selected LLCs across NUMA nodes")]
97+
pub pick2_cross_numa: u64,
98+
#[stat(desc = "Number of times pick2 NUMA-aware selection fell back to random")]
99+
pub pick2_numa_fallback: u64,
94100
}
95101

96102
impl Metrics {
@@ -111,38 +117,47 @@ impl Metrics {
111117
self.enq_mig,
112118
)?;
113119

114-
// Build the stats line conditionally based on thermal tracking availability
115120
let mut stats_line = format!(
116-
"\twake prev/llc/mig {}/{}/{}\n\tpick2 select/dispatch {}/{}\n\tmigrations llc/node: {}/{}\n\tfork balance/same {}/{}\n\texec balance/same {}/{}",
117-
self.wake_prev,
118-
self.wake_llc,
119-
self.wake_mig,
120-
self.select_pick2,
121-
self.dispatch_pick2,
122-
self.llc_migrations,
123-
self.node_migrations,
124-
self.fork_balance,
125-
self.fork_same_llc,
126-
self.exec_balance,
127-
self.exec_same_llc,
121+
"\twake prev/llc/mig {}/{}/{}",
122+
self.wake_prev, self.wake_llc, self.wake_mig,
128123
);
129124

130-
// Only show thermal stats if thermal tracking is enabled
125+
if crate::TOPO.all_llcs.len() > 1 {
126+
stats_line.push_str(&format!(
127+
"\n\tpick2 select/dispatch {}/{}\n\tmigrations llc/node: {}/{}\n\tfork balance/same {}/{}\n\texec balance/same {}/{}",
128+
self.select_pick2,
129+
self.dispatch_pick2,
130+
self.llc_migrations,
131+
self.node_migrations,
132+
self.fork_balance,
133+
self.fork_same_llc,
134+
self.exec_balance,
135+
self.exec_same_llc,
136+
));
137+
}
138+
131139
if is_thermal_tracking_enabled() {
132140
stats_line.push_str(&format!(
133141
"\n\tthermal kick/avoid {}/{}",
134142
self.thermal_kick, self.thermal_avoid,
135143
));
136144
}
137145

138-
// Only show EAS stats if energy-aware scheduling is enabled
139146
if is_eas_enabled() {
140147
stats_line.push_str(&format!(
141148
"\n\tEAS little/big/fallback {}/{}/{}",
142149
self.eas_little_select, self.eas_big_select, self.eas_fallback,
143150
));
144151
}
145152

153+
let numa_total = self.pick2_same_numa + self.pick2_cross_numa + self.pick2_numa_fallback;
154+
if numa_total > 0 {
155+
stats_line.push_str(&format!(
156+
"\n\tpick2 same_numa/cross_numa/fallback {}/{}/{}",
157+
self.pick2_same_numa, self.pick2_cross_numa, self.pick2_numa_fallback,
158+
));
159+
}
160+
146161
writeln!(w, "{}", stats_line)?;
147162
Ok(())
148163
}
@@ -176,6 +191,9 @@ impl Metrics {
176191
eas_little_select: self.eas_little_select - rhs.eas_little_select,
177192
eas_big_select: self.eas_big_select - rhs.eas_big_select,
178193
eas_fallback: self.eas_fallback - rhs.eas_fallback,
194+
pick2_same_numa: self.pick2_same_numa - rhs.pick2_same_numa,
195+
pick2_cross_numa: self.pick2_cross_numa - rhs.pick2_cross_numa,
196+
pick2_numa_fallback: self.pick2_numa_fallback - rhs.pick2_numa_fallback,
179197
}
180198
}
181199
}

0 commit comments

Comments
 (0)