@@ -96,6 +96,7 @@ const volatile struct {
9696
9797 bool dispatch_lb_interactive ;
9898 bool dispatch_pick2_disable ;
99+ bool dispatch_pick2_numa_aware ;
99100 bool eager_load_balance ;
100101 bool max_dsq_pick2 ;
101102 bool wakeup_llc_migrations ;
@@ -110,6 +111,7 @@ const volatile struct {
110111
111112 .dispatch_lb_interactive = false,
112113 .dispatch_pick2_disable = false,
114+ .dispatch_pick2_numa_aware = true,
113115 .eager_load_balance = true,
114116 .max_dsq_pick2 = false,
115117 .wakeup_llc_migrations = false,
@@ -206,6 +208,12 @@ u32 llcs_per_node[MAX_NUMA_NODES];
206208/* Global DHQ counter for unique indexing */
207209u32 global_dhq_count = 0 ;
208210
211+ /* NUMA-aware pick2 load balancing data structures */
212+ /* Maps NUMA node ID to list of LLC IDs in that node */
213+ u64 node_llc_ids [MAX_NUMA_NODES ][MAX_LLCS ];
214+ /* Count of LLCs per NUMA node (for random selection) */
215+ u32 node_llc_counts [MAX_NUMA_NODES ];
216+
209217u64 min_slice_ns = 500 ;
210218
211219private (A ) struct bpf_cpumask __kptr * all_cpumask ;
@@ -963,6 +971,127 @@ static struct llc_ctx *rand_llc_ctx(void)
963971 return lookup_llc_ctx (bpf_get_prandom_u32 () % topo_config .nr_llcs );
964972}
965973
974+ /*
975+ * Returns a random LLC context from the specified NUMA node.
976+ * Returns NULL if node_id is invalid or has no LLCs.
977+ */
978+ static struct llc_ctx * rand_llc_ctx_from_node (u32 node_id )
979+ {
980+ u32 llc_count , llc_index ;
981+ u64 llc_id ;
982+
983+ if (node_id >= MAX_NUMA_NODES )
984+ return NULL ;
985+
986+ llc_count = node_llc_counts [node_id ];
987+ if (llc_count == 0 || llc_count > MAX_LLCS )
988+ return NULL ;
989+
990+ /* Random selection within this node's LLCs */
991+ llc_index = bpf_get_prandom_u32 () % llc_count ;
992+
993+ /* Verifier bounds check */
994+ if (llc_index >= MAX_LLCS )
995+ return NULL ;
996+
997+ llc_id = node_llc_ids [node_id ][llc_index ];
998+ return lookup_llc_ctx (llc_id );
999+ }
1000+
1001+ /*
1002+ * Pick two different LLCs from the same NUMA node.
1003+ * Returns 0 on success, -EINVAL if can't pick 2 different LLCs.
1004+ */
1005+ static __always_inline int pick_two_same_node (u32 node_id ,
1006+ struct llc_ctx * * left ,
1007+ struct llc_ctx * * right )
1008+ {
1009+ u32 llc_count , attempts = 0 ;
1010+ u32 left_index , right_index ;
1011+ u64 llc_id ;
1012+
1013+ if (node_id >= MAX_NUMA_NODES )
1014+ return - EINVAL ;
1015+
1016+ llc_count = node_llc_counts [node_id ];
1017+
1018+ /* Need at least 2 LLCs in this node */
1019+ if (llc_count < 2 )
1020+ return - EINVAL ;
1021+
1022+ * left = rand_llc_ctx_from_node (node_id );
1023+ if (!* left )
1024+ return - EINVAL ;
1025+
1026+ /* Try to pick a different LLC, with bounded loop for verifier */
1027+ #pragma unroll
1028+ for (attempts = 0 ; attempts < 8 ; attempts ++ ) {
1029+ * right = rand_llc_ctx_from_node (node_id );
1030+ if (* right && (* right )-> id != (* left )-> id ) {
1031+ stat_inc (P2DQ_STAT_PICK2_SAME_NUMA );
1032+ return 0 ;
1033+ }
1034+ }
1035+
1036+ /* Fallback: deterministically pick a different LLC */
1037+ left_index = (* left )-> index ;
1038+ right_index = (left_index + 1 ) % llc_count ;
1039+
1040+ if (right_index >= MAX_LLCS )
1041+ return - EINVAL ;
1042+
1043+ llc_id = node_llc_ids [node_id ][right_index ];
1044+ * right = lookup_llc_ctx (llc_id );
1045+
1046+ if (* right && (* right )-> id != (* left )-> id ) {
1047+ stat_inc (P2DQ_STAT_PICK2_SAME_NUMA );
1048+ return 0 ;
1049+ }
1050+
1051+ return - EINVAL ;
1052+ }
1053+
1054+ /*
1055+ * Pick two LLCs from different NUMA nodes for cross-node balancing.
1056+ */
1057+ static __always_inline int pick_two_cross_node (u32 current_node_id ,
1058+ struct llc_ctx * * left ,
1059+ struct llc_ctx * * right )
1060+ {
1061+ u32 other_node , attempts = 0 ;
1062+
1063+ /* Pick one from current node */
1064+ * left = rand_llc_ctx_from_node (current_node_id );
1065+ if (!* left )
1066+ return - EINVAL ;
1067+
1068+ /* Try to pick from a different node */
1069+ #pragma unroll
1070+ for (attempts = 0 ; attempts < 8 ; attempts ++ ) {
1071+ other_node = bpf_get_prandom_u32 () % topo_config .nr_nodes ;
1072+ if (other_node >= MAX_NUMA_NODES )
1073+ continue ;
1074+
1075+ if (other_node != current_node_id &&
1076+ node_llc_counts [other_node ] > 0 ) {
1077+ * right = rand_llc_ctx_from_node (other_node );
1078+ if (* right && (* right )-> id != (* left )-> id ) {
1079+ stat_inc (P2DQ_STAT_PICK2_CROSS_NUMA );
1080+ return 0 ;
1081+ }
1082+ }
1083+ }
1084+
1085+ /* Fallback to any random LLC */
1086+ * right = rand_llc_ctx ();
1087+ if (* right && (* right )-> id != (* left )-> id ) {
1088+ stat_inc (P2DQ_STAT_PICK2_CROSS_NUMA );
1089+ return 0 ;
1090+ }
1091+
1092+ return - EINVAL ;
1093+ }
1094+
9661095static bool keep_running (struct cpu_ctx * cpuc , struct llc_ctx * llcx ,
9671096 struct task_struct * p )
9681097{
@@ -2538,18 +2667,34 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
25382667 }
25392668
25402669 /*
2541- * For pick two load balancing we randomly choose two LLCs. We then
2542- * first try to consume from the LLC with the largest load. If we are
2543- * unable to consume from the first LLC then the second LLC is consumed
2544- * from. This yields better work conservation on machines with a large
2545- * number of LLCs.
2670+ * NUMA-aware pick-2: prefer same-NUMA LLCs, then try cross-NUMA,
2671+ * finally fallback to random selection.
25462672 */
2673+ if (lb_config .dispatch_pick2_numa_aware && topo_config .nr_nodes > 1 ) {
2674+ u32 current_node_id = cpu_node_ids [cpu ];
2675+ int ret ;
2676+
2677+ if (node_llc_counts [current_node_id ] >= 2 ) {
2678+ ret = pick_two_same_node (current_node_id , & left , & right );
2679+ if (ret == 0 && left && right )
2680+ goto llc_selected ;
2681+ }
2682+
2683+ ret = pick_two_cross_node (current_node_id , & left , & right );
2684+ if (ret == 0 && left && right )
2685+ goto llc_selected ;
2686+
2687+ stat_inc (P2DQ_STAT_PICK2_NUMA_FALLBACK );
2688+ }
2689+
2690+ /* Legacy random selection */
25472691 left = topo_config .nr_llcs == 2 ? lookup_llc_ctx (llc_ids [0 ]) : rand_llc_ctx ();
25482692 right = topo_config .nr_llcs == 2 ? lookup_llc_ctx (llc_ids [1 ]) : rand_llc_ctx ();
25492693
25502694 if (!left || !right )
25512695 return - EINVAL ;
25522696
2697+ /* Handle collision (same LLC picked twice) */
25532698 if (left -> id == right -> id ) {
25542699 i = llc_get_load (cur_llcx ) % topo_config .nr_llcs ;
25552700 i &= 0x3 ; // verifier
@@ -2559,6 +2704,8 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
25592704 return - EINVAL ;
25602705 }
25612706
2707+ llc_selected :
2708+
25622709
25632710 if (llc_get_load (right ) > llc_get_load (left )) {
25642711 first = right ;
@@ -3542,9 +3689,35 @@ static s32 p2dq_init_impl()
35423689
35433690 // First we initialize LLCs because DSQs are created at the LLC level.
35443691 bpf_for (i , 0 , topo_config .nr_llcs ) {
3692+ u32 llc_id , node_id , node_llc_idx ;
3693+
35453694 ret = init_llc (i );
35463695 if (ret )
35473696 return ret ;
3697+
3698+ /* Build NUMA-to-LLC mapping for NUMA-aware pick2 */
3699+ if (i >= MAX_LLCS )
3700+ continue ;
3701+
3702+ llc_id = llc_ids [i ];
3703+ llcx = lookup_llc_ctx (llc_id );
3704+ if (!llcx )
3705+ continue ;
3706+
3707+ node_id = llcx -> node_id ;
3708+ if (node_id >= MAX_NUMA_NODES ) {
3709+ scx_bpf_error ("LLC %u has invalid node_id %u" , llc_id , node_id );
3710+ return - EINVAL ;
3711+ }
3712+
3713+ node_llc_idx = node_llc_counts [node_id ];
3714+ if (node_llc_idx >= MAX_LLCS ) {
3715+ scx_bpf_error ("Node %u has too many LLCs" , node_id );
3716+ return - EINVAL ;
3717+ }
3718+
3719+ node_llc_ids [node_id ][node_llc_idx ] = llc_id ;
3720+ node_llc_counts [node_id ]++ ;
35483721 }
35493722
35503723 bpf_for (i , 0 , topo_config .nr_nodes ) {
0 commit comments