Merge pull request #3123 from hodgesds/p2dq-lat-prio

hodgesds · web-flow · commit 08068a7a0721 · 2025-12-02T20:41:48.000Z
scx_p2dq: Add conservative wakeup preemption for latency-critical tasks
diff --git a/scheds/rust/scx_p2dq/src/bpf/main.bpf.c b/scheds/rust/scx_p2dq/src/bpf/main.bpf.c
@@ -151,6 +151,15 @@ const volatile struct {
 	.pelt_enabled = true,
 };
 
+/* Latency priority and preemption configuration */
+const volatile struct {
+	bool latency_priority_enabled;
+	bool wakeup_preemption_enabled;
+} latency_config = {
+	.latency_priority_enabled = false,
+	.wakeup_preemption_enabled = false,
+};
+
 const volatile u32 debug = 2;
 const u32 zero_u32 = 0;
 extern const volatile u32 nr_cpu_ids;
@@ -1177,6 +1186,39 @@ static s32 p2dq_select_cpu_impl(struct task_struct *p, s32 prev_cpu, u64 wake_fl
 	else
 		cpu = pick_idle_cpu(p, taskc, prev_cpu, wake_flags, &is_idle);
 
+	// Wakeup preemption for extremely latency-critical tasks
+	// Only attempt if: no idle CPU found AND task has very high priority
+	if (!is_idle && latency_config.wakeup_preemption_enabled) {
+		struct cpu_ctx *prev_cpuc;
+
+		// Only preempt for truly latency-critical tasks (scx.weight >= 2847, equivalent to nice <= -15)
+		// and only if we can check the prev_cpu state
+		if (p->scx.weight >= 2847 &&
+		    bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr) &&
+		    (prev_cpuc = lookup_cpu_ctx(prev_cpu))) {
+
+			// Don't preempt interactive tasks - they need low latency too
+			if (cpu_ctx_test_flag(prev_cpuc, CPU_CTX_F_INTERACTIVE)) {
+				goto skip_preempt;
+			}
+
+			// Only preempt if incoming task has higher priority than running task
+			// This ensures we only preempt lower priority work
+			if (p->scx.weight <= prev_cpuc->running_weight) {
+				goto skip_preempt;
+			}
+
+			// Queue to prev_cpu's LLC DSQ with high priority
+			// Don't bypass normal queueing - let vtime ordering work
+			// Just ensure we target prev_cpu for better cache affinity
+			cpu = prev_cpu;
+			trace("PREEMPT_TARGET [%d][%s] weight=%u > running_weight=%u on cpu=%d",
+			      p->pid, p->comm, p->scx.weight, prev_cpuc->running_weight, prev_cpu);
+		}
+	}
+
+skip_preempt:
+
 	if (likely(is_idle)) {
 		stat_inc(P2DQ_STAT_IDLE);
 		// Only direct dispatch non-affinitized tasks
@@ -1665,6 +1707,7 @@ static int p2dq_running_impl(struct task_struct *p)
 		cpu_ctx_clear_flag(cpuc, CPU_CTX_F_INTERACTIVE);
 
 	cpuc->dsq_index = taskc->dsq_index;
+	cpuc->running_weight = p->scx.weight;
 
 	if (p->scx.weight < 100)
 		cpu_ctx_set_flag(cpuc, CPU_CTX_F_NICE_TASK);
@@ -2466,6 +2509,7 @@ static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args
 
 	taskc->llc_id = cpuc->llc_id;
 	taskc->node_id = cpuc->node_id;
+	taskc->pid = p->pid;
 
 	// Adjust starting index based on niceness
 	if (p->scx.weight == 100) {
@@ -2500,8 +2544,6 @@ static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args
 	else
 		taskc->dsq_id = cpuc->llc_dsq;
 
-	taskc->pid = p->pid;
-
 	return 0;
 }
 
@@ -3025,6 +3067,7 @@ void BPF_STRUCT_OPS(p2dq_running, struct task_struct *p)
 void BPF_STRUCT_OPS(p2dq_enqueue, struct task_struct *p __arg_trusted, u64 enq_flags)
 {
 	struct enqueue_promise pro;
+
 	async_p2dq_enqueue(&pro, p, enq_flags);
 	complete_p2dq_enqueue(&pro, p);
 }
diff --git a/scheds/rust/scx_p2dq/src/bpf/types.h b/scheds/rust/scx_p2dq/src/bpf/types.h
@@ -53,6 +53,7 @@ struct cpu_ctx {
 	u64				mig_dsq;
 	u64				llc_dsq;
 	u64				max_load_dsq;
+	u32				running_weight;  /* Weight of currently running task */
 
 	scx_atq_t			*mig_atq;
 	scx_dhq_t			*mig_dhq;
diff --git a/scheds/rust/scx_p2dq/src/lib.rs b/scheds/rust/scx_p2dq/src/lib.rs
@@ -321,6 +321,14 @@ pub struct SchedulerOpts {
     #[clap(long, default_value_t = true, action = clap::ArgAction::Set)]
     pub enable_pelt: bool,
 
+    /// Enable latency priority system (uses task nice value)
+    #[clap(long, action = clap::ArgAction::SetTrue)]
+    pub latency_priority: bool,
+
+    /// Enable wakeup preemption for latency-critical tasks
+    #[clap(long, action = clap::ArgAction::SetTrue)]
+    pub wakeup_preemption: bool,
+
     #[clap(flatten, next_help_heading = "Topology Options")]
     pub topo: TopologyArgs,
 }
@@ -449,6 +457,10 @@ macro_rules! init_open_skel {
             rodata.p2dq_config.keep_running_enabled = MaybeUninit::new(opts.keep_running);
             rodata.p2dq_config.pelt_enabled = MaybeUninit::new(opts.enable_pelt);
 
+            // Latency priority config
+            rodata.latency_config.latency_priority_enabled = MaybeUninit::new(opts.latency_priority);
+            rodata.latency_config.wakeup_preemption_enabled = MaybeUninit::new(opts.wakeup_preemption);
+
             rodata.debug = verbose as u32;
             rodata.nr_cpu_ids = *NR_CPU_IDS as u32;