feat(scheduler): instrumentation to locate the wedge in spiceai/spiceai#10832

phillipleblanc · phillipleblanc · commit 6b96f6a937bf · 2026-05-14T21:27:26.000+09:00
Adds opt-in diagnostic instrumentation aimed at identifying which call
path is wedging the QueryStageScheduler event loop in production. Pure
logging; no behavior change in the happy path.

  1. ballista/core/src/lock_tracing.rs

     Helpers traced_write / traced_read wrap tokio::sync::RwLock
     acquisitions to log:
       - acquire_ms if &gt;= 100ms (real contention)
       - hold_ms   if &gt;= 500ms (long critical sections)
     Each call site provides a static label so a slow-acquire or
     slow-hold message in the log points directly at the offending
     function.

  2. Lock instrumentation applied at every execution_graph
     read/write across task_manager.rs (update_job,
     update_task_statuses helpers, abort_job, succeed_job,
     get_job_status, get_job_execution_graph,
     get_available_task_count, executor_lost, the metrics
     read in total_pending_tasks) and cluster/mod.rs
     (bind_task_bias, bind_task_round_robin,
     bind_task_consistent_hash).

  3. ballista/core/src/event_loop.rs

     EventLoop::run now stopwatches each on_receive call and warns
     if a handler runs &gt;= 1s. The event label is logged via Debug so
     a wedged handler is identified by event variant.

     EventSender::post_event stopwatches the send().await and warns
     if it parks &gt;= 100ms (mpsc channel full and consumer parked).
     Requires E: Debug on EventLoop::run and ::start; the only
     production event type (QueryStageSchedulerEvent) already
     hand-rolls Debug.

  4. ballista/scheduler/src/scheduler_server/mod.rs

     Adds start_diagnostic_dump_loop that, every 30s, calls a new
     TaskManager::diagnostic_snapshot which try_reads each active
     job graph and logs one line per stage:
       diagnostic_dump: job_id=X stage_id=Y variant=Running \
         partitions=96 assigned=64 unassigned=32 job_status=Running(...)
     The unassigned count is the single most diagnostic number for
     #10832: in production we see partitions stuck bound to no
     executor while the cluster has live slots. A graph that fails
     try_read (write lock held) is logged as graph_locked=true.

The four signals together let a single production run identify:
  - whether the wedge is in an event handler (slow on_receive)
  - or in the event channel (slow post_event)
  - or in a specific lock acquire / hold (slow rwlock_*)
  - and the stage-level shape of the wedged graph (diagnostic_dump)

Verified: cargo build, cargo test -p ballista-core -p ballista-scheduler
(79 passed, 1 ignored), cargo clippy -D warnings, cargo fmt --check
all clean.
diff --git a/ballista/core/src/event_loop.rs b/ballista/core/src/event_loop.rs
@@ -17,15 +17,27 @@
 
 //! Event loop infrastructure for asynchronous message processing.
 
+use std::fmt::Debug;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::{Duration, Instant};
 
 use async_trait::async_trait;
-use log::{error, info};
+use log::{error, info, warn};
 use tokio::sync::mpsc;
 
 use crate::error::{BallistaError, Result};
 
+/// Hold-time threshold above which `on_receive` durations are logged at warn.
+/// Background tasks (queued events) are expected to be sub-second; anything
+/// over a second is diagnostic.
+const SLOW_ON_RECEIVE_THRESHOLD: Duration = Duration::from_secs(1);
+
+/// Threshold above which `EventSender::post_event` await durations are
+/// logged at warn. Indicates the underlying mpsc channel is full and the
+/// sender is parked waiting on the consumer.
+const SLOW_POST_EVENT_THRESHOLD: Duration = Duration::from_millis(100);
+
 /// Trait defining actions to be performed in response to events in an event loop.
 #[async_trait]
 pub trait EventAction<E>: Send + Sync {
@@ -75,7 +87,10 @@ impl<E: Send + 'static> EventLoop<E> {
         }
     }
 
-    fn run(&self, mut rx_event: mpsc::Receiver<E>) {
+    fn run(&self, mut rx_event: mpsc::Receiver<E>)
+    where
+        E: Debug,
+    {
         assert!(
             self.tx_event.is_some(),
             "The event sender should be initialized first!"
@@ -88,7 +103,21 @@ impl<E: Send + 'static> EventLoop<E> {
             info!("Starting the event loop {name}");
             while !stopped.load(Ordering::SeqCst) {
                 if let Some(event) = rx_event.recv().await {
-                    if let Err(e) = action.on_receive(event, &tx_event, &rx_event).await {
+                    // Diagnostic: time every on_receive call so a wedged
+                    // handler shows up as an event that never logs its
+                    // matching completion. See spiceai/spiceai#10832.
+                    let started = Instant::now();
+                    let event_label = format!("{event:?}");
+                    let result = action.on_receive(event, &tx_event, &rx_event).await;
+                    let elapsed = started.elapsed();
+                    if elapsed >= SLOW_ON_RECEIVE_THRESHOLD {
+                        warn!(
+                            "slow on_receive in event_loop={name}: \
+                             event={event_label:.200} elapsed_ms={}",
+                            elapsed.as_millis()
+                        );
+                    }
+                    if let Err(e) = result {
                         error!("Fail to process event due to {e}");
                         action.on_error(e);
                     }
@@ -102,7 +131,10 @@ impl<E: Send + 'static> EventLoop<E> {
     }
 
     /// Starts the event loop, spawning a background task to process events.
-    pub fn start(&mut self) -> Result<()> {
+    pub fn start(&mut self) -> Result<()>
+    where
+        E: Debug,
+    {
         if self.stopped.load(Ordering::SeqCst) {
             return Err(BallistaError::General(format!(
                 "{} has already been stopped",
@@ -151,9 +183,20 @@ impl<E> EventSender<E> {
 
     /// Posts an event to the event loop asynchronously.
     pub async fn post_event(&self, event: E) -> Result<()> {
-        self.tx_event
-            .send(event)
-            .await
+        // Diagnostic: a slow `send().await` here means the underlying
+        // mpsc channel is full and the loop's consumer is parked. See
+        // spiceai/spiceai#10832.
+        let started = Instant::now();
+        let result = self.tx_event.send(event).await;
+        let elapsed = started.elapsed();
+        if elapsed >= SLOW_POST_EVENT_THRESHOLD {
+            warn!(
+                "slow post_event: elapsed_ms={} channel_capacity={}",
+                elapsed.as_millis(),
+                self.tx_event.capacity(),
+            );
+        }
+        result
             .map_err(|e| BallistaError::General(format!("Fail to send event due to {e}")))
     }
 }
diff --git a/ballista/core/src/lib.rs b/ballista/core/src/lib.rs
@@ -47,6 +47,9 @@ pub mod event_loop;
 pub mod execution_plans;
 /// Extension traits and utilities for DataFusion integration.
 pub mod extension;
+/// Diagnostic timing wrappers for `tokio::sync::RwLock`. See
+/// [`lock_tracing`] for the motivating issue.
+pub mod lock_tracing;
 /// Object store configuration and utilities for distributed file access.
 pub mod object_store;
 /// Query planning utilities for distributed execution.
diff --git a/ballista/core/src/lock_tracing.rs b/ballista/core/src/lock_tracing.rs
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Diagnostic instrumentation for tokio `RwLock` hot paths in the scheduler.
+//!
+//! Wraps `RwLock::write().await` and `RwLock::read().await` with timing
+//! around acquire (time-to-acquire) and hold (time-from-acquire-to-drop).
+//! Both thresholds surface as `warn!` logs so a stuck or slow path is
+//! visible without overwhelming the log in healthy operation.
+//!
+//! Added while investigating spiceai/spiceai#10832 — the scheduler's
+//! `QueryStageScheduler` event loop wedges mid-query, and three call paths
+//! share the per-job `execution_graph` write lock. The pattern matches a
+//! leaked or never-released `RwLockWriteGuard`. Instrumenting these
+//! acquisitions identifies (a) which call site holds the lock, (b) for how
+//! long, and (c) whether the contention is at acquire-time or hold-time.
+
+use std::ops::{Deref, DerefMut};
+use std::time::{Duration, Instant};
+
+use log::warn;
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+
+/// Threshold for slow acquisitions. Healthy paths acquire in microseconds;
+/// over 100ms means real contention.
+const SLOW_ACQUIRE_THRESHOLD: Duration = Duration::from_millis(100);
+
+/// Threshold for slow holds. A graph write lock held for over 500ms is
+/// suspicious — the critical sections under it are in-memory bookkeeping
+/// plus DashMap operations.
+const SLOW_HOLD_THRESHOLD: Duration = Duration::from_millis(500);
+
+/// Acquires a write lock on `lock`, logging if either acquisition or
+/// release takes longer than the slow thresholds.
+///
+/// `label` should identify the call site, e.g. `"task_manager::update_job"`.
+pub async fn traced_write<'a, T>(
+    lock: &'a RwLock<T>,
+    label: &'static str,
+) -> TracedWriteGuard<'a, T> {
+    let acquire_start = Instant::now();
+    let guard = lock.write().await;
+    let acquire = acquire_start.elapsed();
+    if acquire >= SLOW_ACQUIRE_THRESHOLD {
+        warn!(
+            "slow rwlock_write acquire: label={label} acquire_ms={}",
+            acquire.as_millis()
+        );
+    }
+    TracedWriteGuard {
+        guard,
+        held_since: Instant::now(),
+        label,
+    }
+}
+
+/// Acquires a read lock on `lock`, logging if either acquisition or release
+/// takes longer than the slow thresholds.
+pub async fn traced_read<'a, T>(
+    lock: &'a RwLock<T>,
+    label: &'static str,
+) -> TracedReadGuard<'a, T> {
+    let acquire_start = Instant::now();
+    let guard = lock.read().await;
+    let acquire = acquire_start.elapsed();
+    if acquire >= SLOW_ACQUIRE_THRESHOLD {
+        warn!(
+            "slow rwlock_read acquire: label={label} acquire_ms={}",
+            acquire.as_millis()
+        );
+    }
+    TracedReadGuard {
+        guard,
+        held_since: Instant::now(),
+        label,
+    }
+}
+
+/// Write guard that logs at warn level if the lock is held longer than
+/// `SLOW_HOLD_THRESHOLD`.
+pub struct TracedWriteGuard<'a, T> {
+    guard: RwLockWriteGuard<'a, T>,
+    held_since: Instant,
+    label: &'static str,
+}
+
+impl<T> Deref for TracedWriteGuard<'_, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        &self.guard
+    }
+}
+
+impl<T> DerefMut for TracedWriteGuard<'_, T> {
+    fn deref_mut(&mut self) -> &mut T {
+        &mut self.guard
+    }
+}
+
+impl<T> Drop for TracedWriteGuard<'_, T> {
+    fn drop(&mut self) {
+        let held = self.held_since.elapsed();
+        if held >= SLOW_HOLD_THRESHOLD {
+            warn!(
+                "slow rwlock_write hold: label={} hold_ms={}",
+                self.label,
+                held.as_millis()
+            );
+        }
+    }
+}
+
+/// Read guard that logs at warn level if the lock is held longer than
+/// `SLOW_HOLD_THRESHOLD`.
+pub struct TracedReadGuard<'a, T> {
+    guard: RwLockReadGuard<'a, T>,
+    held_since: Instant,
+    label: &'static str,
+}
+
+impl<T> Deref for TracedReadGuard<'_, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        &self.guard
+    }
+}
+
+impl<T> Drop for TracedReadGuard<'_, T> {
+    fn drop(&mut self) {
+        let held = self.held_since.elapsed();
+        if held >= SLOW_HOLD_THRESHOLD {
+            warn!(
+                "slow rwlock_read hold: label={} hold_ms={}",
+                self.label,
+                held.as_millis()
+            );
+        }
+    }
+}
diff --git a/ballista/scheduler/src/cluster/mod.rs b/ballista/scheduler/src/cluster/mod.rs
@@ -32,6 +32,7 @@ use log::debug;
 
 use ballista_core::consistent_hash::ConsistentHash;
 use ballista_core::error::Result;
+use ballista_core::lock_tracing::traced_write;
 use ballista_core::serde::protobuf::{
     AvailableTaskSlots, ExecutorHeartbeat, JobStatus, job_status,
 };
@@ -456,7 +457,7 @@ pub(crate) async fn bind_task_bias(
             debug!("Job {job_id} is not in running status and will be skipped");
             continue;
         }
-        let mut graph = job_info.execution_graph.write().await;
+        let mut graph = traced_write(&job_info.execution_graph, "bind_task_bias").await;
 
         let session_id = graph.session_id().to_string();
         let mut black_list = vec![];
@@ -558,7 +559,8 @@ pub(crate) async fn bind_task_round_robin(
             debug!("Job {job_id} is not in running status and will be skipped");
             continue;
         }
-        let mut graph = job_info.execution_graph.write().await;
+        let mut graph =
+            traced_write(&job_info.execution_graph, "bind_task_round_robin").await;
 
         let session_id = graph.session_id().to_string();
         let mut black_list = vec![];
@@ -714,7 +716,8 @@ pub(crate) async fn bind_task_consistent_hash(
             debug!("Job {job_id} is not in running status and will be skipped");
             continue;
         }
-        let mut graph = job_info.execution_graph.write().await;
+        let mut graph =
+            traced_write(&job_info.execution_graph, "bind_task_consistent_hash").await;
         let session_id = graph.session_id().to_string();
         let mut black_list = vec![];
         while let Some((running_stage, task_id_gen)) =
diff --git a/ballista/scheduler/src/scheduler_server/mod.rs b/ballista/scheduler/src/scheduler_server/mod.rs
@@ -35,7 +35,7 @@ use crate::cluster::BallistaCluster;
 use crate::config::SchedulerConfig;
 use crate::metrics::SchedulerMetricsCollector;
 use ballista_core::serde::scheduler::{ExecutorData, ExecutorMetadata};
-use log::{debug, error, warn};
+use log::{debug, error, info, warn};
 
 use crate::scheduler_server::event::QueryStageSchedulerEvent;
 use crate::scheduler_server::query_stage_scheduler::QueryStageScheduler;
@@ -191,6 +191,7 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> SchedulerServer<T
         self.query_stage_event_loop.start()?;
         self.expire_dead_executors()?;
         self.start_pending_tasks_metrics_loop();
+        self.start_diagnostic_dump_loop();
 
         Ok(())
     }
@@ -393,6 +394,33 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> SchedulerServer<T
         Ok(())
     }
 
+    /// Spawns a background task that periodically dumps a snapshot of every
+    /// active job's per-stage state, plus alive executor count.
+    ///
+    /// Added for spiceai/spiceai#10832: the production wedge appears as a
+    /// stage with unassigned partitions that never decrease (32 of 96
+    /// partitions stuck bound to no executor while the cluster has live
+    /// slots). This dump prints `unassigned` per Running stage every 30s
+    /// so the wedge becomes visible in scheduler logs without requiring
+    /// tokio-console or external tooling.
+    ///
+    /// Uses `try_read` on each graph so a held write lock is itself
+    /// diagnostic — the loop logs `graph_locked=true` rather than parking.
+    fn start_diagnostic_dump_loop(&self) {
+        let state = self.state.clone();
+        tokio::task::spawn(async move {
+            const INTERVAL: Duration = Duration::from_secs(30);
+            // Sleep first to avoid an empty-state dump at startup.
+            loop {
+                tokio::time::sleep(INTERVAL).await;
+                state.task_manager.diagnostic_snapshot();
+                let alive = state.executor_manager.get_alive_executors().len();
+                let running = state.task_manager.running_job_number();
+                info!("diagnostic_dump: alive_executors={alive} running_jobs={running}");
+            }
+        });
+    }
+
     /// Spawns a background task that periodically updates the pending tasks metric.
     ///
     /// This metric requires iterating over all active jobs and acquiring read locks,
diff --git a/ballista/scheduler/src/state/task_manager.rs b/ballista/scheduler/src/state/task_manager.rs