Skip to content

Commit abd22d4

Browse files
committed
Merge branch 'main' into grpc-job
2 parents 8da124d + 93d3394 commit abd22d4

9 files changed

Lines changed: 371 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ members = [
55
"components/spider-derive",
66
"components/spider-execution-manager",
77
"components/spider-proto-rust",
8+
"components/spider-scheduler",
89
"components/spider-storage",
910
"components/spider-task-executor",
1011
"components/spider-tdl",
@@ -22,6 +23,7 @@ default-members = [
2223
"components/spider-derive",
2324
"components/spider-execution-manager",
2425
"components/spider-proto-rust",
26+
"components/spider-scheduler",
2527
"components/spider-storage",
2628
"components/spider-task-executor",
2729
"components/spider-tdl",
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[package]
2+
name = "spider-scheduler"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[lib]
7+
name = "spider_scheduler"
8+
path = "src/lib.rs"
9+
10+
[dependencies]
11+
async-trait = "0.1.89"
12+
spider-core = { path = "../spider-core" }
13+
thiserror = "2.0.18"
14+
tokio-util = "0.7.18"
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//! The abstract core of a Spider scheduler.
2+
3+
use async_trait::async_trait;
4+
5+
use crate::{
6+
dispatch_queue::DispatchQueueSink,
7+
error::SchedulerError,
8+
storage_client::SchedulerStorageClient,
9+
};
10+
11+
/// An abstracted core for a scheduling algorithm.
12+
///
13+
/// A core owns its decision loop: it polls the inbound queue through a [`SchedulerStorageClient`],
14+
/// applies its algorithm (reading storage as needed for placement), and writes assignments to a
15+
/// [`DispatchQueueSink`]. Modeling the algorithm as a trait lets different scheduling strategies
16+
/// share the same runtime entry point.
17+
#[async_trait]
18+
pub trait SchedulerCore: Send {
19+
/// The storage client used by the core to poll and read for placement decisions.
20+
type StorageClient: SchedulerStorageClient;
21+
22+
/// The dispatch sink the core writes assignments to.
23+
type Sink: DispatchQueueSink;
24+
25+
/// Runs the scheduling loop until `cancellation_token` is triggered.
26+
///
27+
/// The core polls the inbound queue through `storage_client`, applies its scheduling algorithm,
28+
/// and writes assignments to `sink`, repeating until `cancellation_token` is fired, at which
29+
/// point it returns.
30+
///
31+
/// # Parameters
32+
///
33+
/// * `storage_client` - The storage client used to poll the inbound queue and read state for
34+
/// placement.
35+
/// * `sink` - The dispatch sink that assignments are written to.
36+
/// * `cancellation_token` - The token to signal the scheduling loop to stop.
37+
///
38+
/// # Errors
39+
///
40+
/// Returns a [`SchedulerError`] instance indicating an irrecoverable error.
41+
async fn run(
42+
&mut self,
43+
storage_client: Self::StorageClient,
44+
sink: Self::Sink,
45+
cancellation_token: tokio_util::sync::CancellationToken,
46+
) -> Result<(), SchedulerError>;
47+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//! The dispatching queue that decouples the scheduler core's placement decisions from the
2+
//! execution-manager-facing service.
3+
4+
use std::time::Duration;
5+
6+
use async_trait::async_trait;
7+
use spider_core::types::id::SessionId;
8+
9+
use crate::{error::SchedulerError, types::TaskAssignment};
10+
11+
/// The writer side of the dispatching queue used by the scheduler core.
12+
#[async_trait]
13+
pub trait DispatchQueueSink: Send + Sync + Clone {
14+
/// Enqueues a task assignment for execution managers to consume.
15+
///
16+
/// # Parameters
17+
///
18+
/// * `assignment` - The task assignment to enqueue.
19+
///
20+
/// # Errors
21+
///
22+
/// Returns an error if:
23+
///
24+
/// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed.
25+
async fn enqueue(&self, assignment: TaskAssignment) -> Result<(), SchedulerError>;
26+
27+
/// Bumps the session ID and invalidates all queued task assignments.
28+
///
29+
/// # Parameters
30+
///
31+
/// * `new_session_id` - The new session ID. Must be greater than the current session ID.
32+
///
33+
/// # Errors
34+
///
35+
/// Returns an error if:
36+
///
37+
/// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed.
38+
/// * [`SchedulerError::InvalidSessionId`] if the new session ID is not greater than the current
39+
/// session ID.
40+
async fn bump_session_id(&self, new_session_id: SessionId) -> Result<(), SchedulerError>;
41+
42+
/// # Returns
43+
///
44+
/// The current size of the dispatch queue.
45+
fn size(&self) -> usize;
46+
}
47+
48+
/// The reader side of the dispatching queue, drained by the execution-manager-facing service.
49+
#[async_trait]
50+
pub trait DispatchQueueSource: Send + Sync + Clone {
51+
/// Dequeues the next task assignment for an execution manager to execute.
52+
///
53+
/// # Parameters
54+
///
55+
/// * `wait_time` - The maximum amount of time to wait for a task assignment.
56+
///
57+
/// # Returns
58+
///
59+
/// `None` if no task assignment is available within the specified wait time, or a tuple
60+
/// containing:
61+
///
62+
/// * The storage session associated with the assignment.
63+
/// * The next task assignment ready to execute.
64+
///
65+
/// # Errors
66+
///
67+
/// Returns an error if:
68+
///
69+
/// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed.
70+
async fn dequeue(
71+
&self,
72+
wait_time: Duration,
73+
) -> Result<Option<(SessionId, TaskAssignment)>, SchedulerError>;
74+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
//! The error types used in this crate.
2+
3+
use spider_core::types::id::{JobId, SessionId};
4+
5+
/// Errors returned by [`crate::storage_client::SchedulerStorageClient`] operations.
6+
#[derive(Debug, thiserror::Error)]
7+
pub enum StorageClientError {
8+
/// The inbound queue is closed and can no longer yield ready entries.
9+
#[error("inbound queue is closed")]
10+
InboundClosed,
11+
12+
/// No job with the requested identifier exists.
13+
#[error("job not found: {0:?}")]
14+
JobNotFound(JobId),
15+
}
16+
17+
/// Errors returned by the scheduler runtime and its components.
18+
#[derive(Debug, thiserror::Error)]
19+
pub enum SchedulerError {
20+
/// Forwarded from the storage client.
21+
#[error(transparent)]
22+
Storage(#[from] StorageClientError),
23+
24+
/// The dispatching queue is closed and can no longer accept assignments.
25+
#[error("dispatching queue is closed")]
26+
DispatchQueueClosed,
27+
28+
/// The session ID is invalid.
29+
#[error("invalid session ID: {0:?}")]
30+
InvalidSessionId(SessionId),
31+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
//! Trait and type abstractions for the Spider scheduler.
2+
//!
3+
//! The scheduler is the serial decision maker that turns ready tasks discovered by the storage
4+
//! layer into assignments for execution managers. It owns placement and ordering policy, not
5+
//! dependency resolution: storage decides *what* is ready, and the scheduler decides *in what
6+
//! order* and *with what throttling* ready tasks are offered to the fleet.
7+
//!
8+
//! The crate defines three trait seams wired into a single pipeline — a storage client that polls
9+
//! the ready queue, a core that makes serial decisions, and a dispatching queue that fans those
10+
//! decisions out to execution managers:
11+
//!
12+
//! ```text
13+
//! storage ── authoritative ready queue (owned by the storage layer, not this crate)
14+
//! │
15+
//! │ poll_ready / poll_commit_ready / poll_cleanup_ready (SchedulerStorageClient)
16+
//! ▼
17+
//! ┌───────────────────┐
18+
//! │ SchedulerCore │ serial loop: poll → decide → enqueue
19+
//! └───────────────────┘
20+
//! │
21+
//! │ enqueue (DispatchQueueSink — writer side)
22+
//! ▼
23+
//! ┌───────────────────┐
24+
//! │ dispatch queue │ bounded SPMC; a full queue back-pressures the core
25+
//! └───────────────────┘
26+
//! │
27+
//! │ dequeue (DispatchQueueSource — reader side)
28+
//! ▼
29+
//! ┌───────────────────┐
30+
//! │ scheduler service │ ──▶ execution managers (concurrent fan-out)
31+
//! └───────────────────┘
32+
//! ```
33+
34+
pub mod core;
35+
pub mod dispatch_queue;
36+
pub mod error;
37+
pub mod storage_client;
38+
pub mod types;
39+
40+
pub use crate::{
41+
core::SchedulerCore,
42+
dispatch_queue::{DispatchQueueSink, DispatchQueueSource},
43+
error::{SchedulerError, StorageClientError},
44+
storage_client::SchedulerStorageClient,
45+
types::{InboundEntry, TaskAssignment},
46+
};
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
//! The scheduler's view of the storage layer, abstracting inbound polling and placement-time reads.
2+
3+
use std::time::Duration;
4+
5+
use async_trait::async_trait;
6+
use spider_core::{
7+
job::JobState,
8+
types::id::{JobId, SessionId},
9+
};
10+
11+
use crate::{error::StorageClientError, types::InboundEntry};
12+
13+
/// The scheduler's view of the storage layer.
14+
///
15+
/// Abstracts the storage-owned inbound queue and the read-only queries a scheduling algorithm
16+
/// needs to make placement decisions. Modeled as a trait so the scheduler runtime can be driven by
17+
/// a real storage client in production or a mock in tests.
18+
#[async_trait]
19+
pub trait SchedulerStorageClient: Send + Sync + Clone {
20+
/// Polls the regular-task lane of the storage-owned inbound queue for ready tasks.
21+
///
22+
/// # Parameters
23+
///
24+
/// * `max_items` - The maximum number of entries to return from a single poll.
25+
/// * `wait` - The maximum duration to block waiting for ready entries on the storage side.
26+
///
27+
/// # Returns
28+
///
29+
/// A tuple on success, containing:
30+
///
31+
/// * The storage session the poll was served under.
32+
/// * The ready regular tasks drained from the lane.
33+
///
34+
/// # Errors
35+
///
36+
/// Returns an error if:
37+
///
38+
/// * [`StorageClientError::InboundClosed`] if the regular-task lane is closed and can no longer
39+
/// yield entries.
40+
async fn poll_ready(
41+
&self,
42+
max_items: usize,
43+
wait: Duration,
44+
) -> Result<(SessionId, Vec<InboundEntry>), StorageClientError>;
45+
46+
/// Polls the commit-task lane of the storage-owned inbound queue for ready tasks.
47+
///
48+
/// # Parameters
49+
///
50+
/// * `max_items` - The maximum number of entries to return from a single poll.
51+
/// * `wait` - The maximum duration to block waiting for ready entries on the storage side.
52+
///
53+
/// # Returns
54+
///
55+
/// A tuple on success, containing:
56+
///
57+
/// * The storage session the poll was served under.
58+
/// * The ready commit tasks drained from the lane.
59+
///
60+
/// # Errors
61+
///
62+
/// Returns an error if:
63+
///
64+
/// * [`StorageClientError::InboundClosed`] if the commit-task lane is closed and can no longer
65+
/// yield entries.
66+
async fn poll_commit_ready(
67+
&self,
68+
max_items: usize,
69+
wait: Duration,
70+
) -> Result<(SessionId, Vec<InboundEntry>), StorageClientError>;
71+
72+
/// Polls the cleanup-task lane of the storage-owned inbound queue for ready tasks.
73+
///
74+
/// # Parameters
75+
///
76+
/// * `max_items` - The maximum number of entries to return from a single poll.
77+
/// * `wait` - The maximum duration to block waiting for ready entries on the storage side.
78+
///
79+
/// # Returns
80+
///
81+
/// A tuple on success, containing:
82+
///
83+
/// * The storage session the poll was served under.
84+
/// * The ready cleanup tasks drained from the lane.
85+
///
86+
/// # Errors
87+
///
88+
/// Returns an error if:
89+
///
90+
/// * [`StorageClientError::InboundClosed`] if the cleanup-task lane is closed and can no longer
91+
/// yield entries.
92+
async fn poll_cleanup_ready(
93+
&self,
94+
max_items: usize,
95+
wait: Duration,
96+
) -> Result<(SessionId, Vec<InboundEntry>), StorageClientError>;
97+
98+
/// Reads the current state of a job.
99+
///
100+
/// # Parameters
101+
///
102+
/// * `job_id` - The identifier of the job to query.
103+
///
104+
/// # Returns
105+
///
106+
/// The job's current [`JobState`] on success.
107+
///
108+
/// # Errors
109+
///
110+
/// Returns an error if:
111+
///
112+
/// * [`StorageClientError::JobNotFound`] if no job with the given identifier exists.
113+
async fn job_state(&self, job_id: JobId) -> Result<JobState, StorageClientError>;
114+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
//! The data types the scheduler exchanges with the storage layer and execution managers.
2+
3+
use spider_core::types::id::{JobId, ResourceGroupId, TaskId};
4+
5+
/// A ready task drained from the storage-owned inbound queue.
6+
///
7+
/// The storage client flattens storage's three ready lanes (regular, commit, and cleanup tasks)
8+
/// into this uniform entry, resolving each to its [`TaskId`] so the scheduler core can treat every
9+
/// ready task identically.
10+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11+
pub struct InboundEntry {
12+
/// The resource group that owns the job.
13+
pub resource_group_id: ResourceGroupId,
14+
15+
/// The job the task belongs to.
16+
pub job_id: JobId,
17+
18+
/// The ready task.
19+
pub task_id: TaskId,
20+
}
21+
22+
/// A task placement decision written by the scheduler core to the dispatching queue.
23+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24+
pub struct TaskAssignment {
25+
/// The resource group that owns the job.
26+
pub resource_group_id: ResourceGroupId,
27+
28+
/// The job the task belongs to.
29+
pub job_id: JobId,
30+
31+
/// The task to dispatch.
32+
pub task_id: TaskId,
33+
}

0 commit comments

Comments
 (0)