Eventual-Inc
diff --git a/‎daft/runners/__init__.py‎
Lines changed: 35 additions & 0 deletions b/‎daft/runners/__init__.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎daft/runners/flotilla.py‎
Lines changed: 5 additions & 0 deletions b/‎daft/runners/flotilla.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/distributed/ray.md‎
Lines changed: 22 additions & 0 deletions b/‎docs/distributed/ray.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/daft-distributed/src/python/ray/worker.rs‎
Lines changed: 58 additions & 1 deletion b/‎src/daft-distributed/src/python/ray/worker.rs‎
Lines changed: 58 additions & 1 deletion
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import os
+
 from typing import TYPE_CHECKING
 from daft.daft import get_runner as _get_runner_internal
 from daft.daft import get_or_create_runner as _get_or_create_runner
@@ -66,20 +68,53 @@ def set_runner_ray(
     address: str | None = None,
     noop_if_initialized: bool = False,
     force_client_mode: bool = False,
+    *,
+    downscale_enabled: bool | None = None,
+    downscale_idle_seconds: int | None = None,
+    min_survivor_workers: int | None = None,
+    pending_release_exclude_seconds: int | None = None,
 ) -> Runner[PartitionT]:
     """Configure Daft to execute dataframes using the Ray distributed computing framework.
 
     Args:
         address: Ray cluster address to connect to. If None, connects to or starts a local Ray instance.
         noop_if_initialized: If True, skip initialization if Ray is already running.
         force_client_mode: If True, forces Ray to run in client mode.
+        downscale_enabled: Enable/disable retiring idle Ray workers (scale-in). If not provided,
+            falls back to the ``DAFT_AUTOSCALING_DOWNSCALE_ENABLED`` environment variable (default: False).
+        downscale_idle_seconds: Minimum number of seconds a worker must be idle before it becomes eligible
+            for retirement. If not provided, falls back to ``DAFT_AUTOSCALING_DOWNSCALE_IDLE_SECONDS``
+            (default: 60).
+        min_survivor_workers: Minimum number of Ray workers to keep alive even if they are idle.
+            If not provided, falls back to ``DAFT_AUTOSCALING_MIN_SURVIVOR_WORKERS`` (default: 1).
+        pending_release_exclude_seconds: Grace period (TTL) for recently-released worker IDs during
+            worker discovery, to prevent the autoscaler from immediately respawning them. If not
+            provided, falls back to ``DAFT_AUTOSCALING_PENDING_RELEASE_EXCLUDE_SECONDS`` (default: 120).
 
     Returns:
         Runner[PartitionT]: A runner object with the Ray runner's configurations.
 
     Note:
         Can also be configured via environment variable: DAFT_RUNNER=ray
     """
+    # Allow programmatic configuration of autoscaling/downscaling behavior via `daft.set_runner_ray`.
+    # These settings are still backed by environment variables so they can propagate to the Rust
+    # scheduler/worker-manager components without threading configuration throughout the stack.
+    if downscale_enabled is not None:
+        os.environ["DAFT_AUTOSCALING_DOWNSCALE_ENABLED"] = "1" if downscale_enabled else "0"
+    if downscale_idle_seconds is not None:
+        if downscale_idle_seconds < 0:
+            raise ValueError("downscale_idle_seconds must be >= 0")
+        os.environ["DAFT_AUTOSCALING_DOWNSCALE_IDLE_SECONDS"] = str(downscale_idle_seconds)
+    if min_survivor_workers is not None:
+        if min_survivor_workers < 0:
+            raise ValueError("min_survivor_workers must be >= 0")
+        os.environ["DAFT_AUTOSCALING_MIN_SURVIVOR_WORKERS"] = str(min_survivor_workers)
+    if pending_release_exclude_seconds is not None:
+        if pending_release_exclude_seconds < 0:
+            raise ValueError("pending_release_exclude_seconds must be >= 0")
+        os.environ["DAFT_AUTOSCALING_PENDING_RELEASE_EXCLUDE_SECONDS"] = str(pending_release_exclude_seconds)
+
     return _set_runner_ray(
         address=address,
         noop_if_initialized=noop_if_initialized,
 
@@ -515,6 +515,11 @@ def try_autoscale(bundles: list[dict[str, int]]) -> None:
     )
 
 
+def clear_autoscaling_requests() -> None:
+    # Clear any previously requested resources by the Ray autoscaler.
+    try_autoscale(bundles=[])
+
+
 @ray.remote(num_cpus=0)
 class RemoteFlotillaRunner:
     def __init__(
 
@@ -145,3 +145,25 @@ ray job submit \
     The runtime env parameter specifies that Daft should be installed on the Ray workers. Alternative methods of including Daft in the worker dependencies can be found [here](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html).
 
 For more information about Ray jobs, see [Ray docs -> Ray Jobs Overview](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html).
+
+### Autoscaling and downscaling
+
+When Daft runs on a Ray cluster managed by the Ray autoscaler (including KubeRay), it can send scale-up requests based on pending task demand. Ray's autoscaler request API is sticky: without additional coordination, the autoscaler may keep previously requested capacity even when the workload becomes idle.
+
+Daft can optionally retire idle Flotilla workers (scale-in) and clear outstanding autoscaler requests to make it easier for Ray to scale the cluster back down. This feature is **opt-in**.
+
+You can enable it via `set_runner_ray`:
+
+```python
+import daft
+
+daft.set_runner_ray(
+    address="ray://<head_node_host>:10001",
+    downscale_enabled=True,
+    downscale_idle_seconds=60,
+    min_survivor_workers=1,
+    pending_release_exclude_seconds=120,
+)
+```
+
+Or via environment variables (useful for Ray Jobs / KubeRay manifests): `DAFT_AUTOSCALING_DOWNSCALE_ENABLED` (default: false), `DAFT_AUTOSCALING_DOWNSCALE_IDLE_SECONDS` (default: 60), `DAFT_AUTOSCALING_MIN_SURVIVOR_WORKERS` (default: 1), and `DAFT_AUTOSCALING_PENDING_RELEASE_EXCLUDE_SECONDS` (default: 120).
@@ -1,4 +1,8 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{
+    collections::HashMap,
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
 use common_error::DaftResult;
 use pyo3::prelude::*;
@@ -11,6 +15,15 @@ use crate::scheduling::{
 
 type ActiveTaskDetails = HashMap<TaskContext, TaskDetails>;
 
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum ActorState {
+    Ready,
+    Busy,
+    Idle,
+    Releasing,
+    Released,
+}
+
 #[pyclass(module = "daft.daft", name = "RaySwordfishWorker", from_py_object)]
 #[derive(Debug, Clone)]
 pub(crate) struct RaySwordfishWorker {
@@ -21,6 +34,8 @@ pub(crate) struct RaySwordfishWorker {
     num_gpus: f64,
     active_task_details: ActiveTaskDetails,
     ip_address: String,
+    last_task_finished_at: Instant,
+    state: ActorState,
 }
 
 #[pymethods]
@@ -42,6 +57,8 @@ impl RaySwordfishWorker {
             total_memory_bytes,
             active_task_details: Default::default(),
             ip_address,
+            last_task_finished_at: Instant::now(),
+            state: ActorState::Ready,
         }
     }
 }
@@ -51,8 +68,16 @@ impl RaySwordfishWorker {
         self.total_memory_bytes
     }
 
+    pub fn set_state(&mut self, state: ActorState) {
+        self.state = state;
+    }
+
     pub fn mark_task_finished(&mut self, task_context: &TaskContext) {
         self.active_task_details.remove(task_context);
+        self.last_task_finished_at = Instant::now();
+        if self.active_task_details.is_empty() {
+            self.set_state(ActorState::Idle);
+        }
     }
 
     pub fn submit_tasks(
@@ -75,6 +100,9 @@ impl RaySwordfishWorker {
 
             self.active_task_details
                 .insert(task_context.clone(), task_details);
+            if self.active_task_details.len() == 1 {
+                self.set_state(ActorState::Busy);
+            }
 
             let ray_task_result_handle = RayTaskResultHandle::new(
                 task_context,
@@ -89,12 +117,41 @@ impl RaySwordfishWorker {
         Ok(task_handles)
     }
 
+    pub fn is_idle(&self) -> bool {
+        self.active_task_details.is_empty()
+    }
+
+    pub fn idle_duration(&self, now: Instant) -> Duration {
+        if self.is_idle() {
+            now.saturating_duration_since(self.last_task_finished_at)
+        } else {
+            Duration::from_secs(0)
+        }
+    }
+
     #[allow(dead_code)]
     pub fn shutdown(&self, py: Python<'_>) {
         self.ray_worker_handle
             .call_method0(py, pyo3::intern!(py, "shutdown"))
             .expect("Failed to shutdown RaySwordfishWorker");
     }
+
+    pub fn release(&mut self, py: Python<'_>) {
+        let inflight = self.active_task_details.len();
+        if inflight > 0 {
+            tracing::warn!(
+                target: "ray_swordfish_worker",
+                worker_id = %self.worker_id,
+                inflight_tasks = inflight,
+                "Cannot release worker because it has active tasks."
+            );
+            return;
+        }
+
+        self.set_state(ActorState::Releasing);
+        self.shutdown(py);
+        self.set_state(ActorState::Released);
+    }
 }
 
 impl Worker for RaySwordfishWorker {