marin-community · rjpower · Mar 16, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/lib/iris/src/iris/cluster/platform/bootstrap.py b/lib/iris/src/iris/cluster/platform/bootstrap.py
@@ -180,8 +180,10 @@ def replace_var(match: re.Match) -> str:
 fi
 
 # Start worker container without restart policy first (fail fast during bootstrap)
+# SYS_ADMIN: required for mounting tmpfs workdirs (disk quota enforcement)
 sudo docker run -d --name iris-worker \\
     --network=host \\
+    --cap-add SYS_ADMIN \\
     --ulimit core=0:0 \\
     -v {{ cache_dir }}:{{ cache_dir }} \\
     -v /dev/shm/iris:/dev/shm/iris \\

diff --git a/lib/iris/src/iris/cluster/runtime/docker.py b/lib/iris/src/iris/cluster/runtime/docker.py
@@ -16,7 +16,9 @@
 import os
 import re
 import shlex
+import shutil
 import subprocess
+import sys
 import threading
 import time
 import uuid
@@ -42,6 +44,7 @@
     ContainerStats,
     ContainerStatus,
     ImageInfo,
+    WorkdirSpec,
 )
 from iris.cluster.worker.worker_types import LogLine, TaskLogs
 from iris.rpc import cluster_pb2
@@ -383,6 +386,15 @@ def stats(self) -> ContainerStats:
             return ContainerStats(memory_mb=0, cpu_percent=0, process_count=0, available=False)
         return self._docker_stats(self._run_container_id)
 
+    def disk_usage_mb(self) -> int:
+        """Return used space in MB on the filesystem containing the workdir."""
+        for host_path, container_path, _mode in self.config.mounts:
+            if container_path == self.config.workdir:
+                path = Path(host_path)
+                if path.exists():
+                    return int(shutil.disk_usage(path).used / (1024 * 1024))
+        return 0
+
     def profile(self, duration_seconds: int, profile_type: "cluster_pb2.ProfileType") -> bytes:
         """Profile the running process using py-spy (CPU), memray (memory), or thread dump."""
         container_id = self._run_container_id
@@ -482,11 +494,16 @@ def _profile_memory(
             self._docker_rm_files(container_id, [trace_path, output_path])
 
     def cleanup(self) -> None:
-        """Remove the run container and clean up resources."""
+        """Remove the run container and clean up resources (including tmpfs mounts)."""
         if self._run_container_id:
             self._docker_remove(self._run_container_id)
             self.runtime.untrack_container(self._run_container_id)
             self._run_container_id = None
+        # Release any tmpfs backing storage for the workdir
+        for host_path, container_path, _mode in self.config.mounts:
+            if container_path == self.config.workdir:
+                self.runtime.release_tmpfs(Path(host_path))
+                break
 
     # -------------------------------------------------------------------------
     # Docker CLI helpers
@@ -715,6 +732,7 @@ class DockerRuntime:
     def __init__(self) -> None:
         self._handles: list[DockerContainerHandle] = []
         self._created_containers: set[str] = set()
+        self._tmpfs_mounts: set[Path] = set()
         # Serializes `docker pull` per image tag so that concurrent task threads
         # don't each trigger docker-credential-gcloud against the metadata server,
         # which causes sporadic "no active account" errors under load.
@@ -781,12 +799,47 @@ def stage_bundle(
         workdir: Path,
         workdir_files: dict[str, bytes],
         bundle_store: BundleStore,
+        workdir_spec: WorkdirSpec | None = None,
     ) -> None:
-        """Stage bundle and workdir files on worker-local filesystem."""
+        """Provision backing storage, then stage bundle and workdir files."""
+        if workdir_spec and workdir_spec.disk_bytes > 0:
+            self._mount_tmpfs(workdir, workdir_spec.disk_bytes)
         if bundle_id:
             bundle_store.extract_bundle_to(bundle_id, workdir)
         bundle_store.write_workdir_files(workdir, workdir_files)
 
+    def _mount_tmpfs(self, workdir: Path, disk_bytes: int) -> None:
+        if sys.platform != "linux":
+            raise RuntimeError("Docker workdir disk limits require Linux tmpfs mounts")
+        workdir.mkdir(parents=True, exist_ok=True)
+        if os.path.ismount(workdir):
+            logger.info("Workdir %s is already a mountpoint; reusing", workdir)
+            return
+        result = subprocess.run(
+            ["mount", "-t", "tmpfs", "-o", f"size={disk_bytes},nodev,nosuid", "tmpfs", str(workdir)],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to mount tmpfs workdir {workdir}: {result.stderr.strip()}")
+        self._tmpfs_mounts.add(workdir)
+        logger.info("Mounted tmpfs workdir %s with size=%d bytes", workdir, disk_bytes)
+
+    def release_tmpfs(self, workdir: Path) -> None:
+        """Unmount a tmpfs workdir if it was mounted by this runtime."""
+        if workdir not in self._tmpfs_mounts:
+            return
+        if not os.path.ismount(workdir):
+            self._tmpfs_mounts.discard(workdir)
+            return
+        result = subprocess.run(["umount", str(workdir)], capture_output=True, text=True, check=False)
+        if result.returncode != 0:
+            logger.warning("Failed to unmount tmpfs workdir %s: %s", workdir, result.stderr.strip())
+        else:
+            logger.info("Unmounted tmpfs workdir %s", workdir)
+        self._tmpfs_mounts.discard(workdir)
+
     def track_container(self, container_id: str) -> None:
         """Track a container ID for cleanup."""
         self._created_containers.add(container_id)

diff --git a/lib/iris/src/iris/cluster/runtime/kubernetes.py b/lib/iris/src/iris/cluster/runtime/kubernetes.py
@@ -37,6 +37,7 @@
     ContainerPhase,
     ContainerStats,
     ContainerStatus,
+    WorkdirSpec,
 )
 from iris.cluster.worker.worker_types import LogLine
 from iris.rpc import cluster_pb2
@@ -239,7 +240,11 @@ def run(self) -> None:
             )
 
         mounts.append({"name": "workdir", "mountPath": self.config.workdir, "readOnly": False})
-        volumes.append({"name": "workdir", "emptyDir": {}})
+        empty_dir_spec: dict[str, str] = {}
+        disk_bytes = self.config.get_disk_bytes()
+        if disk_bytes:
+            empty_dir_spec["sizeLimit"] = f"{disk_bytes}"
+        volumes.append({"name": "workdir", "emptyDir": empty_dir_spec})
 
         workdir_files = dict(self.config.entrypoint.workdir_files)
         if workdir_files:
@@ -493,6 +498,10 @@ def stats(self) -> ContainerStats:
             available=True,
         )
 
+    def disk_usage_mb(self) -> int:
+        """K8s workdir lives inside the pod; disk usage isn't observable from the worker."""
+        return 0
+
     def profile(self, duration_seconds: int, profile_type: cluster_pb2.ProfileType) -> bytes:
         """Profile the running process using py-spy (CPU), memray (memory), or thread dump."""
         if not self._pod_name:
@@ -639,9 +648,10 @@ def stage_bundle(
         workdir: Path,
         workdir_files: dict[str, bytes],
         bundle_store: BundleStore,
+        workdir_spec: WorkdirSpec | None = None,
     ) -> None:
         """No-op: Kubernetes task Pods materialize bundle/workdir in-pod."""
-        del bundle_id, workdir, workdir_files, bundle_store
+        del bundle_id, workdir, workdir_files, bundle_store, workdir_spec
 
     def list_containers(self) -> list[KubernetesContainerHandle]:
         return list(self._handles)

diff --git a/lib/iris/src/iris/cluster/runtime/process.py b/lib/iris/src/iris/cluster/runtime/process.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import select
+import shutil
 import signal
 import subprocess
 import sys
@@ -49,6 +50,7 @@
     ContainerStats,
     ContainerStatus,
     RuntimeLogReader,
+    WorkdirSpec,
 )
 from iris.cluster.worker.worker_types import LogLine
 from iris.managed_thread import ManagedThread, get_thread_container
@@ -484,6 +486,14 @@ def stats(self) -> ContainerStats:
             available=memory_mb is not None,
         )
 
+    def disk_usage_mb(self) -> int:
+        """Return used space in MB on the filesystem containing the workdir."""
+        mount_map = {cp: hp for hp, cp, _ in self.config.mounts}
+        host_workdir = mount_map.get(self.config.workdir)
+        if host_workdir and Path(host_workdir).exists():
+            return int(shutil.disk_usage(host_workdir).used / (1024 * 1024))
+        return 0
+
     def profile(self, duration_seconds: int, profile_type: cluster_pb2.ProfileType) -> bytes:
         """Profile the running process using py-spy (CPU), memray (memory), or thread dump."""
 
@@ -610,8 +620,10 @@ def stage_bundle(
         workdir: Path,
         workdir_files: dict[str, bytes],
         bundle_store: BundleStore,
+        workdir_spec: WorkdirSpec | None = None,
     ) -> None:
         """Stage bundle and workdir files on worker-local filesystem."""
+        del workdir_spec  # Process runtime has no backing storage to provision
         if bundle_id:
             bundle_store.extract_bundle_to(bundle_id, workdir)
         bundle_store.write_workdir_files(workdir, workdir_files)

diff --git a/lib/iris/src/iris/cluster/runtime/types.py b/lib/iris/src/iris/cluster/runtime/types.py
@@ -88,6 +88,20 @@ def get_disk_bytes(self) -> int | None:
         return self.resources.disk_bytes
 
 
+@dataclass(frozen=True)
+class WorkdirSpec:
+    """Declares the workdir configuration a task wants.
+
+    The runtime interprets this according to its capabilities:
+    - Docker: mounts tmpfs with size limit when disk_bytes > 0
+    - K8s: sets emptyDir sizeLimit in the pod spec
+    - Process: no enforcement
+    """
+
+    disk_bytes: int = 0
+    tmpfs: bool = False
+
+
 @dataclass
 class ContainerResult:
     container_id: str
@@ -208,6 +222,14 @@ def stats(self) -> ContainerStats:
         """Get resource usage statistics."""
         ...
 
+    def disk_usage_mb(self) -> int:
+        """Return disk usage in MB for this container's workdir.
+
+        Docker/Process: shutil.disk_usage on the host workdir path.
+        K8s: 0 (workdir lives inside the pod, not on the worker node).
+        """
+        ...
+
     def profile(self, duration_seconds: int, profile_type: cluster_pb2.ProfileType) -> bytes:
         """Profile the running process using py-spy (CPU), memray (memory), or thread dump.
 
@@ -250,12 +272,13 @@ def stage_bundle(
         workdir: Path,
         workdir_files: dict[str, bytes],
         bundle_store: BundleStore,
+        workdir_spec: WorkdirSpec | None = None,
     ) -> None:
-        """Materialize task bundle/workdir files for this runtime.
+        """Provision the workdir and materialize task bundle/workdir files.
 
-        Runtimes that execute from worker-local paths (docker/process)
-        stage the bundle into ``workdir`` directly. Kubernetes runtime may no-op
-        and materialize inside the task Pod instead.
+        The runtime sets up any backing storage described by *workdir_spec*
+        (e.g. Docker mounts tmpfs) before extracting the bundle. Kubernetes
+        may no-op and materialize inside the task Pod instead.
         """
         ...
 

diff --git a/lib/iris/src/iris/cluster/worker/env_probe.py b/lib/iris/src/iris/cluster/worker/env_probe.py
@@ -186,27 +186,6 @@ def _get_disk_bytes() -> int:
         return 100 * 1024**3  # Default 100GB
 
 
-def collect_workdir_size_mb(workdir: Path) -> int:
-    """Calculate workdir size in MB using du -sm."""
-    if not workdir.exists():
-        return 0
-
-    result = subprocess.run(
-        ["du", "-sm", str(workdir)],
-        capture_output=True,
-        text=True,
-    )
-
-    if result.returncode != 0:
-        return 0
-
-    # du -sm output format: "SIZE\tPATH"
-    output = result.stdout.strip()
-    size_str = output.split("\t")[0]
-
-    return int(size_str)
-
-
 def _build_worker_attributes(
     *,
     accelerator_type: int,

diff --git a/lib/iris/src/iris/cluster/worker/task_attempt.py b/lib/iris/src/iris/cluster/worker/task_attempt.py
@@ -29,14 +29,14 @@
     ContainerPhase,
     ContainerRuntime,
     RuntimeLogReader,
+    WorkdirSpec,
 )
 from iris.cluster.types import (
     JobName,
     TaskAttempt as TaskAttemptIdentity,
     is_task_finished,
 )
 from iris.cluster.bundle import BundleStore
-from iris.cluster.worker.env_probe import collect_workdir_size_mb
 from iris.cluster.worker.port_allocator import PortAllocator
 from iris.cluster.log_store import LogCursor, LogStore, task_log_key
 from iris.logging import parse_log_level, str_to_log_level
@@ -87,6 +87,7 @@ def _format_exit_error(exit_code: int | None, oom_killed: bool = False) -> str:
 # /dev/shm/iris into the worker container so this path is available on GCE VMs.
 _TMPFS_DIR = Path("/dev/shm/iris")
 _TMPFS_MIN_FREE_BYTES = 1 * 1024 * 1024 * 1024  # 1 GB
+_DISK_CHECK_INTERVAL_SECONDS = 60.0
 
 
 def get_fast_io_dir(cache_dir: Path) -> Path:
@@ -549,11 +550,14 @@ def _download_bundle(self) -> None:
         # to BundleStore.extract_bundle_to if long downloads become a problem.)
 
         assert self.workdir is not None
+        disk_bytes = self.request.resources.disk_bytes if self.request.HasField("resources") else 0
+        workdir_spec = WorkdirSpec(disk_bytes=disk_bytes, tmpfs=disk_bytes > 0) if disk_bytes > 0 else None
         self._runtime.stage_bundle(
             bundle_id=self.request.bundle_id,
             workdir=self.workdir,
             workdir_files=dict(self.request.entrypoint.workdir_files),
             bundle_store=self._bundle_store,
+            workdir_spec=workdir_spec,
         )
 
         logger.info(
@@ -711,6 +715,7 @@ def _monitor_loop(
         log_reader: RuntimeLogReader,
         deadline: Deadline | None,
     ) -> None:
+        last_disk_check = 0.0
         while True:
             if rule := chaos("worker.task_monitor"):
                 time.sleep(rule.delay_seconds)
@@ -794,8 +799,10 @@ def _monitor_loop(
                     if stats.memory_mb > self.peak_memory_mb:
                         self.peak_memory_mb = stats.memory_mb
 
-                if self.workdir:
-                    self.disk_mb = collect_workdir_size_mb(self.workdir)
+                now = time.monotonic()
+                if now - last_disk_check >= _DISK_CHECK_INTERVAL_SECONDS:
+                    self.disk_mb = handle.disk_usage_mb()
+                    last_disk_check = now
             except Exception:
                 logger.debug("Stats collection failed for task %s", self.task_id, exc_info=True)
 
@@ -845,7 +852,7 @@ def _cleanup(self) -> None:
         except Exception as e:
             logger.warning("Failed to release ports for task %s: %s", self.task_id, e)
 
-        # Remove working directory
+        # Remove working directory (handle.cleanup() already released backing storage)
         if self.workdir and self.workdir.exists():
             try:
                 shutil.rmtree(self.workdir)