Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
9dc337d
research: iris tmpfs workdir and du monitoring findings
rjpower Mar 15, 2026
ede2d1a
clean up research artifact
rjpower Mar 15, 2026
df11570
Add prepare_workdir/cleanup_workdir to ContainerRuntime protocol
rjpower Mar 15, 2026
e0b6324
Wire prepare_workdir/cleanup_workdir into task lifecycle, replace du …
rjpower Mar 15, 2026
bbd6888
Add tests for tmpfs workdir management and disk_usage monitoring
rjpower Mar 15, 2026
a925138
fix(iris): cleanup_workdir warns instead of raising, document disk_us…
rjpower Mar 15, 2026
2850e03
refactor(iris): replace prepare/cleanup_workdir stubs with declarativ…
rjpower Mar 15, 2026
2db0bdd
fix(iris): grant SYS_ADMIN to worker container for tmpfs mounts
rjpower Mar 15, 2026
2c568c5
fix(iris): serialize dashboard build across xdist workers with filelock
rjpower Mar 15, 2026
87fbe9e
refactor(iris): replace mount tuples + WorkdirSpec with unified Mount…
rjpower Mar 15, 2026
1ef8d92
fix(iris): read workdir_host_path from config in DockerRuntime.create…
rjpower Mar 15, 2026
14a3b59
iris: replace filelock with fcntl in e2e conftest
github-actions[bot] Mar 15, 2026
af32041
iris: cleanup tmpfs on staging failure, keep failed umounts tracked
github-actions[bot] Mar 15, 2026
bf98e46
iris: consolidate get_fast_io_dir, always mount tmpfs for Docker work…
github-actions[bot] Mar 16, 2026
ef46478
fix(iris): serialize tmpfs mount/unmount to prevent race under high c…
rjpower Mar 16, 2026
21cce8c
fix(iris): move tmpfs mounting from stage_bundle to resolve_mounts
rjpower Mar 16, 2026
ab30adb
fix(iris): simplify tmpfs to use /dev/shm directly, fix mount-before-…
rjpower Mar 16, 2026
4d08420
fix(iris): shared mount propagation for cache bind, move fcntl to mod…
rjpower Mar 16, 2026
af45265
fix(iris): wait for controller VM deletion before clearing remote state
rjpower Mar 16, 2026
867ad72
fix(iris): make controller VM deletion synchronous
rjpower Mar 16, 2026
447112b
fix(iris): use unique device name for tmpfs mounts to avoid mount(8) …
rjpower Mar 16, 2026
b505e02
fix(iris): remove tmpfs mount machinery, use /dev/shm as cache_dir di…
rjpower Mar 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/iris/src/iris/cluster/platform/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,10 @@ def replace_var(match: re.Match) -> str:
fi

# Start worker container without restart policy first (fail fast during bootstrap)
# SYS_ADMIN: required for mounting tmpfs workdirs (disk quota enforcement)
sudo docker run -d --name iris-worker \\
--network=host \\
--cap-add SYS_ADMIN \\
--ulimit core=0:0 \\
-v {{ cache_dir }}:{{ cache_dir }} \\
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Enable shared propagation on worker cache bind mount

DockerRuntime now mounts tmpfs workdirs inside the worker process (_mount_tmpfs in lib/iris/src/iris/cluster/runtime/docker.py), but the worker creates task containers via the host daemon through /var/run/docker.sock; with this plain cache bind (-v {{ cache_dir }}:{{ cache_dir }}) and no shared propagation, those inner mounts are not propagated to the host namespace. In the default bootstrap path, bundle staging and generated scripts can be written to the worker-only tmpfs while the task container sees the underlying host directory, leading to missing files under /app at runtime.

Useful? React with 👍 / 👎.

-v /dev/shm/iris:/dev/shm/iris \\
Expand Down
57 changes: 55 additions & 2 deletions lib/iris/src/iris/cluster/runtime/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import os
import re
import shlex
import shutil
import subprocess
import sys
import threading
import time
import uuid
Expand All @@ -42,6 +44,7 @@
ContainerStats,
ContainerStatus,
ImageInfo,
WorkdirSpec,
)
from iris.cluster.worker.worker_types import LogLine, TaskLogs
from iris.rpc import cluster_pb2
Expand Down Expand Up @@ -383,6 +386,15 @@ def stats(self) -> ContainerStats:
return ContainerStats(memory_mb=0, cpu_percent=0, process_count=0, available=False)
return self._docker_stats(self._run_container_id)

def disk_usage_mb(self) -> int:
"""Return used space in MB on the filesystem containing the workdir."""
for host_path, container_path, _mode in self.config.mounts:
if container_path == self.config.workdir:
path = Path(host_path)
if path.exists():
return int(shutil.disk_usage(path).used / (1024 * 1024))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Compute Docker disk metrics from workdir contents

disk_usage_mb() now returns shutil.disk_usage(path).used, which is bytes used by the entire backing filesystem, not by this task's workdir tree. For Docker tasks where /app is just a subdirectory (common when resources.disk_bytes is unset, or when many task dirs share /dev/shm/iris), tasks will report host/tmpfs-wide usage instead of per-task usage, making resource_usage.disk_mb misleading for debugging and capacity analysis.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rjpower this is an issue?

return 0

def profile(self, duration_seconds: int, profile_type: "cluster_pb2.ProfileType") -> bytes:
"""Profile the running process using py-spy (CPU), memray (memory), or thread dump."""
container_id = self._run_container_id
Expand Down Expand Up @@ -482,11 +494,16 @@ def _profile_memory(
self._docker_rm_files(container_id, [trace_path, output_path])

def cleanup(self) -> None:
"""Remove the run container and clean up resources."""
"""Remove the run container and clean up resources (including tmpfs mounts)."""
if self._run_container_id:
self._docker_remove(self._run_container_id)
self.runtime.untrack_container(self._run_container_id)
self._run_container_id = None
# Release any tmpfs backing storage for the workdir
for host_path, container_path, _mode in self.config.mounts:
if container_path == self.config.workdir:
self.runtime.release_tmpfs(Path(host_path))
break

# -------------------------------------------------------------------------
# Docker CLI helpers
Expand Down Expand Up @@ -715,6 +732,7 @@ class DockerRuntime:
def __init__(self) -> None:
self._handles: list[DockerContainerHandle] = []
self._created_containers: set[str] = set()
self._tmpfs_mounts: set[Path] = set()
# Serializes `docker pull` per image tag so that concurrent task threads
# don't each trigger docker-credential-gcloud against the metadata server,
# which causes sporadic "no active account" errors under load.
Expand Down Expand Up @@ -781,12 +799,47 @@ def stage_bundle(
workdir: Path,
workdir_files: dict[str, bytes],
bundle_store: BundleStore,
workdir_spec: WorkdirSpec | None = None,
) -> None:
"""Stage bundle and workdir files on worker-local filesystem."""
"""Provision backing storage, then stage bundle and workdir files."""
if workdir_spec and workdir_spec.disk_bytes > 0:
self._mount_tmpfs(workdir, workdir_spec.disk_bytes)
if bundle_id:
bundle_store.extract_bundle_to(bundle_id, workdir)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Unmount tmpfs if bundle staging raises

This mounts tmpfs before extract_bundle_to/write_workdir_files, but there is no rollback if either staging call throws. In the current task flow, tmpfs unmounting happens in DockerContainerHandle.cleanup(), which is only reachable after create_container; staging failures occur earlier, so the mount remains and rmtree later hits a busy mountpoint. Repeated task failures with disk_bytes set will leak RAM-backed mounts on the worker.

Useful? React with 👍 / 👎.

bundle_store.write_workdir_files(workdir, workdir_files)

def _mount_tmpfs(self, workdir: Path, disk_bytes: int) -> None:
if sys.platform != "linux":
raise RuntimeError("Docker workdir disk limits require Linux tmpfs mounts")
workdir.mkdir(parents=True, exist_ok=True)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: _setup() already calls workdir.mkdir(parents=True, exist_ok=True) on line 473 of task_attempt.py before calling prepare_workdir. This second mkdir is redundant.

if os.path.ismount(workdir):
logger.info("Workdir %s is already a mountpoint; reusing", workdir)
return
result = subprocess.run(
["mount", "-t", "tmpfs", "-o", f"size={disk_bytes},nodev,nosuid", "tmpfs", str(workdir)],
capture_output=True,
Comment thread
rjpower marked this conversation as resolved.
Outdated
text=True,
check=False,
)
if result.returncode != 0:
raise RuntimeError(f"Failed to mount tmpfs workdir {workdir}: {result.stderr.strip()}")
self._tmpfs_mounts.add(workdir)
logger.info("Mounted tmpfs workdir %s with size=%d bytes", workdir, disk_bytes)

def release_tmpfs(self, workdir: Path) -> None:
"""Unmount a tmpfs workdir if it was mounted by this runtime."""
if workdir not in self._tmpfs_mounts:
return
if not os.path.ismount(workdir):
self._tmpfs_mounts.discard(workdir)
return
result = subprocess.run(["umount", str(workdir)], capture_output=True, text=True, check=False)
if result.returncode != 0:
logger.warning("Failed to unmount tmpfs workdir %s: %s", workdir, result.stderr.strip())
else:
logger.info("Unmounted tmpfs workdir %s", workdir)
self._tmpfs_mounts.discard(workdir)
Comment thread
rjpower marked this conversation as resolved.
Outdated

def track_container(self, container_id: str) -> None:
"""Track a container ID for cleanup."""
self._created_containers.add(container_id)
Expand Down
14 changes: 12 additions & 2 deletions lib/iris/src/iris/cluster/runtime/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
ContainerPhase,
ContainerStats,
ContainerStatus,
WorkdirSpec,
)
from iris.cluster.worker.worker_types import LogLine
from iris.rpc import cluster_pb2
Expand Down Expand Up @@ -239,7 +240,11 @@ def run(self) -> None:
)

mounts.append({"name": "workdir", "mountPath": self.config.workdir, "readOnly": False})
volumes.append({"name": "workdir", "emptyDir": {}})
empty_dir_spec: dict[str, str] = {}
disk_bytes = self.config.get_disk_bytes()
if disk_bytes:
empty_dir_spec["sizeLimit"] = f"{disk_bytes}"
volumes.append({"name": "workdir", "emptyDir": empty_dir_spec})

workdir_files = dict(self.config.entrypoint.workdir_files)
if workdir_files:
Expand Down Expand Up @@ -493,6 +498,10 @@ def stats(self) -> ContainerStats:
available=True,
)

def disk_usage_mb(self) -> int:
"""K8s workdir lives inside the pod; disk usage isn't observable from the worker."""
return 0

def profile(self, duration_seconds: int, profile_type: cluster_pb2.ProfileType) -> bytes:
"""Profile the running process using py-spy (CPU), memray (memory), or thread dump."""
if not self._pod_name:
Expand Down Expand Up @@ -639,9 +648,10 @@ def stage_bundle(
workdir: Path,
workdir_files: dict[str, bytes],
bundle_store: BundleStore,
workdir_spec: WorkdirSpec | None = None,
) -> None:
"""No-op: Kubernetes task Pods materialize bundle/workdir in-pod."""
del bundle_id, workdir, workdir_files, bundle_store
del bundle_id, workdir, workdir_files, bundle_store, workdir_spec

def list_containers(self) -> list[KubernetesContainerHandle]:
return list(self._handles)
Expand Down
12 changes: 12 additions & 0 deletions lib/iris/src/iris/cluster/runtime/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import logging
import os
import select
import shutil
import signal
import subprocess
import sys
Expand Down Expand Up @@ -49,6 +50,7 @@
ContainerStats,
ContainerStatus,
RuntimeLogReader,
WorkdirSpec,
)
from iris.cluster.worker.worker_types import LogLine
from iris.managed_thread import ManagedThread, get_thread_container
Expand Down Expand Up @@ -484,6 +486,14 @@ def stats(self) -> ContainerStats:
available=memory_mb is not None,
)

def disk_usage_mb(self) -> int:
"""Return used space in MB on the filesystem containing the workdir."""
mount_map = {cp: hp for hp, cp, _ in self.config.mounts}
host_workdir = mount_map.get(self.config.workdir)
if host_workdir and Path(host_workdir).exists():
return int(shutil.disk_usage(host_workdir).used / (1024 * 1024))
Comment thread
rjpower marked this conversation as resolved.
Outdated
return 0

def profile(self, duration_seconds: int, profile_type: cluster_pb2.ProfileType) -> bytes:
"""Profile the running process using py-spy (CPU), memray (memory), or thread dump."""

Expand Down Expand Up @@ -610,8 +620,10 @@ def stage_bundle(
workdir: Path,
workdir_files: dict[str, bytes],
bundle_store: BundleStore,
workdir_spec: WorkdirSpec | None = None,
) -> None:
"""Stage bundle and workdir files on worker-local filesystem."""
del workdir_spec # Process runtime has no backing storage to provision
if bundle_id:
bundle_store.extract_bundle_to(bundle_id, workdir)
bundle_store.write_workdir_files(workdir, workdir_files)
Expand Down
31 changes: 27 additions & 4 deletions lib/iris/src/iris/cluster/runtime/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,20 @@ def get_disk_bytes(self) -> int | None:
return self.resources.disk_bytes


@dataclass(frozen=True)
class WorkdirSpec:
"""Declares the workdir configuration a task wants.

The runtime interprets this according to its capabilities:
- Docker: mounts tmpfs with size limit when disk_bytes > 0
- K8s: sets emptyDir sizeLimit in the pod spec
- Process: no enforcement
"""

disk_bytes: int = 0
tmpfs: bool = False


@dataclass
class ContainerResult:
container_id: str
Expand Down Expand Up @@ -208,6 +222,14 @@ def stats(self) -> ContainerStats:
"""Get resource usage statistics."""
...

def disk_usage_mb(self) -> int:
"""Return disk usage in MB for this container's workdir.

Docker/Process: shutil.disk_usage on the host workdir path.
K8s: 0 (workdir lives inside the pod, not on the worker node).
"""
...

def profile(self, duration_seconds: int, profile_type: cluster_pb2.ProfileType) -> bytes:
"""Profile the running process using py-spy (CPU), memray (memory), or thread dump.

Expand Down Expand Up @@ -250,12 +272,13 @@ def stage_bundle(
workdir: Path,
workdir_files: dict[str, bytes],
bundle_store: BundleStore,
workdir_spec: WorkdirSpec | None = None,
) -> None:
"""Materialize task bundle/workdir files for this runtime.
"""Provision the workdir and materialize task bundle/workdir files.

Runtimes that execute from worker-local paths (docker/process)
stage the bundle into ``workdir`` directly. Kubernetes runtime may no-op
and materialize inside the task Pod instead.
The runtime sets up any backing storage described by *workdir_spec*
(e.g. Docker mounts tmpfs) before extracting the bundle. Kubernetes
may no-op and materialize inside the task Pod instead.
"""
...

Expand Down
21 changes: 0 additions & 21 deletions lib/iris/src/iris/cluster/worker/env_probe.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,27 +186,6 @@ def _get_disk_bytes() -> int:
return 100 * 1024**3 # Default 100GB


def collect_workdir_size_mb(workdir: Path) -> int:
"""Calculate workdir size in MB using du -sm."""
if not workdir.exists():
return 0

result = subprocess.run(
["du", "-sm", str(workdir)],
capture_output=True,
text=True,
)

if result.returncode != 0:
return 0

# du -sm output format: "SIZE\tPATH"
output = result.stdout.strip()
size_str = output.split("\t")[0]

return int(size_str)


def _build_worker_attributes(
*,
accelerator_type: int,
Expand Down
15 changes: 11 additions & 4 deletions lib/iris/src/iris/cluster/worker/task_attempt.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@
ContainerPhase,
ContainerRuntime,
RuntimeLogReader,
WorkdirSpec,
)
from iris.cluster.types import (
JobName,
TaskAttempt as TaskAttemptIdentity,
is_task_finished,
)
from iris.cluster.bundle import BundleStore
from iris.cluster.worker.env_probe import collect_workdir_size_mb
from iris.cluster.worker.port_allocator import PortAllocator
from iris.cluster.log_store import LogCursor, LogStore, task_log_key
from iris.logging import parse_log_level, str_to_log_level
Expand Down Expand Up @@ -87,6 +87,7 @@ def _format_exit_error(exit_code: int | None, oom_killed: bool = False) -> str:
# /dev/shm/iris into the worker container so this path is available on GCE VMs.
_TMPFS_DIR = Path("/dev/shm/iris")
_TMPFS_MIN_FREE_BYTES = 1 * 1024 * 1024 * 1024 # 1 GB
_DISK_CHECK_INTERVAL_SECONDS = 60.0


def get_fast_io_dir(cache_dir: Path) -> Path:
Expand Down Expand Up @@ -549,11 +550,14 @@ def _download_bundle(self) -> None:
# to BundleStore.extract_bundle_to if long downloads become a problem.)

assert self.workdir is not None
disk_bytes = self.request.resources.disk_bytes if self.request.HasField("resources") else 0
workdir_spec = WorkdirSpec(disk_bytes=disk_bytes, tmpfs=disk_bytes > 0) if disk_bytes > 0 else None
self._runtime.stage_bundle(
bundle_id=self.request.bundle_id,
workdir=self.workdir,
workdir_files=dict(self.request.entrypoint.workdir_files),
bundle_store=self._bundle_store,
workdir_spec=workdir_spec,
)

logger.info(
Expand Down Expand Up @@ -711,6 +715,7 @@ def _monitor_loop(
log_reader: RuntimeLogReader,
deadline: Deadline | None,
) -> None:
last_disk_check = 0.0
while True:
if rule := chaos("worker.task_monitor"):
time.sleep(rule.delay_seconds)
Expand Down Expand Up @@ -794,8 +799,10 @@ def _monitor_loop(
if stats.memory_mb > self.peak_memory_mb:
self.peak_memory_mb = stats.memory_mb

if self.workdir:
self.disk_mb = collect_workdir_size_mb(self.workdir)
now = time.monotonic()
if now - last_disk_check >= _DISK_CHECK_INTERVAL_SECONDS:
self.disk_mb = handle.disk_usage_mb()
last_disk_check = now
Comment on lines +768 to +770
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Refresh disk usage when tasks stop

This throttles disk sampling to every 60 seconds, but _monitor_loop exits as soon as status.phase == STOPPED before taking another sample. Because last_disk_check starts at 0, most short-lived tasks only record an early startup sample, so disk_mb can stay stale/near-zero even when the task writes significant data later in execution; this makes per-task disk telemetry misleading for the common sub-minute job case.

Useful? React with 👍 / 👎.

except Exception:
logger.debug("Stats collection failed for task %s", self.task_id, exc_info=True)

Expand Down Expand Up @@ -845,7 +852,7 @@ def _cleanup(self) -> None:
except Exception as e:
logger.warning("Failed to release ports for task %s: %s", self.task_id, e)

# Remove working directory
# Remove working directory (handle.cleanup() already released backing storage)
if self.workdir and self.workdir.exists():
try:
shutil.rmtree(self.workdir)
Expand Down
Loading
Loading