Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
9dc337d
research: iris tmpfs workdir and du monitoring findings
rjpower Mar 15, 2026
ede2d1a
clean up research artifact
rjpower Mar 15, 2026
df11570
Add prepare_workdir/cleanup_workdir to ContainerRuntime protocol
rjpower Mar 15, 2026
e0b6324
Wire prepare_workdir/cleanup_workdir into task lifecycle, replace du …
rjpower Mar 15, 2026
bbd6888
Add tests for tmpfs workdir management and disk_usage monitoring
rjpower Mar 15, 2026
a925138
fix(iris): cleanup_workdir warns instead of raising, document disk_us…
rjpower Mar 15, 2026
2850e03
refactor(iris): replace prepare/cleanup_workdir stubs with declarativ…
rjpower Mar 15, 2026
2db0bdd
fix(iris): grant SYS_ADMIN to worker container for tmpfs mounts
rjpower Mar 15, 2026
2c568c5
fix(iris): serialize dashboard build across xdist workers with filelock
rjpower Mar 15, 2026
87fbe9e
refactor(iris): replace mount tuples + WorkdirSpec with unified Mount…
rjpower Mar 15, 2026
1ef8d92
fix(iris): read workdir_host_path from config in DockerRuntime.create…
rjpower Mar 15, 2026
14a3b59
iris: replace filelock with fcntl in e2e conftest
github-actions[bot] Mar 15, 2026
af32041
iris: cleanup tmpfs on staging failure, keep failed umounts tracked
github-actions[bot] Mar 15, 2026
bf98e46
iris: consolidate get_fast_io_dir, always mount tmpfs for Docker work…
github-actions[bot] Mar 16, 2026
ef46478
fix(iris): serialize tmpfs mount/unmount to prevent race under high c…
rjpower Mar 16, 2026
21cce8c
fix(iris): move tmpfs mounting from stage_bundle to resolve_mounts
rjpower Mar 16, 2026
ab30adb
fix(iris): simplify tmpfs to use /dev/shm directly, fix mount-before-…
rjpower Mar 16, 2026
4d08420
fix(iris): shared mount propagation for cache bind, move fcntl to mod…
rjpower Mar 16, 2026
af45265
fix(iris): wait for controller VM deletion before clearing remote state
rjpower Mar 16, 2026
867ad72
fix(iris): make controller VM deletion synchronous
rjpower Mar 16, 2026
447112b
fix(iris): use unique device name for tmpfs mounts to avoid mount(8) …
rjpower Mar 16, 2026
b505e02
fix(iris): remove tmpfs mount machinery, use /dev/shm as cache_dir di…
rjpower Mar 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/iris/src/iris/cluster/platform/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,10 @@ def replace_var(match: re.Match) -> str:
fi

# Start worker container without restart policy first (fail fast during bootstrap)
# SYS_ADMIN: required for mounting tmpfs workdirs (disk quota enforcement)
sudo docker run -d --name iris-worker \\
--network=host \\
--cap-add SYS_ADMIN \\
--ulimit core=0:0 \\
-v {{ cache_dir }}:{{ cache_dir }} \\
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Enable shared propagation on worker cache bind mount

DockerRuntime now mounts tmpfs workdirs inside the worker process (_mount_tmpfs in lib/iris/src/iris/cluster/runtime/docker.py), but the worker creates task containers via the host daemon through /var/run/docker.sock; with this plain cache bind (-v {{ cache_dir }}:{{ cache_dir }}) and no shared propagation, those inner mounts are not propagated to the host namespace. In the default bootstrap path, bundle staging and generated scripts can be written to the worker-only tmpfs while the task container sees the underlying host directory, leading to missing files under /app at runtime.

Useful? React with 👍 / 👎.

-v /dev/shm/iris:/dev/shm/iris \\
Expand Down
163 changes: 147 additions & 16 deletions lib/iris/src/iris/cluster/runtime/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import os
import re
import shlex
import shutil
import subprocess
import sys
import threading
import time
import uuid
Expand All @@ -42,13 +44,39 @@
ContainerStats,
ContainerStatus,
ImageInfo,
MountKind,
MountSpec,
)
from iris.cluster.worker.worker_types import LogLine, TaskLogs
from iris.rpc import cluster_pb2
from iris.time_utils import Timestamp

logger = logging.getLogger(__name__)

_TMPFS_DIR = Path("/dev/shm/iris")
_TMPFS_MIN_FREE_BYTES = 1 * 1024 * 1024 * 1024 # 1 GB


def get_fast_io_dir(cache_dir: Path) -> Path:
"""Return a fast IO directory for ephemeral task data.

Prefers /dev/shm/iris (tmpfs) for memory-speed IOPS when available and has
sufficient free space. Falls back to *cache_dir* on persistent disk.
"""
try:
if _TMPFS_DIR.is_dir():
stat = os.statvfs(_TMPFS_DIR)
free_bytes = stat.f_bavail * stat.f_frsize
if free_bytes >= _TMPFS_MIN_FREE_BYTES:
logger.info("Using tmpfs at %s for fast IO (%d MB free)", _TMPFS_DIR, free_bytes // (1024 * 1024))
return _TMPFS_DIR
except OSError:
logger.warning("OSError checking tmpfs at %s, falling back to persistent disk", _TMPFS_DIR, exc_info=True)
else:
logger.warning("Fast IO (tmpfs) not available at %s, falling back to persistent disk", _TMPFS_DIR)
return cache_dir


# Substrings that indicate a docker/registry infrastructure problem rather than
# a user-code error. Checked case-insensitively against stderr from docker
# create/start/pull.
Expand All @@ -72,6 +100,19 @@ def _is_docker_infra_error(stderr: str) -> bool:
return any(p.lower() in stderr_lower for p in _INFRA_ERROR_PATTERNS)


DEFAULT_WORKDIR_DISK_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB


@dataclass(frozen=True)
class ResolvedMount:
"""A MountSpec resolved to concrete host and container paths for Docker."""

host_path: str
container_path: str
mode: str # "rw" or "ro"
kind: MountKind


def _build_device_flags(config: ContainerConfig) -> list[str]:
"""Build Docker device flags based on resource configuration.

Expand Down Expand Up @@ -104,17 +145,17 @@ def _build_device_flags(config: ContainerConfig) -> list[str]:
return flags


def _detect_mount_user(mounts: list[tuple[str, str, str]]) -> str | None:
def _detect_mount_user(mounts: list[ResolvedMount]) -> str | None:
"""Detect user to run container as based on bind mount ownership.

When bind-mounting directories owned by non-root users, the container
must run as that user to have write access. Returns "uid:gid" for
--user flag, or None to run as root.
"""
for host_path, _container_path, mode in mounts:
if "w" not in mode:
for mount in mounts:
if "w" not in mount.mode:
continue
path = Path(host_path)
path = Path(mount.host_path)
if not path.exists():
continue
stat = path.stat()
Expand Down Expand Up @@ -234,6 +275,7 @@ class DockerContainerHandle:

config: ContainerConfig
runtime: "DockerRuntime"
_resolved_mounts: list[ResolvedMount] = field(default_factory=list, repr=False)
_run_container_id: str | None = field(default=None, repr=False)

@property
Expand Down Expand Up @@ -312,9 +354,9 @@ def _generate_setup_script(self) -> str:

def _write_setup_script(self, script: str) -> None:
"""Write the setup script to the workdir mount."""
for host_path, container_path, _mode in self.config.mounts:
if container_path == "/app":
(Path(host_path) / "_setup_env.sh").write_text(script)
for rm in self._resolved_mounts:
if rm.container_path == "/app":
(Path(rm.host_path) / "_setup_env.sh").write_text(script)
return
raise RuntimeError("No /app mount found in config")

Expand Down Expand Up @@ -356,9 +398,9 @@ def run(self) -> None:

def _write_run_script(self, script: str) -> None:
"""Write the run script to the workdir mount."""
for host_path, container_path, _mode in self.config.mounts:
if container_path == "/app":
(Path(host_path) / "_run.sh").write_text(script)
for rm in self._resolved_mounts:
if rm.container_path == "/app":
(Path(rm.host_path) / "_run.sh").write_text(script)
return
raise RuntimeError("No /app mount found in config")

Expand All @@ -383,6 +425,15 @@ def stats(self) -> ContainerStats:
return ContainerStats(memory_mb=0, cpu_percent=0, process_count=0, available=False)
return self._docker_stats(self._run_container_id)

def disk_usage_mb(self) -> int:
"""Return used space in MB on the filesystem containing the workdir."""
for rm in self._resolved_mounts:
if rm.container_path == self.config.workdir:
path = Path(rm.host_path)
if path.exists():
return int(shutil.disk_usage(path).used / (1024 * 1024))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Compute Docker disk metrics from workdir contents

disk_usage_mb() now returns shutil.disk_usage(path).used, which is bytes used by the entire backing filesystem, not by this task's workdir tree. For Docker tasks where /app is just a subdirectory (common when resources.disk_bytes is unset, or when many task dirs share /dev/shm/iris), tasks will report host/tmpfs-wide usage instead of per-task usage, making resource_usage.disk_mb misleading for debugging and capacity analysis.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rjpower this is an issue?

return 0

def profile(self, duration_seconds: int, profile_type: "cluster_pb2.ProfileType") -> bytes:
"""Profile the running process using py-spy (CPU), memray (memory), or thread dump."""
container_id = self._run_container_id
Expand Down Expand Up @@ -482,11 +533,15 @@ def _profile_memory(
self._docker_rm_files(container_id, [trace_path, output_path])

def cleanup(self) -> None:
"""Remove the run container and clean up resources."""
"""Remove the run container and clean up resources (including tmpfs mounts)."""
if self._run_container_id:
self._docker_remove(self._run_container_id)
self.runtime.untrack_container(self._run_container_id)
self._run_container_id = None
# Release tmpfs backing for WORKDIR and TMPFS mounts
for rm in self._resolved_mounts:
if rm.kind in (MountKind.WORKDIR, MountKind.TMPFS):
self.runtime.release_tmpfs(Path(rm.host_path))

# -------------------------------------------------------------------------
# Docker CLI helpers
Expand Down Expand Up @@ -514,7 +569,7 @@ def _docker_create(
]

# Run as the owner of bind-mounted directories
user_flag = _detect_mount_user(config.mounts)
user_flag = _detect_mount_user(self._resolved_mounts)
if user_flag:
cmd.extend(["--user", user_flag])

Expand Down Expand Up @@ -555,8 +610,8 @@ def _docker_create(
cmd.extend(["-e", f"{k}={v}"])

# Mounts
for host, container, mode in config.mounts:
cmd.extend(["-v", f"{host}:{container}:{mode}"])
for rm in self._resolved_mounts:
cmd.extend(["-v", f"{rm.host_path}:{rm.container_path}:{rm.mode}"])

cmd.append(config.image)
cmd.extend(command)
Expand Down Expand Up @@ -712,9 +767,15 @@ class DockerRuntime:
Tracks all created containers for cleanup on shutdown.
"""

def __init__(self) -> None:
def __init__(self, cache_dir: Path) -> None:
self._cache_dir = cache_dir
self._fast_io_dir = get_fast_io_dir(cache_dir)
self._handles: list[DockerContainerHandle] = []
self._created_containers: set[str] = set()
self._tmpfs_mounts: set[Path] = set()
# Serializes tmpfs mount/unmount to avoid concurrent `mount` failures
# under high task parallelism (e.g. 200 tasks mounting under /dev/shm).
self._mount_lock = threading.Lock()
# Serializes `docker pull` per image tag so that concurrent task threads
# don't each trigger docker-credential-gcloud against the metadata server,
# which causes sporadic "no active account" errors under load.
Expand Down Expand Up @@ -764,13 +825,36 @@ def ensure_image(self, image: str) -> None:
logger.info("Image %s pulled successfully", image)
self._pulled_images.add(image)

def resolve_mounts(self, mounts: list[MountSpec], workdir_host_path: Path | None = None) -> list[ResolvedMount]:
"""Convert semantic MountSpecs to ResolvedMount instances.

Creates host directories as needed. WORKDIR uses the explicit host path
(created by task_attempt) and mounts tmpfs on it for isolation.
CACHE and TMPFS get dirs under fast_io_dir.
"""
result: list[ResolvedMount] = []
for mount in mounts:
mode = "ro" if mount.read_only else "rw"
if mount.kind == MountKind.WORKDIR:
if workdir_host_path is None:
raise RuntimeError("WORKDIR mount requires workdir_host_path")
size = mount.size_bytes if mount.size_bytes > 0 else DEFAULT_WORKDIR_DISK_BYTES
self._mount_tmpfs(workdir_host_path, size)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Mount tmpfs before staging task bundle

Mounting tmpfs here happens during create_container, but TaskAttempt.run() stages the bundle/workdir files earlier (_download_bundle runs before _create_container in lib/iris/src/iris/cluster/worker/task_attempt.py). On Linux, mounting tmpfs on a populated directory hides the previously extracted files, so Docker tasks with a non-empty bundle/workdir payload can start with an empty /app and fail to find user code.

Useful? React with 👍 / 👎.

result.append(ResolvedMount(str(workdir_host_path), mount.container_path, mode, mount.kind))
elif mount.kind in (MountKind.TMPFS, MountKind.CACHE):
host_dir = self._fast_io_dir / mount.container_path.strip("/").replace("/", "-")
host_dir.mkdir(parents=True, exist_ok=True)
result.append(ResolvedMount(str(host_dir), mount.container_path, mode, mount.kind))
return result

def create_container(self, config: ContainerConfig) -> DockerContainerHandle:
"""Create a container handle from config.

The handle is not started - call handle.build() then handle.run()
to execute the container.
"""
handle = DockerContainerHandle(config=config, runtime=self)
resolved = self.resolve_mounts(config.mounts, workdir_host_path=config.workdir_host_path)
handle = DockerContainerHandle(config=config, runtime=self, _resolved_mounts=resolved)
self._handles.append(handle)
return handle

Expand All @@ -787,6 +871,53 @@ def stage_bundle(
bundle_store.extract_bundle_to(bundle_id, workdir)
bundle_store.write_workdir_files(workdir, workdir_files)

def _mount_tmpfs(self, workdir: Path, disk_bytes: int) -> None:
if sys.platform != "linux":
raise RuntimeError("Docker workdir disk limits require Linux tmpfs mounts")
workdir.mkdir(parents=True, exist_ok=True)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: _setup() already calls workdir.mkdir(parents=True, exist_ok=True) on line 473 of task_attempt.py before calling prepare_workdir. This second mkdir is redundant.

with self._mount_lock:
if os.path.ismount(workdir):
logger.info("Workdir %s is already a mountpoint; reusing", workdir)
return
result = subprocess.run(
["mount", "-t", "tmpfs", "-o", f"size={disk_bytes},nodev,nosuid", "tmpfs", str(workdir)],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
# Under high concurrency the mount utility can spuriously fail
# even for distinct paths; if the target is now mounted, accept it.
if os.path.ismount(workdir):
logger.warning(
"mount command failed but %s is now a mountpoint; treating as success (stderr: %s)",
workdir,
result.stderr.strip(),
)
else:
raise RuntimeError(f"Failed to mount tmpfs workdir {workdir}: {result.stderr.strip()}")
self._tmpfs_mounts.add(workdir)
logger.info("Mounted tmpfs workdir %s with size=%d bytes", workdir, disk_bytes)

def release_tmpfs(self, workdir: Path) -> None:
"""Unmount a tmpfs workdir if it was mounted by this runtime.

On umount failure the path stays in ``_tmpfs_mounts`` so a later
cleanup pass can retry, preventing leaked RAM-backed mounts.
"""
with self._mount_lock:
if workdir not in self._tmpfs_mounts:
return
if not os.path.ismount(workdir):
self._tmpfs_mounts.discard(workdir)
return
result = subprocess.run(["umount", str(workdir)], capture_output=True, text=True, check=False)
if result.returncode != 0:
logger.warning("Failed to unmount tmpfs workdir %s: %s", workdir, result.stderr.strip())
else:
logger.info("Unmounted tmpfs workdir %s", workdir)
self._tmpfs_mounts.discard(workdir)

def track_container(self, container_id: str) -> None:
"""Track a container ID for cleanup."""
self._created_containers.add(container_id)
Expand Down
Loading
Loading