marin-community
diff --git a/‎lib/iris/src/iris/cli/bug_report.py‎
Lines changed: 8 additions & 1 deletion b/‎lib/iris/src/iris/cli/bug_report.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎lib/iris/src/iris/cli/job.py‎
Lines changed: 8 additions & 10 deletions b/‎lib/iris/src/iris/cli/job.py‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎lib/iris/src/iris/cli/main.py‎
Lines changed: 8 additions & 1 deletion b/‎lib/iris/src/iris/cli/main.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎lib/iris/src/iris/client/resolver.py‎
Lines changed: 3 additions & 0 deletions b/‎lib/iris/src/iris/client/resolver.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/iris/src/iris/cluster/client/remote_client.py‎
Lines changed: 3 additions & 0 deletions b/‎lib/iris/src/iris/cluster/client/remote_client.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/iris/src/iris/cluster/controller/autoscaler/recovery.py‎
Lines changed: 3 additions & 1 deletion b/‎lib/iris/src/iris/cluster/controller/autoscaler/recovery.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎lib/iris/src/iris/cluster/controller/controller.py‎
Lines changed: 14 additions & 22 deletions b/‎lib/iris/src/iris/cluster/controller/controller.py‎
Lines changed: 14 additions & 22 deletions
diff --git a/‎lib/iris/src/iris/cluster/controller/db.py‎
Lines changed: 39 additions & 13 deletions b/‎lib/iris/src/iris/cluster/controller/db.py‎
Lines changed: 39 additions & 13 deletions
diff --git a/‎lib/iris/src/iris/cluster/controller/migrations/0004_worker_indexes.py‎
Lines changed: 10 additions & 1 deletion b/‎lib/iris/src/iris/cluster/controller/migrations/0004_worker_indexes.py‎
Lines changed: 10 additions & 1 deletion
@@ -18,6 +18,7 @@
 from iris.cluster.types import JobName
 from iris.rpc import controller_pb2, job_pb2
 from iris.rpc.auth import AuthTokenInjector, TokenProvider
+from iris.rpc.compression import IRIS_RPC_COMPRESSIONS
 from iris.rpc.controller_connect import ControllerServiceClientSync
 from iris.rpc.proto_utils import format_resources, job_state_friendly, task_state_friendly
 from iris.time_proto import timestamp_from_proto
@@ -119,7 +120,13 @@ def gather_bug_report(
 ) -> BugReport:
     """Gather all diagnostic data for a job into a BugReport."""
     interceptors = [AuthTokenInjector(token_provider)] if token_provider else []
-    client = ControllerServiceClientSync(controller_url, timeout_ms=30000, interceptors=interceptors)
+    client = ControllerServiceClientSync(
+        controller_url,
+        timeout_ms=30000,
+        interceptors=interceptors,
+        accept_compression=IRIS_RPC_COMPRESSIONS,
+        send_compression=IRIS_RPC_COMPRESSIONS[0],
+    )
     log_client = LogClient.connect(controller_url, timeout_ms=30000, interceptors=interceptors)
     try:
         return _gather(client, log_client, job_id, tail=tail)
 
@@ -52,7 +52,6 @@
 from iris.rpc.auth import TokenProvider
 from iris.rpc.proto_utils import (
     PRIORITY_BAND_NAMES,
-    format_resources,
     job_state_friendly,
     priority_band_value,
     task_state_friendly,
@@ -1008,31 +1007,30 @@ def list_jobs(ctx, state: str | None, prefix: str | None, json_output: bool) ->
         click.echo("No jobs found.")
         return
 
-    # Build table rows
+    # Build table rows. The ListJobs response no longer includes the resource
+    # spec (it required a per-row proto decode in service of a CLI column most
+    # users skim past); call ``iris job status <id>`` for a single job's
+    # resources.
     rows: list[list[str]] = []
     has_reasons = False
 
     for j in jobs:
         job_id = j.job_id
         state_name = job_state_friendly(j.state)
         submitted = timestamp_from_proto(j.submitted_at).as_formatted_date() if j.submitted_at.epoch_ms else "-"
-        resources = format_resources(j.resources) if j.HasField("resources") else "-"
 
-        # Show error for failed jobs, pending_reason for pending/unschedulable
         reason = j.error or j.pending_reason or ""
         if reason:
             has_reasons = True
-            # Truncate long reasons
             reason = (reason[:60] + "...") if len(reason) > 63 else reason
 
-        rows.append([job_id, state_name, resources, submitted, reason])
+        rows.append([job_id, state_name, submitted, reason])
 
-    # Build headers - only include REASON column if there are any reasons
     if has_reasons:
-        headers = ["JOB ID", "STATE", "RESOURCES", "SUBMITTED", "REASON"]
+        headers = ["JOB ID", "STATE", "SUBMITTED", "REASON"]
     else:
-        headers = ["JOB ID", "STATE", "RESOURCES", "SUBMITTED"]
-        rows = [row[:4] for row in rows]
+        headers = ["JOB ID", "STATE", "SUBMITTED"]
+        rows = [row[:3] for row in rows]
 
     click.echo(tabulate(rows, headers=headers, tablefmt="plain"))
 
 
@@ -18,6 +18,7 @@
 from iris.rpc import config_pb2, job_pb2
 from iris.rpc import controller_pb2 as _controller_pb2
 from iris.rpc.auth import AuthTokenInjector, GcpAccessTokenProvider, StaticTokenProvider, TokenProvider
+from iris.rpc.compression import IRIS_RPC_COMPRESSIONS
 from iris.rpc.controller_connect import ControllerServiceClientSync
 from iris.rpc.proto_utils import PRIORITY_BAND_NAMES, priority_band_name, priority_band_value
 
@@ -124,7 +125,13 @@ def rpc_client(
 ) -> ControllerServiceClientSync:
     """Create an RPC client with optional auth. Use as a context manager: ``with rpc_client(url) as c:``."""
     interceptors = [AuthTokenInjector(token_provider)] if token_provider else []
-    return ControllerServiceClientSync(address, timeout_ms=timeout_ms, interceptors=interceptors)
+    return ControllerServiceClientSync(
+        address,
+        timeout_ms=timeout_ms,
+        interceptors=interceptors,
+        accept_compression=IRIS_RPC_COMPRESSIONS,
+        send_compression=IRIS_RPC_COMPRESSIONS[0],
+    )
 
 
 def require_controller_url(ctx: click.Context) -> str:
 
@@ -8,6 +8,7 @@
 from iris.actor.resolver import ResolvedEndpoint, ResolveResult
 from iris.cluster.types import Namespace
 from iris.rpc import controller_pb2
+from iris.rpc.compression import IRIS_RPC_COMPRESSIONS
 from iris.rpc.controller_connect import ControllerServiceClientSync
 
 
@@ -54,6 +55,8 @@ def __init__(
         self._client = ControllerServiceClientSync(
             address=self._address,
             timeout_ms=int(timeout * 1000),
+            accept_compression=IRIS_RPC_COMPRESSIONS,
+            send_compression=IRIS_RPC_COMPRESSIONS[0],
         )
 
     def _namespace_prefix(self) -> str:
 
@@ -21,6 +21,7 @@
 from iris.cluster.runtime.entrypoint import build_runtime_entrypoint
 from iris.cluster.types import Entrypoint, EnvironmentSpec, JobName, TaskAttempt, adjust_tpu_replicas, is_job_finished
 from iris.rpc import controller_pb2, job_pb2
+from iris.rpc.compression import IRIS_RPC_COMPRESSIONS
 from iris.rpc.controller_connect import ControllerServiceClientSync
 from iris.rpc.errors import call_with_retry, format_connect_error, poll_with_retries
 from iris.time_proto import duration_to_proto
@@ -78,6 +79,8 @@ def __init__(
             address=controller_address,
             timeout_ms=timeout_ms,
             interceptors=interceptors,
+            accept_compression=IRIS_RPC_COMPRESSIONS,
+            send_compression=IRIS_RPC_COMPRESSIONS[0],
         )
         self._log_client = LogClient.connect(
             controller_address,
 
@@ -57,8 +57,10 @@ def load_autoscaler_checkpoint(db: ControllerDB) -> AutoscalerCheckpoint:
                 "last_active_ms": decode_timestamp_ms,
             },
         )
+        # Failed workers have their DB row deleted (WorkerStore.remove), so
+        # surviving rows with a slice are by definition the live tracked set.
         tracked_rows = snapshot.raw(
-            "SELECT worker_id, slice_id, scale_group, address FROM workers WHERE slice_id != '' AND active = 1",
+            "SELECT worker_id, slice_id, scale_group, address FROM workers WHERE slice_id != ''",
         )
 
     slices_by_group: dict[str, list[SliceSnapshot]] = {}
 
@@ -114,7 +114,7 @@
     TaskUpdate,
     log_event,
 )
-from iris.cluster.controller.worker_health import WorkerHealthTracker
+from iris.cluster.controller.worker_health import WorkerCommitTracker, WorkerHealthTracker
 from iris.cluster.log_store_helpers import CONTROLLER_LOG_KEY
 from iris.cluster.providers.k8s.tasks import K8sTaskProvider
 from iris.cluster.providers.types import find_free_port, resolve_external_host
@@ -881,6 +881,8 @@ def _reservation_region_constraints(
     job_id_wire: str,
     claims: dict[WorkerId, ReservationClaim],
     queries: ControllerDB,
+    health: WorkerHealthTracker,
+    committed: WorkerCommitTracker,
     existing_constraints: list[Constraint],
 ) -> list[Constraint]:
     """Derive region constraints from claimed reservation workers.
@@ -897,7 +899,7 @@ def _reservation_region_constraints(
     claimed_worker_ids = {worker_id for worker_id, claim in claims.items() if claim.job_id == job_id_wire}
     workers_by_id = {
         worker.worker_id: worker
-        for worker in healthy_active_workers_with_attributes(queries)
+        for worker in healthy_active_workers_with_attributes(queries, health, committed)
         if worker.worker_id in claimed_worker_ids
     }
     regions: set[str] = set()
@@ -1153,7 +1155,8 @@ def __init__(
             self._db = db
         else:
             self._db = ControllerDB(db_dir=config.local_state_dir / "db")
-        self._store = ControllerStore(self._db)
+        self._health = WorkerHealthTracker()
+        self._store = ControllerStore(self._db, health=self._health)
 
         # ThreadContainer must be initialized before the log service setup
         # because _start_local_log_server spawns a uvicorn thread.
@@ -1194,7 +1197,6 @@ def __init__(
         self._log_handler.setFormatter(logging.Formatter("%(asctime)s %(name)s %(message)s"))
         logging.getLogger("iris").addHandler(self._log_handler)
 
-        self._health = WorkerHealthTracker()
         self._transitions = ControllerTransitions(
             store=self._store,
             health=self._health,
@@ -1630,7 +1632,7 @@ def _profile_all_running_tasks(self) -> None:
         Memory profiling via memray is currently disabled because memray attach
         has been triggering segfaults in target processes.
         """
-        workers = healthy_active_workers_with_attributes(self._db)
+        workers = healthy_active_workers_with_attributes(self._db, self._health, self._store.committed)
         if not workers:
             return
         workers_by_id = {w.worker_id: w for w in workers}
@@ -1742,11 +1744,7 @@ def _cleanup_stale_claims(self, claims: dict[WorkerId, ReservationClaim] | None
         if claims is None:
             claims = _read_reservation_claims(self._db)
             persisted = True
-        with self._db.read_snapshot() as snapshot:
-            active_worker_ids = {
-                WorkerId(str(row[0]))
-                for row in snapshot.fetchall("SELECT w.worker_id FROM workers w WHERE w.active = 1")
-            }
+        active_worker_ids = {wid for wid, l in self._health.all().items() if l.active}
         claimed_job_ids = {JobName.from_wire(claim.job_id) for claim in claims.values()}
         claimed_jobs = list(_jobs_by_id(self._db, claimed_job_ids).values()) if claimed_job_ids else []
         jobs_by_id = {job.job_id.to_wire(): job for job in claimed_jobs}
@@ -1778,7 +1776,7 @@ def _claim_workers_for_reservations(self, claims: dict[WorkerId, ReservationClai
             persisted = True
         claimed_entries: set[tuple[str, int]] = {(c.job_id, c.entry_idx) for c in claims.values()}
         claimed_worker_ids: set[WorkerId] = set(claims.keys())
-        all_workers = healthy_active_workers_with_attributes(self._db)
+        all_workers = healthy_active_workers_with_attributes(self._db, self._health, self._store.committed)
         changed = False
 
         reservable_states = (
@@ -1916,7 +1914,7 @@ def _read_scheduling_state(self) -> _SchedulingStateRead:
         timer = Timer()
         with slow_log(logger, "scheduling state reads", threshold_ms=50):
             pending_tasks = _schedulable_tasks(self._db)
-            workers = healthy_active_workers_with_attributes(self._db)
+            workers = healthy_active_workers_with_attributes(self._db, self._health, self._store.committed)
         return _SchedulingStateRead(
             pending_tasks=pending_tasks,
             workers=workers,
@@ -2378,7 +2376,7 @@ def _stop_tasks_direct(
 
     def _get_active_worker_addresses(self) -> list[tuple[WorkerId, str | None]]:
         """Get healthy active workers as (worker_id, address) tuples for ping."""
-        workers = healthy_active_workers_with_attributes(self._db)
+        workers = healthy_active_workers_with_attributes(self._db, self._health, self._store.committed)
         return [(w.worker_id, w.address) for w in workers]
 
     def _run_ping_loop(self, stop_event: threading.Event) -> None:
@@ -2406,8 +2404,7 @@ def _run_ping_loop(self, stop_event: threading.Event) -> None:
                         self._health.ping(result.worker_id, healthy=True)
                         live_worker_ids.append(result.worker_id)
 
-                with self._store.transaction() as cur:
-                    self._transitions.update_worker_pings(cur, live_worker_ids)
+                self._transitions.update_worker_pings(live_worker_ids)
 
                 unhealthy = self._health.workers_over_threshold()
                 if unhealthy:
@@ -2534,7 +2531,7 @@ def _run_autoscaler_once(self) -> None:
 
         worker_status_map = self._build_worker_status_map()
         self._autoscaler.refresh(worker_status_map)
-        workers = healthy_active_workers_with_attributes(self._db)
+        workers = healthy_active_workers_with_attributes(self._db, self._health, self._store.committed)
         demand_entries = compute_demand_entries(
             self._db,
             self._scheduler,
@@ -2546,12 +2543,7 @@ def _run_autoscaler_once(self) -> None:
     def _build_worker_status_map(self) -> WorkerStatusMap:
         """Build a map of worker_id to worker status for autoscaler idle tracking."""
         result: WorkerStatusMap = {}
-        with self._db.read_snapshot() as snapshot:
-            rows = snapshot.raw(
-                "SELECT worker_id FROM workers WHERE active = 1",
-                decoders={"worker_id": WorkerId},
-            )
-        worker_ids = {row.worker_id for row in rows}
+        worker_ids = {wid for wid, l in self._health.all().items() if l.active}
         running_by_worker = running_tasks_by_worker(self._db, worker_ids)
         for wid in worker_ids:
             result[wid] = WorkerStatus(
 
@@ -20,6 +20,7 @@
 
 from iris.cluster.constraints import AttributeValue
 from iris.cluster.controller.schema import decode_timestamp_ms, decode_worker_id
+from iris.cluster.controller.worker_health import WorkerCommitTracker, WorkerHealthTracker
 from iris.cluster.types import TERMINAL_TASK_STATES, JobName, WorkerId
 from iris.rpc import job_pb2
 
@@ -919,32 +920,57 @@ def _worker_row_select() -> str:
     return WORKER_ROW_PROJECTION.select_clause()
 
 
-def healthy_active_workers_with_attributes(db: ControllerDB) -> list:
+def healthy_active_workers_with_attributes(
+    db: ControllerDB,
+    health: WorkerHealthTracker,
+    committed: WorkerCommitTracker,
+) -> list:
     """Fetch all healthy, active workers with their attributes populated.
 
     Returns WorkerRow (scalar-only) so the scheduling loop avoids loading metadata columns.
-    Uses the in-memory attribute cache to avoid a per-cycle SQL join.
+    Health/active filtering reads the in-memory tracker; committed-resource
+    arithmetic reads the in-memory commit tracker.
     """
     from iris.cluster.controller.schema import WORKER_ROW_PROJECTION
 
+    liveness = health.all()
+    healthy_active = {wid for wid, l in liveness.items() if l.healthy and l.active}
+    if not healthy_active:
+        return []
+    placeholders = ",".join("?" for _ in healthy_active)
     with db.read_snapshot() as q:
         workers = WORKER_ROW_PROJECTION.decode(
-            q.fetchall(f"SELECT {_worker_row_select()} FROM workers w WHERE w.healthy = 1 AND w.active = 1"),
+            q.fetchall(
+                f"SELECT {_worker_row_select()} FROM workers w WHERE w.worker_id IN ({placeholders})",
+                tuple(str(wid) for wid in healthy_active),
+            ),
         )
         if not workers:
             return []
     attrs_by_worker = db.get_worker_attributes()
-    return [
-        dc_replace(
-            w,
-            attributes=attrs_by_worker.get(w.worker_id, {}),
-            available_cpu_millicores=w.total_cpu_millicores - w.committed_cpu_millicores,
-            available_memory=w.total_memory_bytes - w.committed_mem,
-            available_gpus=w.total_gpu_count - w.committed_gpu,
-            available_tpus=w.total_tpu_count - w.committed_tpu,
+    hydrated = []
+    for w in workers:
+        commit = committed.get(w.worker_id)
+        l = liveness.get(w.worker_id)
+        hydrated.append(
+            dc_replace(
+                w,
+                healthy=True,
+                active=True,
+                consecutive_failures=l.consecutive_ping_failures if l is not None else 0,
+                last_heartbeat=Timestamp.from_ms(l.last_heartbeat_ms) if l is not None else w.last_heartbeat,
+                committed_cpu_millicores=commit.cpu_millicores,
+                committed_mem=commit.memory_bytes,
+                committed_gpu=commit.gpu,
+                committed_tpu=commit.tpu,
+                attributes=attrs_by_worker.get(w.worker_id, {}),
+                available_cpu_millicores=w.total_cpu_millicores - commit.cpu_millicores,
+                available_memory=w.total_memory_bytes - commit.memory_bytes,
+                available_gpus=w.total_gpu_count - commit.gpu,
+                available_tpus=w.total_tpu_count - commit.tpu,
+            )
         )
-        for w in workers
-    ]
+    return hydrated
 
 
 def insert_task_profile(
 
@@ -4,10 +4,19 @@
 import sqlite3
 
 
+def _has_column(conn: sqlite3.Connection, table: str, column: str) -> bool:
+    return column in {row[1] for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
+
+
 def migrate(conn: sqlite3.Connection) -> None:
     # Originally this migration also rewrote the `trg_txn_log_retention`
     # trigger; those statements were removed once migration 0037 dropped the
     # `txn_log` / `txn_actions` tables entirely. On DBs that already ran the
     # old form the trigger survives until 0037 executes; 0037 is idempotent
     # (`DROP TRIGGER IF EXISTS`) so no fixup is needed here.
-    conn.execute("CREATE INDEX IF NOT EXISTS idx_workers_healthy_active ON workers(healthy, active)")
+    #
+    # ``healthy`` / ``active`` were workers columns when this migration was
+    # authored. They are dropped in 0042; on a fresh DB the columns are absent
+    # at this point so the index is a no-op.
+    if _has_column(conn, "workers", "healthy") and _has_column(conn, "workers", "active"):
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_workers_healthy_active ON workers(healthy, active)")
Original file line number	Diff line number	Diff line change
`@@ -57,8 +57,10 @@ def load_autoscaler_checkpoint(db: ControllerDB) -> AutoscalerCheckpoint:`
`57`	`57`	`"last_active_ms": decode_timestamp_ms,`
`58`	`58`	`},`
`59`	`59`	`)`
	`60`	`+ # Failed workers have their DB row deleted (WorkerStore.remove), so`
	`61`	`+ # surviving rows with a slice are by definition the live tracked set.`
`60`	`62`	`tracked_rows = snapshot.raw(`
`61`		`- "SELECT worker_id, slice_id, scale_group, address FROM workers WHERE slice_id != '' AND active = 1",`
	`63`	`+ "SELECT worker_id, slice_id, scale_group, address FROM workers WHERE slice_id != ''",`
`62`	`64`	`)`
`63`	`65`
`64`	`66`	`slices_by_group: dict[str, list[SliceSnapshot]] = {}`