marin-community
diff --git a/‎.agents/skills/babysit-job/SKILL.md‎
Lines changed: 7 additions & 0 deletions b/‎.agents/skills/babysit-job/SKILL.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.agents/skills/babysit-zephyr/SKILL.md‎
Lines changed: 20 additions & 1 deletion b/‎.agents/skills/babysit-zephyr/SKILL.md‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎lib/iris/src/iris/cluster/controller/db.py‎
Lines changed: 0 additions & 1 deletion b/‎lib/iris/src/iris/cluster/controller/db.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/iris/src/iris/cluster/controller/migrations/0013_task_counters.py‎
Lines changed: 0 additions & 10 deletions b/‎lib/iris/src/iris/cluster/controller/migrations/0013_task_counters.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎lib/iris/src/iris/cluster/controller/service.py‎
Lines changed: 0 additions & 6 deletions b/‎lib/iris/src/iris/cluster/controller/service.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎lib/iris/src/iris/cluster/controller/transitions.py‎
Lines changed: 0 additions & 31 deletions b/‎lib/iris/src/iris/cluster/controller/transitions.py‎
Lines changed: 0 additions & 31 deletions
diff --git a/‎lib/iris/src/iris/cluster/controller/worker_provider.py‎
Lines changed: 0 additions & 1 deletion b/‎lib/iris/src/iris/cluster/controller/worker_provider.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/iris/src/iris/cluster/worker/task_attempt.py‎
Lines changed: 0 additions & 26 deletions b/‎lib/iris/src/iris/cluster/worker/task_attempt.py‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎lib/iris/src/iris/cluster/worker/worker.py‎
Lines changed: 0 additions & 2 deletions b/‎lib/iris/src/iris/cluster/worker/worker.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎lib/iris/src/iris/counters.py‎
Lines changed: 0 additions & 108 deletions b/‎lib/iris/src/iris/counters.py‎
Lines changed: 0 additions & 108 deletions
@@ -103,6 +103,13 @@ data loading issues, unclear multi-file stack traces.
   or unsatisfied resources -> mark `degraded` and notify user.
 - If same error repeats after one fix attempt, do not retry blindly; report to user.
 
+### Zephyr Counters
+
+Zephyr pipelines support user-defined counters (e.g. `documents_processed`, `bytes_written`).
+Counters appear in coordinator progress logs and in `JobStatus.counters`. When babysitting
+a Zephyr job, monitor counter advancement as an additional throughput signal. See
+**babysit-zephyr** for details.
+
 ### When to Escalate
 
 - Debug Zephyr pipeline issues -> **debug-zephyr-job**
 
@@ -65,9 +65,23 @@ A healthy zephyr job has:
 
 The coordinator logs a progress line every 5s:
 ```
-[stage0-Map → Scatter] 347/1964 complete, 1617 in-flight, 0 queued, 1828/1891 workers alive, 63 dead
+[stage0-Map → Scatter] 347/1964 complete, 1617 in-flight, 0 queued, 1828/1891 workers alive, 63 dead, counters: bytes_written=4831838208 documents_processed=1200000
 ```
 
+### User-Defined Counters
+
+Zephyr pipelines can report user-defined counters via `zephyr.counters.increment()`. Counters are aggregated across all workers and appear in:
+- **Coordinator progress logs**: appended to the periodic status line (grep for `counters:`)
+- **`get_status()` RPC**: `JobStatus.counters` dict, accessible programmatically
+
+To check counters from task code, use:
+```python
+from zephyr import counters
+counters.increment("documents_processed", batch_size)
+```
+
+Counters are sent to the coordinator via the worker heartbeat (every 5s) and only transmitted when values change — no overhead for idle workers.
+
 Fetch via the Iris CLI:
 ```bash
 uv run iris --config lib/iris/examples/marin.yaml rpc controller get-task-logs \
@@ -115,10 +129,15 @@ After submitting, monitor in escalating stages:
 3. Get the run command (or reuse the previous one).
 4. Submit and resume monitoring.
 
+## Monitoring Counters
+
+When babysitting a Zephyr job, check coordinator logs for counter lines. Counters give you insight into pipeline throughput (e.g. `documents_processed`, `bytes_written`, `validation_errors`). If counters stop advancing while shards are still in-flight, this may indicate a straggler or stuck worker — escalate to debug-zephyr-job.
+
 ## When to Escalate
 
 Escalate to **debug-zephyr-job** when:
 - A stage is stuck (no shard progress for an extended period)
 - Stragglers are holding up a stage (few in-flight, 0 queued, most workers idle)
 - Workers are failing repeatedly with the same error
+- Counters stop advancing while tasks remain in-flight
 - For controller issues (e.g., RPCs timing out), use the **debug-iris-controller** skill
@@ -624,7 +624,6 @@ class Task:
     current_worker_id: WorkerId | None = db_field("current_worker_id", _nullable(_decode_worker_id), default=None)
     current_worker_address: str | None = db_field("current_worker_address", _nullable(_decode_str), default=None)
     container_id: str | None = db_field("container_id", _nullable(_decode_str), default=None)
-    counters: dict[str, int] = db_field("counters_json", _decode_json_dict, default_factory=dict)
     attempts: tuple[Attempt, ...] = field(default_factory=tuple)
 
     def is_finished(self) -> bool:
 
@@ -173,8 +173,6 @@ def task_to_proto(task: Task, worker_address: str = "") -> cluster_pb2.TaskStatu
         proto.resource_usage.CopyFrom(task.resource_usage)
     if task.container_id:
         proto.container_id = task.container_id
-    if task.counters:
-        proto.counters.update(task.counters)
     # For pending tasks with prior terminal attempts, surface retry context.
     if task.state == cluster_pb2.TASK_STATE_PENDING and task.attempts and task.attempts[-1].is_terminal:
         last = task.attempts[-1]
@@ -843,12 +841,9 @@ def get_job_status(
         task_statuses = []
         total_failure_count = 0
         total_preemption_count = 0
-        total_counters: dict[str, int] = {}
         for task in tasks:
             total_failure_count += task.failure_count
             total_preemption_count += task.preemption_count
-            for name, value in task.counters.items():
-                total_counters[name] = total_counters.get(name, 0) + value
 
             task_statuses.append(task_to_proto(task, worker_address=worker_addr_by_id.get(task.worker_id, "")))
 
@@ -874,7 +869,6 @@ def get_job_status(
             tasks=task_statuses,
             name=job.request.name if job.request else "",
             pending_reason=pending_reason,
-            counters=total_counters,
         )
         if job.request:
             proto_job_status.resources.CopyFrom(job.request.resources)
 
@@ -141,7 +141,6 @@ class TaskUpdate:
     resource_usage: cluster_pb2.ResourceUsage | None = None
     log_entries: list[logging_pb2.LogEntry] = field(default_factory=list)
     container_id: str | None = None
-    counters: dict[str, int] = field(default_factory=dict)
 
 
 @dataclass(frozen=True)
@@ -1024,7 +1023,6 @@ def _apply_single_heartbeat(
                 or update.exit_code is not None
                 or update.resource_usage is not None
                 or update.log_entries
-                or update.counters
             )
             if update.new_state == prior_state and not has_new_data:
                 continue
@@ -1049,12 +1047,6 @@ def _apply_single_heartbeat(
                     "UPDATE tasks SET resource_usage_proto = ? WHERE task_id = ?",
                     (usage_payload, update.task_id.to_wire()),
                 )
-            if update.counters:
-                cur.execute(
-                    "UPDATE tasks SET counters_json = ? WHERE task_id = ?",
-                    (json.dumps(update.counters), update.task_id.to_wire()),
-                )
-
             terminal_ms: int | None = None
             started_ms: int | None = None
             task_state = prior_state
@@ -1104,15 +1096,6 @@ def _apply_single_heartbeat(
                     task_state = cluster_pb2.TASK_STATE_PENDING
                     terminal_ms = None
 
-            # Clear stale counters when the task is retried so that
-            # get_job_status() does not double-count values from the
-            # previous attempt.
-            if task_state == cluster_pb2.TASK_STATE_PENDING:
-                cur.execute(
-                    "UPDATE tasks SET counters_json = NULL WHERE task_id = ?",
-                    (update.task_id.to_wire(),),
-                )
-
             cur.execute(
                 "UPDATE task_attempts SET state = ?, started_at_ms = COALESCE(started_at_ms, ?), "
                 "finished_at_ms = COALESCE(finished_at_ms, ?), exit_code = COALESCE(?, exit_code), "
@@ -2138,11 +2121,6 @@ def apply_direct_provider_updates(self, updates: list[TaskUpdate]) -> TxResult:
                         "UPDATE tasks SET resource_usage_proto = ? WHERE task_id = ?",
                         (usage_payload, update.task_id.to_wire()),
                     )
-                if update.counters:
-                    cur.execute(
-                        "UPDATE tasks SET counters_json = ? WHERE task_id = ?",
-                        (json.dumps(update.counters), update.task_id.to_wire()),
-                    )
                 if update.container_id is not None:
                     cur.execute(
                         "UPDATE tasks SET container_id = ? WHERE task_id = ?",
@@ -2202,15 +2180,6 @@ def apply_direct_provider_updates(self, updates: list[TaskUpdate]) -> TxResult:
                         task_state = cluster_pb2.TASK_STATE_PENDING
                         terminal_ms = None
 
-                # Clear stale counters when the task is retried so that
-                # get_job_status() does not double-count values from the
-                # previous attempt.
-                if task_state == cluster_pb2.TASK_STATE_PENDING:
-                    cur.execute(
-                        "UPDATE tasks SET counters_json = NULL WHERE task_id = ?",
-                        (update.task_id.to_wire(),),
-                    )
-
                 cur.execute(
                     "UPDATE task_attempts SET state = ?, started_at_ms = COALESCE(started_at_ms, ?), "
                     "finished_at_ms = COALESCE(finished_at_ms, ?), exit_code = COALESCE(?, exit_code), "
 
@@ -87,7 +87,6 @@ def _apply_request_from_response(
                 resource_usage=entry.resource_usage if entry.resource_usage.ByteSize() > 0 else None,
                 log_entries=list(entry.log_entries),
                 container_id=entry.container_id or None,
-                counters=dict(entry.counters) if entry.counters else {},
             )
         )
     return HeartbeatApplyRequest(
 
@@ -7,7 +7,6 @@
 bundle download -> image build -> container run -> monitor -> cleanup.
 """
 
-import json
 import logging
 import shutil
 import socket
@@ -159,10 +158,6 @@ def build_iris_env(
     for name, port in task.ports.items():
         env[f"IRIS_PORT_{name.upper()}"] = str(port)
 
-    # Counter file is written by task code and read by the monitor loop.
-    # /app is the container-side mount point for the workdir.
-    env["IRIS_COUNTER_FILE"] = "/app/iris_counters.json"
-
     return env
 
 
@@ -257,9 +252,6 @@ def __init__(
         self.process_count: int = 0
         self.disk_mb: int = 0
 
-        # User-defined counters (read from iris_counters.json in workdir)
-        self.counters: dict[str, int] = {}
-
         # Build tracking
         self.build_started: Timestamp | None = None
         self.build_finished: Timestamp | None = None
@@ -452,9 +444,6 @@ def to_proto(self) -> cluster_pb2.TaskStatus:
             proto.build_metrics.build_started.CopyFrom(self.build_started.to_proto())
         if self.build_finished is not None:
             proto.build_metrics.build_finished.CopyFrom(self.build_finished.to_proto())
-        if self.counters:
-            proto.counters.update(self.counters)
-
         return proto
 
     def _check_cancelled(self) -> None:
@@ -754,7 +743,6 @@ def _monitor_loop(
                 )
                 # Final log fetch before container stops
                 self._stream_logs(log_reader)
-                self._read_counters()
 
                 # Container has stopped
                 if status.error:
@@ -802,8 +790,6 @@ def _monitor_loop(
             except Exception:
                 logger.debug("Stats collection failed for task %s", self.task_id, exc_info=True)
 
-            self._read_counters()
-
             # Sleep before next poll
             time.sleep(self._poll_interval_seconds)
 
@@ -823,18 +809,6 @@ def _stream_logs(self, reader: RuntimeLogReader) -> None:
         except Exception:
             logger.debug("Log streaming failed for task %s", self.task_id, exc_info=True)
 
-    def _read_counters(self) -> None:
-        """Read user-defined counters from iris_counters.json in the workdir."""
-        if self.workdir is None:
-            return
-        counter_file = self.workdir / "iris_counters.json"
-        if not counter_file.exists():
-            return
-        try:
-            self.counters = json.loads(counter_file.read_text())
-        except (json.JSONDecodeError, OSError):
-            logger.debug("Counter file read failed for task %s", self.task_id, exc_info=True)
-
     def _cleanup(self) -> None:
         """Clean up task resources: container, ports, image protection, workdir.
 
 
@@ -625,8 +625,6 @@ def handle_heartbeat(self, request: cluster_pb2.HeartbeatRequest) -> cluster_pb2
                                 entry.finished_at.CopyFrom(task_proto.finished_at)
                             if task_proto.resource_usage.ByteSize() > 0:
                                 entry.resource_usage.CopyFrom(task_proto.resource_usage)
-                            if task_proto.counters:
-                                entry.counters.update(task_proto.counters)
                             tasks.append(entry)
 
                     # Kill tasks not in expected_tasks - the controller has decided these
Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,6 @@ def _apply_request_from_response(`
`87`	`87`	`resource_usage=entry.resource_usage if entry.resource_usage.ByteSize() > 0 else None,`
`88`	`88`	`log_entries=list(entry.log_entries),`
`89`	`89`	`container_id=entry.container_id or None,`
`90`		`- counters=dict(entry.counters) if entry.counters else {},`
`91`	`90`	`)`
`92`	`91`	`)`
`93`	`92`	`return HeartbeatApplyRequest(`