Skip to content

Commit 51475ac

Browse files
committed
Use protobuf JSON formatting for worker metadata
1 parent 4f0728d commit 51475ac

2 files changed

Lines changed: 40 additions & 23 deletions

File tree

lib/marin/src/marin/mcp/babysitter.py

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pathlib import Path
1212
from typing import Any
1313

14+
from google.protobuf import json_format
1415
from iris.cli.bug_report import gather_bug_report
1516
from iris.cli.job import build_job_summary
1617
from iris.cli.token_store import cluster_name_from_url, load_any_token, load_token
@@ -30,6 +31,7 @@
3031
DEFAULT_PROFILE_SECONDS = 1
3132
MAX_LIST_JOBS_PAGE_SIZE = 500
3233
DEFAULT_LIST_JOBS_LIMIT = 100
34+
_PROTO_TO_DICT_OPTIONS = dict(preserving_proto_field_name=True)
3335

3436
_ZEPHYR_PROGRESS_RE = re.compile(
3537
r"\[(?P<stage>[^\]]+)\]\s+"
@@ -183,16 +185,12 @@ def job_status_to_json(job: job_pb2.JobStatus, tasks: Iterable[job_pb2.TaskStatu
183185
}
184186

185187

186-
def _attribute_value_to_json(value) -> Any:
187-
kind = value.WhichOneof("value")
188-
if kind is None:
189-
return None
190-
return getattr(value, kind)
188+
def _worker_metadata_to_json(metadata: job_pb2.WorkerMetadata) -> dict[str, Any]:
189+
return json_format.MessageToDict(metadata, **_PROTO_TO_DICT_OPTIONS)
191190

192191

193192
def worker_status_to_json(worker: controller_pb2.Controller.WorkerHealthStatus) -> dict[str, Any]:
194193
"""Serialize Iris worker health into stable JSON."""
195-
metadata = worker.metadata
196194
return {
197195
"worker_id": worker.worker_id,
198196
"healthy": bool(worker.healthy),
@@ -201,23 +199,7 @@ def worker_status_to_json(worker: controller_pb2.Controller.WorkerHealthStatus)
201199
"running_job_ids": list(worker.running_job_ids),
202200
"address": worker.address,
203201
"status_message": worker.status_message,
204-
"metadata": {
205-
"hostname": metadata.hostname,
206-
"ip_address": metadata.ip_address,
207-
"cpu_count": int(metadata.cpu_count),
208-
"memory_bytes": int(metadata.memory_bytes),
209-
"disk_bytes": int(metadata.disk_bytes),
210-
"device": _device_config_to_json(metadata.device) if metadata.HasField("device") else _cpu_device_json(),
211-
"tpu_name": metadata.tpu_name,
212-
"tpu_worker_id": metadata.tpu_worker_id,
213-
"gpu_count": int(metadata.gpu_count),
214-
"gpu_name": metadata.gpu_name,
215-
"gpu_memory_mb": int(metadata.gpu_memory_mb),
216-
"gce_instance_name": metadata.gce_instance_name,
217-
"gce_zone": metadata.gce_zone,
218-
"git_hash": metadata.git_hash,
219-
"attributes": {key: _attribute_value_to_json(value) for key, value in metadata.attributes.items()},
220-
},
202+
"metadata": _worker_metadata_to_json(worker.metadata),
221203
}
222204

223205

tests/mcp/test_babysitter.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
classify_diagnosis,
1313
parse_zephyr_progress,
1414
task_status_to_json,
15+
worker_status_to_json,
1516
)
1617

1718

@@ -91,6 +92,40 @@ def test_task_status_json_includes_attempts_timestamps_and_usage():
9192
assert payload["attempts"][1]["exit_code"] == 137
9293

9394

95+
def test_worker_status_json_uses_protobuf_metadata_serializer():
96+
worker = controller_pb2.Controller.WorkerHealthStatus(
97+
worker_id="worker-a",
98+
healthy=True,
99+
consecutive_failures=1,
100+
last_heartbeat=_timestamp(12_345),
101+
running_job_ids=["/alice/train"],
102+
address="worker-a:10001",
103+
status_message="healthy",
104+
metadata=job_pb2.WorkerMetadata(
105+
hostname="worker-a",
106+
memory_bytes=123_456_789_012,
107+
device=job_pb2.DeviceConfig(gpu=job_pb2.GpuDevice(variant="H100", count=8)),
108+
tpu_worker_hostnames="worker-a,worker-b",
109+
attributes={
110+
"region": job_pb2.AttributeValue(string_value="us-central1"),
111+
"slots": job_pb2.AttributeValue(int_value=4),
112+
},
113+
),
114+
)
115+
116+
payload = worker_status_to_json(worker)
117+
118+
assert payload["last_heartbeat_ms"] == 12_345
119+
assert payload["metadata"]["hostname"] == "worker-a"
120+
assert payload["metadata"]["memory_bytes"] == "123456789012"
121+
assert payload["metadata"]["device"] == {"gpu": {"variant": "H100", "count": 8}}
122+
assert payload["metadata"]["tpu_worker_hostnames"] == "worker-a,worker-b"
123+
assert payload["metadata"]["attributes"] == {
124+
"region": {"string_value": "us-central1"},
125+
"slots": {"int_value": "4"},
126+
}
127+
128+
94129
def test_job_summary_payload_preserves_summary_task_fields():
95130
job = job_pb2.JobStatus(
96131
job_id="/alice/train",

0 commit comments

Comments
 (0)