fix: surface PENDING peers under their served_model_name during boot

robmsmt · robmsmt · commit 7591d55f174b · 2026-05-18T20:07:26.000+02:00
Before: when a launching peer is still in PENDING (no service advertised
yet), get_all_models surfaced it with id="" and worker_group_id set.
The frontend (ModelList.svelte) builds wgToModel from peers that already
carry an id, then drops any remaining id="" peer whose worker_group_id
doesn't appear in that map. During the brief PENDING window every peer
in the worker group is service-less, so wgToModel is empty for that
group and the replica is silently filtered out. By the time we COULD
render it, registrar.go flips status from PENDING to READY and advertises
the service in the same step — so PENDING is never actually visible on
the dashboard.

After: fall back to labels.served_model_name (already emitted by
model-launch's _ocf_labels on every peer) when synthesising the
no-service entry. The peer now has a real model id during boot, the
frontend's grouping succeeds, and the status pill renders "pending"
until the health check passes.

Tests updated: the multi-node-replica grouping test previously asserted
the follower kept id="". With served_model_name on every peer, both
peers in the group now resolve to the same id; we still verify the
shared worker_group_id keeps them in one replica. Added a defensive
test for the older-binary case (no served_model_name label) where the
id stays empty as before.
diff --git a/backend/services/model_service.py b/backend/services/model_service.py
@@ -63,8 +63,13 @@ def get_all_models(endpoint: str, with_details: bool = False):
             # worker_group_id and show it as part of a launching/follower set.
             if not meta["worker_group_id"]:
                 continue
+            # Fall back to the served_model_name label so the frontend can
+            # group PENDING peers under their eventual model card during boot.
+            # Without this, the brief PENDING window is invisible because the
+            # peer has no advertised service yet and nothing else maps its
+            # worker_group_id back to a model id.
             entry = {
-                "id": "",  # no model yet
+                "id": meta["labels"].get("served_model_name", ""),
                 "object": "model",
                 "created": "0x",
                 "owner": "0x",
diff --git a/backend/tests/test_model_service.py b/backend/tests/test_model_service.py
@@ -53,6 +53,7 @@ def json(self):
         "slurm_job_id": "12345",
         "worker_group_id": "12345",
         "framework": "sglang",
+        "served_model_name": "swiss-ai/Apertus-8B",
         "started_at": "2026-05-15T18:00:00Z",
     },
     "hardware": {"gpus": [{"name": "GH200"}] * 4},
@@ -100,9 +101,11 @@ def test_new_binary_head_carries_labels():
 
 
 def test_metrics_only_follower_groups_with_head_via_worker_group_id():
-    """A multi-node replica's follower has no `service` but does carry
-    worker_group_id. It should appear in the output with id='' so the
-    frontend can attribute it to the same replica as the head."""
+    """A peer with no advertised `service` (multi-node follower, or a head
+    still in PENDING during boot) should fall back to its served_model_name
+    label so the frontend can render the model card during the brief window
+    before the service is published. Without the fallback, the peer's id
+    stays empty and the frontend silently drops it."""
     with patch("backend.services.model_service.requests.get") as mock_get:
         mock_get.return_value = _dnt_response(
             {
@@ -114,15 +117,34 @@ def test_metrics_only_follower_groups_with_head_via_worker_group_id():
     assert len(out) == 2
     by_id = {e["peer_id"]: e for e in out}
     assert by_id["QmHead"]["id"] == "swiss-ai/Apertus-8B"
-    assert by_id["QmFollower"]["id"] == ""
-    # Shared worker_group_id lets the frontend group them.
+    # Follower inherits id from the served_model_name label — same model card.
+    assert by_id["QmFollower"]["id"] == "swiss-ai/Apertus-8B"
+    assert by_id["QmFollower"]["status"] == "pending"
+    # Shared worker_group_id lets the frontend group them within the model.
     assert (
         by_id["QmHead"]["worker_group_id"]
         == by_id["QmFollower"]["worker_group_id"]
         == "12345"
     )
 
 
+def test_pending_peer_without_served_model_name_label_falls_back_to_empty_id():
+    """Defensive: if a peer is mid-boot from an older binary that doesn't
+    emit served_model_name, we still surface it via worker_group_id with
+    id=''. The frontend then needs another peer in the same group with an
+    id to attribute it; otherwise it's dropped."""
+    peer = {
+        **PEER_NEW_BINARY_FOLLOWER,
+        "labels": {k: v for k, v in PEER_NEW_BINARY_FOLLOWER["labels"].items() if k != "served_model_name"},
+    }
+    with patch("backend.services.model_service.requests.get") as mock_get:
+        mock_get.return_value = _dnt_response({"/QmPending": peer})
+        out = get_all_models("http://x/v1/dnt/table", with_details=True)
+    assert len(out) == 1
+    assert out[0]["id"] == ""
+    assert out[0]["worker_group_id"] == "12345"
+
+
 def test_follower_without_worker_group_id_skipped():
     """Older binary follower with no labels and no service is uninformative —
     drop it so the model list stays clean."""
@@ -196,9 +218,10 @@ def test_real_prod_payload_returns_models():
 
 def test_upgraded_payload_groups_multinode_replica():
     """Simulated v0.0.6 deployment: the gemma 'multi-node demo' pair share a
-    worker_group_id. One has a service, the other is metrics-only with id=''.
-    Backend returns both entries with the shared worker_group_id so the
-    frontend can aggregate them into one logical replica."""
+    worker_group_id. Both peers carry the served_model_name label, so both
+    resolve to the same model id even though only one advertises a service.
+    Backend returns both entries with the shared worker_group_id + model id
+    so the frontend can aggregate them into one logical replica."""
     with patch("backend.services.model_service.requests.get") as mock_get:
         mock_get.return_value = type(
             "R",
@@ -212,7 +235,8 @@ def test_upgraded_payload_groups_multinode_replica():
         by_wg.setdefault(e["worker_group_id"], []).append(e)
     multi = [v for v in by_wg.values() if len(v) > 1]
     assert multi, "fixture should contain at least one multi-peer worker group"
-    # At least one peer in the multi-peer group should be metrics-only (id='').
     pair = multi[0]
-    assert any(e["id"] == "" for e in pair), pair
-    assert any(e["id"] != "" for e in pair), pair
+    # Both peers in the group share the same non-empty model id.
+    ids = {e["id"] for e in pair}
+    assert ids != {""}, pair
+    assert len(ids) == 1, f"peers in one worker group should share one model id: {ids}"