fix: pick multi-node TP head by has_service, not first peer

robmsmt · robmsmt · commit bad4dca9e4e0 · 2026-05-21T12:24:52.000+02:00
In a multi-node TP replica only rank-0 registers the `llm` service; the
other ranks run as background workers and their OCFs stay status=pending
forever. The frontend was picking the head as the first peer whose id
matched the model id — but every peer in the group shares that id (from
the served_model_name label), so rank-N could win and the whole replica
would render as pending despite serving traffic fine.

Surface `has_service` on each peer entry from the backend and prefer it
in the frontend's head selection. Same change also makes the expanded-
view "head" label match the node sglang actually runs the API server on.
diff --git a/backend/services/model_service.py b/backend/services/model_service.py
@@ -73,6 +73,7 @@ def get_all_models(endpoint: str, with_details: bool = False):
                 "object": "model",
                 "created": "0x",
                 "owner": "0x",
+                "has_service": False,
                 **meta,
             }
             if with_details:
@@ -93,6 +94,7 @@ def get_all_models(endpoint: str, with_details: bool = False):
                     "object": "model",
                     "created": "0x",
                     "owner": "0x",
+                    "has_service": True,
                     **meta,
                 }
                 if with_details:
diff --git a/backend/tests/test_model_service.py b/backend/tests/test_model_service.py
@@ -98,6 +98,7 @@ def test_new_binary_head_carries_labels():
     assert entry["worker_group_id"] == "12345"
     assert entry["framework"] == "sglang"
     assert entry["status"] == "ready"
+    assert entry["has_service"] is True
 
 
 def test_metrics_only_follower_groups_with_head_via_worker_group_id():
@@ -126,6 +127,12 @@ def test_metrics_only_follower_groups_with_head_via_worker_group_id():
         == by_id["QmFollower"]["worker_group_id"]
         == "12345"
     )
+    # has_service distinguishes the actually-serving peer from TP-worker /
+    # metrics-only peers — the frontend uses this to pick the head when
+    # aggregating replicas (otherwise a multi-node TP replica shows pending
+    # forever because rank-1..N never register an llm service).
+    assert by_id["QmHead"]["has_service"] is True
+    assert by_id["QmFollower"]["has_service"] is False
 
 
 def test_pending_peer_without_served_model_name_label_falls_back_to_empty_id():
diff --git a/frontend/src/components/ui/ModelList.svelte b/frontend/src/components/ui/ModelList.svelte
@@ -57,8 +57,15 @@
             replicaCount = 0;
             models = Array.from(modelsMap.values()).map(grouped => {
                 const replicas = Array.from(grouped.replicas.values()).map(r => {
-                    // The head is the peer that owns the serving entry.
-                    const head = r.peers.find(p => p.id === grouped.id) || r.peers[0];
+                    // The head is the peer that owns the serving entry. In a
+                    // multi-node TP replica only rank-0 registers `llm`, so
+                    // has_service uniquely identifies it; the other peers all
+                    // share the same id (from served_model_name) and would
+                    // otherwise be indistinguishable.
+                    const head =
+                        r.peers.find(p => p.has_service) ||
+                        r.peers.find(p => p.id === grouped.id) ||
+                        r.peers[0];
                     const followers = r.peers.filter(p => p !== head);
                     return {
                         worker_group_id: r.worker_group_id,