Skip to content

Commit bad4dca

Browse files
committed
fix: pick multi-node TP head by has_service, not first peer
In a multi-node TP replica only rank-0 registers the `llm` service; the other ranks run as background workers and their OCFs stay status=pending forever. The frontend was picking the head as the first peer whose id matched the model id — but every peer in the group shares that id (from the served_model_name label), so rank-N could win and the whole replica would render as pending despite serving traffic fine. Surface `has_service` on each peer entry from the backend and prefer it in the frontend's head selection. Same change also makes the expanded- view "head" label match the node sglang actually runs the API server on.
1 parent 70fe01d commit bad4dca

3 files changed

Lines changed: 18 additions & 2 deletions

File tree

backend/services/model_service.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def get_all_models(endpoint: str, with_details: bool = False):
7373
"object": "model",
7474
"created": "0x",
7575
"owner": "0x",
76+
"has_service": False,
7677
**meta,
7778
}
7879
if with_details:
@@ -93,6 +94,7 @@ def get_all_models(endpoint: str, with_details: bool = False):
9394
"object": "model",
9495
"created": "0x",
9596
"owner": "0x",
97+
"has_service": True,
9698
**meta,
9799
}
98100
if with_details:

backend/tests/test_model_service.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ def test_new_binary_head_carries_labels():
9898
assert entry["worker_group_id"] == "12345"
9999
assert entry["framework"] == "sglang"
100100
assert entry["status"] == "ready"
101+
assert entry["has_service"] is True
101102

102103

103104
def test_metrics_only_follower_groups_with_head_via_worker_group_id():
@@ -126,6 +127,12 @@ def test_metrics_only_follower_groups_with_head_via_worker_group_id():
126127
== by_id["QmFollower"]["worker_group_id"]
127128
== "12345"
128129
)
130+
# has_service distinguishes the actually-serving peer from TP-worker /
131+
# metrics-only peers — the frontend uses this to pick the head when
132+
# aggregating replicas (otherwise a multi-node TP replica shows pending
133+
# forever because rank-1..N never register an llm service).
134+
assert by_id["QmHead"]["has_service"] is True
135+
assert by_id["QmFollower"]["has_service"] is False
129136

130137

131138
def test_pending_peer_without_served_model_name_label_falls_back_to_empty_id():

frontend/src/components/ui/ModelList.svelte

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,15 @@
5757
replicaCount = 0;
5858
models = Array.from(modelsMap.values()).map(grouped => {
5959
const replicas = Array.from(grouped.replicas.values()).map(r => {
60-
// The head is the peer that owns the serving entry.
61-
const head = r.peers.find(p => p.id === grouped.id) || r.peers[0];
60+
// The head is the peer that owns the serving entry. In a
61+
// multi-node TP replica only rank-0 registers `llm`, so
62+
// has_service uniquely identifies it; the other peers all
63+
// share the same id (from served_model_name) and would
64+
// otherwise be indistinguishable.
65+
const head =
66+
r.peers.find(p => p.has_service) ||
67+
r.peers.find(p => p.id === grouped.id) ||
68+
r.peers[0];
6269
const followers = r.peers.filter(p => p !== head);
6370
return {
6471
worker_group_id: r.worker_group_id,

0 commit comments

Comments
 (0)