diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..2292e31
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,39 @@
+# Copy this to .env (the Makefile does this automatically on first run).
+# Fill in the REPLACE_WITH_* placeholders with values from
+# rob-poc/serving-api/dev/secrets.yaml. Never put prod values here —
+# the Makefile's _guard-local-db will refuse non-local DATABASE_URL.
+
+# ── Local Postgres (matches what `make db-up` spins up) ──────────────────────
+DATABASE_URL=postgresql://serving:serving@localhost:5433/serving
+
+# ── Auth0 (research.computer tenant) ─────────────────────────────────────────
+AUTH0_DOMAIN=researchcomputer.eu.auth0.com
+AUTH0_API_AUDIENCE=https://researchcomputer.eu.auth0.com/
+AUTH0_ISSUER=https://researchcomputer.eu.auth0.com/
+AUTH0_ALGORITHMS=RS256
+AUTH0_CLIENT_ID=REPLACE_WITH_AUTH0_CLIENT_ID
+AUTH0_CLIENT_SECRET=REPLACE_WITH_AUTH0_CLIENT_SECRET
+AUTH_SECRET=REPLACE_WITH_RANDOM_STRING
+AUTH_TRUST_HOST=true
+
+VITE_AUTH0_CLIENT_ID=REPLACE_WITH_VITE_AUTH0_CLIENT_ID
+VITE_AUTH0_DOMAIN=researchcomputer.eu.auth0.com
+
+# ── OpenTela / OCF (peer discovery + LLM routing) ────────────────────────────
+# Point at the dev OpenTela head for live model discovery, or use
+# OTELA_FIXTURE_PATH (set by `make dummy-run`) to read a static snapshot.
+OCF_HEAD_ADDR=http://148.187.108.177:8092
+
+# ── Langfuse (observability — optional; leave blank to disable) ──────────────
+LANGFUSE_HOST=https://cloud.langfuse.com
+LANGFUSE_PUBLIC_KEY=
+LANGFUSE_SECRET_KEY=
+
+# ── CSCS L1 passthrough (optional; leave blank to disable) ───────────────────
+# When both are set, requests for L1-hosted Apertus models forward here
+# instead of OpenTela. See backend/services/cscs_l1_service.py.
+CSCS_L1_BASE_URL=
+CSCS_L1_API_KEY=
+
+# ── Logfire (observability — optional) ───────────────────────────────────────
+LOGFIRE_TOKEN=
diff --git a/.gitignore b/.gitignore
index 4686944..c82f305 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
*.pyc
*.env
*.env*
+!.env.example
details.json
secrets/*.json
.venv/*
diff --git a/Makefile b/Makefile
index 9151fda..df5f012 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: install install-dev format check test run dummy-run db-up db-down migrate _ensure-env _ensure-frontend-env
+.PHONY: install install-dev format check test run dummy-run db-up db-down migrate _ensure-env _ensure-frontend-env _guard-local-db _guard-local-api
UV_EXTRA ?=
@@ -66,10 +66,38 @@ db-down:
-docker stop $(PG_CONTAINER) > /dev/null 2>&1
-docker rm $(PG_CONTAINER) > /dev/null 2>&1
-migrate: _ensure-env db-up
+# Refuse to run any DB-touching target if .env points at a non-local host.
+# We never want `make run` / `make migrate` to accidentally apply migrations
+# or open connections against a remote (prod/staging) database — the local
+# Postgres container is the only acceptable target for dev commands.
+_guard-local-db: _ensure-env
+ @url=$$(grep -E '^DATABASE_URL=' .env | head -1 | cut -d= -f2- | tr -d '"' | tr -d "'"); \
+ host=$$(echo "$$url" | sed -E 's|^[^:]+://[^@]*@([^:/?]+).*|\1|'); \
+ case "$$host" in \
+ localhost|127.0.0.1|::1|"") ;; \
+ *) echo "REFUSING: .env DATABASE_URL host '$$host' is not local."; \
+ echo "Local dev must not run against prod/staging. Set DATABASE_URL=$(DATABASE_URL) in .env."; \
+ exit 1;; \
+ esac
+
+# Same guard for the frontend — VITE_API_URL is what `npm run dev` reads,
+# so a prod URL there silently makes the local UI hit prod even when the
+# local backend is running fine. That's exactly what tripped up dummy-run
+# the first time around. Empty / unset is fine (frontend defaults apply).
+_guard-local-api: _ensure-frontend-env
+ @url=$$(grep -E '^VITE_API_URL=' frontend/.env | head -1 | cut -d= -f2- | tr -d '"' | tr -d "'"); \
+ host=$$(echo "$$url" | sed -E 's|^[^:]+://([^:/?]+).*|\1|'); \
+ case "$$host" in \
+ localhost|127.0.0.1|::1|"") ;; \
+ *) echo "REFUSING: frontend/.env VITE_API_URL host '$$host' is not local."; \
+ echo "Local dev must not call prod/staging API. Set VITE_API_URL=http://localhost:8080 in frontend/.env."; \
+ exit 1;; \
+ esac
+
+migrate: _ensure-env _guard-local-db db-up
alembic upgrade head
-run: _ensure-env _ensure-frontend-env db-up migrate
+run: _ensure-env _ensure-frontend-env _guard-local-api db-up migrate
uvicorn backend.main:app --reload --host 0.0.0.0 --port 8080 & \
cd frontend && npm run dev & \
wait
@@ -77,7 +105,7 @@ run: _ensure-env _ensure-frontend-env db-up migrate
# Same as `run` but forces the model list to come from the synthesised
# upgraded fixture instead of the live OpenTela endpoint. Useful for
# iterating on the model-card UI without depending on prod state.
-dummy-run: _ensure-env _ensure-frontend-env db-up migrate
+dummy-run: _ensure-env _ensure-frontend-env _guard-local-api db-up migrate
OTELA_FIXTURE_PATH=$(PWD)/backend/tests/fixtures/dnt_table_dev_live.json \
uvicorn backend.main:app --reload --host 0.0.0.0 --port 8080 & \
cd frontend && npm run dev & \
diff --git a/backend/services/model_service.py b/backend/services/model_service.py
index ec43bb6..3d2a7f3 100644
--- a/backend/services/model_service.py
+++ b/backend/services/model_service.py
@@ -73,6 +73,7 @@ def get_all_models(endpoint: str, with_details: bool = False):
"object": "model",
"created": "0x",
"owner": "0x",
+ "has_service": False,
**meta,
}
if with_details:
@@ -93,6 +94,7 @@ def get_all_models(endpoint: str, with_details: bool = False):
"object": "model",
"created": "0x",
"owner": "0x",
+ "has_service": True,
**meta,
}
if with_details:
diff --git a/backend/tests/fixtures/dnt_table_dev_live.json b/backend/tests/fixtures/dnt_table_dev_live.json
index b6ccb64..8394bc6 100644
--- a/backend/tests/fixtures/dnt_table_dev_live.json
+++ b/backend/tests/fixtures/dnt_table_dev_live.json
@@ -770,5 +770,352 @@
],
"status": "ready",
"version": "v0.1.11"
+ },
+ "/QmNWFQvMQSimNCYzRYpZb7yhwfER53nYo3pJhQJ9HAdM7U": {
+ "id": "QmNWFQvMQSimNCYzRYpZb7yhwfER53nYo3pJhQJ9HAdM7U",
+ "latency": 0,
+ "privileged": false,
+ "owner": "",
+ "current_offering": null,
+ "role": null,
+ "status": "ready",
+ "available_offering": null,
+ "service": [
+ {
+ "name": "llm",
+ "hardware": {
+ "gpus": null,
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "status": "connected",
+ "host": "localhost",
+ "port": "8080",
+ "identity_group": [
+ "model=meta-llama/Llama-3.3-70B-Instruct"
+ ]
+ }
+ ],
+ "last_seen": 1779267166,
+ "version": "sai-v0.0.6",
+ "hostname": "sglang-llama-70b-bb9cbb4b6-wn9x2",
+ "labels": {
+ "framework": "sglang",
+ "launched_by": "k8s",
+ "served_model_name": "meta-llama/Llama-3.3-70B-Instruct",
+ "started_at": "2026-05-19T15:58:39Z",
+ "worker_group_id": "sglang-llama-70b-bb9cbb4b6-wn9x2"
+ },
+ "public_address": "",
+ "hardware": {
+ "gpus": [
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 14
+ },
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 9
+ },
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 8
+ },
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 7
+ }
+ ],
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "connected": true,
+ "load": null
+ },
+ "/QmNpziFtdsDmFFPSdgL3XMwz6cf3jDaZxNVvb4e9Be39f3": {
+ "id": "QmNpziFtdsDmFFPSdgL3XMwz6cf3jDaZxNVvb4e9Be39f3",
+ "latency": 0,
+ "privileged": false,
+ "owner": "",
+ "current_offering": null,
+ "role": null,
+ "status": "ready",
+ "available_offering": null,
+ "service": [
+ {
+ "name": "llm",
+ "hardware": {
+ "gpus": null,
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "status": "connected",
+ "host": "localhost",
+ "port": "8080",
+ "identity_group": [
+ "model=Snowflake/snowflake-arctic-embed-l-v2.0"
+ ]
+ }
+ ],
+ "last_seen": 1779267169,
+ "version": "sai-v0.0.6",
+ "hostname": "vllm-snowflake-7468ff7dc8-4hv68",
+ "labels": {
+ "framework": "vllm",
+ "launched_by": "k8s",
+ "served_model_name": "Snowflake/snowflake-arctic-embed-l-v2.0",
+ "started_at": "2026-05-19T15:57:44Z",
+ "worker_group_id": "vllm-snowflake-7468ff7dc8-4hv68"
+ },
+ "public_address": "",
+ "hardware": {
+ "gpus": [
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 1
+ }
+ ],
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "connected": true,
+ "load": null
+ },
+ "/QmUsx1HD2WPJ9hjVGXcpA9RTcn5YmN6bDuxFF2mUburLRq": {
+ "id": "QmUsx1HD2WPJ9hjVGXcpA9RTcn5YmN6bDuxFF2mUburLRq",
+ "latency": 0,
+ "privileged": false,
+ "owner": "",
+ "current_offering": null,
+ "role": null,
+ "status": "ready",
+ "available_offering": null,
+ "service": [
+ {
+ "name": "llm",
+ "hardware": {
+ "gpus": null,
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "status": "connected",
+ "host": "localhost",
+ "port": "8080",
+ "identity_group": [
+ "model=Qwen/Qwen3.5-27B"
+ ]
+ }
+ ],
+ "last_seen": 1779267152,
+ "version": "sai-v0.0.6",
+ "hostname": "vllm-qwen35-27b-7fc7bb7ffc-4bgkf",
+ "labels": {
+ "framework": "vllm",
+ "launched_by": "k8s",
+ "served_model_name": "Qwen/Qwen3.5-27B",
+ "started_at": "2026-05-19T15:59:26Z",
+ "worker_group_id": "vllm-qwen35-27b-7fc7bb7ffc-4bgkf"
+ },
+ "public_address": "",
+ "hardware": {
+ "gpus": [
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 0
+ },
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 1
+ }
+ ],
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "connected": true,
+ "load": null
+ },
+ "/QmWZBsbbkn93QHp4wVDaQ92u9LEBZsgAUn9jbj1iBKdDhj": {
+ "id": "QmWZBsbbkn93QHp4wVDaQ92u9LEBZsgAUn9jbj1iBKdDhj",
+ "latency": 0,
+ "privileged": false,
+ "owner": "",
+ "current_offering": null,
+ "role": null,
+ "status": "ready",
+ "available_offering": null,
+ "service": [
+ {
+ "name": "llm",
+ "hardware": {
+ "gpus": null,
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "status": "connected",
+ "host": "localhost",
+ "port": "8080",
+ "identity_group": [
+ "model=swiss-ai/Apertus-70B-Instruct-2509"
+ ]
+ }
+ ],
+ "last_seen": 1779267164,
+ "version": "sai-v0.0.6",
+ "hostname": "vllm-apertus-70b-84d688fd7d-mbhbd",
+ "labels": {
+ "framework": "vllm",
+ "launched_by": "k8s",
+ "served_model_name": "swiss-ai/Apertus-70B-Instruct-2509",
+ "started_at": "2026-05-20T08:27:04Z",
+ "worker_group_id": "vllm-apertus-70b-84d688fd7d-mbhbd"
+ },
+ "public_address": "",
+ "hardware": {
+ "gpus": [
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 2
+ },
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 1
+ },
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 3
+ },
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 1
+ }
+ ],
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "connected": true,
+ "load": null
+ },
+ "/QmYTeswtJmGEzYnPEG64GFyQ6USnfCdER6czXkr2VAPE5u": {
+ "id": "QmYTeswtJmGEzYnPEG64GFyQ6USnfCdER6czXkr2VAPE5u",
+ "latency": 0,
+ "privileged": false,
+ "owner": "",
+ "current_offering": null,
+ "role": null,
+ "status": "ready",
+ "available_offering": null,
+ "service": [
+ {
+ "name": "llm",
+ "hardware": {
+ "gpus": null,
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "status": "connected",
+ "host": "localhost",
+ "port": "8080",
+ "identity_group": [
+ "model=swiss-ai/Apertus-8B-Instruct-2509"
+ ]
+ }
+ ],
+ "last_seen": 1779267169,
+ "version": "sai-v0.0.6",
+ "hostname": "sglang-apertus-8b-78b4f9d77-v2pbt",
+ "labels": {
+ "framework": "sglang",
+ "launched_by": "k8s",
+ "served_model_name": "swiss-ai/Apertus-8B-Instruct-2509",
+ "started_at": "2026-05-19T16:11:00Z",
+ "worker_group_id": "sglang-apertus-8b-78b4f9d77-v2pbt"
+ },
+ "public_address": "",
+ "hardware": {
+ "gpus": [
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 3
+ }
+ ],
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "connected": true,
+ "load": null
+ },
+ "/QmbVVJVM1VwTCHEo8FCFZinWT5zEBoccDJmbZtj7zRK6Cx": {
+ "id": "QmbVVJVM1VwTCHEo8FCFZinWT5zEBoccDJmbZtj7zRK6Cx",
+ "latency": 0,
+ "privileged": false,
+ "owner": "",
+ "current_offering": null,
+ "role": null,
+ "status": "ready",
+ "available_offering": null,
+ "service": [
+ {
+ "name": "llm",
+ "hardware": {
+ "gpus": null,
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "status": "connected",
+ "host": "localhost",
+ "port": "8080",
+ "identity_group": [
+ "model=zai-org/GLM-4.7-Flash"
+ ]
+ }
+ ],
+ "last_seen": 1779267154,
+ "version": "sai-v0.0.6",
+ "hostname": "sglang-glm-47-flash-759d7dd467-fx9s4",
+ "labels": {
+ "framework": "sglang",
+ "launched_by": "k8s",
+ "served_model_name": "zai-org/GLM-4.7-Flash",
+ "started_at": "2026-05-19T15:58:03Z",
+ "worker_group_id": "sglang-glm-47-flash-759d7dd467-fx9s4"
+ },
+ "public_address": "",
+ "hardware": {
+ "gpus": [
+ {
+ "name": "NVIDIA GH200 120GB",
+ "total_memory": 97871,
+ "used_memory": 1
+ }
+ ],
+ "host_memory": 0,
+ "host_memory_bandwidth": 0,
+ "host_memory_used": 0
+ },
+ "connected": true,
+ "load": null
}
-}
\ No newline at end of file
+}
diff --git a/backend/tests/test_model_service.py b/backend/tests/test_model_service.py
index dceb6bc..f56a3b1 100644
--- a/backend/tests/test_model_service.py
+++ b/backend/tests/test_model_service.py
@@ -98,6 +98,7 @@ def test_new_binary_head_carries_labels():
assert entry["worker_group_id"] == "12345"
assert entry["framework"] == "sglang"
assert entry["status"] == "ready"
+ assert entry["has_service"] is True
def test_metrics_only_follower_groups_with_head_via_worker_group_id():
@@ -126,6 +127,12 @@ def test_metrics_only_follower_groups_with_head_via_worker_group_id():
== by_id["QmFollower"]["worker_group_id"]
== "12345"
)
+ # has_service distinguishes the actually-serving peer from TP-worker /
+ # metrics-only peers — the frontend uses this to pick the head when
+ # aggregating replicas (otherwise a multi-node TP replica shows pending
+ # forever because rank-1..N never register an llm service).
+ assert by_id["QmHead"]["has_service"] is True
+ assert by_id["QmFollower"]["has_service"] is False
def test_pending_peer_without_served_model_name_label_falls_back_to_empty_id():
diff --git a/frontend/.env.example b/frontend/.env.example
new file mode 100644
index 0000000..4908f82
--- /dev/null
+++ b/frontend/.env.example
@@ -0,0 +1,16 @@
+# Copy this to frontend/.env. The Makefile does it automatically.
+#
+# Only VITE_-prefixed vars are visible to the Astro/Vite frontend; do
+# NOT paste backend secrets (DATABASE_URL, AUTH0_CLIENT_SECRET, Neon
+# URLs, etc.) here — those go in the project-root .env or
+# rob-poc/serving-api/{dev,prod}/secrets.yaml only.
+
+# API the frontend talks to. Must be localhost during local dev so
+# `make run` / `make dummy-run` hit your own backend instead of prod.
+# The Makefile's _guard-local-api refuses non-local values.
+VITE_API_URL=http://localhost:8080
+
+# Auth0 SPA client (research.computer tenant). Safe to commit values
+# since they're public client identifiers.
+VITE_AUTH0_CLIENT_ID=REPLACE_WITH_VITE_AUTH0_CLIENT_ID
+VITE_AUTH0_DOMAIN=researchcomputer.eu.auth0.com
diff --git a/frontend/src/components/ui/ModelCard.svelte b/frontend/src/components/ui/ModelCard.svelte
index 1688e5b..66ed76e 100644
--- a/frontend/src/components/ui/ModelCard.svelte
+++ b/frontend/src/components/ui/ModelCard.svelte
@@ -1,6 +1,6 @@
diff --git a/frontend/src/lib/modelMetrics.ts b/frontend/src/lib/modelMetrics.ts
index 4764bbd..6139610 100644
--- a/frontend/src/lib/modelMetrics.ts
+++ b/frontend/src/lib/modelMetrics.ts
@@ -4,25 +4,24 @@ export type HostingTier = "L2" | "slurm";
type ModelConfig = {
metrics?: boolean;
- tier?: HostingTier;
};
-const models: Record = {
- "swiss-ai/Apertus-8B-Instruct-2509": { tier: "L2" },
- "zai-org/GLM-4.7-Flash": { tier: "L2" },
- "Snowflake/snowflake-arctic-embed-l-v2.0": { tier: "L2" },
- "cais/HarmBench-Llama-2-13b-cls": { tier: "L2" },
- "meta-llama/Llama-3.3-70B-Instruct": { tier: "L2" },
- "meta-llama/Llama-Guard-4-12B": { tier: "L2" },
- "swiss-ai/Apertus-70B-Instruct-2509": { tier: "L2" },
- "Qwen/Qwen3.5-27B": { tier: "L2" },
-};
+// Per-model overrides for the Grafana metrics dashboard URL. Add an entry
+// with `metrics: false` for models that have no panel — clicking through
+// to a blank dashboard is worse than hiding the button.
+const models: Record = {};
export function getModelMetricsUrl(modelName: string): string | null {
if (models[modelName]?.metrics === false) return null;
return `${METRICS_BASE}${encodeURIComponent(modelName)}`;
}
-export function getModelTier(modelName: string): HostingTier {
- return models[modelName]?.tier ?? "slurm";
+// Tier is now driven by the peer's `launched_by` label instead of a
+// hardcoded model list. Persistent infra launchers ("k8s", "cscs_L1") map
+// to the 24/7 badge; anything else (a username from model-launch, or an
+// older OpenTela binary that doesn't emit the label) is a Slurm job.
+const PERSISTENT_LAUNCHERS = new Set(["k8s", "cscs_L1"]);
+
+export function getTierFromLaunchedBy(launched_by: string | undefined): HostingTier {
+ return launched_by && PERSISTENT_LAUNCHERS.has(launched_by) ? "L2" : "slurm";
}