diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2292e31 --- /dev/null +++ b/.env.example @@ -0,0 +1,39 @@ +# Copy this to .env (the Makefile does this automatically on first run). +# Fill in the REPLACE_WITH_* placeholders with values from +# rob-poc/serving-api/dev/secrets.yaml. Never put prod values here — +# the Makefile's _guard-local-db will refuse non-local DATABASE_URL. + +# ── Local Postgres (matches what `make db-up` spins up) ────────────────────── +DATABASE_URL=postgresql://serving:serving@localhost:5433/serving + +# ── Auth0 (research.computer tenant) ───────────────────────────────────────── +AUTH0_DOMAIN=researchcomputer.eu.auth0.com +AUTH0_API_AUDIENCE=https://researchcomputer.eu.auth0.com/ +AUTH0_ISSUER=https://researchcomputer.eu.auth0.com/ +AUTH0_ALGORITHMS=RS256 +AUTH0_CLIENT_ID=REPLACE_WITH_AUTH0_CLIENT_ID +AUTH0_CLIENT_SECRET=REPLACE_WITH_AUTH0_CLIENT_SECRET +AUTH_SECRET=REPLACE_WITH_RANDOM_STRING +AUTH_TRUST_HOST=true + +VITE_AUTH0_CLIENT_ID=REPLACE_WITH_VITE_AUTH0_CLIENT_ID +VITE_AUTH0_DOMAIN=researchcomputer.eu.auth0.com + +# ── OpenTela / OCF (peer discovery + LLM routing) ──────────────────────────── +# Point at the dev OpenTela head for live model discovery, or use +# OTELA_FIXTURE_PATH (set by `make dummy-run`) to read a static snapshot. +OCF_HEAD_ADDR=http://148.187.108.177:8092 + +# ── Langfuse (observability — optional; leave blank to disable) ────────────── +LANGFUSE_HOST=https://cloud.langfuse.com +LANGFUSE_PUBLIC_KEY= +LANGFUSE_SECRET_KEY= + +# ── CSCS L1 passthrough (optional; leave blank to disable) ─────────────────── +# When both are set, requests for L1-hosted Apertus models forward here +# instead of OpenTela. See backend/services/cscs_l1_service.py. +CSCS_L1_BASE_URL= +CSCS_L1_API_KEY= + +# ── Logfire (observability — optional) ─────────────────────────────────────── +LOGFIRE_TOKEN= diff --git a/.gitignore b/.gitignore index 4686944..c82f305 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.pyc *.env *.env* +!.env.example details.json secrets/*.json .venv/* diff --git a/Makefile b/Makefile index 9151fda..df5f012 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: install install-dev format check test run dummy-run db-up db-down migrate _ensure-env _ensure-frontend-env +.PHONY: install install-dev format check test run dummy-run db-up db-down migrate _ensure-env _ensure-frontend-env _guard-local-db _guard-local-api UV_EXTRA ?= @@ -66,10 +66,38 @@ db-down: -docker stop $(PG_CONTAINER) > /dev/null 2>&1 -docker rm $(PG_CONTAINER) > /dev/null 2>&1 -migrate: _ensure-env db-up +# Refuse to run any DB-touching target if .env points at a non-local host. +# We never want `make run` / `make migrate` to accidentally apply migrations +# or open connections against a remote (prod/staging) database — the local +# Postgres container is the only acceptable target for dev commands. +_guard-local-db: _ensure-env + @url=$$(grep -E '^DATABASE_URL=' .env | head -1 | cut -d= -f2- | tr -d '"' | tr -d "'"); \ + host=$$(echo "$$url" | sed -E 's|^[^:]+://[^@]*@([^:/?]+).*|\1|'); \ + case "$$host" in \ + localhost|127.0.0.1|::1|"") ;; \ + *) echo "REFUSING: .env DATABASE_URL host '$$host' is not local."; \ + echo "Local dev must not run against prod/staging. Set DATABASE_URL=$(DATABASE_URL) in .env."; \ + exit 1;; \ + esac + +# Same guard for the frontend — VITE_API_URL is what `npm run dev` reads, +# so a prod URL there silently makes the local UI hit prod even when the +# local backend is running fine. That's exactly what tripped up dummy-run +# the first time around. Empty / unset is fine (frontend defaults apply). +_guard-local-api: _ensure-frontend-env + @url=$$(grep -E '^VITE_API_URL=' frontend/.env | head -1 | cut -d= -f2- | tr -d '"' | tr -d "'"); \ + host=$$(echo "$$url" | sed -E 's|^[^:]+://([^:/?]+).*|\1|'); \ + case "$$host" in \ + localhost|127.0.0.1|::1|"") ;; \ + *) echo "REFUSING: frontend/.env VITE_API_URL host '$$host' is not local."; \ + echo "Local dev must not call prod/staging API. Set VITE_API_URL=http://localhost:8080 in frontend/.env."; \ + exit 1;; \ + esac + +migrate: _ensure-env _guard-local-db db-up alembic upgrade head -run: _ensure-env _ensure-frontend-env db-up migrate +run: _ensure-env _ensure-frontend-env _guard-local-api db-up migrate uvicorn backend.main:app --reload --host 0.0.0.0 --port 8080 & \ cd frontend && npm run dev & \ wait @@ -77,7 +105,7 @@ run: _ensure-env _ensure-frontend-env db-up migrate # Same as `run` but forces the model list to come from the synthesised # upgraded fixture instead of the live OpenTela endpoint. Useful for # iterating on the model-card UI without depending on prod state. -dummy-run: _ensure-env _ensure-frontend-env db-up migrate +dummy-run: _ensure-env _ensure-frontend-env _guard-local-api db-up migrate OTELA_FIXTURE_PATH=$(PWD)/backend/tests/fixtures/dnt_table_dev_live.json \ uvicorn backend.main:app --reload --host 0.0.0.0 --port 8080 & \ cd frontend && npm run dev & \ diff --git a/backend/services/model_service.py b/backend/services/model_service.py index ec43bb6..3d2a7f3 100644 --- a/backend/services/model_service.py +++ b/backend/services/model_service.py @@ -73,6 +73,7 @@ def get_all_models(endpoint: str, with_details: bool = False): "object": "model", "created": "0x", "owner": "0x", + "has_service": False, **meta, } if with_details: @@ -93,6 +94,7 @@ def get_all_models(endpoint: str, with_details: bool = False): "object": "model", "created": "0x", "owner": "0x", + "has_service": True, **meta, } if with_details: diff --git a/backend/tests/fixtures/dnt_table_dev_live.json b/backend/tests/fixtures/dnt_table_dev_live.json index b6ccb64..8394bc6 100644 --- a/backend/tests/fixtures/dnt_table_dev_live.json +++ b/backend/tests/fixtures/dnt_table_dev_live.json @@ -770,5 +770,352 @@ ], "status": "ready", "version": "v0.1.11" + }, + "/QmNWFQvMQSimNCYzRYpZb7yhwfER53nYo3pJhQJ9HAdM7U": { + "id": "QmNWFQvMQSimNCYzRYpZb7yhwfER53nYo3pJhQJ9HAdM7U", + "latency": 0, + "privileged": false, + "owner": "", + "current_offering": null, + "role": null, + "status": "ready", + "available_offering": null, + "service": [ + { + "name": "llm", + "hardware": { + "gpus": null, + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "status": "connected", + "host": "localhost", + "port": "8080", + "identity_group": [ + "model=meta-llama/Llama-3.3-70B-Instruct" + ] + } + ], + "last_seen": 1779267166, + "version": "sai-v0.0.6", + "hostname": "sglang-llama-70b-bb9cbb4b6-wn9x2", + "labels": { + "framework": "sglang", + "launched_by": "k8s", + "served_model_name": "meta-llama/Llama-3.3-70B-Instruct", + "started_at": "2026-05-19T15:58:39Z", + "worker_group_id": "sglang-llama-70b-bb9cbb4b6-wn9x2" + }, + "public_address": "", + "hardware": { + "gpus": [ + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 14 + }, + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 9 + }, + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 8 + }, + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 7 + } + ], + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "connected": true, + "load": null + }, + "/QmNpziFtdsDmFFPSdgL3XMwz6cf3jDaZxNVvb4e9Be39f3": { + "id": "QmNpziFtdsDmFFPSdgL3XMwz6cf3jDaZxNVvb4e9Be39f3", + "latency": 0, + "privileged": false, + "owner": "", + "current_offering": null, + "role": null, + "status": "ready", + "available_offering": null, + "service": [ + { + "name": "llm", + "hardware": { + "gpus": null, + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "status": "connected", + "host": "localhost", + "port": "8080", + "identity_group": [ + "model=Snowflake/snowflake-arctic-embed-l-v2.0" + ] + } + ], + "last_seen": 1779267169, + "version": "sai-v0.0.6", + "hostname": "vllm-snowflake-7468ff7dc8-4hv68", + "labels": { + "framework": "vllm", + "launched_by": "k8s", + "served_model_name": "Snowflake/snowflake-arctic-embed-l-v2.0", + "started_at": "2026-05-19T15:57:44Z", + "worker_group_id": "vllm-snowflake-7468ff7dc8-4hv68" + }, + "public_address": "", + "hardware": { + "gpus": [ + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 1 + } + ], + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "connected": true, + "load": null + }, + "/QmUsx1HD2WPJ9hjVGXcpA9RTcn5YmN6bDuxFF2mUburLRq": { + "id": "QmUsx1HD2WPJ9hjVGXcpA9RTcn5YmN6bDuxFF2mUburLRq", + "latency": 0, + "privileged": false, + "owner": "", + "current_offering": null, + "role": null, + "status": "ready", + "available_offering": null, + "service": [ + { + "name": "llm", + "hardware": { + "gpus": null, + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "status": "connected", + "host": "localhost", + "port": "8080", + "identity_group": [ + "model=Qwen/Qwen3.5-27B" + ] + } + ], + "last_seen": 1779267152, + "version": "sai-v0.0.6", + "hostname": "vllm-qwen35-27b-7fc7bb7ffc-4bgkf", + "labels": { + "framework": "vllm", + "launched_by": "k8s", + "served_model_name": "Qwen/Qwen3.5-27B", + "started_at": "2026-05-19T15:59:26Z", + "worker_group_id": "vllm-qwen35-27b-7fc7bb7ffc-4bgkf" + }, + "public_address": "", + "hardware": { + "gpus": [ + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 0 + }, + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 1 + } + ], + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "connected": true, + "load": null + }, + "/QmWZBsbbkn93QHp4wVDaQ92u9LEBZsgAUn9jbj1iBKdDhj": { + "id": "QmWZBsbbkn93QHp4wVDaQ92u9LEBZsgAUn9jbj1iBKdDhj", + "latency": 0, + "privileged": false, + "owner": "", + "current_offering": null, + "role": null, + "status": "ready", + "available_offering": null, + "service": [ + { + "name": "llm", + "hardware": { + "gpus": null, + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "status": "connected", + "host": "localhost", + "port": "8080", + "identity_group": [ + "model=swiss-ai/Apertus-70B-Instruct-2509" + ] + } + ], + "last_seen": 1779267164, + "version": "sai-v0.0.6", + "hostname": "vllm-apertus-70b-84d688fd7d-mbhbd", + "labels": { + "framework": "vllm", + "launched_by": "k8s", + "served_model_name": "swiss-ai/Apertus-70B-Instruct-2509", + "started_at": "2026-05-20T08:27:04Z", + "worker_group_id": "vllm-apertus-70b-84d688fd7d-mbhbd" + }, + "public_address": "", + "hardware": { + "gpus": [ + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 2 + }, + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 1 + }, + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 3 + }, + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 1 + } + ], + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "connected": true, + "load": null + }, + "/QmYTeswtJmGEzYnPEG64GFyQ6USnfCdER6czXkr2VAPE5u": { + "id": "QmYTeswtJmGEzYnPEG64GFyQ6USnfCdER6czXkr2VAPE5u", + "latency": 0, + "privileged": false, + "owner": "", + "current_offering": null, + "role": null, + "status": "ready", + "available_offering": null, + "service": [ + { + "name": "llm", + "hardware": { + "gpus": null, + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "status": "connected", + "host": "localhost", + "port": "8080", + "identity_group": [ + "model=swiss-ai/Apertus-8B-Instruct-2509" + ] + } + ], + "last_seen": 1779267169, + "version": "sai-v0.0.6", + "hostname": "sglang-apertus-8b-78b4f9d77-v2pbt", + "labels": { + "framework": "sglang", + "launched_by": "k8s", + "served_model_name": "swiss-ai/Apertus-8B-Instruct-2509", + "started_at": "2026-05-19T16:11:00Z", + "worker_group_id": "sglang-apertus-8b-78b4f9d77-v2pbt" + }, + "public_address": "", + "hardware": { + "gpus": [ + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 3 + } + ], + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "connected": true, + "load": null + }, + "/QmbVVJVM1VwTCHEo8FCFZinWT5zEBoccDJmbZtj7zRK6Cx": { + "id": "QmbVVJVM1VwTCHEo8FCFZinWT5zEBoccDJmbZtj7zRK6Cx", + "latency": 0, + "privileged": false, + "owner": "", + "current_offering": null, + "role": null, + "status": "ready", + "available_offering": null, + "service": [ + { + "name": "llm", + "hardware": { + "gpus": null, + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "status": "connected", + "host": "localhost", + "port": "8080", + "identity_group": [ + "model=zai-org/GLM-4.7-Flash" + ] + } + ], + "last_seen": 1779267154, + "version": "sai-v0.0.6", + "hostname": "sglang-glm-47-flash-759d7dd467-fx9s4", + "labels": { + "framework": "sglang", + "launched_by": "k8s", + "served_model_name": "zai-org/GLM-4.7-Flash", + "started_at": "2026-05-19T15:58:03Z", + "worker_group_id": "sglang-glm-47-flash-759d7dd467-fx9s4" + }, + "public_address": "", + "hardware": { + "gpus": [ + { + "name": "NVIDIA GH200 120GB", + "total_memory": 97871, + "used_memory": 1 + } + ], + "host_memory": 0, + "host_memory_bandwidth": 0, + "host_memory_used": 0 + }, + "connected": true, + "load": null } -} \ No newline at end of file +} diff --git a/backend/tests/test_model_service.py b/backend/tests/test_model_service.py index dceb6bc..f56a3b1 100644 --- a/backend/tests/test_model_service.py +++ b/backend/tests/test_model_service.py @@ -98,6 +98,7 @@ def test_new_binary_head_carries_labels(): assert entry["worker_group_id"] == "12345" assert entry["framework"] == "sglang" assert entry["status"] == "ready" + assert entry["has_service"] is True def test_metrics_only_follower_groups_with_head_via_worker_group_id(): @@ -126,6 +127,12 @@ def test_metrics_only_follower_groups_with_head_via_worker_group_id(): == by_id["QmFollower"]["worker_group_id"] == "12345" ) + # has_service distinguishes the actually-serving peer from TP-worker / + # metrics-only peers — the frontend uses this to pick the head when + # aggregating replicas (otherwise a multi-node TP replica shows pending + # forever because rank-1..N never register an llm service). + assert by_id["QmHead"]["has_service"] is True + assert by_id["QmFollower"]["has_service"] is False def test_pending_peer_without_served_model_name_label_falls_back_to_empty_id(): diff --git a/frontend/.env.example b/frontend/.env.example new file mode 100644 index 0000000..4908f82 --- /dev/null +++ b/frontend/.env.example @@ -0,0 +1,16 @@ +# Copy this to frontend/.env. The Makefile does it automatically. +# +# Only VITE_-prefixed vars are visible to the Astro/Vite frontend; do +# NOT paste backend secrets (DATABASE_URL, AUTH0_CLIENT_SECRET, Neon +# URLs, etc.) here — those go in the project-root .env or +# rob-poc/serving-api/{dev,prod}/secrets.yaml only. + +# API the frontend talks to. Must be localhost during local dev so +# `make run` / `make dummy-run` hit your own backend instead of prod. +# The Makefile's _guard-local-api refuses non-local values. +VITE_API_URL=http://localhost:8080 + +# Auth0 SPA client (research.computer tenant). Safe to commit values +# since they're public client identifiers. +VITE_AUTH0_CLIENT_ID=REPLACE_WITH_VITE_AUTH0_CLIENT_ID +VITE_AUTH0_DOMAIN=researchcomputer.eu.auth0.com diff --git a/frontend/src/components/ui/ModelCard.svelte b/frontend/src/components/ui/ModelCard.svelte index 1688e5b..66ed76e 100644 --- a/frontend/src/components/ui/ModelCard.svelte +++ b/frontend/src/components/ui/ModelCard.svelte @@ -1,6 +1,6 @@ diff --git a/frontend/src/lib/modelMetrics.ts b/frontend/src/lib/modelMetrics.ts index 4764bbd..6139610 100644 --- a/frontend/src/lib/modelMetrics.ts +++ b/frontend/src/lib/modelMetrics.ts @@ -4,25 +4,24 @@ export type HostingTier = "L2" | "slurm"; type ModelConfig = { metrics?: boolean; - tier?: HostingTier; }; -const models: Record = { - "swiss-ai/Apertus-8B-Instruct-2509": { tier: "L2" }, - "zai-org/GLM-4.7-Flash": { tier: "L2" }, - "Snowflake/snowflake-arctic-embed-l-v2.0": { tier: "L2" }, - "cais/HarmBench-Llama-2-13b-cls": { tier: "L2" }, - "meta-llama/Llama-3.3-70B-Instruct": { tier: "L2" }, - "meta-llama/Llama-Guard-4-12B": { tier: "L2" }, - "swiss-ai/Apertus-70B-Instruct-2509": { tier: "L2" }, - "Qwen/Qwen3.5-27B": { tier: "L2" }, -}; +// Per-model overrides for the Grafana metrics dashboard URL. Add an entry +// with `metrics: false` for models that have no panel — clicking through +// to a blank dashboard is worse than hiding the button. +const models: Record = {}; export function getModelMetricsUrl(modelName: string): string | null { if (models[modelName]?.metrics === false) return null; return `${METRICS_BASE}${encodeURIComponent(modelName)}`; } -export function getModelTier(modelName: string): HostingTier { - return models[modelName]?.tier ?? "slurm"; +// Tier is now driven by the peer's `launched_by` label instead of a +// hardcoded model list. Persistent infra launchers ("k8s", "cscs_L1") map +// to the 24/7 badge; anything else (a username from model-launch, or an +// older OpenTela binary that doesn't emit the label) is a Slurm job. +const PERSISTENT_LAUNCHERS = new Set(["k8s", "cscs_L1"]); + +export function getTierFromLaunchedBy(launched_by: string | undefined): HostingTier { + return launched_by && PERSISTENT_LAUNCHERS.has(launched_by) ? "L2" : "slurm"; }