Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
.PHONY: install install-dev format check test run
.PHONY: install install-dev format check test run dummy-run db-up db-down migrate _ensure-env _ensure-frontend-env

UV_EXTRA ?=

PG_CONTAINER := serving-api-pg
PG_PORT := 5433
PG_USER := serving
PG_PASS := serving
PG_DB := serving
DATABASE_URL := postgresql://$(PG_USER):$(PG_PASS)@localhost:$(PG_PORT)/$(PG_DB)

install:
uv pip install $(UV_EXTRA) -r backend/requirements.txt

Expand All @@ -19,7 +26,59 @@ check:
test:
pytest backend/tests/ -v

run:
_ensure-env:
@if [ ! -f .env ]; then \
cp .env.example .env; \
echo "copied .env.example -> .env"; \
fi

_ensure-frontend-env:
@if [ ! -f frontend/.env ]; then \
cp frontend/.env.example frontend/.env; \
echo "copied frontend/.env.example -> frontend/.env (fill in AUTH0_* to enable login)"; \
fi

db-up:
@if [ -z "$$(docker ps -q -f name=^/$(PG_CONTAINER)$$)" ]; then \
if [ -n "$$(docker ps -aq -f name=^/$(PG_CONTAINER)$$)" ]; then \
echo "starting existing $(PG_CONTAINER) container"; \
docker start $(PG_CONTAINER) > /dev/null; \
else \
echo "creating $(PG_CONTAINER) container on :$(PG_PORT)"; \
docker run -d --name $(PG_CONTAINER) \
-e POSTGRES_USER=$(PG_USER) \
-e POSTGRES_PASSWORD=$(PG_PASS) \
-e POSTGRES_DB=$(PG_DB) \
-p $(PG_PORT):5432 \
postgres:16 > /dev/null; \
fi; \
fi
@printf "waiting for postgres"; \
for i in $$(seq 1 30); do \
if docker exec $(PG_CONTAINER) pg_isready -U $(PG_USER) -d $(PG_DB) > /dev/null 2>&1; then \
echo " ready"; exit 0; \
fi; \
printf "."; sleep 1; \
done; \
echo " timed out"; exit 1

db-down:
-docker stop $(PG_CONTAINER) > /dev/null 2>&1
-docker rm $(PG_CONTAINER) > /dev/null 2>&1

migrate: _ensure-env db-up
alembic upgrade head

run: _ensure-env _ensure-frontend-env db-up migrate
uvicorn backend.main:app --reload --host 0.0.0.0 --port 8080 & \
cd frontend && npm run dev & \
wait

# Same as `run` but forces the model list to come from the synthesised
# upgraded fixture instead of the live OpenTela endpoint. Useful for
# iterating on the model-card UI without depending on prod state.
dummy-run: _ensure-env _ensure-frontend-env db-up migrate
OTELA_FIXTURE_PATH=$(PWD)/backend/tests/fixtures/dnt_table_upgraded.json \
uvicorn backend.main:app --reload --host 0.0.0.0 --port 8080 & \
cd frontend && npm run dev & \
wait
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Frontend and backend API proxy for SwissAI LLM serving. For examples on how to l
┌─────────────────┐
OCF │ OpenTela P2P routing → model=apertus-...
OpenTela │ P2P routing → model=apertus-...
└────────┬────────┘
Expand All @@ -42,7 +42,7 @@ frontend/ # web UI (Astro + Svelte)
meta/ # example Dockerfiles, example k8s manifests, build scripts
```

OCF (Open Compute Framework) now renamed to OpenTela upstream is maintained at [eth-easl/OpenTela](https://github.com/eth-easl/OpenTela). We maintain a fork at [swiss-ai/OpenTela](https://github.com/swiss-ai/opentela) to control deployments to dev+prod.
OpenTela (formerly OCF / "Open Compute Framework") is maintained upstream at [eth-easl/OpenTela](https://github.com/eth-easl/OpenTela). We maintain a fork at [swiss-ai/OpenTela](https://github.com/swiss-ai/opentela) to control deployments to dev+prod.

## Dev Quick Start

Expand Down
20 changes: 18 additions & 2 deletions backend/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pydantic_settings import BaseSettings
from functools import lru_cache

from pydantic import AliasChoices, Field
from pydantic_settings import BaseSettings


@lru_cache()
def get_settings():
Expand All @@ -17,7 +19,20 @@ class Settings(BaseSettings):
database_url: str = ""
auth_secret: str = ""
auth_trust_host: bool = False
ocf_head_addr: str = ""
# Accept the historical OCF_* env var names in addition to the canonical
# OTELA_* ones so existing deployments keep working through the rename.
# Python attribute access stays `settings.otela_*`.
otela_head_addr: str = Field(
default="",
validation_alias=AliasChoices("otela_head_addr", "ocf_head_addr"),
)
# When set, /v1/models* reads this JSON file instead of calling
# $otela_head_addr/v1/dnt/table. Used for UI iteration against synthesised
# upgraded payloads (see backend/tests/fixtures/build_upgraded.py).
otela_fixture_path: str = Field(
default="",
validation_alias=AliasChoices("otela_fixture_path", "ocf_fixture_path"),
)
langfuse_host: str = ""
langfuse_public_key: str = ""
langfuse_secret_key: str = ""
Expand All @@ -28,6 +43,7 @@ class Settings(BaseSettings):

class Config:
env_file = ".env"
populate_by_name = True


def parse_hardware_info(hardware_info):
Expand Down
4 changes: 2 additions & 2 deletions backend/routers/completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ async def chat_completion(
)

response = await llm_proxy(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
request=llm_request,
)
Expand Down Expand Up @@ -125,7 +125,7 @@ async def completion(
)

response = await llm_proxy_completions(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
request=llm_request,
)
Expand Down
2 changes: 1 addition & 1 deletion backend/routers/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ async def embeddings(
data["app_title"] = app_title

response = await llm_proxy_embeddings(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
**data,
)
Expand Down
14 changes: 10 additions & 4 deletions backend/routers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,17 @@
settings = get_settings()


def _dnt_endpoint() -> str:
"""When OTELA_FIXTURE_PATH is set, read DNT from disk instead of HTTP —
used for iterating on the UI against synthesised post-upgrade payloads."""
if settings.otela_fixture_path:
return settings.otela_fixture_path
return settings.otela_head_addr + "/v1/dnt/table"


@router.get("/v1/models_detailed")
async def list_models_detailed():
models = get_all_models(settings.ocf_head_addr + "/v1/dnt/table", with_details=True)
models = get_all_models(_dnt_endpoint(), with_details=True)
return dict(
object="list",
data=models,
Expand All @@ -17,9 +25,7 @@ async def list_models_detailed():

@router.get("/v1/models")
async def list_models():
models = get_all_models(
settings.ocf_head_addr + "/v1/dnt/table", with_details=False
)
models = get_all_models(_dnt_endpoint(), with_details=False)
return dict(
object="list",
data=models,
Expand Down
4 changes: 2 additions & 2 deletions backend/routers/rerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ async def rerank(
):
data = await request.json()
response = await llm_proxy_rerank(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
payload=data,
model=data.get("model", "unknown"),
Expand All @@ -29,7 +29,7 @@ async def score(
):
data = await request.json()
response = await llm_proxy_score(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
payload=data,
model=data.get("model", "unknown"),
Expand Down
2 changes: 1 addition & 1 deletion backend/routers/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ async def create_response(
stream = data.get("stream", False)

response = await llm_proxy_responses(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
payload=data,
stream=stream,
Expand Down
4 changes: 2 additions & 2 deletions backend/routers/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ async def tokenize(
):
data = await request.json()
response = await llm_proxy_tokenize(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
payload=data,
model=data.get("model", "unknown"),
Expand All @@ -29,7 +29,7 @@ async def detokenize(
):
data = await request.json()
response = await llm_proxy_detokenize(
endpoint=settings.ocf_head_addr + "/v1/service/llm/v1/",
endpoint=settings.otela_head_addr + "/v1/service/llm/v1/",
api_key=token,
payload=data,
model=data.get("model", "unknown"),
Expand Down
102 changes: 77 additions & 25 deletions backend/services/model_service.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,96 @@
import json
import pathlib

import requests

from backend.config import parse_hardware_info


def _peer_metadata(node_info: dict) -> dict:
"""Pull the surfaced launch-time fields off a DNT peer entry.

Older OpenTela binaries (<v0.0.6) don't emit hostname/status/labels —
we return whatever's present and let consumers treat missing keys as
'unknown'. labels.worker_group_id is what the frontend groups by to
count replicas of a single model.
"""
labels = node_info.get("labels") or {}
return {
"peer_id": node_info.get("id", ""),
"hostname": node_info.get("hostname", ""),
"otela_version": node_info.get("version", ""),
"status": node_info.get("status", ""),
"labels": labels,
# Convenience pulls — frontends can just read these directly
# without having to dig into labels every time.
"worker_group_id": labels.get("worker_group_id", ""),
"launched_by": labels.get("launched_by", ""),
"slurm_job_id": labels.get("slurm_job_id", ""),
"framework": labels.get("framework", ""),
"started_at": labels.get("started_at", ""),
"expires_at": labels.get("expires_at", ""),
}


def _load_dnt(endpoint: str) -> dict:
"""Fetch DNT data. If endpoint points at a local file (no scheme), read
it as JSON — that's the fixture-mode dev path. Otherwise HTTP-GET it."""
if endpoint and not endpoint.startswith(("http://", "https://")):
return json.loads(pathlib.Path(endpoint).read_text())
return requests.get(endpoint).json()


def get_all_models(endpoint: str, with_details: bool = False):
"""Return one entry per (peer, model) pair served on the network.

The frontend aggregates these by model id and by worker_group_id to
produce the model card + replica count. We keep the granularity at the
peer level so multi-node replicas show their full topology (head +
metrics-only followers all share the same worker_group_id).
"""
try:
data = requests.get(endpoint).json()
data = _load_dnt(endpoint)
except Exception:
return []
models = []
for node_info in data.values():
if not node_info.get("service"):
continue
meta = _peer_metadata(node_info)
device_info = parse_hardware_info(node_info.get("hardware"))
for service in node_info["service"]:
services = node_info.get("service") or []
if not services:
# Metrics-only / pending peer: surface it under a sentinel id so
# the frontend can attribute it to the right replica via
# worker_group_id and show it as part of a launching/follower set.
if not meta["worker_group_id"]:
continue
entry = {
"id": "", # no model yet
"object": "model",
"created": "0x",
"owner": "0x",
**meta,
}
if with_details:
entry["device"] = device_info
models.append(entry)
continue
for service in services:
if not service.get("identity_group"):
continue
model_names = [
identity[len("model=") :]
for identity in service["identity_group"]
if identity.startswith("model=")
]
if with_details:
models.extend(
{
"id": model_name,
"device": device_info,
"object": "model",
"created": "0x",
"owner": "0x",
}
for model_name in model_names
)
else:
models.extend(
{
"id": model_name,
"object": "model",
"created": "0x",
"owner": "0x",
}
for model_name in model_names
)
for model_name in model_names:
entry = {
"id": model_name,
"object": "model",
"created": "0x",
"owner": "0x",
**meta,
}
if with_details:
entry["device"] = device_info
models.append(entry)
return models
Loading
Loading