gradio-app
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 35 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 81 additions & 5 deletions b/‎README.md‎
Lines changed: 81 additions & 5 deletions
diff --git a/‎examples/demo_dashboard.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/demo_dashboard.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/mock/seed_basic.py‎
Lines changed: 58 additions & 0 deletions b/‎examples/mock/seed_basic.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎examples/mock/seed_secondary_metrics.py‎
Lines changed: 46 additions & 0 deletions b/‎examples/mock/seed_secondary_metrics.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎examples/mock/seed_traces.py‎
Lines changed: 59 additions & 0 deletions b/‎examples/mock/seed_traces.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎examples/real/anthropic_autocompare_messages.py‎
Lines changed: 32 additions & 0 deletions b/‎examples/real/anthropic_autocompare_messages.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/real/anthropic_wrapper_basic.py‎
Lines changed: 22 additions & 0 deletions b/‎examples/real/anthropic_wrapper_basic.py‎
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,35 @@
+name: ci
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -e ".[dev]"
+
+      - name: Lint
+        run: |
+          ruff check .
+          ruff format --check .
+
+      - name: Install Playwright browsers
+        run: python -m playwright install --with-deps chromium
+
+      - name: Unit and UI tests
+        run: pytest -q
@@ -20,7 +20,7 @@ pip install smollest[all]          # both
 
 ## Usage
 
-Install `openai` from `smollest` and then write your code as normal!
+Install `openai` from `smollest` and write your code as normal:
 
 ```python
 from smollest import openai
@@ -66,6 +66,33 @@ result = client.messages.create(
 )
 ```
 
+Or instrument existing SDK usage with `autocompare()`:
+
+```python
+import openai
+from smollest.openai import autocompare
+
+autocompare(project="my-project")
+client = openai.OpenAI()
+client.chat.completions.create(
+    model="gpt-4.1-mini",
+    messages=[{"role": "user", "content": "Return JSON sentiment"}],
+)
+```
+
+```python
+import anthropic
+from smollest.anthropic import autocompare
+
+autocompare(project="my-project")
+client = anthropic.Anthropic()
+client.messages.create(
+    model="claude-sonnet-4-20250514",
+    max_tokens=120,
+    messages=[{"role": "user", "content": "Return JSON sentiment"}],
+)
+```
+
 ## How it works
 
 1. Your API call goes to the baseline model as normal
@@ -75,6 +102,31 @@ result = client.messages.create(
 
 Remote candidates run in parallel; local candidates run sequentially.
 
+## Model presets
+
+Default comparison candidates come from a date-indexed preset list and resolve to the latest month automatically. You can inspect or pin a month:
+
+```python
+from smollest import get_default_candidates
+
+latest = get_default_candidates()
+march = get_default_candidates("2026-03")
+```
+
+## Secondary metrics
+
+You can register callbacks to compute arbitrary metrics for baseline and candidate runs:
+
+```python
+from smollest import register_secondary_metric
+
+def co2_metric(payload: dict) -> dict[str, float]:
+    tokens = payload.get("input_tokens", 0) + payload.get("output_tokens", 0)
+    return {"co2_g": tokens * 0.00009}
+
+register_secondary_metric(co2_metric)
+```
+
 ## Dashboard
 
 ```bash
@@ -83,12 +135,36 @@ smollest show
 
 Opens a web dashboard with projects in the sidebar, a results table with truncation for long outputs, latency and cost per model, and aggregate match rates. The image above shows the UI, which you can reproduce by cloning this repo and running: `python examples/demo_dashboard.py`
 
+The dashboard now includes:
 
-## Roadmap
+- Trace view with input/output inspection
+- Model size badges
+- Secondary metrics display
+- A `+` column action to add another model and replay saved traces against it
 
-- Allow adding additional models directly through the UI
-- Add LLM as judge to score outputs that are not structured
-- Let developers eaisly fine tune models on outputs
+## Examples
+
+Two runnable example groups are provided:
+
+- `examples/mock/` for quick local seeding to inspect UI states
+  - `seed_basic.py`
+  - `seed_traces.py`
+  - `seed_secondary_metrics.py`
+- `examples/real/` for real SDK usage patterns (requires API keys)
+  - `openai_wrapper_basic.py`
+  - `openai_autocompare_chat.py`
+  - `openai_autocompare_responses.py`
+  - `anthropic_wrapper_basic.py`
+  - `anthropic_autocompare_messages.py`
+  - `openai_secondary_metrics.py`
+
+
+Run one:
+
+```bash
+python examples/mock/seed_traces.py
+smollest show
+```
 
 ## License
 
 
@@ -8,7 +8,6 @@
 import json
 import random
 from datetime import datetime, timedelta, timezone
-from pathlib import Path
 
 from smollest.results import DATA_DIR
 from smollest.web import show
 
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import uuid
+
+from smollest.compare import ComparisonResult
+from smollest.results import log_result
+
+
+def main() -> None:
+    project = "mock-basic"
+    baseline_messages = [
+        {"role": "system", "content": "Return compact JSON."},
+        {"role": "user", "content": "Classify: I loved the service."},
+    ]
+    for i in range(3):
+        trace_id = str(uuid.uuid4())
+        baseline_content = '{"label":"positive","confidence":0.9}'
+        for candidate_name, score in [
+            ("Qwen/Qwen3.5-3B-Instruct", 1.0),
+            ("meta-llama/Llama-3.1-8B-Instruct", 0.5),
+        ]:
+            comparison = ComparisonResult(
+                candidate=candidate_name,
+                score=score,
+                total_fields=2,
+                matching_fields=["label"] if score < 1.0 else ["label", "confidence"],
+                mismatched_fields=[]
+                if score == 1.0
+                else [{"field": "confidence", "baseline": 0.9, "candidate": 0.7}],
+            )
+            log_result(
+                project=project,
+                provider="openai",
+                baseline_model="gpt-4.1-mini",
+                baseline_model_size="small",
+                baseline_messages=baseline_messages,
+                baseline_content=baseline_content,
+                baseline_latency_ms=180 + i * 11,
+                baseline_input_tokens=34,
+                baseline_output_tokens=12,
+                baseline_cost=0.00004,
+                baseline_secondary_metrics={},
+                comparison=comparison,
+                candidate_content='{"label":"positive","confidence":0.7}',
+                candidate_model_size="small",
+                candidate_latency_ms=120 + i * 8,
+                candidate_input_tokens=34,
+                candidate_output_tokens=12,
+                candidate_cost=0.0,
+                candidate_secondary_metrics={},
+                trace_id=trace_id,
+                parent_span_id=str(uuid.uuid4()),
+                input_payload={"model": "gpt-4.1-mini"},
+            )
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+import uuid
+
+from smollest.compare import ComparisonResult
+from smollest.results import log_result
+
+
+def main() -> None:
+    project = "mock-secondary-metrics"
+    messages = [{"role": "user", "content": "Extract topic from this paragraph."}]
+    trace_id = str(uuid.uuid4())
+    baseline = "topic=mlops"
+    models = [
+        ("mistralai/Mistral-Small-24B-Instruct-2501", "large", 240.0, 0.00011, 0.024),
+        ("Qwen/Qwen3.5-3B-Instruct", "small", 90.0, 0.0, 0.007),
+    ]
+    for model, size, latency_ms, cost, co2 in models:
+        log_result(
+            project=project,
+            provider="anthropic",
+            baseline_model="claude-sonnet-4-20250514",
+            baseline_model_size="large",
+            baseline_messages=messages,
+            baseline_content=baseline,
+            baseline_latency_ms=330.0,
+            baseline_input_tokens=98,
+            baseline_output_tokens=15,
+            baseline_cost=0.0005,
+            baseline_secondary_metrics={"co2_g": 0.041},
+            comparison=ComparisonResult(candidate=model, score=1.0, total_fields=1),
+            candidate_content=baseline,
+            candidate_model_size=size,
+            candidate_latency_ms=latency_ms,
+            candidate_input_tokens=98,
+            candidate_output_tokens=15,
+            candidate_cost=cost,
+            candidate_secondary_metrics={"co2_g": co2},
+            trace_id=trace_id,
+            parent_span_id=str(uuid.uuid4()),
+            input_payload={"model": "claude-sonnet-4-20250514"},
+        )
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import uuid
+
+from smollest.compare import ComparisonResult
+from smollest.results import log_result
+
+
+def seed_trace(trace_input: str, baseline_output: str, project: str) -> None:
+    trace_id = str(uuid.uuid4())
+    base_messages = [{"role": "user", "content": trace_input}]
+    candidates = [
+        ("Qwen/Qwen3.5-3B-Instruct", baseline_output, 1.0),
+        ("meta-llama/Llama-3.1-8B-Instruct", baseline_output.replace("2", "3"), 0.0),
+    ]
+    for model, output, score in candidates:
+        comparison = ComparisonResult(
+            candidate=model,
+            score=score,
+            total_fields=1,
+            matching_fields=["answer"] if score == 1.0 else [],
+            mismatched_fields=[]
+            if score == 1.0
+            else [{"field": "answer", "baseline": "2", "candidate": "3"}],
+        )
+        log_result(
+            project=project,
+            provider="openai",
+            baseline_model="gpt-4.1",
+            baseline_model_size="large",
+            baseline_messages=base_messages,
+            baseline_content=baseline_output,
+            baseline_latency_ms=250.0,
+            baseline_input_tokens=20,
+            baseline_output_tokens=5,
+            baseline_cost=0.00009,
+            baseline_secondary_metrics={},
+            comparison=comparison,
+            candidate_content=output,
+            candidate_model_size="small",
+            candidate_latency_ms=110.0,
+            candidate_input_tokens=20,
+            candidate_output_tokens=5,
+            candidate_cost=0.0,
+            candidate_secondary_metrics={},
+            trace_id=trace_id,
+            parent_span_id=str(uuid.uuid4()),
+            input_payload={"model": "gpt-4.1"},
+        )
+
+
+def main() -> None:
+    project = "mock-traces"
+    seed_trace('{"task":"math","question":"1+1?"}', '{"answer":"2"}', project)
+    seed_trace('{"task":"math","question":"2+2?"}', '{"answer":"4"}', project)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import anthropic
+
+from smollest.anthropic import autocompare
+
+
+def main() -> None:
+    autocompare(
+        project="real-anthropic-autocompare",
+        candidates=[
+            "Qwen/Qwen3.5-3B-Instruct",
+            "mistralai/Mistral-Small-24B-Instruct-2501",
+        ],
+    )
+    client = anthropic.Anthropic()
+    response = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=120,
+        system="Return compact JSON only.",
+        messages=[
+            {
+                "role": "user",
+                "content": 'Extract entities from: "Apple acquired a startup in Paris."',
+            }
+        ],
+    )
+    print(response.content[0].text)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from smollest import anthropic
+
+
+def main() -> None:
+    client = anthropic.Anthropic(project="real-anthropic-wrapper-basic")
+    response = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=120,
+        messages=[
+            {
+                "role": "user",
+                "content": 'Return JSON with fields "intent" and "priority" for: "Server is down in eu-west-1"',
+            }
+        ],
+    )
+    print(response.content[0].text)
+
+
+if __name__ == "__main__":
+    main()