From 8290cfe3c30ea6af5917efeb43d26d089aa21a69 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Thu, 28 May 2026 18:44:15 +0800
Subject: [PATCH 01/26] feat: background job system

---
 app/desktop/desktop_server.py                 |   2 +
 app/desktop/git_sync/middleware.py            |   6 +-
 app/desktop/git_sync/save_context.py          |  66 ++
 app/desktop/git_sync/test_save_context.py     | 219 ++++++
 app/desktop/studio_server/jobs/__init__.py    |   0
 app/desktop/studio_server/jobs/api.py         | 308 ++++++++
 app/desktop/studio_server/jobs/error_log.py   |  67 ++
 app/desktop/studio_server/jobs/events.py      | 105 +++
 app/desktop/studio_server/jobs/models.py      | 180 +++++
 app/desktop/studio_server/jobs/registry.py    | 479 ++++++++++++
 app/desktop/studio_server/jobs/test_api.py    | 704 +++++++++++++++++
 .../studio_server/jobs/test_error_log.py      |  67 ++
 app/desktop/studio_server/jobs/test_events.py |  90 +++
 .../studio_server/jobs/test_registry.py       | 723 ++++++++++++++++++
 .../studio_server/jobs/workers/__init__.py    |   0
 .../studio_server/jobs/workers/eval.py        | 136 ++++
 .../studio_server/jobs/workers/noop.py        |  47 ++
 .../studio_server/jobs/workers/test_eval.py   | 535 +++++++++++++
 app/web_ui/src/lib/api_schema.d.ts            | 630 +++++++++++++++
 .../lib/components/SidebarJobsBadge.svelte    |  32 +
 .../lib/components/SidebarJobsBadge.test.ts   |  40 +
 app/web_ui/src/lib/stores/job_status.test.ts  | 128 ++++
 app/web_ui/src/lib/stores/job_status.ts       | 109 +++
 app/web_ui/src/lib/stores/jobs_api.test.ts    | 150 ++++
 app/web_ui/src/lib/stores/jobs_api.ts         | 118 +++
 app/web_ui/src/lib/stores/jobs_store.test.ts  | 305 ++++++++
 app/web_ui/src/lib/stores/jobs_store.ts       | 244 ++++++
 app/web_ui/src/lib/ui/icons/jobs_icon.svelte  |  23 +
 app/web_ui/src/lib/ui/section.ts              |   1 +
 app/web_ui/src/routes/(app)/+layout.svelte    |  14 +
 app/web_ui/src/routes/(app)/jobs/+page.svelte | 362 +++++++++
 .../src/routes/(app)/sidebar_rail.svelte      |   9 +
 libs/server/kiln_server/server.py             |   4 +
 .../background_job_system/architecture.md     | 108 +++
 .../background_job_system/functional_spec.md  | 350 +++++++++
 .../implementation_plan.md                    |  14 +
 .../phase_plans/phase_1.md                    | 186 +++++
 .../phase_plans/phase_2.md                    | 169 ++++
 .../phase_plans/phase_3.md                    | 133 ++++
 .../phase_plans/phase_4.md                    | 200 +++++
 .../background_job_system/project_overview.md |  48 ++
 41 files changed, 7110 insertions(+), 1 deletion(-)
 create mode 100644 app/desktop/git_sync/save_context.py
 create mode 100644 app/desktop/git_sync/test_save_context.py
 create mode 100644 app/desktop/studio_server/jobs/__init__.py
 create mode 100644 app/desktop/studio_server/jobs/api.py
 create mode 100644 app/desktop/studio_server/jobs/error_log.py
 create mode 100644 app/desktop/studio_server/jobs/events.py
 create mode 100644 app/desktop/studio_server/jobs/models.py
 create mode 100644 app/desktop/studio_server/jobs/registry.py
 create mode 100644 app/desktop/studio_server/jobs/test_api.py
 create mode 100644 app/desktop/studio_server/jobs/test_error_log.py
 create mode 100644 app/desktop/studio_server/jobs/test_events.py
 create mode 100644 app/desktop/studio_server/jobs/test_registry.py
 create mode 100644 app/desktop/studio_server/jobs/workers/__init__.py
 create mode 100644 app/desktop/studio_server/jobs/workers/eval.py
 create mode 100644 app/desktop/studio_server/jobs/workers/noop.py
 create mode 100644 app/desktop/studio_server/jobs/workers/test_eval.py
 create mode 100644 app/web_ui/src/lib/components/SidebarJobsBadge.svelte
 create mode 100644 app/web_ui/src/lib/components/SidebarJobsBadge.test.ts
 create mode 100644 app/web_ui/src/lib/stores/job_status.test.ts
 create mode 100644 app/web_ui/src/lib/stores/job_status.ts
 create mode 100644 app/web_ui/src/lib/stores/jobs_api.test.ts
 create mode 100644 app/web_ui/src/lib/stores/jobs_api.ts
 create mode 100644 app/web_ui/src/lib/stores/jobs_store.test.ts
 create mode 100644 app/web_ui/src/lib/stores/jobs_store.ts
 create mode 100644 app/web_ui/src/lib/ui/icons/jobs_icon.svelte
 create mode 100644 app/web_ui/src/routes/(app)/jobs/+page.svelte
 create mode 100644 specs/projects/background_job_system/architecture.md
 create mode 100644 specs/projects/background_job_system/functional_spec.md
 create mode 100644 specs/projects/background_job_system/implementation_plan.md
 create mode 100644 specs/projects/background_job_system/phase_plans/phase_1.md
 create mode 100644 specs/projects/background_job_system/phase_plans/phase_2.md
 create mode 100644 specs/projects/background_job_system/phase_plans/phase_3.md
 create mode 100644 specs/projects/background_job_system/phase_plans/phase_4.md
 create mode 100644 specs/projects/background_job_system/project_overview.md

diff --git a/app/desktop/desktop_server.py b/app/desktop/desktop_server.py
index 0163c146c..639bbe27e 100644
--- a/app/desktop/desktop_server.py
+++ b/app/desktop/desktop_server.py
@@ -33,6 +33,7 @@
 from app.desktop.studio_server.eval_api import connect_evals_api
 from app.desktop.studio_server.finetune_api import connect_fine_tune_api
 from app.desktop.studio_server.import_api import connect_import_api
+from app.desktop.studio_server.jobs.api import connect_jobs_api
 from app.desktop.studio_server.prompt_api import connect_prompt_api
 from app.desktop.studio_server.prompt_optimization_job_api import (
     connect_prompt_optimization_job_api,
@@ -142,6 +143,7 @@ def make_app(tk_root: tk.Tk | None = None):
     connect_agent_api(app)
     connect_dev_tools(app)
     connect_chat_api(app)
+    connect_jobs_api(app)
     # Important: webhost must be last, it handles all other URLs
     connect_webhost(app)
     return app
diff --git a/app/desktop/git_sync/middleware.py b/app/desktop/git_sync/middleware.py
index bca2b3ae6..900f89cf2 100644
--- a/app/desktop/git_sync/middleware.py
+++ b/app/desktop/git_sync/middleware.py
@@ -351,7 +351,11 @@ def _resolve_endpoint(self, request: Request) -> Callable[..., Any] | None:
         return None
 
     def _get_manager_for_request(self, request: Request) -> GitSyncManager | None:
-        """Extract project_id from URL, resolve to path, return manager if auto-sync enabled."""
+        """Extract project_id from URL, resolve to path, return manager if auto-sync enabled.
+
+        Keep the project_id -> manager resolution below in sync with the request-free
+        copy in save_context.get_manager_for_project (used by background job workers).
+        """
         match = PROJECT_ID_PATTERN.match(request.url.path)
         if match is None:
             return None
diff --git a/app/desktop/git_sync/save_context.py b/app/desktop/git_sync/save_context.py
new file mode 100644
index 000000000..5ce24bedd
--- /dev/null
+++ b/app/desktop/git_sync/save_context.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from kiln_ai.utils.git_sync_protocols import SaveContext
+
+from app.desktop.git_sync.config import get_git_sync_config, project_path_from_id
+from app.desktop.git_sync.git_sync_manager import GitSyncManager
+from app.desktop.git_sync.registry import GitSyncRegistry
+
+
+def get_manager_for_project(project_id: str) -> GitSyncManager | None:
+    """Resolve a project_id to its GitSyncManager when auto-sync is active.
+
+    Request-free mirror of GitSyncMiddleware._get_manager_for_request (minus the
+    URL parsing). Returns None for every "not active" branch: the project has no
+    path, no git-sync config, sync_mode is not "auto", or no clone_path is set.
+
+    Config is keyed by project_path; the manager is keyed by clone_path. The
+    manager is always obtained via GitSyncRegistry.get_or_create so the single
+    per-clone-path manager (and its executor + non-reentrant write lock) is
+    shared with the HTTP path.
+    """
+    project_path = project_path_from_id(project_id)
+    if project_path is None:
+        return None
+
+    config = get_git_sync_config(project_path)
+    if config is None:
+        return None
+
+    if config["sync_mode"] != "auto":
+        return None
+
+    clone_path = config.get("clone_path")
+    if clone_path is None:
+        return None
+
+    return GitSyncRegistry.get_or_create(
+        repo_path=Path(clone_path),
+        remote_name=config["remote_name"],
+        pat_token=config.get("pat_token"),
+        oauth_token=config.get("oauth_token"),
+        auth_mode=config["auth_mode"],
+    )
+
+
+def save_context_for_project(project_id: str, context: str) -> SaveContext | None:
+    """Return a SaveContext wrapping writes in manager.atomic_write(context=...),
+    or None when git sync is not active for this project.
+
+    Mirrors build_save_context(request) for callers that have only a project_id
+    (e.g. background job workers). Runners coalesce None to a no-op context.
+    """
+    manager = get_manager_for_project(project_id)
+    if manager is None:
+        return None
+
+    bg_sync = GitSyncRegistry.get_background_sync(manager.repo_path)
+    if bg_sync is not None:
+        bg_sync.notify_request()
+
+    def factory():
+        return manager.atomic_write(context=context)
+
+    return factory
diff --git a/app/desktop/git_sync/test_save_context.py b/app/desktop/git_sync/test_save_context.py
new file mode 100644
index 000000000..a26d4590a
--- /dev/null
+++ b/app/desktop/git_sync/test_save_context.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+from contextlib import ExitStack, asynccontextmanager
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from app.desktop.git_sync.config import GitSyncProjectConfig
+from app.desktop.git_sync.save_context import (
+    get_manager_for_project,
+    save_context_for_project,
+)
+
+PROJECT_ID = "project_abc"
+PROJECT_PATH = "/tmp/test/project.kiln"
+CLONE_PATH = "/tmp/test/clone"
+
+
+def _auto_config(clone_path: str | None = CLONE_PATH) -> GitSyncProjectConfig:
+    return GitSyncProjectConfig(
+        sync_mode="auto",
+        auth_mode="system_keys",
+        remote_name="origin",
+        branch="main",
+        clone_path=clone_path,
+        git_url=None,
+        pat_token=None,
+        oauth_token=None,
+    )
+
+
+def _manual_config() -> GitSyncProjectConfig:
+    return GitSyncProjectConfig(
+        sync_mode="manual",
+        auth_mode="system_keys",
+        remote_name="origin",
+        branch="main",
+        clone_path=CLONE_PATH,
+        git_url=None,
+        pat_token=None,
+        oauth_token=None,
+    )
+
+
+class _FakeManager:
+    """Minimal AtomicWriteCapable stand-in that records atomic_write calls."""
+
+    def __init__(self, repo_path: Path = Path(CLONE_PATH)):
+        self.repo_path = repo_path
+        self.calls: list[str] = []
+        self.entered = False
+
+    @asynccontextmanager
+    async def atomic_write(self, context: str):
+        self.calls.append(context)
+        self.entered = True
+        yield
+
+
+def _patch_resolution(project_path, config, manager=None, bg_sync=None):
+    """Patch the config + registry calls used by the helper.
+
+    project_path_from_id and get_git_sync_config are looked up in the
+    save_context module namespace, so patch them there.
+    """
+    stack = ExitStack()
+    stack.enter_context(
+        patch(
+            "app.desktop.git_sync.save_context.project_path_from_id",
+            return_value=project_path,
+        )
+    )
+    stack.enter_context(
+        patch(
+            "app.desktop.git_sync.save_context.get_git_sync_config",
+            return_value=config,
+        )
+    )
+    stack.enter_context(
+        patch(
+            "app.desktop.git_sync.save_context.GitSyncRegistry.get_or_create",
+            return_value=manager,
+        )
+    )
+    stack.enter_context(
+        patch(
+            "app.desktop.git_sync.save_context.GitSyncRegistry.get_background_sync",
+            return_value=bg_sync,
+        )
+    )
+    return stack
+
+
+# -- None branches -----------------------------------------------------------
+
+
+def test_returns_none_when_no_project_path():
+    with _patch_resolution(project_path=None, config=None):
+        assert save_context_for_project(PROJECT_ID, context="ctx") is None
+        assert get_manager_for_project(PROJECT_ID) is None
+
+
+def test_returns_none_when_no_git_sync_config():
+    with _patch_resolution(project_path=PROJECT_PATH, config=None):
+        assert save_context_for_project(PROJECT_ID, context="ctx") is None
+        assert get_manager_for_project(PROJECT_ID) is None
+
+
+def test_returns_none_when_sync_mode_not_auto():
+    with _patch_resolution(project_path=PROJECT_PATH, config=_manual_config()):
+        assert save_context_for_project(PROJECT_ID, context="ctx") is None
+        assert get_manager_for_project(PROJECT_ID) is None
+
+
+def test_returns_none_when_clone_path_missing():
+    with _patch_resolution(
+        project_path=PROJECT_PATH, config=_auto_config(clone_path=None)
+    ):
+        assert save_context_for_project(PROJECT_ID, context="ctx") is None
+        assert get_manager_for_project(PROJECT_ID) is None
+
+
+# -- active branches ---------------------------------------------------------
+
+
+def test_get_manager_uses_registry_with_config_values():
+    manager = _FakeManager()
+    with (
+        patch(
+            "app.desktop.git_sync.save_context.project_path_from_id",
+            return_value=PROJECT_PATH,
+        ),
+        patch(
+            "app.desktop.git_sync.save_context.get_git_sync_config",
+            return_value=_auto_config(),
+        ),
+        patch(
+            "app.desktop.git_sync.save_context.GitSyncRegistry.get_or_create",
+            return_value=manager,
+        ) as mock_get_or_create,
+    ):
+        result = get_manager_for_project(PROJECT_ID)
+
+    assert result is manager
+    mock_get_or_create.assert_called_once_with(
+        repo_path=Path(CLONE_PATH),
+        remote_name="origin",
+        pat_token=None,
+        oauth_token=None,
+        auth_mode="system_keys",
+    )
+
+
+async def test_save_context_enters_atomic_write_with_label():
+    manager = _FakeManager()
+    with _patch_resolution(
+        project_path=PROJECT_PATH, config=_auto_config(), manager=manager
+    ):
+        save_context = save_context_for_project(PROJECT_ID, context="eval job e1/r1")
+
+    assert save_context is not None
+    assert manager.entered is False  # built lazily, not yet entered
+
+    async with save_context():
+        pass
+
+    assert manager.calls == ["eval job e1/r1"]
+
+
+def test_save_context_notifies_background_sync():
+    manager = _FakeManager()
+    bg_sync = MagicMock()
+    with _patch_resolution(
+        project_path=PROJECT_PATH,
+        config=_auto_config(),
+        manager=manager,
+        bg_sync=bg_sync,
+    ):
+        save_context = save_context_for_project(PROJECT_ID, context="ctx")
+
+    assert save_context is not None
+    bg_sync.notify_request.assert_called_once()
+
+
+def test_save_context_no_background_sync_is_fine():
+    manager = _FakeManager()
+    with _patch_resolution(
+        project_path=PROJECT_PATH,
+        config=_auto_config(),
+        manager=manager,
+        bg_sync=None,
+    ):
+        save_context = save_context_for_project(PROJECT_ID, context="ctx")
+
+    assert save_context is not None
+
+
+# -- error propagation -------------------------------------------------------
+
+
+def test_propagates_when_config_lookup_raises():
+    # A corrupt/raising config lookup must surface (failing the job) rather than
+    # be swallowed to None, which would silently skip commits for an auto-sync
+    # project — the very bug this resolver exists to prevent.
+    with (
+        patch(
+            "app.desktop.git_sync.save_context.project_path_from_id",
+            return_value=PROJECT_PATH,
+        ),
+        patch(
+            "app.desktop.git_sync.save_context.get_git_sync_config",
+            side_effect=RuntimeError("corrupt config"),
+        ),
+    ):
+        with pytest.raises(RuntimeError, match="corrupt config"):
+            get_manager_for_project(PROJECT_ID)
+        with pytest.raises(RuntimeError, match="corrupt config"):
+            save_context_for_project(PROJECT_ID, context="ctx")
diff --git a/app/desktop/studio_server/jobs/__init__.py b/app/desktop/studio_server/jobs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/app/desktop/studio_server/jobs/api.py b/app/desktop/studio_server/jobs/api.py
new file mode 100644
index 000000000..ec66c6fcd
--- /dev/null
+++ b/app/desktop/studio_server/jobs/api.py
@@ -0,0 +1,308 @@
+from __future__ import annotations
+
+import asyncio
+import json
+from datetime import datetime
+from typing import Annotated, Any, AsyncGenerator
+
+from fastapi import FastAPI, HTTPException, Path, Query, Response
+from kiln_server.cancellable_streaming_response import CancellableStreamingResponse
+from kiln_server.utils.agent_checks.policy import (
+    ALLOW_AGENT,
+    agent_policy_require_approval,
+)
+from pydantic import BaseModel, Field, ValidationError
+
+from . import error_log
+from .events import JobEvent
+from .models import BackgroundJobStatus, JobRecord
+from .registry import (
+    JobNotFoundError,
+    JobOperationError,
+    job_registry,
+)
+from .workers.eval import EvalJobWorker
+from .workers.noop import NoopJobWorker
+
+KEEPALIVE_SECONDS = 15.0
+
+_JOB_MUTATION_APPROVAL = agent_policy_require_approval(
+    "Allow agent to control background jobs (pause, resume, cancel, delete)?"
+)
+
+
+class CreateJobRequest(BaseModel):
+    """Request body for creating a job. Params are validated per job type."""
+
+    params: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Type-specific job parameters, validated against the type's params model.",
+    )
+    project_id: str | None = Field(
+        default=None,
+        description="Project to scope this job to (for filtering/visibility). "
+        "Falls back to the params' project_id when omitted.",
+    )
+    metadata: dict[str, Any] | None = Field(
+        default=None,
+        description="Free-form pass-through attribution, stored verbatim.",
+    )
+
+
+class CreateJobResponse(BaseModel):
+    """Response returned when a job is created."""
+
+    job_id: str = Field(description="The id of the newly created job.")
+    status: BackgroundJobStatus = Field(
+        description="The job's status immediately after creation."
+    )
+
+
+def _project_id_from_params(validated_params: BaseModel) -> str | None:
+    return getattr(validated_params, "project_id", None)
+
+
+def _format_sse(event: JobEvent) -> str:
+    return f"event: {event.event}\ndata: {json.dumps(event.data)}\n\n"
+
+
+async def _event_stream(
+    job_id: str | None,
+    type_name: str | None,
+    project_id: str | None,
+):
+    """Pure-observer SSE generator.
+
+    Subscribes to the registry event bus and forwards snapshot/job/deleted
+    events, injecting a keepalive comment between events. Closing this generator
+    (client disconnect, via CancellableStreamingResponse) only unsubscribes from
+    the bus — it never touches any job's supervising task. Jobs keep running.
+    """
+    subscription: AsyncGenerator[JobEvent, None] = job_registry.events.subscribe(
+        job_id=job_id,
+        type_name=type_name,
+        project_id=project_id,
+    )
+    try:
+        while True:
+            try:
+                event = await asyncio.wait_for(
+                    subscription.__anext__(), timeout=KEEPALIVE_SECONDS
+                )
+            except asyncio.TimeoutError:
+                yield ": ping\n\n"
+                continue
+            except StopAsyncIteration:
+                break
+            yield _format_sse(event)
+    finally:
+        await subscription.aclose()
+
+
+def connect_jobs_api(app: FastAPI) -> None:
+    # Register the workers this server exposes. register_type overwrites by
+    # type_name, so repeated calls (e.g. multiple make_app() in tests) are safe.
+    job_registry.register_type(NoopJobWorker)
+    job_registry.register_type(EvalJobWorker)
+
+    @app.get(
+        "/api/jobs/events",
+        summary="Stream Job Events",
+        tags=["Jobs"],
+        openapi_extra=ALLOW_AGENT,
+    )
+    async def stream_job_events(
+        job_id: Annotated[
+            str | None, Query(description="Only stream events for this job id.")
+        ] = None,
+        type: Annotated[
+            str | None, Query(description="Only stream events for this job type.")
+        ] = None,
+        project_id: Annotated[
+            str | None, Query(description="Only stream events for this project id.")
+        ] = None,
+    ) -> CancellableStreamingResponse:
+        """Server-sent events for jobs. Emits an initial `snapshot`, then per-job
+        `job` and `deleted` events. A pure observer: disconnecting never stops a job."""
+        return CancellableStreamingResponse(
+            content=_event_stream(job_id, type, project_id),
+            media_type="text/event-stream",
+        )
+
+    @app.get(
+        "/api/jobs",
+        summary="List Jobs",
+        tags=["Jobs"],
+        openapi_extra=ALLOW_AGENT,
+    )
+    async def list_jobs(
+        status: Annotated[
+            BackgroundJobStatus | None, Query(description="Filter by job status.")
+        ] = None,
+        type: Annotated[str | None, Query(description="Filter by job type.")] = None,
+        project_id: Annotated[
+            str | None, Query(description="Filter by project id.")
+        ] = None,
+        since: Annotated[
+            datetime | None,
+            Query(description="Only jobs created at or after this ISO-8601 time."),
+        ] = None,
+        limit: Annotated[
+            int | None, Query(description="Maximum number of jobs to return.")
+        ] = None,
+    ) -> list[JobRecord]:
+        return job_registry.list_jobs(
+            status=status,
+            type_name=type,
+            project_id=project_id,
+            since=since,
+            limit=limit,
+        )
+
+    @app.post(
+        "/api/jobs/{type}",
+        summary="Create Job",
+        tags=["Jobs"],
+        status_code=201,
+        openapi_extra=ALLOW_AGENT,
+    )
+    async def create_job(
+        type: Annotated[str, Path(description="The registered job type to run.")],
+        request: CreateJobRequest,
+    ) -> CreateJobResponse:
+        try:
+            worker = job_registry.worker_for(type)
+        except JobOperationError:
+            raise HTTPException(status_code=404, detail=f"Unknown job type: {type}")
+
+        try:
+            validated = worker.params_model.model_validate(request.params)
+        except ValidationError as exc:
+            raise HTTPException(status_code=422, detail=exc.errors())
+
+        job = await job_registry.create(
+            type_name=type,
+            params=validated,
+            project_id=request.project_id or _project_id_from_params(validated),
+            metadata=request.metadata,
+        )
+        return CreateJobResponse(job_id=job.id, status=job.status)
+
+    @app.get(
+        "/api/jobs/{id}",
+        summary="Get Job",
+        tags=["Jobs"],
+        openapi_extra=ALLOW_AGENT,
+    )
+    async def get_job(
+        id: Annotated[str, Path(description="The job id.")],
+    ) -> JobRecord:
+        job = await job_registry.get(id)
+        if job is None:
+            raise HTTPException(status_code=404, detail=f"Job not found: {id}")
+        return job
+
+    @app.get(
+        "/api/jobs/{id}/result",
+        summary="Get Job Result",
+        tags=["Jobs"],
+        openapi_extra=ALLOW_AGENT,
+    )
+    async def get_job_result(
+        id: Annotated[str, Path(description="The job id.")],
+    ) -> dict[str, Any]:
+        job = await job_registry.get(id)
+        if job is None:
+            raise HTTPException(status_code=404, detail=f"Job not found: {id}")
+        if not job.status.is_terminal or job.result is None:
+            raise HTTPException(
+                status_code=404, detail="No result available for this job."
+            )
+        return job.result
+
+    @app.get(
+        "/api/jobs/{id}/errors",
+        summary="Get Job Errors",
+        tags=["Jobs"],
+        openapi_extra=ALLOW_AGENT,
+    )
+    async def get_job_errors(
+        id: Annotated[str, Path(description="The job id.")],
+        run_id: Annotated[
+            str | None,
+            Query(description="Read the error log for a specific past run id."),
+        ] = None,
+    ) -> list[dict[str, Any]]:
+        # Always 200, never errors (functional_spec §5). A plain non-reconciling
+        # lookup of the current run_id — we don't recompute state for a
+        # best-effort diagnostic read.
+        resolved_run_id = run_id or job_registry.run_id_for(id)
+        if resolved_run_id is None:
+            return []
+        return error_log.read_errors(resolved_run_id)
+
+    @app.post(
+        "/api/jobs/{id}/pause",
+        summary="Pause Job",
+        tags=["Jobs"],
+        status_code=202,
+        openapi_extra=_JOB_MUTATION_APPROVAL,
+    )
+    async def pause_job(
+        id: Annotated[str, Path(description="The job id.")],
+    ) -> Response:
+        await _run_lifecycle(job_registry.pause, id)
+        return Response(status_code=202)
+
+    @app.post(
+        "/api/jobs/{id}/resume",
+        summary="Resume Job",
+        tags=["Jobs"],
+        status_code=202,
+        openapi_extra=_JOB_MUTATION_APPROVAL,
+    )
+    async def resume_job(
+        id: Annotated[str, Path(description="The job id.")],
+    ) -> Response:
+        await _run_lifecycle(job_registry.resume, id)
+        return Response(status_code=202)
+
+    @app.post(
+        "/api/jobs/{id}/cancel",
+        summary="Cancel Job",
+        tags=["Jobs"],
+        status_code=202,
+        openapi_extra=_JOB_MUTATION_APPROVAL,
+    )
+    async def cancel_job(
+        id: Annotated[str, Path(description="The job id.")],
+    ) -> Response:
+        await _run_lifecycle(job_registry.cancel, id)
+        return Response(status_code=202)
+
+    @app.delete(
+        "/api/jobs/{id}",
+        summary="Delete Job",
+        tags=["Jobs"],
+        status_code=204,
+        openapi_extra=_JOB_MUTATION_APPROVAL,
+    )
+    async def delete_job(
+        id: Annotated[str, Path(description="The job id.")],
+    ) -> Response:
+        await _run_lifecycle(job_registry.delete, id)
+        return Response(status_code=204)
+
+
+async def _run_lifecycle(operation, job_id: str) -> Any:
+    """Invoke a registry lifecycle op, mapping its exceptions to HTTP status.
+
+    JobNotFoundError -> 404, JobOperationError (invalid transition / unsupported
+    pause / delete in-flight) -> 409.
+    """
+    try:
+        return await operation(job_id)
+    except JobNotFoundError:
+        raise HTTPException(status_code=404, detail=f"Job not found: {job_id}")
+    except JobOperationError as exc:
+        raise HTTPException(status_code=409, detail=str(exc))
diff --git a/app/desktop/studio_server/jobs/error_log.py b/app/desktop/studio_server/jobs/error_log.py
new file mode 100644
index 000000000..6e8e23715
--- /dev/null
+++ b/app/desktop/studio_server/jobs/error_log.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+from typing import Any
+
+ERROR_LOG_DIR_NAME = "kiln_jobs"
+
+
+def error_log_dir() -> Path:
+    return Path(tempfile.gettempdir()) / ERROR_LOG_DIR_NAME
+
+
+def error_log_path(run_id: str) -> Path:
+    return error_log_dir() / f"{run_id}.json"
+
+
+def append_error(run_id: str, entry: dict[str, Any]) -> None:
+    """Append a single error entry to this run's log (JSON Lines). Best-effort.
+
+    Creates the directory lazily. Any IO/serialization failure is swallowed —
+    the error log is a diagnostic convenience, never a guarantee.
+    """
+    try:
+        directory = error_log_dir()
+        directory.mkdir(parents=True, exist_ok=True)
+        line = json.dumps(entry, ensure_ascii=False)
+        with error_log_path(run_id).open("a", encoding="utf-8") as f:
+            f.write(line + "\n")
+    except Exception:
+        pass
+
+
+def read_errors(run_id: str) -> list[dict[str, Any]]:
+    """Read the error log for a run as a list of objects. Best-effort.
+
+    A missing or unreadable file returns []. Individual unparsable lines are
+    skipped rather than failing the whole read. Never raises.
+    """
+    entries: list[dict[str, Any]] = []
+    try:
+        path = error_log_path(run_id)
+        if not path.exists():
+            return []
+        with path.open("r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    parsed = json.loads(line)
+                except (ValueError, TypeError):
+                    continue
+                if isinstance(parsed, dict):
+                    entries.append(parsed)
+    except Exception:
+        return entries
+    return entries
+
+
+def delete_errors(run_id: str) -> None:
+    """Best-effort remove the error log file for a run. Swallows all errors."""
+    try:
+        error_log_path(run_id).unlink(missing_ok=True)
+    except Exception:
+        pass
diff --git a/app/desktop/studio_server/jobs/events.py b/app/desktop/studio_server/jobs/events.py
new file mode 100644
index 000000000..b85f0f2d1
--- /dev/null
+++ b/app/desktop/studio_server/jobs/events.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+import asyncio
+from typing import Any, AsyncGenerator, Callable, Literal
+
+from pydantic import BaseModel
+
+from .models import JobRecord
+
+
+class JobEvent(BaseModel):
+    """A single bus event. Per-job events carry the full record (idempotent snapshot)."""
+
+    event: Literal["snapshot", "job", "deleted"]
+    data: dict[str, Any]
+
+
+class _Subscriber:
+    def __init__(
+        self,
+        job_id: str | None,
+        type_name: str | None,
+        project_id: str | None,
+    ) -> None:
+        self.queue: asyncio.Queue[JobEvent] = asyncio.Queue()
+        self.job_id = job_id
+        self.type_name = type_name
+        self.project_id = project_id
+
+    def matches(
+        self,
+        record_id: str | None,
+        record_type: str | None,
+        record_project_id: str | None,
+    ) -> bool:
+        if self.job_id is not None and self.job_id != record_id:
+            return False
+        if self.type_name is not None and self.type_name != record_type:
+            return False
+        if self.project_id is not None and self.project_id != record_project_id:
+            return False
+        return True
+
+
+SnapshotProvider = Callable[[], list[JobRecord]]
+
+
+class JobEventBus:
+    """In-process async pub/sub bus feeding the SSE endpoint (Phase 2).
+
+    Subscribers receive an initial `snapshot` event, then per-job `job` events
+    and `deleted` tombstones, filtered by job_id / type / project_id.
+    """
+
+    def __init__(self, snapshot_provider: SnapshotProvider | None = None) -> None:
+        self._subscribers: set[_Subscriber] = set()
+        self._snapshot_provider = snapshot_provider
+
+    def set_snapshot_provider(self, provider: SnapshotProvider) -> None:
+        self._snapshot_provider = provider
+
+    def _filtered_snapshot(self, subscriber: _Subscriber) -> list[JobRecord]:
+        if self._snapshot_provider is None:
+            return []
+        return [
+            record
+            for record in self._snapshot_provider()
+            if subscriber.matches(record.id, record.type, record.project_id)
+        ]
+
+    async def subscribe(
+        self,
+        job_id: str | None = None,
+        type_name: str | None = None,
+        project_id: str | None = None,
+    ) -> AsyncGenerator[JobEvent, None]:
+        subscriber = _Subscriber(job_id, type_name, project_id)
+        self._subscribers.add(subscriber)
+        try:
+            snapshot = self._filtered_snapshot(subscriber)
+            yield JobEvent(
+                event="snapshot",
+                data={"jobs": [r.model_dump(mode="json") for r in snapshot]},
+            )
+            while True:
+                yield await subscriber.queue.get()
+        finally:
+            self._subscribers.discard(subscriber)
+
+    def publish_job(self, record: JobRecord) -> None:
+        event = JobEvent(event="job", data=record.model_dump(mode="json"))
+        for subscriber in self._subscribers:
+            if subscriber.matches(record.id, record.type, record.project_id):
+                subscriber.queue.put_nowait(event)
+
+    def publish_deleted(
+        self,
+        job_id: str,
+        type_name: str | None = None,
+        project_id: str | None = None,
+    ) -> None:
+        event = JobEvent(event="deleted", data={"id": job_id})
+        for subscriber in self._subscribers:
+            if subscriber.matches(job_id, type_name, project_id):
+                subscriber.queue.put_nowait(event)
diff --git a/app/desktop/studio_server/jobs/models.py b/app/desktop/studio_server/jobs/models.py
new file mode 100644
index 000000000..7262934a4
--- /dev/null
+++ b/app/desktop/studio_server/jobs/models.py
@@ -0,0 +1,180 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from enum import Enum
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    ClassVar,
+    Generic,
+    TypeVar,
+)
+
+from pydantic import BaseModel, Field
+
+
+def _utc_now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+class BackgroundJobStatus(str, Enum):
+    PENDING = "pending"
+    RUNNING = "running"
+    PAUSED = "paused"
+    SUCCEEDED = "succeeded"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+    @property
+    def is_terminal(self) -> bool:
+        return self in TERMINAL_STATUSES
+
+
+TERMINAL_STATUSES = frozenset(
+    {
+        BackgroundJobStatus.SUCCEEDED,
+        BackgroundJobStatus.FAILED,
+        BackgroundJobStatus.CANCELLED,
+    }
+)
+
+
+class JobProgress(BaseModel):
+    """Count-based progress for a job.
+
+    Processed = success + error; remaining = total - success - error. The error
+    field is a count only — the actual messages live in the per-run error log.
+    """
+
+    total: int | None = None
+    success: int = 0
+    error: int = 0
+    message: str | None = None
+    updated_at: datetime = Field(default_factory=_utc_now)
+
+
+class JobDerivedState(BaseModel):
+    """A worker's view of the operation's true state, read from source-of-truth entities."""
+
+    total: int | None = None
+    success: int = 0
+    error: int = 0
+    is_complete: bool = False
+    message: str | None = None
+
+
+class JobError(BaseModel):
+    """Small failure summary stamped on the record. Detail lives in the error log."""
+
+    error: str | None = None
+    detail: dict[str, Any] | None = None
+
+
+class JobRecord(BaseModel):
+    """Ephemeral, in-memory bookkeeping for a single job. Never persisted to disk."""
+
+    id: str
+    type: str
+    status: BackgroundJobStatus
+    run_id: str | None = None
+    progress: JobProgress = Field(default_factory=JobProgress)
+    params: dict[str, Any] = Field(default_factory=dict)
+    result: dict[str, Any] | None = None
+    error: JobError | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    project_id: str | None = None
+    supports_pause: bool = False
+    created_at: datetime = Field(default_factory=_utc_now)
+    updated_at: datetime = Field(default_factory=_utc_now)
+    started_at: datetime | None = None
+    ended_at: datetime | None = None
+
+
+ReportProgress = Callable[["JobProgressUpdate"], Awaitable[None]]
+ReportError = Callable[[str, dict[str, Any]], Awaitable[None]]
+
+
+class JobProgressUpdate(BaseModel):
+    success: int
+    error: int = 0
+    total: int | None = None
+    message: str | None = None
+
+
+class JobContext:
+    """Provided to the worker by JobRegistry during run().
+
+    Holds the current job_id and run_id, plus registry-injected callbacks for
+    reporting progress (in-memory snapshot + event) and per-item errors (error log).
+    """
+
+    def __init__(
+        self,
+        job_id: str,
+        run_id: str,
+        report_progress: ReportProgress,
+        report_error: ReportError,
+    ) -> None:
+        self.job_id = job_id
+        self.run_id = run_id
+        self._report_progress = report_progress
+        self._report_error = report_error
+
+    async def report_progress(
+        self,
+        success: int,
+        error: int = 0,
+        total: int | None = None,
+        message: str | None = None,
+    ) -> None:
+        """Update the registry's in-memory progress snapshot and emit an event.
+
+        A UI-smoothing signal only — the authoritative progress comes from
+        compute_state(). Cheap to call often.
+        """
+        await self._report_progress(
+            JobProgressUpdate(
+                success=success,
+                error=error,
+                total=total,
+                message=message,
+            )
+        )
+
+    async def report_error(self, error_message: str, **extra: Any) -> None:
+        """Append one structured error entry to this run's error log.
+
+        For non-fatal per-item errors that don't stop the run. Best-effort: a
+        failed write is swallowed, never propagated. Does not itself bump the
+        progress error count — report that via report_progress.
+        """
+        await self._report_error(error_message, extra)
+
+
+TParams = TypeVar("TParams", bound=BaseModel)
+TResult = TypeVar("TResult", bound=BaseModel)
+
+
+class JobWorker(Generic[TParams, TResult]):
+    type_name: ClassVar[str]
+    params_model: ClassVar[type[BaseModel]]
+    result_model: ClassVar[type[BaseModel]]
+    supports_pause: ClassVar[bool] = False
+
+    async def compute_state(self, params: TParams) -> JobDerivedState | None:
+        """Read source-of-truth Kiln entities and return the operation's true state.
+
+        MUST be a pure read — no side effects, idempotent, safe to call any time.
+        Return None only when the worker has no backing entity to consult (e.g.
+        the NoopJob fixture); the registry then keeps the last believed snapshot.
+        Real workers must override this.
+        """
+        return None
+
+    async def run(self, params: TParams, ctx: JobContext) -> TResult:
+        """MUST be idempotent. Covers both first run and resume — the registry
+        calls run() again to resume a paused job; the worker re-orients via
+        compute_state(), not a handed-in checkpoint.
+        """
+        raise NotImplementedError
diff --git a/app/desktop/studio_server/jobs/registry.py b/app/desktop/studio_server/jobs/registry.py
new file mode 100644
index 000000000..e8d37d55b
--- /dev/null
+++ b/app/desktop/studio_server/jobs/registry.py
@@ -0,0 +1,479 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import secrets
+import traceback
+import uuid
+from datetime import datetime
+from typing import Any
+
+from pydantic import BaseModel
+
+from . import error_log
+from .events import JobEventBus
+from .models import (
+    BackgroundJobStatus,
+    JobContext,
+    JobDerivedState,
+    JobError,
+    JobProgress,
+    JobProgressUpdate,
+    JobRecord,
+    JobWorker,
+    _utc_now,
+)
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_MAX_CONCURRENT = 10
+MAX_CONCURRENT_ENV_VAR = "KILN_JOBS_MAX_CONCURRENT"
+
+_JOB_ID_ALPHABET = "abcdefghijklmnopqrstuvwxyz234567"
+_JOB_ID_LENGTH = 12
+
+
+class JobNotFoundError(Exception):
+    pass
+
+
+class JobOperationError(Exception):
+    """Raised for invalid lifecycle operations (e.g. pause a non-running job).
+
+    Phase 2 maps these to 409 Conflict.
+    """
+
+
+def _new_job_id() -> str:
+    suffix = "".join(secrets.choice(_JOB_ID_ALPHABET) for _ in range(_JOB_ID_LENGTH))
+    return f"j_{suffix}"
+
+
+def _resolve_max_concurrent(explicit: int | None) -> int:
+    if explicit is not None:
+        return explicit
+    raw = os.environ.get(MAX_CONCURRENT_ENV_VAR)
+    if raw:
+        try:
+            value = int(raw)
+            if value > 0:
+                return value
+        except ValueError:
+            pass
+    return DEFAULT_MAX_CONCURRENT
+
+
+class JobRegistry:
+    """In-memory registry owning job lifecycle, concurrency, and reconciliation.
+
+    Singleton per process. The in-memory index is the only store — no disk
+    persistence of state. Supervising tasks are owned here and decoupled from any
+    HTTP connection.
+    """
+
+    def __init__(self, max_concurrent: int | None = None) -> None:
+        self._max_concurrent = _resolve_max_concurrent(max_concurrent)
+        self._workers: dict[str, JobWorker] = {}
+        self._jobs: dict[str, JobRecord] = {}
+        self._tasks: dict[str, asyncio.Task] = {}
+        self._pending_ids: list[str] = []
+        self._cancel_intent: set[str] = set()
+        self._pause_intent: set[str] = set()
+        # Job ids whose supervising task received a real (delivered-to-a-live-
+        # task) cancellation. Distinguishes "worker swallowed a cancel" from
+        # "worker finished before any cancel landed" when the worker returns
+        # normally — the former must transition to paused/cancelled, the latter
+        # must keep its succeeded result.
+        self._cancel_delivered: set[str] = set()
+        self._running_count = 0
+        self.events = JobEventBus(snapshot_provider=self._snapshot)
+
+    # -- registration --------------------------------------------------------
+
+    def register_type(self, worker_cls: type[JobWorker]) -> None:
+        worker = worker_cls()
+        self._workers[worker_cls.type_name] = worker
+
+    def worker_for(self, type_name: str) -> JobWorker:
+        worker = self._workers.get(type_name)
+        if worker is None:
+            raise JobOperationError(f"Unknown job type: {type_name}")
+        return worker
+
+    # -- snapshots / reads ---------------------------------------------------
+
+    def _snapshot(self) -> list[JobRecord]:
+        return list(self._jobs.values())
+
+    def _require(self, job_id: str) -> JobRecord:
+        job = self._jobs.get(job_id)
+        if job is None:
+            raise JobNotFoundError(job_id)
+        return job
+
+    async def get(self, job_id: str) -> JobRecord | None:
+        job = self._jobs.get(job_id)
+        if job is None:
+            return None
+        await self._reconcile(job, emit_on_change=True)
+        return job
+
+    def run_id_for(self, job_id: str) -> str | None:
+        """Current run_id for a job, or None if unknown. A plain read — no
+        reconciliation (used by the best-effort errors endpoint)."""
+        job = self._jobs.get(job_id)
+        return job.run_id if job is not None else None
+
+    def list_jobs(
+        self,
+        status: BackgroundJobStatus | None = None,
+        type_name: str | None = None,
+        project_id: str | None = None,
+        since: datetime | None = None,
+        limit: int | None = None,
+    ) -> list[JobRecord]:
+        records = list(self._jobs.values())
+        if status is not None:
+            records = [r for r in records if r.status == status]
+        if type_name is not None:
+            records = [r for r in records if r.type == type_name]
+        if project_id is not None:
+            records = [r for r in records if r.project_id == project_id]
+        if since is not None:
+            records = [r for r in records if r.created_at >= since]
+        records.sort(key=lambda r: r.created_at, reverse=True)
+        if limit is not None:
+            records = records[:limit]
+        return records
+
+    # -- create --------------------------------------------------------------
+
+    async def create(
+        self,
+        type_name: str,
+        params: dict[str, Any] | BaseModel,
+        project_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> JobRecord:
+        worker = self.worker_for(type_name)
+        validated = self._validate_params(worker, params)
+        job_id = self._fresh_job_id()
+        job = JobRecord(
+            id=job_id,
+            type=type_name,
+            status=BackgroundJobStatus.PENDING,
+            params=validated.model_dump(mode="json"),
+            metadata=metadata or {},
+            project_id=project_id,
+            supports_pause=worker.supports_pause,
+        )
+        self._jobs[job_id] = job
+        self._pending_ids.append(job_id)
+        self._emit(job)
+        self._dispatch_pending()
+        return job
+
+    def _fresh_job_id(self) -> str:
+        job_id = _new_job_id()
+        while job_id in self._jobs:
+            job_id = _new_job_id()
+        return job_id
+
+    def _validate_params(
+        self, worker: JobWorker, params: dict[str, Any] | BaseModel
+    ) -> BaseModel:
+        if isinstance(params, worker.params_model):
+            return params
+        if isinstance(params, BaseModel):
+            params = params.model_dump()
+        return worker.params_model.model_validate(params)
+
+    # -- dispatch / supervision ---------------------------------------------
+
+    def _dispatch_pending(self) -> None:
+        while self._running_count < self._max_concurrent and self._pending_ids:
+            job_id = self._pending_ids.pop(0)
+            job = self._jobs.get(job_id)
+            if job is None or job.status != BackgroundJobStatus.PENDING:
+                continue
+            self._launch(job)
+
+    def _launch(self, job: JobRecord) -> None:
+        worker = self.worker_for(job.type)
+        run_id = str(uuid.uuid4())
+        job.run_id = run_id
+        job.status = BackgroundJobStatus.RUNNING
+        job.started_at = _utc_now()
+        self._touch(job)
+        self._running_count += 1
+        self._emit(job)
+        task = asyncio.create_task(self._supervise(job.id, worker, run_id))
+        self._tasks[job.id] = task
+
+    async def _supervise(self, job_id: str, worker: JobWorker, run_id: str) -> None:
+        job = self._jobs.get(job_id)
+        if job is None:
+            return
+        params = worker.params_model.model_validate(job.params)
+        ctx = self._build_context(job_id, run_id)
+        try:
+            try:
+                await self._reconcile(job, emit_on_change=True)
+                if job.status == BackgroundJobStatus.SUCCEEDED:
+                    return
+                result = await worker.run(params, ctx)
+                # The cancellation transition is unconditional (functional_spec
+                # §2): a worker that catches CancelledError for cleanup and then
+                # returns normally — even one that calls task.uncancel() so it is
+                # never re-raised — must still land in paused/cancelled, not
+                # succeeded. The registry enforces this off its own delivery
+                # record rather than trusting the worker to re-raise. A worker
+                # that finished naturally before any cancel landed has no
+                # delivery recorded, so its result stands.
+                if job_id in self._cancel_delivered:
+                    self._finish_cancelled_or_paused(job)
+                else:
+                    self._finish_succeeded(job, result)
+            except asyncio.CancelledError:
+                self._finish_cancelled_or_paused(job)
+                raise
+            except Exception as exc:
+                self._finish_failed(job, run_id, exc)
+        finally:
+            self._release_slot(job_id)
+
+    def _build_context(self, job_id: str, run_id: str) -> JobContext:
+        async def report_progress(update: JobProgressUpdate) -> None:
+            job = self._jobs.get(job_id)
+            if job is None or job.run_id != run_id:
+                return
+            job.progress = JobProgress(
+                total=update.total if update.total is not None else job.progress.total,
+                success=update.success,
+                error=update.error,
+                message=update.message
+                if update.message is not None
+                else job.progress.message,
+            )
+            self._touch(job)
+            self._emit(job)
+
+        async def report_error(message: str, extra: dict[str, Any]) -> None:
+            error_log.append_error(run_id, {"error_message": message, **extra})
+
+        return JobContext(job_id, run_id, report_progress, report_error)
+
+    def _finish_succeeded(self, job: JobRecord, result: BaseModel) -> None:
+        job.status = BackgroundJobStatus.SUCCEEDED
+        job.result = result.model_dump(mode="json")
+        job.ended_at = _utc_now()
+        self._touch(job)
+        self._emit(job)
+
+    def _finish_failed(self, job: JobRecord, run_id: str, exc: Exception) -> None:
+        job.status = BackgroundJobStatus.FAILED
+        job.error = JobError(error=str(exc) or exc.__class__.__name__)
+        job.ended_at = _utc_now()
+        self._touch(job)
+        error_log.append_error(
+            run_id,
+            {
+                "error_message": str(exc) or exc.__class__.__name__,
+                "traceback": "".join(
+                    traceback.format_exception(type(exc), exc, exc.__traceback__)
+                ),
+                "fatal": True,
+            },
+        )
+        self._emit(job)
+
+    def _finish_cancelled_or_paused(self, job: JobRecord) -> None:
+        if job.id in self._pause_intent:
+            job.status = BackgroundJobStatus.PAUSED
+        else:
+            job.status = BackgroundJobStatus.CANCELLED
+            job.ended_at = _utc_now()
+        self._touch(job)
+        self._emit(job)
+
+    # -- lifecycle controls --------------------------------------------------
+
+    async def pause(self, job_id: str) -> JobRecord:
+        job = self._require(job_id)
+        if not job.supports_pause:
+            raise JobOperationError(f"Job type '{job.type}' does not support pause")
+        if job.status != BackgroundJobStatus.RUNNING:
+            raise JobOperationError(
+                f"Cannot pause a job in status '{job.status.value}'"
+            )
+        self._pause_intent.add(job_id)
+        await self._cancel_task(job_id)
+        # If run() completed naturally during the cancel await, the job is
+        # already terminal — leave that state intact rather than forcing paused.
+        if job.status.is_terminal:
+            return job
+        if job.status != BackgroundJobStatus.PAUSED:
+            job.status = BackgroundJobStatus.PAUSED
+            self._touch(job)
+        worker = self.worker_for(job.type)
+        params = worker.params_model.model_validate(job.params)
+        derived = await worker.compute_state(params)
+        self._apply_derived(job, derived)
+        self._emit(job)
+        return job
+
+    async def resume(self, job_id: str) -> JobRecord:
+        job = self._require(job_id)
+        if job.status != BackgroundJobStatus.PAUSED:
+            raise JobOperationError(
+                f"Cannot resume a job in status '{job.status.value}'"
+            )
+        worker = self.worker_for(job.type)
+        params = worker.params_model.model_validate(job.params)
+        derived = await worker.compute_state(params)
+        if derived is not None and derived.is_complete:
+            self._apply_derived(job, derived)
+            job.status = BackgroundJobStatus.SUCCEEDED
+            job.ended_at = _utc_now()
+            self._touch(job)
+            self._emit(job)
+            return job
+        self._apply_derived(job, derived)
+        job.status = BackgroundJobStatus.PENDING
+        self._touch(job)
+        self._pending_ids.append(job_id)
+        self._emit(job)
+        self._dispatch_pending()
+        return job
+
+    async def cancel(self, job_id: str) -> JobRecord:
+        job = self._require(job_id)
+        if job.status.is_terminal:
+            raise JobOperationError(
+                f"Cannot cancel a job in status '{job.status.value}'"
+            )
+        if job.status == BackgroundJobStatus.PENDING:
+            self._remove_pending(job_id)
+            job.status = BackgroundJobStatus.CANCELLED
+            job.ended_at = _utc_now()
+            self._touch(job)
+            self._emit(job)
+            return job
+        if job.status == BackgroundJobStatus.PAUSED:
+            job.status = BackgroundJobStatus.CANCELLED
+            job.ended_at = _utc_now()
+            self._touch(job)
+            self._emit(job)
+            return job
+        self._cancel_intent.add(job_id)
+        await self._cancel_task(job_id)
+        if not job.status.is_terminal:
+            job.status = BackgroundJobStatus.CANCELLED
+            job.ended_at = _utc_now()
+            self._touch(job)
+            self._emit(job)
+        return self._jobs[job_id]
+
+    async def delete(self, job_id: str) -> None:
+        job = self._require(job_id)
+        if not job.status.is_terminal:
+            raise JobOperationError(
+                f"Cannot delete a job in status '{job.status.value}'"
+            )
+        self._jobs.pop(job_id, None)
+        self._remove_pending(job_id)
+        if job.run_id is not None:
+            error_log.delete_errors(job.run_id)
+        self.events.publish_deleted(job_id, job.type, job.project_id)
+
+    async def _cancel_task(self, job_id: str) -> None:
+        task = self._tasks.get(job_id)
+        if task is None:
+            return
+        # cancel() returns True only if the request landed on a not-yet-done
+        # task — i.e. the cancellation is actually delivered to the worker. If
+        # it returns False the worker already finished naturally; we must not
+        # override that terminal result.
+        if task.cancel():
+            self._cancel_delivered.add(job_id)
+        try:
+            await task
+        except asyncio.CancelledError:
+            pass
+        except Exception:
+            # The worker raised while we awaited its cancellation. _supervise
+            # already routed this to the failed/terminal state and logged it;
+            # we only debug-log here so it isn't silently discarded.
+            logger.debug(
+                "Worker for job %s raised during cancel await", job_id, exc_info=True
+            )
+        # If the task was cancelled before its coroutine body ever ran, its own
+        # finally never executed, so reclaim the slot here. Idempotent: whoever
+        # pops job_id from _tasks first owns the single decrement.
+        self._release_slot(job_id)
+
+    def _release_slot(self, job_id: str) -> None:
+        if self._tasks.pop(job_id, None) is None:
+            return
+        self._cancel_intent.discard(job_id)
+        self._pause_intent.discard(job_id)
+        self._cancel_delivered.discard(job_id)
+        self._running_count -= 1
+        self._dispatch_pending()
+
+    def _remove_pending(self, job_id: str) -> None:
+        try:
+            self._pending_ids.remove(job_id)
+        except ValueError:
+            pass
+
+    # -- reconciliation ------------------------------------------------------
+
+    async def _reconcile(self, job: JobRecord, emit_on_change: bool) -> bool:
+        worker = self._workers.get(job.type)
+        if worker is None:
+            return False
+        params = worker.params_model.model_validate(job.params)
+        derived = await worker.compute_state(params)
+        if derived is None:
+            return False
+        changed = self._apply_derived(job, derived)
+        if derived.is_complete and not job.status.is_terminal:
+            job.status = BackgroundJobStatus.SUCCEEDED
+            job.ended_at = _utc_now()
+            self._touch(job)
+            changed = True
+        if changed and emit_on_change:
+            self._emit(job)
+        return changed
+
+    def _apply_derived(self, job: JobRecord, derived: JobDerivedState | None) -> bool:
+        if derived is None:
+            return False
+        new_progress = JobProgress(
+            total=derived.total if derived.total is not None else job.progress.total,
+            success=derived.success,
+            error=derived.error,
+            message=derived.message
+            if derived.message is not None
+            else job.progress.message,
+        )
+        before = job.progress.model_dump(exclude={"updated_at"})
+        after = new_progress.model_dump(exclude={"updated_at"})
+        if before == after:
+            return False
+        job.progress = new_progress
+        self._touch(job)
+        return True
+
+    # -- helpers -------------------------------------------------------------
+
+    def _touch(self, job: JobRecord) -> None:
+        job.updated_at = _utc_now()
+
+    def _emit(self, job: JobRecord) -> None:
+        self.events.publish_job(job)
+
+
+job_registry = JobRegistry()
diff --git a/app/desktop/studio_server/jobs/test_api.py b/app/desktop/studio_server/jobs/test_api.py
new file mode 100644
index 000000000..9e5429b91
--- /dev/null
+++ b/app/desktop/studio_server/jobs/test_api.py
@@ -0,0 +1,704 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import uuid
+
+import httpx
+import pytest
+import pytest_asyncio
+from app.desktop.studio_server.jobs import api as jobs_api
+from app.desktop.studio_server.jobs import error_log
+from app.desktop.studio_server.jobs.api import connect_jobs_api
+from app.desktop.studio_server.jobs.models import (
+    BackgroundJobStatus,
+    JobDerivedState,
+    JobWorker,
+)
+from app.desktop.studio_server.jobs.registry import JobOperationError, JobRegistry
+from app.desktop.studio_server.jobs.workers.noop import NoopJobWorker
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+
+async def _safe_cancel(registry: JobRegistry, job_id: str) -> None:
+    """Best-effort cleanup cancel; ignore a job that already reached terminal."""
+    try:
+        await registry.cancel(job_id)
+    except JobOperationError:
+        pass
+
+
+@pytest.fixture(autouse=True)
+def temp_error_log_dir(tmp_path, monkeypatch):
+    monkeypatch.setattr(
+        "app.desktop.studio_server.jobs.error_log.tempfile.gettempdir",
+        lambda: str(tmp_path),
+    )
+
+
+# -- supporting test workers -------------------------------------------------
+
+
+class _ProjectParams(BaseModel):
+    project_id: str
+    steps: int = 50
+    sleep_per_step_seconds: float = 0.05
+
+
+class _EmptyResult(BaseModel):
+    pass
+
+
+class ProjectScopedWorker(JobWorker[_ProjectParams, _EmptyResult]):
+    """A worker whose params carry a project_id, so the record gets one."""
+
+    type_name = "project_scoped"
+    params_model = _ProjectParams
+    result_model = _EmptyResult
+    supports_pause = True
+
+    async def run(self, params, ctx):
+        await asyncio.sleep(5)
+        return _EmptyResult()
+
+
+class _EmptyParams(BaseModel):
+    pass
+
+
+class ReconcileCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    """compute_state flips to complete once `done` is set, so a GET reconciles
+    the running job straight to succeeded."""
+
+    type_name = "reconcile_complete"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = True
+    done = False
+
+    async def compute_state(self, params):
+        complete = type(self).done
+        return JobDerivedState(
+            total=3, success=3 if complete else 1, error=0, is_complete=complete
+        )
+
+    async def run(self, params, ctx):
+        await asyncio.sleep(5)
+        return _EmptyResult()
+
+
+class NonPausableWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    type_name = "nonpausable"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = False
+
+    async def run(self, params, ctx):
+        await asyncio.sleep(5)
+        return _EmptyResult()
+
+
+# -- fixtures ----------------------------------------------------------------
+
+
+@pytest.fixture
+def registry(monkeypatch):
+    """Patch a fresh registry in for isolation, then register the test workers."""
+    reg = JobRegistry(max_concurrent=10)
+    monkeypatch.setattr(jobs_api, "job_registry", reg)
+    reg.register_type(NoopJobWorker)
+    reg.register_type(ProjectScopedWorker)
+    reg.register_type(ReconcileCompleteWorker)
+    reg.register_type(NonPausableWorker)
+    return reg
+
+
+@pytest.fixture
+def fast_keepalive(monkeypatch):
+    # httpx's ASGITransport batches the SSE generator's output and only surfaces
+    # buffered lines once the next chunk (here, the keepalive ping) forces a
+    # flush. Shortening the keepalive makes that flush — and stream teardown —
+    # prompt in tests. Production keeps the 15s default.
+    monkeypatch.setattr(jobs_api, "KEEPALIVE_SECONDS", 0.1)
+
+
+@pytest.fixture
+def app(registry):
+    app = FastAPI()
+    connect_jobs_api(app)
+    return app
+
+
+@pytest_asyncio.fixture
+async def client(app):
+    # Async client over ASGI so handlers AND the registry's background tasks
+    # share the test's event loop — background jobs progress while we await.
+    transport = httpx.ASGITransport(app=app)
+    async with httpx.AsyncClient(
+        transport=transport, base_url="http://test"
+    ) as http_client:
+        yield http_client
+
+
+async def _wait_for_status(
+    registry: JobRegistry,
+    job_id: str,
+    target: BackgroundJobStatus | set[BackgroundJobStatus],
+    timeout: float = 3.0,
+) -> None:
+    targets = {target} if isinstance(target, BackgroundJobStatus) else target
+    deadline = asyncio.get_event_loop().time() + timeout
+    while asyncio.get_event_loop().time() < deadline:
+        job = registry._jobs.get(job_id)
+        if job is not None and job.status in targets:
+            return
+        await asyncio.sleep(0.01)
+    job = registry._jobs.get(job_id)
+    actual = job.status if job else "missing"
+    raise AssertionError(f"Job {job_id} did not reach {targets}; was {actual}")
+
+
+async def _create_noop(client, **params) -> str:
+    body = {"steps": 50, "sleep_per_step_seconds": 0.05}
+    body.update(params)
+    resp = await client.post("/api/jobs/noop", json={"params": body})
+    assert resp.status_code == 201, resp.text
+    return resp.json()["job_id"]
+
+
+# -- create ------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_create_returns_201_and_status(client):
+    resp = await client.post(
+        "/api/jobs/noop",
+        json={"params": {"steps": 3, "sleep_per_step_seconds": 0.01}},
+    )
+    assert resp.status_code == 201
+    body = resp.json()
+    assert body["job_id"].startswith("j_")
+    assert body["status"] in ("pending", "running")
+
+
+@pytest.mark.asyncio
+async def test_create_unknown_type_404(client):
+    resp = await client.post("/api/jobs/does_not_exist", json={"params": {}})
+    assert resp.status_code == 404
+    assert "Unknown job type" in resp.json()["detail"]
+
+
+@pytest.mark.asyncio
+async def test_create_invalid_params_422(client):
+    resp = await client.post("/api/jobs/noop", json={"params": {"steps": "not-an-int"}})
+    assert resp.status_code == 422
+
+
+@pytest.mark.asyncio
+async def test_create_stores_metadata_and_project_id(client, registry):
+    resp = await client.post(
+        "/api/jobs/project_scoped",
+        json={"params": {"project_id": "p_abc"}, "metadata": {"source": "test"}},
+    )
+    assert resp.status_code == 201
+    job_id = resp.json()["job_id"]
+    record = registry._jobs[job_id]
+    assert record.project_id == "p_abc"
+    assert record.metadata == {"source": "test"}
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_create_noop_has_null_project_id(client, registry):
+    job_id = await _create_noop(client)
+    assert registry._jobs[job_id].project_id is None
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_create_explicit_project_id_scopes_typeless_job(client, registry):
+    # A job whose params carry no project_id (noop) still gets scoped when the
+    # request body sets project_id explicitly — this is what the project-filtered
+    # jobs panel / SSE stream rely on to show such jobs.
+    resp = await client.post(
+        "/api/jobs/noop",
+        json={
+            "params": {"steps": 50, "sleep_per_step_seconds": 0.05},
+            "project_id": "p_explicit",
+        },
+    )
+    assert resp.status_code == 201
+    job_id = resp.json()["job_id"]
+    assert registry._jobs[job_id].project_id == "p_explicit"
+    rows = (await client.get("/api/jobs", params={"project_id": "p_explicit"})).json()
+    assert any(r["id"] == job_id for r in rows)
+    await registry.cancel(job_id)
+
+
+# -- list --------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_list_empty(client):
+    resp = await client.get("/api/jobs")
+    assert resp.status_code == 200
+    assert resp.json() == []
+
+
+@pytest.mark.asyncio
+async def test_list_returns_jobs_sorted_desc(client, registry):
+    first = await _create_noop(client)
+    second = await _create_noop(client)
+    resp = await client.get("/api/jobs")
+    assert resp.status_code == 200
+    ids = [r["id"] for r in resp.json()]
+    assert ids[0] == second
+    assert ids[1] == first
+    await registry.cancel(first)
+    await registry.cancel(second)
+
+
+@pytest.mark.asyncio
+async def test_list_filter_by_type(client, registry):
+    await _create_noop(client)
+    await client.post("/api/jobs/project_scoped", json={"params": {"project_id": "p1"}})
+    resp = await client.get("/api/jobs", params={"type": "project_scoped"})
+    assert resp.status_code == 200
+    rows = resp.json()
+    assert len(rows) == 1
+    assert rows[0]["type"] == "project_scoped"
+
+
+@pytest.mark.asyncio
+async def test_list_filter_by_status(client, registry):
+    job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED)
+    resp = await client.get("/api/jobs", params={"status": "succeeded"})
+    assert [r["id"] for r in resp.json()] == [job_id]
+    resp = await client.get("/api/jobs", params={"status": "running"})
+    assert resp.json() == []
+
+
+@pytest.mark.asyncio
+async def test_list_filter_by_project_id(client):
+    await client.post(
+        "/api/jobs/project_scoped", json={"params": {"project_id": "p_one"}}
+    )
+    await client.post(
+        "/api/jobs/project_scoped", json={"params": {"project_id": "p_two"}}
+    )
+    resp = await client.get("/api/jobs", params={"project_id": "p_one"})
+    rows = resp.json()
+    assert len(rows) == 1
+    assert rows[0]["project_id"] == "p_one"
+
+
+@pytest.mark.asyncio
+async def test_list_limit(client):
+    for _ in range(3):
+        await _create_noop(client)
+    resp = await client.get("/api/jobs", params={"limit": 2})
+    assert len(resp.json()) == 2
+
+
+@pytest.mark.asyncio
+async def test_list_since_excludes_older(client, registry):
+    old_id = await _create_noop(client)
+    newer_id = await _create_noop(client)
+    cutoff = registry._jobs[newer_id].created_at.isoformat()
+    resp = await client.get("/api/jobs", params={"since": cutoff})
+    ids = [r["id"] for r in resp.json()]
+    assert newer_id in ids
+    assert old_id not in ids
+
+
+# -- get ---------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_get_returns_record(client, registry):
+    job_id = await _create_noop(client)
+    resp = await client.get(f"/api/jobs/{job_id}")
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["id"] == job_id
+    assert body["type"] == "noop"
+    assert "progress" in body
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_get_unknown_404(client):
+    resp = await client.get("/api/jobs/j_missing")
+    assert resp.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_get_reconciles_to_succeeded(client, registry):
+    ReconcileCompleteWorker.done = False
+    resp = await client.post("/api/jobs/reconcile_complete", json={"params": {}})
+    job_id = resp.json()["job_id"]
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+    ReconcileCompleteWorker.done = True
+    got = await client.get(f"/api/jobs/{job_id}")
+    assert got.status_code == 200
+    assert got.json()["status"] == "succeeded"
+    assert got.json()["progress"]["success"] == 3
+
+
+# -- result ------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_result_200_when_terminal(client, registry):
+    job_id = await _create_noop(client, steps=3, sleep_per_step_seconds=0.01)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED)
+    resp = await client.get(f"/api/jobs/{job_id}/result")
+    assert resp.status_code == 200
+    assert resp.json() == {"completed_steps": 3}
+
+
+@pytest.mark.asyncio
+async def test_result_404_when_not_terminal(client, registry):
+    job_id = await _create_noop(client)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+    resp = await client.get(f"/api/jobs/{job_id}/result")
+    assert resp.status_code == 404
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_result_404_unknown(client):
+    resp = await client.get("/api/jobs/j_missing/result")
+    assert resp.status_code == 404
+
+
+# -- errors ------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_errors_returns_array(client, registry):
+    resp = await client.post(
+        "/api/jobs/noop",
+        json={
+            "params": {
+                "steps": 4,
+                "sleep_per_step_seconds": 0.01,
+                "error_at_steps": [1, 3],
+            }
+        },
+    )
+    job_id = resp.json()["job_id"]
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED)
+    resp = await client.get(f"/api/jobs/{job_id}/errors")
+    assert resp.status_code == 200
+    messages = [e["error_message"] for e in resp.json()]
+    assert "intentional error at step 1" in messages
+    assert "intentional error at step 3" in messages
+
+
+@pytest.mark.asyncio
+async def test_errors_empty_when_none(client, registry):
+    job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED)
+    resp = await client.get(f"/api/jobs/{job_id}/errors")
+    assert resp.status_code == 200
+    assert resp.json() == []
+
+
+@pytest.mark.asyncio
+async def test_errors_unknown_job_returns_empty_200(client):
+    resp = await client.get("/api/jobs/j_missing/errors")
+    assert resp.status_code == 200
+    assert resp.json() == []
+
+
+@pytest.mark.asyncio
+async def test_errors_specific_run_id(client):
+    run_id = str(uuid.uuid4())
+    error_log.append_error(run_id, {"error_message": "from a past run"})
+    resp = await client.get("/api/jobs/j_missing/errors", params={"run_id": run_id})
+    assert resp.status_code == 200
+    assert resp.json() == [{"error_message": "from a past run"}]
+
+
+# -- pause / resume / cancel -------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_pause_then_resume(client, registry):
+    job_id = await _create_noop(client, steps=50, sleep_per_step_seconds=0.03)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+
+    resp = await client.post(f"/api/jobs/{job_id}/pause")
+    assert resp.status_code == 202
+    assert registry._jobs[job_id].status == BackgroundJobStatus.PAUSED
+
+    resp = await client.post(f"/api/jobs/{job_id}/resume")
+    assert resp.status_code == 202
+    assert registry._jobs[job_id].status in (
+        BackgroundJobStatus.PENDING,
+        BackgroundJobStatus.RUNNING,
+    )
+
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_pause_409_when_not_running(client, registry):
+    job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED)
+    resp = await client.post(f"/api/jobs/{job_id}/pause")
+    assert resp.status_code == 409
+
+
+@pytest.mark.asyncio
+async def test_pause_409_when_unsupported(client, registry):
+    resp = await client.post("/api/jobs/nonpausable", json={"params": {}})
+    job_id = resp.json()["job_id"]
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+    resp = await client.post(f"/api/jobs/{job_id}/pause")
+    assert resp.status_code == 409
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_pause_unknown_404(client):
+    resp = await client.post("/api/jobs/j_missing/pause")
+    assert resp.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_resume_409_when_not_paused(client, registry):
+    job_id = await _create_noop(client)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+    resp = await client.post(f"/api/jobs/{job_id}/resume")
+    assert resp.status_code == 409
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_cancel_202(client, registry):
+    job_id = await _create_noop(client)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+    resp = await client.post(f"/api/jobs/{job_id}/cancel")
+    assert resp.status_code == 202
+    assert registry._jobs[job_id].status == BackgroundJobStatus.CANCELLED
+
+
+@pytest.mark.asyncio
+async def test_cancel_409_when_terminal(client, registry):
+    job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED)
+    resp = await client.post(f"/api/jobs/{job_id}/cancel")
+    assert resp.status_code == 409
+
+
+@pytest.mark.asyncio
+async def test_cancel_unknown_404(client):
+    resp = await client.post("/api/jobs/j_missing/cancel")
+    assert resp.status_code == 404
+
+
+# -- delete ------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_delete_204_when_terminal(client, registry):
+    job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED)
+    resp = await client.delete(f"/api/jobs/{job_id}")
+    assert resp.status_code == 204
+    assert job_id not in registry._jobs
+    assert (await client.get("/api/jobs")).json() == []
+
+
+@pytest.mark.asyncio
+async def test_delete_409_when_in_flight(client, registry):
+    job_id = await _create_noop(client)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+    resp = await client.delete(f"/api/jobs/{job_id}")
+    assert resp.status_code == 409
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_delete_unknown_404(client):
+    resp = await client.delete("/api/jobs/j_missing")
+    assert resp.status_code == 404
+
+
+# -- wiring ------------------------------------------------------------------
+
+
+def test_connect_jobs_api_registers_noop_idempotently(monkeypatch):
+    reg = JobRegistry(max_concurrent=2)
+    monkeypatch.setattr(jobs_api, "job_registry", reg)
+    app = FastAPI()
+    connect_jobs_api(app)
+    connect_jobs_api(app)  # second call must not raise
+    assert "noop" in reg._workers
+
+
+# -- SSE ---------------------------------------------------------------------
+
+
+def test_format_sse_wire_format():
+    from app.desktop.studio_server.jobs.events import JobEvent
+
+    event = JobEvent(event="job", data={"id": "j_abc", "status": "running"})
+    wire = jobs_api._format_sse(event)
+    assert wire == 'event: job\ndata: {"id": "j_abc", "status": "running"}\n\n'
+
+
+@pytest.mark.asyncio
+async def test_event_stream_forwards_snapshot_then_job(registry):
+    # Unit-level test of the generator (independent of any HTTP transport): a
+    # subscriber gets the initial snapshot, and a job created afterward produces
+    # a `job` event. Proves pure-observer forwarding of the Phase 1 bus.
+    stream = jobs_api._event_stream(job_id=None, type_name=None, project_id=None)
+    try:
+        first = await asyncio.wait_for(stream.__anext__(), timeout=3.0)
+        assert first.startswith("event: snapshot\n")
+
+        job = await registry.create(
+            "noop", {"steps": 40, "sleep_per_step_seconds": 0.05}
+        )
+        # Drain until we see a job event for our job.
+        deadline = asyncio.get_event_loop().time() + 3.0
+        saw_job = False
+        while asyncio.get_event_loop().time() < deadline:
+            chunk = await asyncio.wait_for(stream.__anext__(), timeout=3.0)
+            if chunk.startswith("event: job\n") and job.id in chunk:
+                saw_job = True
+                break
+        assert saw_job
+        await _safe_cancel(registry, job.id)
+    finally:
+        await stream.aclose()
+
+
+def _parse_sse_block(block: str) -> tuple[str | None, dict | None]:
+    event_name: str | None = None
+    data: dict | None = None
+    for line in block.splitlines():
+        if line.startswith("event:"):
+            event_name = line[len("event:") :].strip()
+        elif line.startswith("data:"):
+            data = json.loads(line[len("data:") :].strip())
+    return event_name, data
+
+
+async def _read_until_event(line_iter, target: str, timeout: float = 3.0) -> dict:
+    """Read SSE blocks from a shared line iterator until one matches the target
+    event name; return its data. httpx allows streaming the body only once, so a
+    single iterator must be threaded through all reads on a response."""
+    buffer = ""
+    while True:
+        line = await asyncio.wait_for(line_iter.__anext__(), timeout=timeout)
+        if line == "":
+            event_name, data = _parse_sse_block(buffer)
+            buffer = ""
+            if event_name == target and data is not None:
+                return data
+        else:
+            buffer += line + "\n"
+
+
+@pytest.mark.asyncio
+async def test_sse_empty_snapshot(app, fast_keepalive):
+    # Connecting with no jobs yields an empty snapshot. (httpx's ASGITransport
+    # sends http.disconnect right after the GET body, so we only assert the
+    # initial snapshot here; live-event delivery is covered below with a job
+    # that is already running before we connect.)
+    transport = httpx.ASGITransport(app=app)
+    async with httpx.AsyncClient(
+        transport=transport, base_url="http://test"
+    ) as http_client:
+        async with http_client.stream("GET", "/api/jobs/events") as response:
+            assert response.status_code == 200
+            assert response.headers["content-type"].startswith("text/event-stream")
+            snapshot = await _read_until_event(response.aiter_lines(), "snapshot")
+            assert snapshot == {"jobs": []}
+
+
+@pytest.mark.asyncio
+async def test_sse_snapshot_then_job_event(app, registry, fast_keepalive):
+    # Start a long-running job first, so it appears in the snapshot and keeps
+    # emitting live `job` progress events while we observe the stream.
+    job = await registry.create("noop", {"steps": 40, "sleep_per_step_seconds": 0.05})
+
+    transport = httpx.ASGITransport(app=app)
+    async with httpx.AsyncClient(
+        transport=transport, base_url="http://test"
+    ) as http_client:
+        async with http_client.stream("GET", "/api/jobs/events") as response:
+            assert response.status_code == 200
+            assert response.headers["content-type"].startswith("text/event-stream")
+            lines = response.aiter_lines()
+
+            snapshot = await _read_until_event(lines, "snapshot")
+            assert [j["id"] for j in snapshot["jobs"]] == [job.id]
+
+            data = await _read_until_event(lines, "job")
+            assert data["id"] == job.id
+            assert data["type"] == "noop"
+
+    await _safe_cancel(registry, job.id)
+
+
+@pytest.mark.asyncio
+async def test_sse_filters_by_job_id(app, registry, fast_keepalive):
+    # Both jobs run; only `target`'s events should reach a job_id-filtered stream.
+    other = await registry.create("noop", {"steps": 40, "sleep_per_step_seconds": 0.05})
+    target = await registry.create(
+        "noop", {"steps": 40, "sleep_per_step_seconds": 0.05}
+    )
+
+    transport = httpx.ASGITransport(app=app)
+    async with httpx.AsyncClient(
+        transport=transport, base_url="http://test"
+    ) as http_client:
+        async with http_client.stream(
+            "GET", "/api/jobs/events", params={"job_id": target.id}
+        ) as response:
+            lines = response.aiter_lines()
+            snapshot = await _read_until_event(lines, "snapshot")
+            snapshot_ids = {j["id"] for j in snapshot["jobs"]}
+            assert target.id in snapshot_ids
+            assert other.id not in snapshot_ids
+
+            # The progress event that arrives is for the target, never `other`.
+            data = await _read_until_event(lines, "job")
+            assert data["id"] == target.id
+
+    await _safe_cancel(registry, other.id)
+    await _safe_cancel(registry, target.id)
+
+
+@pytest.mark.asyncio
+async def test_sse_disconnect_leaves_job_running(app, registry, fast_keepalive):
+    """The decoupling guarantee: dropping the SSE stream mid-run must NOT stop
+    the job. Only explicit cancel/pause stops a job."""
+    job = await registry.create("noop", {"steps": 6, "sleep_per_step_seconds": 0.05})
+
+    transport = httpx.ASGITransport(app=app)
+    async with httpx.AsyncClient(
+        transport=transport, base_url="http://test"
+    ) as http_client:
+        async with http_client.stream("GET", "/api/jobs/events") as response:
+            lines = response.aiter_lines()
+            await _read_until_event(lines, "snapshot")
+            # Observe at least one live job event so we know the run is underway.
+            await _read_until_event(lines, "job")
+        # Exiting the `stream` context drops the client connection, which cancels
+        # the SSE subscription generator (CancellableStreamingResponse). The job
+        # task lives in the registry and must keep running.
+
+    assert registry._jobs[job.id].status in (
+        BackgroundJobStatus.RUNNING,
+        BackgroundJobStatus.SUCCEEDED,
+    )
+    await _wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+    assert registry._jobs[job.id].result == {"completed_steps": 6}
diff --git a/app/desktop/studio_server/jobs/test_error_log.py b/app/desktop/studio_server/jobs/test_error_log.py
new file mode 100644
index 000000000..d4291c9de
--- /dev/null
+++ b/app/desktop/studio_server/jobs/test_error_log.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+import uuid
+
+import pytest
+
+from app.desktop.studio_server.jobs import error_log
+
+
+@pytest.fixture
+def run_id(tmp_path, monkeypatch):
+    monkeypatch.setattr(
+        "app.desktop.studio_server.jobs.error_log.tempfile.gettempdir",
+        lambda: str(tmp_path),
+    )
+    return str(uuid.uuid4())
+
+
+def test_append_and_read_round_trip(run_id):
+    error_log.append_error(run_id, {"error_message": "first", "step": 1})
+    error_log.append_error(run_id, {"error_message": "second", "item_id": "x"})
+
+    entries = error_log.read_errors(run_id)
+    assert entries == [
+        {"error_message": "first", "step": 1},
+        {"error_message": "second", "item_id": "x"},
+    ]
+
+
+def test_read_missing_file_returns_empty(run_id):
+    assert error_log.read_errors(run_id) == []
+
+
+def test_read_skips_unparsable_lines(run_id):
+    error_log.append_error(run_id, {"error_message": "good"})
+    with error_log.error_log_path(run_id).open("a", encoding="utf-8") as f:
+        f.write("not json at all\n")
+        f.write("\n")
+    error_log.append_error(run_id, {"error_message": "also good"})
+
+    entries = error_log.read_errors(run_id)
+    assert entries == [
+        {"error_message": "good"},
+        {"error_message": "also good"},
+    ]
+
+
+def test_delete_removes_file(run_id):
+    error_log.append_error(run_id, {"error_message": "x"})
+    assert error_log.error_log_path(run_id).exists()
+
+    error_log.delete_errors(run_id)
+    assert not error_log.error_log_path(run_id).exists()
+    assert error_log.read_errors(run_id) == []
+
+
+def test_delete_missing_file_is_noop(run_id):
+    error_log.delete_errors(run_id)
+    assert error_log.read_errors(run_id) == []
+
+
+def test_append_never_raises_on_bad_dir(monkeypatch, run_id):
+    def boom(*args, **kwargs):
+        raise OSError("disk full")
+
+    monkeypatch.setattr("app.desktop.studio_server.jobs.error_log.Path.mkdir", boom)
+    error_log.append_error(run_id, {"error_message": "swallowed"})
diff --git a/app/desktop/studio_server/jobs/test_events.py b/app/desktop/studio_server/jobs/test_events.py
new file mode 100644
index 000000000..2a60e3f2f
--- /dev/null
+++ b/app/desktop/studio_server/jobs/test_events.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import asyncio
+
+import pytest
+from app.desktop.studio_server.jobs.events import JobEvent, JobEventBus
+from app.desktop.studio_server.jobs.models import BackgroundJobStatus, JobRecord
+
+
+def _record(
+    job_id: str = "j_aaaaaaaaaaaa",
+    type_name: str = "noop",
+    project_id: str | None = None,
+    status: BackgroundJobStatus = BackgroundJobStatus.RUNNING,
+) -> JobRecord:
+    return JobRecord(
+        id=job_id,
+        type=type_name,
+        status=status,
+        project_id=project_id,
+    )
+
+
+async def _next_event(gen, timeout: float = 1.0) -> JobEvent:
+    return await asyncio.wait_for(gen.__anext__(), timeout=timeout)
+
+
+@pytest.mark.asyncio
+async def test_snapshot_then_job_event():
+    existing = _record("j_existing0001")
+    bus = JobEventBus(snapshot_provider=lambda: [existing])
+
+    gen = bus.subscribe()
+    snapshot = await _next_event(gen)
+    assert snapshot.event == "snapshot"
+    assert [j["id"] for j in snapshot.data["jobs"]] == ["j_existing0001"]
+
+    new = _record("j_new000000001")
+    bus.publish_job(new)
+    job_event = await _next_event(gen)
+    assert job_event.event == "job"
+    assert job_event.data["id"] == "j_new000000001"
+
+    await gen.aclose()
+
+
+@pytest.mark.asyncio
+async def test_deleted_event():
+    bus = JobEventBus(snapshot_provider=lambda: [])
+    gen = bus.subscribe()
+    await _next_event(gen)  # snapshot
+
+    bus.publish_deleted("j_gone00000001")
+    event = await _next_event(gen)
+    assert event.event == "deleted"
+    assert event.data == {"id": "j_gone00000001"}
+
+    await gen.aclose()
+
+
+@pytest.mark.asyncio
+async def test_filter_by_project_id():
+    matching = _record("j_match0000001", project_id="p_keep")
+    other = _record("j_other0000001", project_id="p_drop")
+    bus = JobEventBus(snapshot_provider=lambda: [matching, other])
+
+    gen = bus.subscribe(project_id="p_keep")
+    snapshot = await _next_event(gen)
+    assert [j["id"] for j in snapshot.data["jobs"]] == ["j_match0000001"]
+
+    bus.publish_job(other)
+    bus.publish_job(matching)
+    event = await _next_event(gen)
+    assert event.data["id"] == "j_match0000001"
+
+    await gen.aclose()
+
+
+@pytest.mark.asyncio
+async def test_filter_by_type_and_job_id():
+    bus = JobEventBus(snapshot_provider=lambda: [])
+    gen = bus.subscribe(type_name="eval", job_id="j_target000001")
+    await _next_event(gen)  # snapshot
+
+    bus.publish_job(_record("j_other0000001", type_name="noop"))
+    bus.publish_job(_record("j_target000001", type_name="eval"))
+    event = await _next_event(gen)
+    assert event.data["id"] == "j_target000001"
+
+    await gen.aclose()
diff --git a/app/desktop/studio_server/jobs/test_registry.py b/app/desktop/studio_server/jobs/test_registry.py
new file mode 100644
index 000000000..2dab8909c
--- /dev/null
+++ b/app/desktop/studio_server/jobs/test_registry.py
@@ -0,0 +1,723 @@
+from __future__ import annotations
+
+import asyncio
+import uuid
+
+import pytest
+from pydantic import BaseModel
+
+from app.desktop.studio_server.jobs import error_log
+from app.desktop.studio_server.jobs.models import (
+    JobDerivedState,
+    BackgroundJobStatus,
+    JobWorker,
+)
+from app.desktop.studio_server.jobs.registry import (
+    JobNotFoundError,
+    JobOperationError,
+    JobRegistry,
+    _new_job_id,
+)
+from app.desktop.studio_server.jobs.workers.noop import NoopJobWorker
+
+
+@pytest.fixture(autouse=True)
+def temp_error_log_dir(tmp_path, monkeypatch):
+    monkeypatch.setattr(
+        "app.desktop.studio_server.jobs.error_log.tempfile.gettempdir",
+        lambda: str(tmp_path),
+    )
+
+
+@pytest.fixture
+def registry():
+    reg = JobRegistry(max_concurrent=10)
+    reg.register_type(NoopJobWorker)
+    return reg
+
+
+async def wait_for_status(
+    registry: JobRegistry,
+    job_id: str,
+    target: BackgroundJobStatus | set[BackgroundJobStatus],
+    timeout: float = 3.0,
+) -> None:
+    targets = {target} if isinstance(target, BackgroundJobStatus) else target
+    deadline = asyncio.get_event_loop().time() + timeout
+    while asyncio.get_event_loop().time() < deadline:
+        job = registry._jobs.get(job_id)
+        if job is not None and job.status in targets:
+            return
+        await asyncio.sleep(0.01)
+    job = registry._jobs.get(job_id)
+    actual = job.status if job else "missing"
+    raise AssertionError(f"Job {job_id} did not reach {targets}; was {actual}")
+
+
+# -- supporting test workers ------------------------------------------------
+
+
+class _EmptyParams(BaseModel):
+    pass
+
+
+class _EmptyResult(BaseModel):
+    pass
+
+
+class NonPausableWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    type_name = "nonpausable"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = False
+
+    async def run(self, params, ctx):
+        await asyncio.sleep(5)
+        return _EmptyResult()
+
+
+class AlreadyCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    type_name = "already_complete"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = True
+    run_called = False
+
+    async def compute_state(self, params):
+        return JobDerivedState(total=5, success=5, error=0, is_complete=True)
+
+    async def run(self, params, ctx):
+        type(self).run_called = True
+        return _EmptyResult()
+
+
+class PartialProgressWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    """First reports the full set (total + message), then a count-only update.
+    The later partial update must preserve the earlier total/message, not null
+    them.
+    """
+
+    type_name = "partial_progress"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = False
+
+    async def run(self, params, ctx):
+        await ctx.report_progress(success=1, total=50, message="starting")
+        await ctx.report_progress(success=5)
+        return _EmptyResult()
+
+
+class RaceCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    """run() blocks on a test-controlled gate, then returns normally without
+    ever observing a cancellation. The test opens the gate (so run() returns and
+    the supervising task drives the job to its terminal succeeded state) and only
+    then issues pause/cancel — reproducing the completion-vs-cancel race where
+    the job finished naturally during the cancel await.
+    """
+
+    type_name = "race_complete"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = True
+    gate: asyncio.Event
+
+    async def run(self, params, ctx):
+        await type(self).gate.wait()
+        return _EmptyResult()
+
+
+class SwallowCancelWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    """Catches CancelledError, fully clears the cancellation (uncancel) so it is
+    not re-raised, and returns normally — the worst-case "swallows CancelledError
+    and returns silently" worker. The cancellation transition is unconditional,
+    so the registry itself must land the job in paused/cancelled rather than
+    trusting the worker to re-raise.
+
+    `started` is set once run() is actually suspended at its await point, so a
+    test can guarantee the cancellation is delivered into the worker body (not
+    before it runs) before issuing pause/cancel.
+    """
+
+    type_name = "swallow_cancel"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = True
+    started: asyncio.Event
+    gate: asyncio.Event
+
+    async def run(self, params, ctx):
+        type(self).started.set()
+        try:
+            await type(self).gate.wait()
+        except asyncio.CancelledError:
+            task = asyncio.current_task()
+            if task is not None:
+                task.uncancel()
+        return _EmptyResult()
+
+
+class TotalThenNoneWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    """run() reports a known total via report_progress, then compute_state at
+    pause returns total=None alongside success/error counts. The reconcile must
+    preserve the prior total rather than wiping the denominator to None.
+    """
+
+    type_name = "total_then_none"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = True
+    started: asyncio.Event
+    gate: asyncio.Event
+
+    async def compute_state(self, params):
+        return JobDerivedState(total=None, success=2, error=1, is_complete=False)
+
+    async def run(self, params, ctx):
+        await ctx.report_progress(success=0, total=10, message="starting")
+        type(self).started.set()
+        try:
+            await type(self).gate.wait()
+        except asyncio.CancelledError:
+            task = asyncio.current_task()
+            if task is not None:
+                task.uncancel()
+        return _EmptyResult()
+
+
+class ReconcileCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    """compute_state reports complete only once the test flips `done`, so a
+    get() issued while the job is still running (run() is a long sleep)
+    reconciles it straight to succeeded mid-flight.
+    """
+
+    type_name = "reconcile_complete"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    supports_pause = True
+    done = False
+
+    async def compute_state(self, params):
+        complete = type(self).done
+        return JobDerivedState(
+            total=3, success=3 if complete else 1, error=0, is_complete=complete
+        )
+
+    async def run(self, params, ctx):
+        await asyncio.sleep(5)
+        return _EmptyResult()
+
+
+# -- job id ------------------------------------------------------------------
+
+
+def test_job_id_format():
+    job_id = _new_job_id()
+    assert job_id.startswith("j_")
+    suffix = job_id[2:]
+    assert len(suffix) == 12
+    assert all(c in "abcdefghijklmnopqrstuvwxyz234567" for c in suffix)
+
+
+# -- lifecycle ---------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_full_lifecycle_succeeds(registry):
+    job = await registry.create("noop", {"steps": 3, "sleep_per_step_seconds": 0.01})
+    assert job.status in (BackgroundJobStatus.PENDING, BackgroundJobStatus.RUNNING)
+    assert job.supports_pause is True
+
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+    final = registry._jobs[job.id]
+    assert final.result == {"completed_steps": 3}
+    assert final.started_at is not None
+    assert final.ended_at is not None
+    assert final.run_id is not None
+    assert final.progress.success == 3
+
+
+@pytest.mark.asyncio
+async def test_failure_path_captures_error_log(registry):
+    job = await registry.create(
+        "noop",
+        {"steps": 5, "sleep_per_step_seconds": 0.01, "fail_at_step": 2},
+    )
+    await wait_for_status(registry, job.id, BackgroundJobStatus.FAILED)
+
+    final = registry._jobs[job.id]
+    assert final.error is not None
+    assert final.error.error is not None
+    assert "intentional fail at step 2" in final.error.error
+
+    entries = error_log.read_errors(final.run_id)
+    fatal = [e for e in entries if e.get("fatal")]
+    assert len(fatal) == 1
+    assert "intentional fail at step 2" in fatal[0]["error_message"]
+
+
+@pytest.mark.asyncio
+async def test_non_fatal_errors_logged_and_counted(registry):
+    job = await registry.create(
+        "noop",
+        {
+            "steps": 4,
+            "sleep_per_step_seconds": 0.01,
+            "error_at_steps": [1, 3],
+        },
+    )
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+
+    final = registry._jobs[job.id]
+    assert final.progress.error == 2
+    assert final.progress.success == 2
+
+    entries = error_log.read_errors(final.run_id)
+    messages = [e["error_message"] for e in entries]
+    assert "intentional error at step 1" in messages
+    assert "intentional error at step 3" in messages
+    steps = sorted(e["step"] for e in entries if "step" in e)
+    assert steps == [1, 3]
+
+
+@pytest.mark.asyncio
+async def test_error_log_missing_returns_empty():
+    assert error_log.read_errors(str(uuid.uuid4())) == []
+
+
+# -- cancel ------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_cancel_pending_job_never_starts():
+    reg = JobRegistry(max_concurrent=1)
+    reg.register_type(NoopJobWorker)
+    running = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    await wait_for_status(reg, running.id, BackgroundJobStatus.RUNNING)
+    pending = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    assert reg._jobs[pending.id].status == BackgroundJobStatus.PENDING
+
+    await reg.cancel(pending.id)
+    assert reg._jobs[pending.id].status == BackgroundJobStatus.CANCELLED
+    assert pending.id not in reg._tasks
+
+    await reg.cancel(running.id)
+
+
+@pytest.mark.asyncio
+async def test_cancel_from_running(registry):
+    job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    await registry.cancel(job.id)
+    assert registry._jobs[job.id].status == BackgroundJobStatus.CANCELLED
+
+
+@pytest.mark.asyncio
+async def test_cancel_immediately_after_create_reclaims_slot():
+    # Cancelling right after create can race the supervising task before its
+    # coroutine body runs; the registry must still reclaim the concurrency slot.
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(NoopJobWorker)
+    ids = []
+    for _ in range(6):
+        job = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.02})
+        ids.append(job.id)
+    for job_id in ids:
+        await reg.cancel(job_id)
+    await asyncio.sleep(0.05)
+
+    assert all(reg._jobs[i].status == BackgroundJobStatus.CANCELLED for i in ids)
+    assert reg._running_count == 0
+    assert reg._tasks == {}
+    assert reg._pending_ids == []
+
+
+@pytest.mark.asyncio
+async def test_cancel_terminal_raises(registry):
+    job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+    with pytest.raises(JobOperationError):
+        await registry.cancel(job.id)
+
+
+# -- pause / resume ----------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_pause_then_resume_succeeds(registry):
+    job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.03})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    first_run_id = registry._jobs[job.id].run_id
+
+    await registry.pause(job.id)
+    assert registry._jobs[job.id].status == BackgroundJobStatus.PAUSED
+
+    # Make resume finish quickly by checking it re-runs with a fresh run_id.
+    await registry.resume(job.id)
+    assert registry._jobs[job.id].status in (
+        BackgroundJobStatus.PENDING,
+        BackgroundJobStatus.RUNNING,
+    )
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    second_run_id = registry._jobs[job.id].run_id
+    assert second_run_id is not None
+    assert second_run_id != first_run_id
+
+    await registry.cancel(job.id)
+
+
+@pytest.mark.asyncio
+async def test_resume_to_succeeded_when_complete():
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(NoopJobWorker)
+    reg.register_type(AlreadyCompleteWorker)
+    AlreadyCompleteWorker.run_called = False
+
+    # Start a noop that we pause so we have a paused job to resume against a
+    # complete worker. Simpler: create the complete worker job, it succeeds
+    # immediately via reconcile at launch.
+    job = await reg.create("already_complete", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.SUCCEEDED)
+    assert AlreadyCompleteWorker.run_called is False
+    assert reg._jobs[job.id].progress.success == 5
+
+
+@pytest.mark.asyncio
+async def test_pause_rejected_when_not_supported():
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(NonPausableWorker)
+    job = await reg.create("nonpausable", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+    with pytest.raises(JobOperationError):
+        await reg.pause(job.id)
+    await reg.cancel(job.id)
+
+
+@pytest.mark.asyncio
+async def test_pause_rejected_when_not_running(registry):
+    job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+    with pytest.raises(JobOperationError):
+        await registry.pause(job.id)
+
+
+@pytest.mark.asyncio
+async def test_resume_rejected_when_not_paused(registry):
+    job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    with pytest.raises(JobOperationError):
+        await registry.resume(job.id)
+    await registry.cancel(job.id)
+
+
+async def _drive_completion_race(operation: str) -> JobRegistry:
+    # Reproduce the completion-vs-cancel race deterministically: the worker's
+    # run() is gated; we open the gate at the exact moment the lifecycle op
+    # begins its cancel await, so the supervising task finishes naturally
+    # (job -> succeeded, task done) before/while task.cancel() lands. The job
+    # was running at the op's entry check, so it gets past the guard, but the
+    # terminal succeeded state must survive.
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(RaceCompleteWorker)
+    RaceCompleteWorker.gate = asyncio.Event()
+    job = await reg.create("race_complete", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+
+    original_cancel_task = reg._cancel_task
+
+    async def open_gate_then_cancel(job_id: str) -> None:
+        # Let run() return and the supervising task drive to terminal first.
+        RaceCompleteWorker.gate.set()
+        task = reg._tasks.get(job_id)
+        if task is not None:
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+        await original_cancel_task(job_id)
+
+    reg._cancel_task = open_gate_then_cancel  # type: ignore[method-assign]
+
+    if operation == "pause":
+        await reg.pause(job.id)
+    else:
+        await reg.cancel(job.id)
+    return reg
+
+
+@pytest.mark.asyncio
+async def test_pause_loses_race_to_natural_completion_keeps_succeeded():
+    # Regression: if run() completes naturally during pause()'s cancel-await,
+    # the job is already terminal (succeeded) and pause() must not clobber it
+    # back to paused (which would drop the result and allow a resume re-run).
+    reg = await _drive_completion_race("pause")
+    job_id = next(iter(reg._jobs))
+    assert reg._jobs[job_id].status == BackgroundJobStatus.SUCCEEDED
+    assert reg._jobs[job_id].result is not None
+
+
+@pytest.mark.asyncio
+async def test_cancel_loses_race_to_natural_completion_keeps_succeeded():
+    # The cancel() path already guards on is_terminal; lock it in.
+    reg = await _drive_completion_race("cancel")
+    job_id = next(iter(reg._jobs))
+    assert reg._jobs[job_id].status == BackgroundJobStatus.SUCCEEDED
+    assert reg._jobs[job_id].result is not None
+
+
+@pytest.mark.asyncio
+async def test_pause_enforced_when_worker_swallows_cancel():
+    # A worker that catches CancelledError (and uncancels it) then returns
+    # normally must still be paused, not succeeded — the cancellation transition
+    # is unconditional and enforced by the registry, not the worker.
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(SwallowCancelWorker)
+    SwallowCancelWorker.started = asyncio.Event()
+    SwallowCancelWorker.gate = asyncio.Event()
+    job = await reg.create("swallow_cancel", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+    await asyncio.wait_for(SwallowCancelWorker.started.wait(), timeout=3.0)
+
+    result = await reg.pause(job.id)
+    assert result.status == BackgroundJobStatus.PAUSED
+    assert reg._jobs[job.id].result is None
+
+
+@pytest.mark.asyncio
+async def test_cancel_enforced_when_worker_swallows_cancel():
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(SwallowCancelWorker)
+    SwallowCancelWorker.started = asyncio.Event()
+    SwallowCancelWorker.gate = asyncio.Event()
+    job = await reg.create("swallow_cancel", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+    await asyncio.wait_for(SwallowCancelWorker.started.wait(), timeout=3.0)
+
+    result = await reg.cancel(job.id)
+    assert result.status == BackgroundJobStatus.CANCELLED
+    assert reg._jobs[job.id].result is None
+
+
+@pytest.mark.asyncio
+async def test_cancel_from_paused():
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(NoopJobWorker)
+    job = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.03})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+    await reg.pause(job.id)
+    assert reg._jobs[job.id].status == BackgroundJobStatus.PAUSED
+
+    result = await reg.cancel(job.id)
+    assert result.status == BackgroundJobStatus.CANCELLED
+    assert reg._jobs[job.id].status == BackgroundJobStatus.CANCELLED
+    assert reg._jobs[job.id].ended_at is not None
+
+
+# -- delete ------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_delete_terminal_emits_deleted(registry):
+    job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+
+    events = []
+    gen = registry.events.subscribe()
+    await asyncio.wait_for(gen.__anext__(), timeout=1.0)  # snapshot
+
+    async def collect():
+        async for event in gen:
+            events.append(event)
+
+    collector = asyncio.create_task(collect())
+    await registry.delete(job.id)
+    await asyncio.sleep(0.05)
+    collector.cancel()
+    try:
+        await collector
+    except asyncio.CancelledError:
+        pass
+
+    assert job.id not in registry._jobs
+    assert any(e.event == "deleted" and e.data["id"] == job.id for e in events)
+
+
+@pytest.mark.asyncio
+async def test_delete_running_raises(registry):
+    job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    with pytest.raises(JobOperationError):
+        await registry.delete(job.id)
+    await registry.cancel(job.id)
+
+
+@pytest.mark.asyncio
+async def test_delete_pending_raises():
+    reg = JobRegistry(max_concurrent=1)
+    reg.register_type(NoopJobWorker)
+    running = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    await wait_for_status(reg, running.id, BackgroundJobStatus.RUNNING)
+    pending = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    assert reg._jobs[pending.id].status == BackgroundJobStatus.PENDING
+    with pytest.raises(JobOperationError):
+        await reg.delete(pending.id)
+    await reg.cancel(running.id)
+    await reg.cancel(pending.id)
+
+
+# -- reconciliation ----------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_compute_state_none_keeps_snapshot(registry):
+    # Noop's compute_state returns None, so the believed snapshot from
+    # report_progress is preserved and never flipped to complete early.
+    job = await registry.create("noop", {"steps": 4, "sleep_per_step_seconds": 0.02})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    # get() triggers reconcile; with None it must not change progress/status.
+    got = await registry.get(job.id)
+    assert got is not None
+    assert got.status in (BackgroundJobStatus.RUNNING, BackgroundJobStatus.SUCCEEDED)
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+    assert registry._jobs[job.id].progress.success == 4
+
+
+@pytest.mark.asyncio
+async def test_report_progress_preserves_total_and_message_when_omitted():
+    # A count-only report_progress call must not wipe a total/message set by an
+    # earlier call.
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(PartialProgressWorker)
+    job = await reg.create("partial_progress", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.SUCCEEDED)
+
+    final = reg._jobs[job.id]
+    assert final.progress.success == 5
+    assert final.progress.total == 50
+    assert final.progress.message == "starting"
+
+
+@pytest.mark.asyncio
+async def test_apply_derived_preserves_total_when_compute_state_returns_none():
+    # A compute_state that returns total=None (unknown denominator) alongside
+    # success/error counts must not wipe a total set earlier via report_progress.
+    # total=None means "unknown, keep what we had", mirroring message handling.
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(TotalThenNoneWorker)
+    TotalThenNoneWorker.started = asyncio.Event()
+    TotalThenNoneWorker.gate = asyncio.Event()
+    job = await reg.create("total_then_none", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+    await asyncio.wait_for(TotalThenNoneWorker.started.wait(), timeout=3.0)
+    assert reg._jobs[job.id].progress.total == 10
+
+    # pause() runs compute_state (total=None, success=2, error=1) through
+    # _apply_derived; the prior total of 10 must survive.
+    result = await reg.pause(job.id)
+    assert result.status == BackgroundJobStatus.PAUSED
+    assert result.progress.total == 10
+    assert result.progress.success == 2
+    assert result.progress.error == 1
+
+
+@pytest.mark.asyncio
+async def test_get_reconciles_running_job_to_succeeded_mid_flight():
+    # A long-running job whose source-of-truth state flips to complete should be
+    # reconciled straight to succeeded by get() (the running/get() reconcile
+    # path), not only at launch time.
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(ReconcileCompleteWorker)
+    ReconcileCompleteWorker.done = False
+    job = await reg.create("reconcile_complete", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+    # Still running here (run() is a 5s sleep); now flip the source of truth.
+    assert reg._jobs[job.id].status == BackgroundJobStatus.RUNNING
+    ReconcileCompleteWorker.done = True
+
+    got = await reg.get(job.id)
+    assert got is not None
+    assert got.status == BackgroundJobStatus.SUCCEEDED
+    assert got.progress.success == 3
+    assert got.ended_at is not None
+
+
+# -- concurrency -------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_semaphore_caps_concurrency_fifo():
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(NoopJobWorker)
+
+    jobs = []
+    for _ in range(4):
+        jobs.append(
+            await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+        )
+
+    await asyncio.sleep(0.05)
+    statuses = [reg._jobs[j.id].status for j in jobs]
+    running = [s for s in statuses if s == BackgroundJobStatus.RUNNING]
+    pending = [s for s in statuses if s == BackgroundJobStatus.PENDING]
+    assert len(running) == 2
+    assert len(pending) == 2
+    # FIFO: the first two created are the running ones.
+    assert statuses[0] == BackgroundJobStatus.RUNNING
+    assert statuses[1] == BackgroundJobStatus.RUNNING
+    assert statuses[2] == BackgroundJobStatus.PENDING
+    assert statuses[3] == BackgroundJobStatus.PENDING
+
+    # Cancel the running ones; pending should be promoted.
+    await reg.cancel(jobs[0].id)
+    await reg.cancel(jobs[1].id)
+    await wait_for_status(reg, jobs[2].id, BackgroundJobStatus.RUNNING)
+    await wait_for_status(reg, jobs[3].id, BackgroundJobStatus.RUNNING)
+
+    await reg.cancel(jobs[2].id)
+    await reg.cancel(jobs[3].id)
+
+
+# -- events ------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_registry_emits_snapshot_and_job_events(registry):
+    gen = registry.events.subscribe()
+    snapshot = await asyncio.wait_for(gen.__anext__(), timeout=1.0)
+    assert snapshot.event == "snapshot"
+    assert snapshot.data["jobs"] == []
+
+    events = []
+
+    async def collect():
+        async for event in gen:
+            events.append(event)
+
+    collector = asyncio.create_task(collect())
+    job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+    await asyncio.sleep(0.02)
+    collector.cancel()
+    try:
+        await collector
+    except asyncio.CancelledError:
+        pass
+
+    job_events = [e for e in events if e.event == "job"]
+    assert len(job_events) >= 2
+    assert any(e.data["status"] == "running" for e in job_events)
+    assert any(e.data["status"] == "succeeded" for e in job_events)
+
+
+# -- not found ---------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_get_unknown_returns_none(registry):
+    assert await registry.get("j_doesnotexist") is None
+
+
+@pytest.mark.asyncio
+async def test_lifecycle_op_unknown_raises(registry):
+    with pytest.raises(JobNotFoundError):
+        await registry.cancel("j_doesnotexist")
diff --git a/app/desktop/studio_server/jobs/workers/__init__.py b/app/desktop/studio_server/jobs/workers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/app/desktop/studio_server/jobs/workers/eval.py b/app/desktop/studio_server/jobs/workers/eval.py
new file mode 100644
index 000000000..89f540fa6
--- /dev/null
+++ b/app/desktop/studio_server/jobs/workers/eval.py
@@ -0,0 +1,136 @@
+from __future__ import annotations
+
+from app.desktop.git_sync.save_context import save_context_for_project
+from kiln_ai.adapters.eval.eval_runner import EvalRunner
+from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
+from kiln_ai.datamodel.eval import Eval, EvalConfig
+from kiln_ai.datamodel.task import Task
+from pydantic import BaseModel
+
+from ...eval_api import eval_config_from_id, task_run_config_from_id
+from ..models import JobContext, JobDerivedState, JobWorker
+
+
+class EvalJobParams(BaseModel):
+    project_id: str
+    task_id: str
+    eval_id: str
+    eval_config_id: str
+    run_config_id: str
+
+
+class EvalJobResult(BaseModel):
+    total: int
+    success: int
+    error: int
+
+
+class EvalJobWorker(JobWorker[EvalJobParams, EvalJobResult]):
+    """Background worker that runs an eval against a single run config.
+
+    Wraps the existing EvalRunner unchanged. Idempotent: EvalRunner excludes
+    already-run (eval_config, run_config, dataset) triples, so a paused-then-
+    resumed (or re-triggered) job skips completed items and writes no duplicate
+    EvalRun entities — hence supports_pause = True.
+    """
+
+    type_name = "eval"
+    params_model = EvalJobParams
+    result_model = EvalJobResult
+    supports_pause = True
+
+    async def compute_state(self, params: EvalJobParams) -> JobDerivedState:
+        eval_config = eval_config_from_id(
+            params.project_id,
+            params.task_id,
+            params.eval_id,
+            params.eval_config_id,
+        )
+        eval, task = self._eval_and_task(eval_config)
+
+        # The eval-set filter defines the universe of dataset items in scope.
+        # EvalRunner only works items that BOTH pass this filter AND lack a
+        # matching EvalRun, so progress must be measured against this set.
+        filter = dataset_filter_from_id(eval.eval_set_filter_id)
+        in_filter_ids = {
+            task_run.id for task_run in task.runs(readonly=True) if filter(task_run)
+        }
+        total = len(in_filter_ids)
+
+        # Count only scored items that are still in the filter set. Items that
+        # were scored but later drifted out of the filter must not be counted,
+        # or success/is_complete would overcount and a resume could short-circuit
+        # to succeeded while real work remains.
+        scored_ids = {
+            run.dataset_id
+            for run in eval_config.runs(readonly=True)
+            if run.task_run_config_id == params.run_config_id
+        }
+        success = len(scored_ids & in_filter_ids)
+
+        return JobDerivedState(
+            total=total,
+            success=success,
+            error=0,
+            is_complete=success >= total,
+        )
+
+    async def run(self, params: EvalJobParams, ctx: JobContext) -> EvalJobResult:
+        # Baseline: items already scored (and still in-filter) before this run.
+        # EvalRunner only works the unfinished remainder, so its Progress counts
+        # are relative to that remainder. We add the baseline back so progress
+        # and the returned result are reported against the FULL eval-set size,
+        # not just the work left for this run.
+        baseline = await self.compute_state(params)
+        baseline_success = baseline.success
+
+        eval_runner = self._build_eval_runner(params)
+
+        success = baseline_success
+        total = baseline.total if baseline.total is not None else baseline_success
+        error = 0
+        async for progress in eval_runner.run():
+            # progress.total = full - baseline_success (the unfinished remainder),
+            # so baseline_success + progress.total = the full eval-set size.
+            success = baseline_success + progress.complete
+            total = baseline_success + progress.total
+            error = progress.errors
+            await ctx.report_progress(
+                success=success,
+                error=error,
+                total=total,
+            )
+
+        return EvalJobResult(total=total, success=success, error=error)
+
+    def _build_eval_runner(self, params: EvalJobParams) -> EvalRunner:
+        eval_config = eval_config_from_id(
+            params.project_id,
+            params.task_id,
+            params.eval_id,
+            params.eval_config_id,
+        )
+        run_config = task_run_config_from_id(
+            params.project_id,
+            params.task_id,
+            params.run_config_id,
+        )
+        save_context = save_context_for_project(
+            params.project_id,
+            context=f"eval job {params.eval_id}/{params.run_config_id}",
+        )
+        return EvalRunner(
+            eval_configs=[eval_config],
+            run_configs=[run_config],
+            eval_run_type="task_run_eval",
+            save_context=save_context,
+        )
+
+    def _eval_and_task(self, eval_config: EvalConfig) -> tuple[Eval, Task]:
+        eval = eval_config.parent_eval()
+        if eval is None:
+            raise ValueError("Eval config has no parent eval")
+        task = eval.parent_task()
+        if task is None:
+            raise ValueError("Eval has no parent task")
+        return eval, task
diff --git a/app/desktop/studio_server/jobs/workers/noop.py b/app/desktop/studio_server/jobs/workers/noop.py
new file mode 100644
index 000000000..23cc8d04a
--- /dev/null
+++ b/app/desktop/studio_server/jobs/workers/noop.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import asyncio
+
+from pydantic import BaseModel
+
+from ..models import JobContext, JobDerivedState, JobWorker
+
+
+class NoopJobParams(BaseModel):
+    steps: int = 10
+    sleep_per_step_seconds: float = 0.5
+    fail_at_step: int | None = None
+    error_at_steps: list[int] = []
+
+
+class NoopJobResult(BaseModel):
+    completed_steps: int
+
+
+class NoopJobWorker(JobWorker[NoopJobParams, NoopJobResult]):
+    type_name = "noop"
+    params_model = NoopJobParams
+    result_model = NoopJobResult
+    supports_pause = True
+
+    async def compute_state(self, params: NoopJobParams) -> JobDerivedState | None:
+        return None
+
+    async def run(self, params: NoopJobParams, ctx: JobContext) -> NoopJobResult:
+        success = error = 0
+        for i in range(params.steps):
+            await asyncio.sleep(params.sleep_per_step_seconds)
+            if params.fail_at_step == i:
+                raise RuntimeError(f"intentional fail at step {i}")
+            if i in params.error_at_steps:
+                error += 1
+                await ctx.report_error(f"intentional error at step {i}", step=i)
+            else:
+                success += 1
+            await ctx.report_progress(
+                success=success,
+                error=error,
+                total=params.steps,
+                message=f"step {i + 1}/{params.steps}",
+            )
+        return NoopJobResult(completed_steps=success + error)
diff --git a/app/desktop/studio_server/jobs/workers/test_eval.py b/app/desktop/studio_server/jobs/workers/test_eval.py
new file mode 100644
index 000000000..0715344f9
--- /dev/null
+++ b/app/desktop/studio_server/jobs/workers/test_eval.py
@@ -0,0 +1,535 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+from typing import AsyncIterator
+from unittest.mock import patch
+
+import pytest
+from app.desktop.studio_server.jobs.models import BackgroundJobStatus
+from app.desktop.studio_server.jobs.registry import JobRegistry
+from app.desktop.studio_server.jobs.workers.eval import (
+    EvalJobParams,
+    EvalJobResult,
+    EvalJobWorker,
+)
+from kiln_ai.adapters.ml_model_list import ModelProviderName
+from kiln_ai.datamodel import (
+    DataSource,
+    DataSourceType,
+    Project,
+    Task,
+    TaskOutput,
+    TaskOutputRatingType,
+    TaskRun,
+)
+from kiln_ai.datamodel.eval import (
+    Eval,
+    EvalConfig,
+    EvalOutputScore,
+    EvalRun,
+)
+from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
+from kiln_ai.datamodel.task import StructuredOutputMode, TaskRunConfig
+from kiln_ai.utils.async_job_runner import Progress
+
+
+@pytest.fixture
+def project(tmp_path):
+    project = Project(
+        id="project1", name="Test Project", path=tmp_path / "project.kiln"
+    )
+    project.save_to_file()
+    return project
+
+
+@pytest.fixture
+def task(project):
+    task = Task(
+        id="task1",
+        name="Test Task",
+        description="test",
+        instruction="do the thing",
+        parent=project,
+    )
+    task.save_to_file()
+    return task
+
+
+@pytest.fixture
+def eval(task):
+    eval = Eval(
+        id="eval1",
+        name="Test Eval",
+        description="test",
+        eval_set_filter_id="tag::eval_set",
+        eval_configs_filter_id="tag::golden",
+        output_scores=[
+            EvalOutputScore(
+                name="Accuracy",
+                instruction="Check accuracy",
+                type=TaskOutputRatingType.pass_fail,
+            ),
+        ],
+        parent=task,
+    )
+    eval.save_to_file()
+    return eval
+
+
+@pytest.fixture
+def eval_config(eval):
+    eval_config = EvalConfig(
+        id="eval_config1",
+        name="Test Eval Config",
+        model_name="gpt-4",
+        model_provider="openai",
+        properties={"eval_steps": ["step1", "step2"]},
+        parent=eval,
+    )
+    eval_config.save_to_file()
+    return eval_config
+
+
+@pytest.fixture
+def run_config(task):
+    run_config = TaskRunConfig(
+        id="run_config1",
+        name="Test Run Config",
+        description="test",
+        run_config_properties=KilnAgentRunConfigProperties(
+            model_name="gpt-4",
+            model_provider_name=ModelProviderName.openai,
+            prompt_id="simple_prompt_builder",
+            structured_output_mode=StructuredOutputMode.json_schema,
+        ),
+        parent=task,
+    )
+    run_config.save_to_file()
+    return run_config
+
+
+@pytest.fixture
+def data_source():
+    return DataSource(
+        type=DataSourceType.synthetic,
+        properties={
+            "model_name": "gpt-4",
+            "model_provider": "openai",
+            "adapter_name": "test_adapter",
+        },
+    )
+
+
+@pytest.fixture
+def params():
+    return EvalJobParams(
+        project_id="project1",
+        task_id="task1",
+        eval_id="eval1",
+        eval_config_id="eval_config1",
+        run_config_id="run_config1",
+    )
+
+
+@pytest.fixture
+def resolve_project(project):
+    """Make the eval_api entity helpers resolve the on-disk project by id.
+
+    task_from_id binds project_from_id into kiln_server.task_api, so we patch it
+    there (the name as looked up), not at its definition site.
+    """
+    with patch("kiln_server.task_api.project_from_id", return_value=project):
+        yield project
+
+
+def _make_task_run(task, data_source, tag: str) -> TaskRun:
+    task_run = TaskRun(
+        parent=task,
+        input="test",
+        input_source=data_source,
+        tags=[tag],
+        output=TaskOutput(output="test"),
+    )
+    task_run.save_to_file()
+    return task_run
+
+
+def _make_eval_run(eval_config, dataset_id, run_config_id) -> EvalRun:
+    eval_run = EvalRun(
+        parent=eval_config,
+        dataset_id=dataset_id,
+        task_run_config_id=run_config_id,
+        input="test",
+        output="test",
+        scores={"accuracy": 1.0},
+    )
+    eval_run.save_to_file()
+    return eval_run
+
+
+@contextmanager
+def _stub_eval_runner_run(progresses: list[Progress]):
+    async def fake_run(self, concurrency: int = 25) -> AsyncIterator[Progress]:
+        for progress in progresses:
+            yield progress
+
+    with patch(
+        "kiln_ai.adapters.eval.eval_runner.EvalRunner.run",
+        new=fake_run,
+    ):
+        yield
+
+
+# -- compute_state -----------------------------------------------------------
+
+
+async def test_compute_state_no_eval_runs(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    for _ in range(3):
+        _make_task_run(task, data_source, "eval_set")
+    # A task run outside the eval-set filter must not be counted toward total.
+    _make_task_run(task, data_source, "other")
+
+    state = await EvalJobWorker().compute_state(params)
+
+    assert state.total == 3
+    assert state.success == 0
+    assert state.error == 0
+    assert state.is_complete is False
+
+
+async def test_compute_state_counts_already_scored(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(3)]
+    _make_eval_run(eval_config, task_runs[0].id, run_config.id)
+    _make_eval_run(eval_config, task_runs[1].id, run_config.id)
+
+    state = await EvalJobWorker().compute_state(params)
+
+    assert state.total == 3
+    assert state.success == 2
+    assert state.is_complete is False
+
+
+async def test_compute_state_is_complete(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(2)]
+    for task_run in task_runs:
+        _make_eval_run(eval_config, task_run.id, run_config.id)
+
+    state = await EvalJobWorker().compute_state(params)
+
+    assert state.total == 2
+    assert state.success == 2
+    assert state.is_complete is True
+
+
+async def test_compute_state_ignores_other_run_config(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(2)]
+    # Scored under a different run config — must not be counted.
+    _make_eval_run(eval_config, task_runs[0].id, "some_other_run_config")
+
+    state = await EvalJobWorker().compute_state(params)
+
+    assert state.total == 2
+    assert state.success == 0
+    assert state.is_complete is False
+
+
+async def test_compute_state_ignores_scored_items_out_of_filter(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    # Two items in the eval-set filter, both scored.
+    in_filter = [_make_task_run(task, data_source, "eval_set") for _ in range(2)]
+    for task_run in in_filter:
+        _make_eval_run(eval_config, task_run.id, run_config.id)
+
+    # An item that was scored under this run config but is NOT in the eval-set
+    # filter (e.g. it drifted out / was tagged differently). EvalRunner would
+    # never work it, so it must not count toward success or flip is_complete.
+    out_of_filter = _make_task_run(task, data_source, "other")
+    _make_eval_run(eval_config, out_of_filter.id, run_config.id)
+
+    state = await EvalJobWorker().compute_state(params)
+
+    # total reflects only in-filter items; the out-of-filter scored item is
+    # neither counted in total nor in success.
+    assert state.total == 2
+    assert state.success == 2
+    assert state.is_complete is True
+
+
+async def test_compute_state_out_of_filter_does_not_short_circuit(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    # Three in-filter items; only one scored. Two remain to be worked.
+    in_filter = [_make_task_run(task, data_source, "eval_set") for _ in range(3)]
+    _make_eval_run(eval_config, in_filter[0].id, run_config.id)
+
+    # Extra scored items that are out-of-filter. A naive count would inflate
+    # success to 3 and falsely report is_complete, short-circuiting a resume.
+    for _ in range(5):
+        out_of_filter = _make_task_run(task, data_source, "other")
+        _make_eval_run(eval_config, out_of_filter.id, run_config.id)
+
+    state = await EvalJobWorker().compute_state(params)
+
+    assert state.total == 3
+    assert state.success == 1
+    assert state.is_complete is False
+
+
+async def test_compute_state_missing_eval_config_raises(
+    resolve_project, task, run_config, data_source
+):
+    # No EvalConfig (or Eval) with this id exists on disk: the entity loader
+    # raises rather than silently reporting "no progress", so the failure is
+    # visible to the registry during reconciliation.
+    bad_params = EvalJobParams(
+        project_id="project1",
+        task_id="task1",
+        eval_id="missing_eval",
+        eval_config_id="missing_eval_config",
+        run_config_id="run_config1",
+    )
+
+    with pytest.raises(Exception):
+        await EvalJobWorker().compute_state(bad_params)
+
+
+# -- run ---------------------------------------------------------------------
+
+
+async def test_run_maps_progress_and_returns_result(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    progresses = [
+        Progress(complete=0, total=3, errors=0),
+        Progress(complete=1, total=3, errors=0),
+        Progress(complete=2, total=3, errors=1),
+    ]
+
+    reported: list[tuple[int, int, int | None]] = []
+
+    class FakeCtx:
+        job_id = "j_test"
+        run_id = "run_test"
+
+        async def report_progress(self, success, error=0, total=None, message=None):
+            reported.append((success, error, total))
+
+        async def report_error(self, error_message, **extra):
+            pass
+
+    with _stub_eval_runner_run(progresses):
+        result = await EvalJobWorker().run(params, FakeCtx())
+
+    assert reported == [(0, 0, 3), (1, 0, 3), (2, 1, 3)]
+    assert result == EvalJobResult(total=3, success=2, error=1)
+
+
+async def test_run_no_items_returns_zero_summary(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    class FakeCtx:
+        job_id = "j_test"
+        run_id = "run_test"
+
+        async def report_progress(self, success, error=0, total=None, message=None):
+            pass
+
+        async def report_error(self, error_message, **extra):
+            pass
+
+    # Real EvalRunner with an empty dataset yields only the initial Progress(0,0,0).
+    result = await EvalJobWorker().run(params, FakeCtx())
+
+    assert result == EvalJobResult(total=0, success=0, error=0)
+
+
+async def test_run_idempotent_skips_already_scored(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(3)]
+    # Two of three already scored.
+    _make_eval_run(eval_config, task_runs[0].id, run_config.id)
+    _make_eval_run(eval_config, task_runs[1].id, run_config.id)
+
+    processed_dataset_ids: list = []
+
+    async def fake_run_job(self, job) -> bool:
+        processed_dataset_ids.append(job.item.id)
+        EvalRun(
+            parent=job.eval_config,
+            dataset_id=job.item.id,
+            task_run_config_id=job.task_run_config.id,
+            input="test",
+            output="test",
+            scores={"accuracy": 1.0},
+        ).save_to_file()
+        return True
+
+    class FakeCtx:
+        job_id = "j_test"
+        run_id = "run_test"
+
+        async def report_progress(self, success, error=0, total=None, message=None):
+            pass
+
+        async def report_error(self, error_message, **extra):
+            pass
+
+    with patch(
+        "kiln_ai.adapters.eval.eval_runner.EvalRunner.run_job",
+        new=fake_run_job,
+    ):
+        result = await EvalJobWorker().run(params, FakeCtx())
+
+    # Only the single not-yet-scored item should have been processed.
+    assert processed_dataset_ids == [task_runs[2].id]
+    # Totals are reported against the FULL eval-set size (3), not just the work
+    # remaining for this run. Two were already scored (baseline), one processed.
+    assert result.total == 3
+    assert result.success == 3
+
+    # No duplicate EvalRuns: three task runs, three EvalRuns total.
+    assert len(eval_config.runs(readonly=True)) == 3
+
+
+async def test_run_reports_full_set_totals_on_partial_resume(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    # 5-item eval set, 2 already scored (baseline). The stubbed runner only sees
+    # the remaining 3 items, so its Progress.total is 3 — but the worker must add
+    # the baseline back and report against the full set of 5.
+    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(5)]
+    _make_eval_run(eval_config, task_runs[0].id, run_config.id)
+    _make_eval_run(eval_config, task_runs[1].id, run_config.id)
+
+    # EvalRunner.run() yields counts relative to the unfinished remainder (3).
+    progresses = [
+        Progress(complete=0, total=3, errors=0),
+        Progress(complete=1, total=3, errors=0),
+        Progress(complete=2, total=3, errors=0),
+        Progress(complete=3, total=3, errors=0),
+    ]
+
+    reported: list[tuple[int, int, int | None]] = []
+
+    class FakeCtx:
+        job_id = "j_test"
+        run_id = "run_test"
+
+        async def report_progress(self, success, error=0, total=None, message=None):
+            reported.append((success, error, total))
+
+        async def report_error(self, error_message, **extra):
+            pass
+
+    with _stub_eval_runner_run(progresses):
+        result = await EvalJobWorker().run(params, FakeCtx())
+
+    # Reported success = baseline (2) + complete; total = baseline (2) + 3 = 5.
+    # The snapshot must not regress below the baseline of 2 already-scored items.
+    assert reported == [(2, 0, 5), (3, 0, 5), (4, 0, 5), (5, 0, 5)]
+    assert result == EvalJobResult(total=5, success=5, error=0)
+
+
+# -- save_context wiring -----------------------------------------------------
+
+
+def test_build_eval_runner_passes_save_context_when_git_sync_enabled(
+    resolve_project, task, eval_config, run_config, params
+):
+    sentinel = object()
+
+    with patch(
+        "app.desktop.studio_server.jobs.workers.eval.save_context_for_project",
+        return_value=sentinel,
+    ) as mock_helper:
+        runner = EvalJobWorker()._build_eval_runner(params)
+
+    mock_helper.assert_called_once_with(
+        params.project_id,
+        context=f"eval job {params.eval_id}/{params.run_config_id}",
+    )
+    # The helper's SaveContext is threaded straight into the runner.
+    assert runner._save_context is sentinel
+
+
+def test_build_eval_runner_defaults_to_noop_when_not_git_sync(
+    resolve_project, task, eval_config, run_config, params
+):
+    from kiln_ai.utils.git_sync_protocols import default_save_context
+
+    with patch(
+        "app.desktop.studio_server.jobs.workers.eval.save_context_for_project",
+        return_value=None,
+    ) as mock_helper:
+        runner = EvalJobWorker()._build_eval_runner(params)
+
+    mock_helper.assert_called_once()
+    # EvalRunner coalesces None to the no-op default_save_context.
+    assert runner._save_context is default_save_context
+
+
+# -- end-to-end via registry -------------------------------------------------
+
+
+async def test_eval_job_through_registry(
+    resolve_project, task, eval_config, run_config, data_source, params
+):
+    for _ in range(2):
+        _make_task_run(task, data_source, "eval_set")
+
+    progresses = [
+        Progress(complete=0, total=2, errors=0),
+        Progress(complete=1, total=2, errors=0),
+        Progress(complete=2, total=2, errors=0),
+    ]
+
+    registry = JobRegistry()
+    registry.register_type(EvalJobWorker)
+
+    with _stub_eval_runner_run(progresses):
+        job = await registry.create("eval", params, project_id=params.project_id)
+        task_handle = registry._tasks[job.id]
+        await task_handle
+
+    final = registry._jobs[job.id]
+    assert final.status == BackgroundJobStatus.SUCCEEDED
+    assert final.result == {"total": 2, "success": 2, "error": 0}
+    assert final.progress.success == 2
+    assert final.progress.total == 2
+    assert final.project_id == "project1"
+
+
+async def test_eval_job_missing_entity_marks_failed(
+    resolve_project, task, run_config, data_source
+):
+    # A job whose eval/eval_config does not exist: compute_state (run during
+    # reconciliation) raises, and the registry marks the job failed rather than
+    # treating the missing entity as "no progress".
+    bad_params = EvalJobParams(
+        project_id="project1",
+        task_id="task1",
+        eval_id="missing_eval",
+        eval_config_id="missing_eval_config",
+        run_config_id="run_config1",
+    )
+
+    registry = JobRegistry()
+    registry.register_type(EvalJobWorker)
+
+    job = await registry.create("eval", bad_params, project_id="project1")
+    task_handle = registry._tasks[job.id]
+    await task_handle
+
+    final = registry._jobs[job.id]
+    assert final.status == BackgroundJobStatus.FAILED
+    assert final.error is not None
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
index e123cdc80..2828ba19f 100644
--- a/app/web_ui/src/lib/api_schema.d.ts
+++ b/app/web_ui/src/lib/api_schema.d.ts
@@ -3076,6 +3076,164 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/jobs/events": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /**
+         * Stream Job Events
+         * @description Server-sent events for jobs. Emits an initial `snapshot`, then per-job
+         *     `job` and `deleted` events. A pure observer: disconnecting never stops a job.
+         */
+        get: operations["stream_job_events_api_jobs_events_get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /** List Jobs */
+        get: operations["list_jobs_api_jobs_get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs/{type}": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        /** Create Job */
+        post: operations["create_job_api_jobs__type__post"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs/{id}": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /** Get Job */
+        get: operations["get_job_api_jobs__id__get"];
+        put?: never;
+        post?: never;
+        /** Delete Job */
+        delete: operations["delete_job_api_jobs__id__delete"];
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs/{id}/result": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /** Get Job Result */
+        get: operations["get_job_result_api_jobs__id__result_get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs/{id}/errors": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /** Get Job Errors */
+        get: operations["get_job_errors_api_jobs__id__errors_get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs/{id}/pause": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        /** Pause Job */
+        post: operations["pause_job_api_jobs__id__pause_post"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs/{id}/resume": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        /** Resume Job */
+        post: operations["resume_job_api_jobs__id__resume_post"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/jobs/{id}/cancel": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        /** Cancel Job */
+        post: operations["cancel_job_api_jobs__id__cancel_post"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
 }
 export type webhooks = Record<string, never>;
 export interface components {
@@ -3539,6 +3697,11 @@ export interface components {
              */
             provider_type: "builtin" | "custom";
         };
+        /**
+         * BackgroundJobStatus
+         * @enum {string}
+         */
+        BackgroundJobStatus: "pending" | "running" | "paused" | "succeeded" | "failed" | "cancelled";
         /**
          * BasePrompt
          * @description A prompt for a task. This is the basic data storage format which can be used throughout a project.
@@ -4348,6 +4511,44 @@ export interface components {
             data_strategy: components["schemas"]["ChatStrategy"];
             run_config_properties?: components["schemas"]["KilnAgentRunConfigProperties"] | null;
         };
+        /**
+         * CreateJobRequest
+         * @description Request body for creating a job. Params are validated per job type.
+         */
+        CreateJobRequest: {
+            /**
+             * Params
+             * @description Type-specific job parameters, validated against the type's params model.
+             */
+            params?: {
+                [key: string]: unknown;
+            };
+            /**
+             * Project Id
+             * @description Project to scope this job to (for filtering/visibility). Falls back to the params' project_id when omitted.
+             */
+            project_id?: string | null;
+            /**
+             * Metadata
+             * @description Free-form pass-through attribution, stored verbatim.
+             */
+            metadata?: {
+                [key: string]: unknown;
+            } | null;
+        };
+        /**
+         * CreateJobResponse
+         * @description Response returned when a job is created.
+         */
+        CreateJobResponse: {
+            /**
+             * Job Id
+             * @description The id of the newly created job.
+             */
+            job_id: string;
+            /** @description The job's status immediately after creation. */
+            status: components["schemas"]["BackgroundJobStatus"];
+        };
         /** CreateKilnCopilotApiKeyRequest */
         CreateKilnCopilotApiKeyRequest: {
             /**
@@ -6744,6 +6945,94 @@ export interface components {
             /** Jailbroken Examples */
             jailbroken_examples: string;
         };
+        /**
+         * JobError
+         * @description Small failure summary stamped on the record. Detail lives in the error log.
+         */
+        JobError: {
+            /** Error */
+            error?: string | null;
+            /** Detail */
+            detail?: {
+                [key: string]: unknown;
+            } | null;
+        };
+        /**
+         * JobProgress
+         * @description Count-based progress for a job.
+         *
+         *     Processed = success + error; remaining = total - success - error. The error
+         *     field is a count only — the actual messages live in the per-run error log.
+         */
+        JobProgress: {
+            /** Total */
+            total?: number | null;
+            /**
+             * Success
+             * @default 0
+             */
+            success: number;
+            /**
+             * Error
+             * @default 0
+             */
+            error: number;
+            /** Message */
+            message?: string | null;
+            /**
+             * Updated At
+             * Format: date-time
+             */
+            updated_at?: string;
+        };
+        /**
+         * JobRecord
+         * @description Ephemeral, in-memory bookkeeping for a single job. Never persisted to disk.
+         */
+        JobRecord: {
+            /** Id */
+            id: string;
+            /** Type */
+            type: string;
+            status: components["schemas"]["BackgroundJobStatus"];
+            /** Run Id */
+            run_id?: string | null;
+            progress?: components["schemas"]["JobProgress"];
+            /** Params */
+            params?: {
+                [key: string]: unknown;
+            };
+            /** Result */
+            result?: {
+                [key: string]: unknown;
+            } | null;
+            error?: components["schemas"]["JobError"] | null;
+            /** Metadata */
+            metadata?: {
+                [key: string]: unknown;
+            };
+            /** Project Id */
+            project_id?: string | null;
+            /**
+             * Supports Pause
+             * @default false
+             */
+            supports_pause: boolean;
+            /**
+             * Created At
+             * Format: date-time
+             */
+            created_at?: string;
+            /**
+             * Updated At
+             * Format: date-time
+             */
+            updated_at?: string;
+            /** Started At */
+            started_at?: string | null;
+            /** Ended At */
+            ended_at?: string | null;
+        };
         /**
          * JobStatus
          * @enum {string}
@@ -17508,4 +17797,345 @@ export interface operations {
             };
         };
     };
+    stream_job_events_api_jobs_events_get: {
+        parameters: {
+            query?: {
+                /** @description Only stream events for this job id. */
+                job_id?: string | null;
+                /** @description Only stream events for this job type. */
+                type?: string | null;
+                /** @description Only stream events for this project id. */
+                project_id?: string | null;
+            };
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": unknown;
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    list_jobs_api_jobs_get: {
+        parameters: {
+            query?: {
+                /** @description Filter by job status. */
+                status?: components["schemas"]["BackgroundJobStatus"] | null;
+                /** @description Filter by job type. */
+                type?: string | null;
+                /** @description Filter by project id. */
+                project_id?: string | null;
+                /** @description Only jobs created at or after this ISO-8601 time. */
+                since?: string | null;
+                /** @description Maximum number of jobs to return. */
+                limit?: number | null;
+            };
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["JobRecord"][];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    create_job_api_jobs__type__post: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                /** @description The registered job type to run. */
+                type: string;
+            };
+            cookie?: never;
+        };
+        requestBody: {
+            content: {
+                "application/json": components["schemas"]["CreateJobRequest"];
+            };
+        };
+        responses: {
+            /** @description Successful Response */
+            201: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["CreateJobResponse"];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    get_job_api_jobs__id__get: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["JobRecord"];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    delete_job_api_jobs__id__delete: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            204: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content?: never;
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    get_job_result_api_jobs__id__result_get: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": {
+                        [key: string]: unknown;
+                    };
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    get_job_errors_api_jobs__id__errors_get: {
+        parameters: {
+            query?: {
+                /** @description Read the error log for a specific past run id. */
+                run_id?: string | null;
+            };
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": {
+                        [key: string]: unknown;
+                    }[];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    pause_job_api_jobs__id__pause_post: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            202: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": unknown;
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    resume_job_api_jobs__id__resume_post: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            202: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": unknown;
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    cancel_job_api_jobs__id__cancel_post: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            202: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": unknown;
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
 }
diff --git a/app/web_ui/src/lib/components/SidebarJobsBadge.svelte b/app/web_ui/src/lib/components/SidebarJobsBadge.svelte
new file mode 100644
index 000000000..af843392e
--- /dev/null
+++ b/app/web_ui/src/lib/components/SidebarJobsBadge.svelte
@@ -0,0 +1,32 @@
+<script lang="ts">
+  import { active_jobs_count } from "$lib/stores/jobs_store"
+
+  // "rail" overlays the count on a sidebar icon (absolute, top-right).
+  // "inline" sits next to a label in the wide drawer.
+  export let variant: "rail" | "inline" = "inline"
+
+  // Defaults to the live active-jobs count, but accepts an override so the
+  // component is render-testable in isolation.
+  export let count: number | undefined = undefined
+
+  $: resolved = count ?? $active_jobs_count
+  $: label = resolved > 99 ? "99+" : `${resolved}`
+</script>
+
+{#if resolved > 0}
+  {#if variant === "rail"}
+    <span
+      class="absolute -top-1 -right-1 min-w-4 h-4 px-1 rounded-full bg-primary text-primary-content text-[10px] leading-4 font-medium text-center"
+      aria-label={`${resolved} active jobs`}
+    >
+      {label}
+    </span>
+  {:else}
+    <span
+      class="badge badge-sm badge-primary"
+      aria-label={`${resolved} active jobs`}
+    >
+      {label}
+    </span>
+  {/if}
+{/if}
diff --git a/app/web_ui/src/lib/components/SidebarJobsBadge.test.ts b/app/web_ui/src/lib/components/SidebarJobsBadge.test.ts
new file mode 100644
index 000000000..7873285ec
--- /dev/null
+++ b/app/web_ui/src/lib/components/SidebarJobsBadge.test.ts
@@ -0,0 +1,40 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi } from "vitest"
+import { render } from "@testing-library/svelte"
+import { writable } from "svelte/store"
+
+vi.mock("$lib/api_client", () => ({
+  base_url: "http://localhost:8757",
+  client: {},
+}))
+
+vi.mock("$lib/stores", () => ({
+  ui_state: writable({ current_project_id: null }),
+}))
+
+const SidebarJobsBadge = (await import("./SidebarJobsBadge.svelte")).default
+
+describe("SidebarJobsBadge", () => {
+  it("renders the count when greater than zero", () => {
+    const { getByText } = render(SidebarJobsBadge, { props: { count: 3 } })
+    expect(getByText("3")).not.toBeNull()
+  })
+
+  it("renders nothing when count is zero", () => {
+    const { container } = render(SidebarJobsBadge, { props: { count: 0 } })
+    expect(container.textContent?.trim()).toBe("")
+  })
+
+  it("caps the displayed count at 99+", () => {
+    const { getByText } = render(SidebarJobsBadge, { props: { count: 150 } })
+    expect(getByText("99+")).not.toBeNull()
+  })
+
+  it("uses the rail variant styling when requested", () => {
+    const { container } = render(SidebarJobsBadge, {
+      props: { count: 2, variant: "rail" },
+    })
+    const span = container.querySelector("span")
+    expect(span?.className).toContain("absolute")
+  })
+})
diff --git a/app/web_ui/src/lib/stores/job_status.test.ts b/app/web_ui/src/lib/stores/job_status.test.ts
new file mode 100644
index 000000000..4e6f91ce4
--- /dev/null
+++ b/app/web_ui/src/lib/stores/job_status.test.ts
@@ -0,0 +1,128 @@
+import { describe, it, expect } from "vitest"
+import {
+  available_actions,
+  is_active,
+  is_terminal,
+  job_status_badge_class,
+  job_status_display,
+  progress_label,
+  progress_percent,
+} from "./job_status"
+import type { BackgroundJobStatus, JobRecord } from "./jobs_api"
+
+function makeJob(overrides: Partial<JobRecord> = {}): JobRecord {
+  return {
+    id: "j_1",
+    type: "noop",
+    status: "running",
+    supports_pause: false,
+    ...overrides,
+  }
+}
+
+describe("is_active / is_terminal", () => {
+  it("treats pending, running, paused as active", () => {
+    expect(is_active("pending")).toBe(true)
+    expect(is_active("running")).toBe(true)
+    expect(is_active("paused")).toBe(true)
+  })
+
+  it("treats terminal statuses as not active", () => {
+    expect(is_active("succeeded")).toBe(false)
+    expect(is_active("failed")).toBe(false)
+    expect(is_active("cancelled")).toBe(false)
+  })
+
+  it("identifies terminal statuses", () => {
+    expect(is_terminal("succeeded")).toBe(true)
+    expect(is_terminal("failed")).toBe(true)
+    expect(is_terminal("cancelled")).toBe(true)
+    expect(is_terminal("running")).toBe(false)
+  })
+})
+
+describe("available_actions", () => {
+  it("running without pause support: cancel only", () => {
+    expect(available_actions(makeJob({ status: "running" }))).toEqual([
+      "cancel",
+    ])
+  })
+
+  it("running with pause support: pause then cancel", () => {
+    expect(
+      available_actions(makeJob({ status: "running", supports_pause: true })),
+    ).toEqual(["pause", "cancel"])
+  })
+
+  it("paused: resume and cancel", () => {
+    expect(
+      available_actions(makeJob({ status: "paused", supports_pause: true })),
+    ).toEqual(["resume", "cancel"])
+  })
+
+  it("pending: cancel only", () => {
+    expect(available_actions(makeJob({ status: "pending" }))).toEqual([
+      "cancel",
+    ])
+  })
+
+  it("terminal states: delete only", () => {
+    for (const status of [
+      "succeeded",
+      "failed",
+      "cancelled",
+    ] as BackgroundJobStatus[]) {
+      expect(available_actions(makeJob({ status }))).toEqual(["delete"])
+    }
+  })
+})
+
+describe("job_status_display / job_status_badge_class", () => {
+  const cases: [BackgroundJobStatus, string, string][] = [
+    ["pending", "Pending", "badge-ghost"],
+    ["running", "Running", "badge-info"],
+    ["paused", "Paused", "badge-warning"],
+    ["succeeded", "Succeeded", "badge-success"],
+    ["failed", "Failed", "badge-error"],
+    ["cancelled", "Cancelled", "badge-ghost"],
+  ]
+  it.each(cases)("maps %s", (status, label, badge) => {
+    expect(job_status_display(status)).toBe(label)
+    expect(job_status_badge_class(status)).toBe(badge)
+  })
+})
+
+describe("progress_label", () => {
+  it("shows count only when total is null", () => {
+    expect(progress_label({ success: 3, error: 0 })).toBe("3")
+  })
+
+  it("shows success / total", () => {
+    expect(progress_label({ success: 3, error: 0, total: 10 })).toBe("3 / 10")
+  })
+
+  it("appends errored count when present", () => {
+    expect(progress_label({ success: 3, error: 2, total: 10 })).toBe(
+      "3 / 10 (2 errored)",
+    )
+  })
+
+  it("handles undefined progress", () => {
+    expect(progress_label(undefined)).toBe("0")
+  })
+})
+
+describe("progress_percent", () => {
+  it("returns 0 when total is null or zero", () => {
+    expect(progress_percent({ success: 1, error: 0 })).toBe(0)
+    expect(progress_percent({ success: 1, error: 0, total: 0 })).toBe(0)
+  })
+
+  it("computes processed / total as a percent", () => {
+    expect(progress_percent({ success: 2, error: 1, total: 10 })).toBe(30)
+  })
+
+  it("returns 100 when complete", () => {
+    expect(progress_percent({ success: 8, error: 2, total: 10 })).toBe(100)
+  })
+})
diff --git a/app/web_ui/src/lib/stores/job_status.ts b/app/web_ui/src/lib/stores/job_status.ts
new file mode 100644
index 000000000..9d6cdfd7c
--- /dev/null
+++ b/app/web_ui/src/lib/stores/job_status.ts
@@ -0,0 +1,109 @@
+import type { BackgroundJobStatus, JobProgress, JobRecord } from "./jobs_api"
+
+export const ACTIVE_STATUSES: readonly BackgroundJobStatus[] = [
+  "pending",
+  "running",
+  "paused",
+]
+
+export const TERMINAL_STATUSES: readonly BackgroundJobStatus[] = [
+  "succeeded",
+  "failed",
+  "cancelled",
+]
+
+export function is_active(status: BackgroundJobStatus): boolean {
+  return ACTIVE_STATUSES.includes(status)
+}
+
+export function is_terminal(status: BackgroundJobStatus): boolean {
+  return TERMINAL_STATUSES.includes(status)
+}
+
+export function job_status_display(status: BackgroundJobStatus): string {
+  switch (status) {
+    case "pending":
+      return "Pending"
+    case "running":
+      return "Running"
+    case "paused":
+      return "Paused"
+    case "succeeded":
+      return "Succeeded"
+    case "failed":
+      return "Failed"
+    case "cancelled":
+      return "Cancelled"
+    default: {
+      const exhaustive: never = status
+      return exhaustive
+    }
+  }
+}
+
+export function job_status_badge_class(status: BackgroundJobStatus): string {
+  switch (status) {
+    case "running":
+      return "badge-info"
+    case "succeeded":
+      return "badge-success"
+    case "failed":
+      return "badge-error"
+    case "paused":
+      return "badge-warning"
+    case "pending":
+      return "badge-ghost"
+    case "cancelled":
+      return "badge-ghost"
+    default: {
+      const exhaustive: never = status
+      return exhaustive
+    }
+  }
+}
+
+export type JobAction = "pause" | "resume" | "cancel" | "delete"
+
+// The set of lifecycle actions valid for a job given its status and whether
+// its worker supports pause. Mirrors the state machine (functional_spec §3) and
+// the delete policy (architecture open item #7: delete only on terminal state).
+export function available_actions(job: JobRecord): JobAction[] {
+  switch (job.status) {
+    case "running": {
+      const actions: JobAction[] = ["cancel"]
+      if (job.supports_pause) {
+        actions.unshift("pause")
+      }
+      return actions
+    }
+    case "paused":
+      return ["resume", "cancel"]
+    case "pending":
+      return ["cancel"]
+    case "succeeded":
+    case "failed":
+    case "cancelled":
+      return ["delete"]
+    default: {
+      const exhaustive: never = job.status
+      return exhaustive
+    }
+  }
+}
+
+export function progress_label(progress: JobProgress | undefined): string {
+  const success = progress?.success ?? 0
+  const total = progress?.total
+  const base = total == null ? `${success}` : `${success} / ${total}`
+  const error = progress?.error ?? 0
+  return error > 0 ? `${base} (${error} errored)` : base
+}
+
+export function progress_percent(progress: JobProgress | undefined): number {
+  const total = progress?.total
+  if (!total || total <= 0) {
+    return 0
+  }
+  const processed = (progress?.success ?? 0) + (progress?.error ?? 0)
+  return Math.max(0, Math.min(100, Math.round((processed / total) * 100)))
+}
diff --git a/app/web_ui/src/lib/stores/jobs_api.test.ts b/app/web_ui/src/lib/stores/jobs_api.test.ts
new file mode 100644
index 000000000..84770438c
--- /dev/null
+++ b/app/web_ui/src/lib/stores/jobs_api.test.ts
@@ -0,0 +1,150 @@
+import { describe, it, expect, vi, beforeEach } from "vitest"
+import { client } from "$lib/api_client"
+import {
+  cancel_job,
+  create_job,
+  delete_job,
+  get_job,
+  get_job_errors,
+  get_job_result,
+  list_jobs,
+  pause_job,
+  resume_job,
+} from "./jobs_api"
+
+vi.mock("$lib/api_client", () => ({
+  client: {
+    GET: vi.fn(),
+    POST: vi.fn(),
+    DELETE: vi.fn(),
+  },
+  base_url: "http://localhost:8757",
+}))
+
+const mockGET = client.GET as unknown as ReturnType<typeof vi.fn>
+const mockPOST = client.POST as unknown as ReturnType<typeof vi.fn>
+const mockDELETE = client.DELETE as unknown as ReturnType<typeof vi.fn>
+
+describe("jobs_api", () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+  })
+
+  it("list_jobs calls GET /api/jobs with the query and returns data", async () => {
+    mockGET.mockResolvedValue({ data: [{ id: "j_1" }], error: undefined })
+    const result = await list_jobs({ project_id: "p_1", status: "running" })
+    expect(mockGET).toHaveBeenCalledWith("/api/jobs", {
+      params: { query: { project_id: "p_1", status: "running" } },
+    })
+    expect(result).toEqual([{ id: "j_1" }])
+  })
+
+  it("list_jobs throws when the client returns an error", async () => {
+    mockGET.mockResolvedValue({ data: undefined, error: { detail: "boom" } })
+    await expect(list_jobs()).rejects.toEqual({ detail: "boom" })
+  })
+
+  it("get_job calls GET /api/jobs/{id}", async () => {
+    mockGET.mockResolvedValue({ data: { id: "j_2" }, error: undefined })
+    const result = await get_job("j_2")
+    expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}", {
+      params: { path: { id: "j_2" } },
+    })
+    expect(result).toEqual({ id: "j_2" })
+  })
+
+  it("create_job calls POST /api/jobs/{type} with params and metadata", async () => {
+    mockPOST.mockResolvedValue({
+      data: { job_id: "j_3", status: "pending" },
+      error: undefined,
+    })
+    const result = await create_job("eval", { eval_id: "e_1" }, { src: "ui" })
+    expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{type}", {
+      params: { path: { type: "eval" } },
+      body: {
+        params: { eval_id: "e_1" },
+        metadata: { src: "ui" },
+        project_id: null,
+      },
+    })
+    expect(result).toEqual({ job_id: "j_3", status: "pending" })
+  })
+
+  it("create_job passes an explicit project_id in the body", async () => {
+    mockPOST.mockResolvedValue({
+      data: { job_id: "j_3b", status: "pending" },
+      error: undefined,
+    })
+    await create_job("noop", { steps: 5 }, null, "p_current")
+    expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{type}", {
+      params: { path: { type: "noop" } },
+      body: { params: { steps: 5 }, metadata: null, project_id: "p_current" },
+    })
+  })
+
+  it("get_job_result calls GET /api/jobs/{id}/result", async () => {
+    mockGET.mockResolvedValue({ data: { total: 5 }, error: undefined })
+    const result = await get_job_result("j_4")
+    expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}/result", {
+      params: { path: { id: "j_4" } },
+    })
+    expect(result).toEqual({ total: 5 })
+  })
+
+  it("get_job_errors calls GET /api/jobs/{id}/errors with optional run_id", async () => {
+    mockGET.mockResolvedValue({
+      data: [{ error_message: "oops" }],
+      error: undefined,
+    })
+    const result = await get_job_errors("j_5", "run_xyz")
+    expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}/errors", {
+      params: { path: { id: "j_5" }, query: { run_id: "run_xyz" } },
+    })
+    expect(result).toEqual([{ error_message: "oops" }])
+  })
+
+  it("get_job_errors omits run_id query when not provided", async () => {
+    mockGET.mockResolvedValue({ data: [], error: undefined })
+    await get_job_errors("j_6")
+    expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}/errors", {
+      params: { path: { id: "j_6" }, query: {} },
+    })
+  })
+
+  it("pause_job calls POST /api/jobs/{id}/pause", async () => {
+    mockPOST.mockResolvedValue({ data: undefined, error: undefined })
+    await pause_job("j_7")
+    expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{id}/pause", {
+      params: { path: { id: "j_7" } },
+    })
+  })
+
+  it("resume_job calls POST /api/jobs/{id}/resume", async () => {
+    mockPOST.mockResolvedValue({ data: undefined, error: undefined })
+    await resume_job("j_8")
+    expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{id}/resume", {
+      params: { path: { id: "j_8" } },
+    })
+  })
+
+  it("cancel_job calls POST /api/jobs/{id}/cancel", async () => {
+    mockPOST.mockResolvedValue({ data: undefined, error: undefined })
+    await cancel_job("j_9")
+    expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{id}/cancel", {
+      params: { path: { id: "j_9" } },
+    })
+  })
+
+  it("delete_job calls DELETE /api/jobs/{id}", async () => {
+    mockDELETE.mockResolvedValue({ data: undefined, error: undefined })
+    await delete_job("j_10")
+    expect(mockDELETE).toHaveBeenCalledWith("/api/jobs/{id}", {
+      params: { path: { id: "j_10" } },
+    })
+  })
+
+  it("lifecycle calls throw on client error", async () => {
+    mockPOST.mockResolvedValue({ data: undefined, error: { detail: "409" } })
+    await expect(cancel_job("j_11")).rejects.toEqual({ detail: "409" })
+  })
+})
diff --git a/app/web_ui/src/lib/stores/jobs_api.ts b/app/web_ui/src/lib/stores/jobs_api.ts
new file mode 100644
index 000000000..d05993011
--- /dev/null
+++ b/app/web_ui/src/lib/stores/jobs_api.ts
@@ -0,0 +1,118 @@
+import { client } from "$lib/api_client"
+import type { components } from "$lib/api_schema"
+
+export type JobRecord = components["schemas"]["JobRecord"]
+export type JobProgress = components["schemas"]["JobProgress"]
+export type JobError = components["schemas"]["JobError"]
+export type BackgroundJobStatus = components["schemas"]["BackgroundJobStatus"]
+
+export type JobErrorEntry = {
+  error_message?: string
+} & Record<string, unknown>
+
+export type ListJobsQuery = {
+  status?: BackgroundJobStatus
+  type?: string
+  project_id?: string
+  since?: string
+  limit?: number
+}
+
+export async function list_jobs(
+  query: ListJobsQuery = {},
+): Promise<JobRecord[]> {
+  const { data, error } = await client.GET("/api/jobs", {
+    params: { query },
+  })
+  if (error) {
+    throw error
+  }
+  return data
+}
+
+export async function get_job(id: string): Promise<JobRecord> {
+  const { data, error } = await client.GET("/api/jobs/{id}", {
+    params: { path: { id } },
+  })
+  if (error) {
+    throw error
+  }
+  return data
+}
+
+export async function create_job(
+  type: string,
+  params: Record<string, unknown> = {},
+  metadata: Record<string, unknown> | null = null,
+  project_id: string | null = null,
+): Promise<components["schemas"]["CreateJobResponse"]> {
+  const { data, error } = await client.POST("/api/jobs/{type}", {
+    params: { path: { type } },
+    body: { params, metadata, project_id },
+  })
+  if (error) {
+    throw error
+  }
+  return data
+}
+
+export async function get_job_result(
+  id: string,
+): Promise<Record<string, unknown>> {
+  const { data, error } = await client.GET("/api/jobs/{id}/result", {
+    params: { path: { id } },
+  })
+  if (error) {
+    throw error
+  }
+  return data
+}
+
+export async function get_job_errors(
+  id: string,
+  run_id?: string,
+): Promise<JobErrorEntry[]> {
+  const { data, error } = await client.GET("/api/jobs/{id}/errors", {
+    params: { path: { id }, query: run_id ? { run_id } : {} },
+  })
+  if (error) {
+    throw error
+  }
+  return data as JobErrorEntry[]
+}
+
+export async function pause_job(id: string): Promise<void> {
+  const { error } = await client.POST("/api/jobs/{id}/pause", {
+    params: { path: { id } },
+  })
+  if (error) {
+    throw error
+  }
+}
+
+export async function resume_job(id: string): Promise<void> {
+  const { error } = await client.POST("/api/jobs/{id}/resume", {
+    params: { path: { id } },
+  })
+  if (error) {
+    throw error
+  }
+}
+
+export async function cancel_job(id: string): Promise<void> {
+  const { error } = await client.POST("/api/jobs/{id}/cancel", {
+    params: { path: { id } },
+  })
+  if (error) {
+    throw error
+  }
+}
+
+export async function delete_job(id: string): Promise<void> {
+  const { error } = await client.DELETE("/api/jobs/{id}", {
+    params: { path: { id } },
+  })
+  if (error) {
+    throw error
+  }
+}
diff --git a/app/web_ui/src/lib/stores/jobs_store.test.ts b/app/web_ui/src/lib/stores/jobs_store.test.ts
new file mode 100644
index 000000000..2eb1d5def
--- /dev/null
+++ b/app/web_ui/src/lib/stores/jobs_store.test.ts
@@ -0,0 +1,305 @@
+// @vitest-environment jsdom
+import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"
+import { get, writable } from "svelte/store"
+import type { JobRecord } from "./jobs_api"
+
+// ui_state drives the project filter. Provide a real writable so we can flip
+// the current project mid-test.
+const ui_state = writable<{ current_project_id: string | null }>({
+  current_project_id: null,
+})
+
+vi.mock("$lib/api_client", () => ({
+  base_url: "http://localhost:8757",
+  client: {},
+}))
+
+vi.mock("$lib/stores", () => ({
+  ui_state,
+}))
+
+// Spy on every mutation entry point. The store is a pure observer: it must
+// never call any of these. We assert that explicitly on teardown below.
+const mutationSpies = {
+  pause_job: vi.fn(),
+  resume_job: vi.fn(),
+  cancel_job: vi.fn(),
+  delete_job: vi.fn(),
+  create_job: vi.fn(),
+}
+vi.mock("./jobs_api", () => mutationSpies)
+
+// A controllable fake EventSource installed on globalThis. Records construction
+// URLs and close() calls so tests can assert the pure-observer / reconnect
+// behavior without a real network connection.
+type Listener = (event: MessageEvent) => void
+
+class FakeEventSource {
+  static instances: FakeEventSource[] = []
+  url: string
+  closed = false
+  onerror: ((this: EventSource, ev: Event) => void) | null = null
+  private listeners: Record<string, Listener[]> = {}
+
+  constructor(url: string) {
+    this.url = url
+    FakeEventSource.instances.push(this)
+  }
+
+  addEventListener(type: string, listener: Listener) {
+    ;(this.listeners[type] ||= []).push(listener)
+  }
+
+  close() {
+    this.closed = true
+  }
+
+  emit(type: string, data: unknown) {
+    const event = { data: JSON.stringify(data) } as MessageEvent
+    for (const listener of this.listeners[type] || []) {
+      listener(event)
+    }
+  }
+
+  fail() {
+    this.onerror?.call(this as unknown as EventSource, new Event("error"))
+  }
+
+  static latest(): FakeEventSource {
+    return FakeEventSource.instances[FakeEventSource.instances.length - 1]
+  }
+
+  static reset() {
+    FakeEventSource.instances = []
+  }
+}
+
+function makeJob(overrides: Partial<JobRecord> = {}): JobRecord {
+  return {
+    id: "j_1",
+    type: "noop",
+    status: "running",
+    supports_pause: true,
+    created_at: "2026-05-28T12:00:00Z",
+    ...overrides,
+  }
+}
+
+// Import the module fresh per test so the ref-counted connection and the
+// module-level ui_state subscription start clean.
+async function loadStore() {
+  vi.resetModules()
+  ui_state.set({ current_project_id: null })
+  FakeEventSource.reset()
+  return await import("./jobs_store")
+}
+
+describe("jobs_store", () => {
+  beforeEach(() => {
+    vi.useFakeTimers()
+    // @ts-expect-error install fake on global
+    globalThis.EventSource = FakeEventSource
+    for (const spy of Object.values(mutationSpies)) {
+      spy.mockClear()
+    }
+  })
+
+  afterEach(() => {
+    vi.useRealTimers()
+    vi.restoreAllMocks()
+  })
+
+  it("snapshot replaces the whole map", async () => {
+    const { jobs } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    const source = FakeEventSource.latest()
+
+    source.emit("snapshot", {
+      jobs: [makeJob({ id: "j_1" }), makeJob({ id: "j_2" })],
+    })
+    expect(
+      get(jobs)
+        .map((j) => j.id)
+        .sort(),
+    ).toEqual(["j_1", "j_2"])
+
+    // A second snapshot fully replaces the prior contents.
+    source.emit("snapshot", { jobs: [makeJob({ id: "j_3" })] })
+    expect(get(jobs).map((j) => j.id)).toEqual(["j_3"])
+    unsub()
+  })
+
+  it("job event inserts a new job", async () => {
+    const { jobs } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    const source = FakeEventSource.latest()
+    source.emit("snapshot", { jobs: [] })
+    source.emit("job", makeJob({ id: "j_new" }))
+    expect(get(jobs).map((j) => j.id)).toEqual(["j_new"])
+    unsub()
+  })
+
+  it("job event upserts status + progress for an existing job", async () => {
+    const { jobs } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    const source = FakeEventSource.latest()
+    source.emit("snapshot", {
+      jobs: [
+        makeJob({
+          id: "j_1",
+          status: "running",
+          progress: { success: 1, error: 0, total: 10 },
+        }),
+      ],
+    })
+    source.emit(
+      "job",
+      makeJob({
+        id: "j_1",
+        status: "succeeded",
+        progress: { success: 10, error: 0, total: 10 },
+      }),
+    )
+    const job = get(jobs)[0]
+    expect(job.status).toBe("succeeded")
+    expect(job.progress?.success).toBe(10)
+    unsub()
+  })
+
+  it("deleted event removes a job; unknown id is a no-op", async () => {
+    const { jobs } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    const source = FakeEventSource.latest()
+    source.emit("snapshot", {
+      jobs: [makeJob({ id: "j_1" }), makeJob({ id: "j_2" })],
+    })
+    source.emit("deleted", { id: "j_1" })
+    expect(get(jobs).map((j) => j.id)).toEqual(["j_2"])
+    source.emit("deleted", { id: "does_not_exist" })
+    expect(get(jobs).map((j) => j.id)).toEqual(["j_2"])
+    unsub()
+  })
+
+  it("reconnects on error and re-syncs from the fresh snapshot", async () => {
+    const { jobs } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    const first = FakeEventSource.latest()
+    first.emit("snapshot", { jobs: [makeJob({ id: "stale" })] })
+    expect(get(jobs).map((j) => j.id)).toEqual(["stale"])
+
+    first.fail()
+    expect(first.closed).toBe(true)
+
+    // After the backoff a new EventSource is constructed.
+    vi.advanceTimersByTime(2000)
+    expect(FakeEventSource.instances.length).toBe(2)
+    const second = FakeEventSource.latest()
+    expect(second).not.toBe(first)
+
+    second.emit("snapshot", { jobs: [makeJob({ id: "fresh" })] })
+    expect(get(jobs).map((j) => j.id)).toEqual(["fresh"])
+    unsub()
+  })
+
+  it("active_jobs_count counts only pending/running/paused", async () => {
+    const { jobs, active_jobs_count } = await loadStore()
+    const unsubJobs = jobs.subscribe(() => {})
+    const unsub = active_jobs_count.subscribe(() => {})
+    const source = FakeEventSource.latest()
+    source.emit("snapshot", {
+      jobs: [
+        makeJob({ id: "a", status: "pending" }),
+        makeJob({ id: "b", status: "running" }),
+        makeJob({ id: "c", status: "paused" }),
+        makeJob({ id: "d", status: "succeeded" }),
+        makeJob({ id: "e", status: "failed" }),
+      ],
+    })
+    expect(get(active_jobs_count)).toBe(3)
+    unsub()
+    unsubJobs()
+  })
+
+  it("closes the EventSource when the last subscriber unsubscribes (pure observer)", async () => {
+    const { jobs } = await loadStore()
+    const unsub1 = jobs.subscribe(() => {})
+    const unsub2 = jobs.subscribe(() => {})
+    const source = FakeEventSource.latest()
+    // Only one EventSource is opened regardless of subscriber count.
+    expect(FakeEventSource.instances.length).toBe(1)
+
+    unsub1()
+    expect(source.closed).toBe(false)
+    unsub2()
+    expect(source.closed).toBe(true)
+  })
+
+  it("opens with the project filter and re-opens when the project changes", async () => {
+    const { jobs } = await loadStore()
+    ui_state.set({ current_project_id: "p_1" })
+    const unsub = jobs.subscribe(() => {})
+    const first = FakeEventSource.latest()
+    expect(first.url).toContain("project_id=p_1")
+
+    ui_state.set({ current_project_id: "p_2" })
+    expect(first.closed).toBe(true)
+    const second = FakeEventSource.latest()
+    expect(second).not.toBe(first)
+    expect(second.url).toContain("project_id=p_2")
+    unsub()
+  })
+
+  it("ignores ui_state changes that don't touch current_project_id", async () => {
+    const { jobs } = await loadStore()
+    ui_state.set({ current_project_id: "p_1" })
+    const unsub = jobs.subscribe(() => {})
+    const first = FakeEventSource.latest()
+    expect(FakeEventSource.instances.length).toBe(1)
+
+    // An unrelated ui_state update with the same project id must not re-open.
+    ui_state.set({ current_project_id: "p_1", other: "x" } as {
+      current_project_id: string | null
+    })
+    expect(FakeEventSource.instances.length).toBe(1)
+    expect(first.closed).toBe(false)
+    unsub()
+  })
+
+  it("reports an errored connection when the stream fails before syncing", async () => {
+    const { jobs, connection } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    expect(get(connection)).toBe("connecting")
+
+    FakeEventSource.latest().fail()
+    expect(get(connection)).toBe("errored")
+    unsub()
+  })
+
+  it("connection becomes open once a snapshot arrives", async () => {
+    const { jobs, connection } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    FakeEventSource.latest().emit("snapshot", { jobs: [] })
+    expect(get(connection)).toBe("open")
+    unsub()
+  })
+
+  it("never calls a mutation endpoint (pure observer) across its full lifecycle", async () => {
+    const { jobs } = await loadStore()
+    const unsub = jobs.subscribe(() => {})
+    const source = FakeEventSource.latest()
+
+    // Drive every observable path: snapshot, job upsert, deletion, an error +
+    // reconnect, a project switch, and finally teardown.
+    source.emit("snapshot", { jobs: [makeJob({ id: "j_1" })] })
+    source.emit("job", makeJob({ id: "j_1", status: "succeeded" }))
+    source.emit("deleted", { id: "j_1" })
+    source.fail()
+    vi.advanceTimersByTime(2000)
+    ui_state.set({ current_project_id: "p_switch" })
+    unsub()
+
+    for (const spy of Object.values(mutationSpies)) {
+      expect(spy).not.toHaveBeenCalled()
+    }
+  })
+})
diff --git a/app/web_ui/src/lib/stores/jobs_store.ts b/app/web_ui/src/lib/stores/jobs_store.ts
new file mode 100644
index 000000000..1718f95bf
--- /dev/null
+++ b/app/web_ui/src/lib/stores/jobs_store.ts
@@ -0,0 +1,244 @@
+import { derived, get, writable, type Readable } from "svelte/store"
+import { base_url } from "$lib/api_client"
+import { ui_state } from "$lib/stores"
+import type { JobRecord } from "./jobs_api"
+import { is_active } from "./job_status"
+
+const RECONNECT_DELAY_MS = 2000
+
+type JobsMap = Map<string, JobRecord>
+
+// Connection state surfaced to the UI so the panel can distinguish "still
+// connecting" from "can't connect". Stays a pure observer: this only reports
+// the EventSource lifecycle, it never triggers a job mutation.
+export type JobsConnection = "idle" | "connecting" | "open" | "errored"
+
+function createJobsStore() {
+  const jobs_map = writable<JobsMap>(new Map())
+
+  // True once the first `snapshot` event for the current connection has been
+  // processed. Lets the panel show a loading state until the stream syncs.
+  const synced = writable(false)
+
+  // Lifecycle of the underlying EventSource. The panel pairs this with `synced`
+  // to show a "can't connect / retrying" affordance instead of spinning forever
+  // when the stream errors before its first snapshot.
+  const connection = writable<JobsConnection>("idle")
+
+  let event_source: EventSource | null = null
+  let reconnect_timer: ReturnType<typeof setTimeout> | null = null
+  let subscriber_count = 0
+  let current_project_id: string | null = null
+
+  function build_url(): string {
+    const url = new URL(`${base_url}/api/jobs/events`)
+    if (current_project_id) {
+      url.searchParams.set("project_id", current_project_id)
+    }
+    return url.toString()
+  }
+
+  function upsert(record: JobRecord) {
+    jobs_map.update((map) => {
+      const next = new Map(map)
+      next.set(record.id, record)
+      return next
+    })
+  }
+
+  function remove(id: string) {
+    jobs_map.update((map) => {
+      if (!map.has(id)) {
+        return map
+      }
+      const next = new Map(map)
+      next.delete(id)
+      return next
+    })
+  }
+
+  function replace_all(records: JobRecord[]) {
+    const next: JobsMap = new Map()
+    for (const record of records) {
+      next.set(record.id, record)
+    }
+    jobs_map.set(next)
+  }
+
+  function handle_snapshot(event: MessageEvent) {
+    try {
+      const parsed = JSON.parse(event.data) as { jobs?: JobRecord[] }
+      replace_all(parsed.jobs ?? [])
+      synced.set(true)
+      connection.set("open")
+    } catch {
+      // Ignore malformed payloads; the next snapshot will re-sync.
+    }
+  }
+
+  function handle_job(event: MessageEvent) {
+    try {
+      const record = JSON.parse(event.data) as JobRecord
+      upsert(record)
+    } catch {
+      // Ignore malformed payloads.
+    }
+  }
+
+  function handle_deleted(event: MessageEvent) {
+    try {
+      const parsed = JSON.parse(event.data) as { id?: string }
+      if (parsed.id) {
+        remove(parsed.id)
+      }
+    } catch {
+      // Ignore malformed payloads.
+    }
+  }
+
+  function clear_reconnect() {
+    if (reconnect_timer !== null) {
+      clearTimeout(reconnect_timer)
+      reconnect_timer = null
+    }
+  }
+
+  function schedule_reconnect() {
+    if (reconnect_timer !== null || subscriber_count === 0) {
+      return
+    }
+    reconnect_timer = setTimeout(() => {
+      reconnect_timer = null
+      if (subscriber_count > 0) {
+        connect()
+      }
+    }, RECONNECT_DELAY_MS)
+  }
+
+  function close_source() {
+    if (event_source) {
+      event_source.close()
+      event_source = null
+    }
+  }
+
+  function connect() {
+    // Pure observer: opening or closing this stream never affects a job. A
+    // dropped connection is recovered by reconnecting; the fresh `snapshot`
+    // re-syncs the map (no Last-Event-ID needed).
+    const EventSourceCtor = globalThis.EventSource
+    if (!EventSourceCtor) {
+      return
+    }
+    close_source()
+    clear_reconnect()
+    synced.set(false)
+    connection.set("connecting")
+
+    const source = new EventSourceCtor(build_url())
+    event_source = source
+
+    source.addEventListener("snapshot", handle_snapshot as EventListener)
+    source.addEventListener("job", handle_job as EventListener)
+    source.addEventListener("deleted", handle_deleted as EventListener)
+    source.onerror = () => {
+      // Only reconnect if this is still the active source (avoids racing a
+      // teardown or a project switch).
+      if (event_source !== source) {
+        return
+      }
+      close_source()
+      connection.set("errored")
+      schedule_reconnect()
+    }
+  }
+
+  function disconnect() {
+    close_source()
+    clear_reconnect()
+    synced.set(false)
+    connection.set("idle")
+  }
+
+  // Re-open the stream against a new project filter. Called by the ui_state
+  // subscription below and exposed for tests.
+  function set_project(project_id: string | null) {
+    if (project_id === current_project_id) {
+      return
+    }
+    current_project_id = project_id
+    if (subscriber_count > 0) {
+      connect()
+    }
+  }
+
+  // Track the active project from UI state so the badge/panel stay scoped to
+  // the project the user is viewing. `ui_state` fires on any field change, so
+  // we react only when `current_project_id` actually differs from what we last
+  // saw — keeping rapid project switches correct (the old source is closed by
+  // `connect()` before the new one opens, so there's no leak).
+  current_project_id = get(ui_state).current_project_id ?? null
+  let last_seen_project_id = current_project_id
+  ui_state.subscribe((state) => {
+    const next = state.current_project_id ?? null
+    if (next === last_seen_project_id) {
+      return
+    }
+    last_seen_project_id = next
+    set_project(next)
+  })
+
+  const subscribe: Readable<JobsMap>["subscribe"] = (run, invalidate) => {
+    if (subscriber_count === 0) {
+      connect()
+    }
+    subscriber_count += 1
+    const unsubscribe = jobs_map.subscribe(run, invalidate)
+    return () => {
+      unsubscribe()
+      subscriber_count -= 1
+      if (subscriber_count <= 0) {
+        subscriber_count = 0
+        disconnect()
+      }
+    }
+  }
+
+  return {
+    subscribe,
+    synced: { subscribe: synced.subscribe } as Readable<boolean>,
+    connection: {
+      subscribe: connection.subscribe,
+    } as Readable<JobsConnection>,
+    set_project,
+    // Exposed for tests / explicit teardown; not part of normal usage.
+    _disconnect: disconnect,
+  }
+}
+
+export const jobs_store = createJobsStore()
+
+export const synced: Readable<boolean> = jobs_store.synced
+
+export const connection: Readable<JobsConnection> = jobs_store.connection
+
+export const jobs: Readable<JobRecord[]> = derived(jobs_store, ($map) =>
+  Array.from($map.values()).sort(
+    (a, b) =>
+      new Date(b.created_at ?? 0).getTime() -
+      new Date(a.created_at ?? 0).getTime(),
+  ),
+)
+
+export const active_jobs_count: Readable<number> = derived(
+  jobs_store,
+  ($map) => {
+    let count = 0
+    for (const job of $map.values()) {
+      if (is_active(job.status)) {
+        count += 1
+      }
+    }
+    return count
+  },
+)
diff --git a/app/web_ui/src/lib/ui/icons/jobs_icon.svelte b/app/web_ui/src/lib/ui/icons/jobs_icon.svelte
new file mode 100644
index 000000000..065ddaac4
--- /dev/null
+++ b/app/web_ui/src/lib/ui/icons/jobs_icon.svelte
@@ -0,0 +1,23 @@
+<svg
+  class="w-full h-full"
+  viewBox="0 0 24 24"
+  fill="none"
+  xmlns="http://www.w3.org/2000/svg"
+>
+  <path
+    d="M2 14C2 11.1716 2 9.75736 2.87868 8.87868C3.75736 8 5.17157 8 8 8H16C18.8284 8 20.2426 8 21.1213 8.87868C22 9.75736 22 11.1716 22 14C22 16.8284 22 18.2426 21.1213 19.1213C20.2426 20 18.8284 20 16 20H8C5.17157 20 3.75736 20 2.87868 19.1213C2 18.2426 2 16.8284 2 14Z"
+    stroke="currentColor"
+    stroke-width="1.5"
+  />
+  <path
+    d="M16 8V7C16 5.11438 16 4.17157 15.4142 3.58579C14.8284 3 13.8856 3 12 3C10.1144 3 9.17157 3 8.58579 3.58579C8 4.17157 8 5.11438 8 7V8"
+    stroke="currentColor"
+    stroke-width="1.5"
+  />
+  <path
+    d="M2 13H22"
+    stroke="currentColor"
+    stroke-width="1.5"
+    stroke-linecap="round"
+  />
+</svg>
diff --git a/app/web_ui/src/lib/ui/section.ts b/app/web_ui/src/lib/ui/section.ts
index 0dd772847..3fbeccf63 100644
--- a/app/web_ui/src/lib/ui/section.ts
+++ b/app/web_ui/src/lib/ui/section.ts
@@ -12,5 +12,6 @@ export enum Section {
   Skills,
   Optimize,
   Assistant,
+  Jobs,
   None,
 }
diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte
index ac367455b..3c1c66e92 100644
--- a/app/web_ui/src/routes/(app)/+layout.svelte
+++ b/app/web_ui/src/routes/(app)/+layout.svelte
@@ -18,6 +18,8 @@
   import ToolsIcon from "$lib/ui/icons/tools_icon.svelte"
   import ChatBar from "./chat_bar.svelte"
   import ChatIcon from "$lib/ui/icons/chat_icon.svelte"
+  import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
+  import SidebarJobsBadge from "$lib/components/SidebarJobsBadge.svelte"
   import { Section } from "$lib/ui/section"
   import Dialog from "$lib/ui/dialog.svelte"
   import SidebarRail from "./sidebar_rail.svelte"
@@ -108,6 +110,8 @@
       section = Section.Specs
     } else if (path_start("/optimize", $page.url.pathname)) {
       section = Section.Optimize
+    } else if (path_start("/jobs", $page.url.pathname)) {
+      section = Section.Jobs
     } else if (path_start("/assistant", $page.url.pathname)) {
       section = Section.Assistant
     } else {
@@ -276,6 +280,16 @@
           >
         </li>
 
+        <li class="menu-sm">
+          <a href="/jobs" class={section == Section.Jobs ? "active" : ""}>
+            <div class="sidebar-icon">
+              <JobsIcon />
+            </div>
+            Jobs
+            <SidebarJobsBadge variant="inline" />
+          </a>
+        </li>
+
         <li class="menu-sm">
           <a
             href={`/optimize/${$ui_state.current_project_id}/${$ui_state.current_task_id}`}
diff --git a/app/web_ui/src/routes/(app)/jobs/+page.svelte b/app/web_ui/src/routes/(app)/jobs/+page.svelte
new file mode 100644
index 000000000..bbfd10191
--- /dev/null
+++ b/app/web_ui/src/routes/(app)/jobs/+page.svelte
@@ -0,0 +1,362 @@
+<script lang="ts">
+  import AppPage from "../app_page.svelte"
+  import Dialog from "$lib/ui/dialog.svelte"
+  import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
+  import { jobs, synced, connection } from "$lib/stores/jobs_store"
+  import {
+    available_actions,
+    is_terminal,
+    job_status_badge_class,
+    job_status_display,
+    progress_label,
+    progress_percent,
+    type JobAction,
+  } from "$lib/stores/job_status"
+  import {
+    cancel_job,
+    create_job,
+    delete_job,
+    get_job_errors,
+    get_job_result,
+    pause_job,
+    resume_job,
+    type JobError,
+    type JobErrorEntry,
+    type JobRecord,
+  } from "$lib/stores/jobs_api"
+  import { formatDate } from "$lib/utils/formatters"
+  import { KilnError, createKilnError } from "$lib/utils/error_handlers"
+  import { capitalize } from "$lib/utils/formatters"
+  import { agentInfo } from "$lib/agent"
+  import { ui_state } from "$lib/stores"
+
+  agentInfo.set({
+    name: "Background Jobs",
+    description:
+      "Background job panel. Lists jobs (evals and others) with status, progress, and lifecycle controls.",
+  })
+
+  let action_error: KilnError | null = null
+  let in_flight: Record<string, boolean> = {}
+  let creating_test_job = false
+
+  // Kicks off a no-op job: a simulated long-running task (sleeps per step,
+  // streams progress, logs a couple of non-fatal errors) for exercising the
+  // panel end-to-end. The new job appears via the SSE stream — no local mutation.
+  async function start_test_job() {
+    action_error = null
+    creating_test_job = true
+    try {
+      await create_job(
+        "noop",
+        {
+          steps: 20,
+          sleep_per_step_seconds: 1,
+          error_at_steps: [4, 12],
+        },
+        null,
+        $ui_state.current_project_id,
+      )
+    } catch (e) {
+      action_error = createKilnError(e)
+    } finally {
+      creating_test_job = false
+    }
+  }
+
+  $: action_buttons = [
+    {
+      label: creating_test_job ? "Starting…" : "Start test job",
+      handler: start_test_job,
+      primary: true,
+      loading: creating_test_job,
+      disabled: creating_test_job,
+    },
+  ]
+
+  const action_runners: Record<JobAction, (id: string) => Promise<void>> = {
+    pause: pause_job,
+    resume: resume_job,
+    cancel: cancel_job,
+    delete: delete_job,
+  }
+
+  const action_labels: Record<JobAction, string> = {
+    pause: "Pause",
+    resume: "Resume",
+    cancel: "Cancel",
+    delete: "Delete",
+  }
+
+  async function run_action(action: JobAction, id: string) {
+    action_error = null
+    in_flight = { ...in_flight, [id]: true }
+    try {
+      await action_runners[action](id)
+      // The SSE stream reflects the resulting transition; no local mutation.
+    } catch (e) {
+      action_error = createKilnError(e)
+    } finally {
+      in_flight = { ...in_flight, [id]: false }
+    }
+  }
+
+  function job_type_display(type: string): string {
+    if (type === "noop") {
+      return "No-op"
+    }
+    return capitalize(type)
+  }
+
+  function has_errors(job: JobRecord): boolean {
+    return (job.progress?.error ?? 0) > 0 || job.status === "failed"
+  }
+
+  // Only show a result once the job is in a terminal state — a non-null
+  // `result` mid-run would be partial and misleading.
+  function has_result(job: JobRecord): boolean {
+    return is_terminal(job.status) && job.result != null
+  }
+
+  // Surface the record's failure summary inline for failed jobs.
+  function failure_error(job: JobRecord): JobError | null {
+    return job.status === "failed" ? job.error ?? null : null
+  }
+
+  // Errors dialog state
+  let errors_dialog: Dialog
+  let errors_loading = false
+  let errors_load_error: KilnError | null = null
+  let error_entries: JobErrorEntry[] = []
+  let errors_summary: JobError | null = null
+
+  async function open_errors(job: JobRecord) {
+    error_entries = []
+    errors_load_error = null
+    errors_summary = failure_error(job)
+    errors_loading = true
+    errors_dialog?.show()
+    try {
+      error_entries = await get_job_errors(job.id)
+    } catch (e) {
+      errors_load_error = createKilnError(e)
+    } finally {
+      errors_loading = false
+    }
+  }
+
+  // Result dialog state
+  let result_dialog: Dialog
+  let result_loading = false
+  let result_load_error: KilnError | null = null
+  let result_data: Record<string, unknown> | null = null
+
+  async function open_result(job: JobRecord) {
+    result_data = null
+    result_load_error = null
+    result_loading = true
+    result_dialog?.show()
+    try {
+      result_data = await get_job_result(job.id)
+    } catch (e) {
+      result_load_error = createKilnError(e)
+    } finally {
+      result_loading = false
+    }
+  }
+</script>
+
+<AppPage
+  title="Jobs"
+  subtitle="Background work for the current project."
+  sub_subtitle="Jobs keep running even if you navigate away or close this panel."
+  {action_buttons}
+>
+  {#if action_error}
+    <div role="alert" class="alert alert-error text-sm mb-4">
+      <span>{action_error.getMessage() || "An action failed."}</span>
+    </div>
+  {/if}
+
+  {#if !$synced && $connection === "errored"}
+    <div
+      class="flex flex-col items-center justify-center min-h-[50vh] text-center max-w-md mx-auto"
+    >
+      <div class="text-gray-400 mb-3">
+        <span class="loading loading-spinner loading-md"></span>
+      </div>
+      <h3 class="text-lg font-medium">Can't connect to the job stream</h3>
+      <p class="text-sm text-gray-500 mt-2">
+        We lost the connection to the background job updates and are retrying
+        automatically. Jobs keep running in the background — this page will
+        refresh once the connection is restored.
+      </p>
+    </div>
+  {:else if !$synced}
+    <div class="w-full min-h-[50vh] flex justify-center items-center">
+      <div class="loading loading-spinner loading-lg"></div>
+    </div>
+  {:else if $jobs.length === 0}
+    <div
+      class="flex flex-col items-center justify-center min-h-[55vh] text-center max-w-md mx-auto"
+    >
+      <div class="w-12 h-12 text-gray-400 mb-4" aria-hidden="true">
+        <JobsIcon />
+      </div>
+      <h3 class="text-lg font-medium">No jobs yet</h3>
+      <p class="text-sm text-gray-500 mt-2">
+        Long-running work like eval runs shows up here. Jobs run in the
+        background — you can leave this page and they'll keep going. Come back
+        any time to check progress, pause, or cancel them.
+      </p>
+    </div>
+  {:else}
+    <div class="overflow-x-auto rounded-lg border">
+      <table class="table">
+        <thead>
+          <tr>
+            <th>Type</th>
+            <th>Status</th>
+            <th>Progress</th>
+            <th>Message</th>
+            <th>Created</th>
+            <th class="text-right">Actions</th>
+          </tr>
+        </thead>
+        <tbody>
+          {#each $jobs as job (job.id)}
+            <tr>
+              <td class="font-medium">{job_type_display(job.type)}</td>
+              <td>
+                <span class="badge {job_status_badge_class(job.status)}">
+                  {job_status_display(job.status)}
+                </span>
+              </td>
+              <td>
+                <div class="flex flex-col gap-1 min-w-32">
+                  <span class="text-sm">{progress_label(job.progress)}</span>
+                  {#if job.progress?.total}
+                    <progress
+                      class="progress progress-primary w-32 h-1.5"
+                      value={progress_percent(job.progress)}
+                      max="100"
+                    ></progress>
+                  {/if}
+                </div>
+              </td>
+              <td class="text-sm text-gray-500 max-w-48">
+                {#if failure_error(job)?.error}
+                  <span
+                    class="text-error block truncate"
+                    title={failure_error(job)?.error}
+                    >{failure_error(job)?.error}</span
+                  >
+                {:else}
+                  <span class="block truncate"
+                    >{job.progress?.message || ""}</span
+                  >
+                {/if}
+              </td>
+              <td class="text-sm text-gray-500 whitespace-nowrap">
+                {formatDate(job.created_at)}
+              </td>
+              <td>
+                <div class="flex flex-row gap-1 justify-end flex-wrap">
+                  {#if has_result(job)}
+                    <button
+                      class="btn btn-xs btn-ghost"
+                      on:click={() => open_result(job)}
+                    >
+                      Result
+                    </button>
+                  {/if}
+                  {#if has_errors(job)}
+                    <button
+                      class="btn btn-xs btn-ghost"
+                      on:click={() => open_errors(job)}
+                    >
+                      Errors
+                    </button>
+                  {/if}
+                  {#each available_actions(job) as action}
+                    <button
+                      class="btn btn-xs {action === 'delete' ||
+                      action === 'cancel'
+                        ? 'btn-ghost text-error'
+                        : 'btn-ghost'}"
+                      disabled={in_flight[job.id]}
+                      on:click={() => run_action(action, job.id)}
+                    >
+                      {action_labels[action]}
+                    </button>
+                  {/each}
+                </div>
+              </td>
+            </tr>
+          {/each}
+        </tbody>
+      </table>
+    </div>
+  {/if}
+</AppPage>
+
+<Dialog bind:this={errors_dialog} title="Job Errors" width="wide">
+  {#if errors_summary?.error}
+    <div
+      role="alert"
+      class="alert alert-error text-sm mb-4 flex flex-col items-start gap-1"
+    >
+      <span class="font-medium break-words">{errors_summary.error}</span>
+      {#if errors_summary.detail}
+        <pre
+          class="text-xs w-full bg-base-200 text-base-content rounded-md p-2 overflow-x-auto max-h-48">{JSON.stringify(
+            errors_summary.detail,
+            null,
+            2,
+          )}</pre>
+      {/if}
+    </div>
+  {/if}
+  {#if errors_loading}
+    <div class="flex justify-center py-8">
+      <div class="loading loading-spinner loading-lg"></div>
+    </div>
+  {:else if errors_load_error}
+    <div class="text-error text-sm">
+      {errors_load_error.getMessage() || "Could not load errors."}
+    </div>
+  {:else if error_entries.length === 0}
+    <p class="text-sm text-gray-500">
+      No error messages recorded for this job.
+    </p>
+  {:else}
+    <ul class="flex flex-col gap-2 max-h-[60vh] overflow-y-auto">
+      {#each error_entries as entry, index (index)}
+        <li class="text-sm bg-base-200 rounded-md p-3 font-mono break-words">
+          {entry.error_message || JSON.stringify(entry)}
+        </li>
+      {/each}
+    </ul>
+  {/if}
+</Dialog>
+
+<Dialog bind:this={result_dialog} title="Job Result" width="wide">
+  {#if result_loading}
+    <div class="flex justify-center py-8">
+      <div class="loading loading-spinner loading-lg"></div>
+    </div>
+  {:else if result_load_error}
+    <div class="text-error text-sm">
+      {result_load_error.getMessage() || "Could not load result."}
+    </div>
+  {:else if result_data}
+    <pre
+      class="text-xs bg-base-200 rounded-md p-3 overflow-x-auto max-h-[60vh]">{JSON.stringify(
+        result_data,
+        null,
+        2,
+      )}</pre>
+  {:else}
+    <p class="text-sm text-gray-500">No result available.</p>
+  {/if}
+</Dialog>
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail.svelte b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
index 1fc1d4c3c..05332dd44 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail.svelte
+++ b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
@@ -8,6 +8,8 @@
   import SidebarRailSettings from "./sidebar_rail_settings.svelte"
   import ChatIcon from "$lib/ui/icons/chat_icon.svelte"
   import EvalIcon from "$lib/ui/icons/eval_icon.svelte"
+  import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
+  import SidebarJobsBadge from "$lib/components/SidebarJobsBadge.svelte"
 
   export let section: Section = Section.None
   export let openTaskDialog: () => void
@@ -109,6 +111,13 @@
     </div>
   </SidebarRailItem>
 
+  <SidebarRailItem href="/jobs" active={section === Section.Jobs} label="Jobs">
+    <div slot="icon" class="w-full h-full relative">
+      <JobsIcon />
+      <SidebarJobsBadge variant="rail" />
+    </div>
+  </SidebarRailItem>
+
   <SidebarRailOptimizeGroup {section} />
 
   <div class="flex-1"></div>
diff --git a/libs/server/kiln_server/server.py b/libs/server/kiln_server/server.py
index 5efb8e963..16e9f423b 100644
--- a/libs/server/kiln_server/server.py
+++ b/libs/server/kiln_server/server.py
@@ -104,6 +104,10 @@ def _get_version() -> str:
         "name": "Settings & Utilities",
         "description": "Server settings, connectivity checks, and utility endpoints.",
     },
+    {
+        "name": "Jobs",
+        "description": "Run, monitor, and control background jobs, and stream their events.",
+    },
 ]
 
 
diff --git a/specs/projects/background_job_system/architecture.md b/specs/projects/background_job_system/architecture.md
new file mode 100644
index 000000000..1d3b4ad9d
--- /dev/null
+++ b/specs/projects/background_job_system/architecture.md
@@ -0,0 +1,108 @@
+---
+status: complete
+---
+
+# Architecture: Background Job System
+
+Internal mechanics. The externally observable surface (record shape, REST API, SSE events, state machine, pause/resume semantics, worker contract) is in `functional_spec.md`. This doc covers state management, concurrency, the (non-)recovery story, code layout, and open items to verify during implementation.
+
+## 1. JobRegistry
+
+Singleton per process. Responsibilities:
+
+- Type registration (`register_type(WorkerClass)`).
+- In-memory index `{job_id → JobRecord}` — the only store. Starts empty on each process boot.
+- Supervising asyncio task per running job (`asyncio.Task` tracked in a dict). Its lifetime is owned entirely by the registry and is **decoupled from any HTTP request or SSE connection** — created at `create`/`resume`, ended only by completion or an explicit `cancel`/`pause`. Closing the web UI or dropping the SSE stream has no effect on it.
+- Global semaphore for max-concurrent `running` jobs (configurable; default 10).
+- Pub/sub bus that feeds the SSE endpoint.
+- Progress coalescing: rapid `report_progress` calls update the in-memory record freely but may be throttled before emitting an SSE `job` event (so a 500-item eval doesn't flood subscribers). Status transitions emit immediately.
+- **Reconciliation:** at every lifecycle transition (start, pause, resume) and on status reads (`GET /api/jobs/{id}`), call the worker's `compute_state(params)`, reconcile the in-memory snapshot against the derived truth, and emit a `job` event if it changed. This is what keeps the believed state honest without persistence. If `compute_state` returns `None` (fixture with no source of truth), keep the believed snapshot.
+- **Per-run identity & error log:** mint a fresh `run_id` (UUID) on each `run()` invocation and stamp it on the record. Route `ctx.report_error(...)` calls (and the final exception on a failed run) to an append-only JSON file keyed by that `run_id` in the OS temp dir. All file IO here is best-effort — a failed write or missing file never propagates.
+- Lifecycle methods: `create`, `pause`, `resume`, `cancel`, `delete`.
+
+## 2. State management (no persistence)
+
+There is no disk persistence. The in-memory index is the registry's entire store, and it is never the source of truth — it is a best-effort view of operations whose authoritative state lives in the Kiln project entities they touch (eval runs, task runs, etc.).
+
+```
+source of truth   →  Kiln project entities (eval runs, task runs, ...)
+                        │  worker.compute_state(params) reads these
+                        ▼
+registry view      →  in-memory {job_id → JobRecord}, reconciled against
+                       compute_state at transitions / status reads; lost on restart
+```
+
+**Why no files.** Job records are transient visibility/control data. Persisting them would create a second, drifting copy of state that we'd then have to reconcile against the real entities. Instead we lean on the idempotency contract and `compute_state`: every worker can re-derive "what's done" from the project, so the registry never needs to remember anything across a restart — and the in-memory snapshot self-corrects whenever it's recomputed.
+
+- **Project scope.** The record carries `project_id` purely for filtering (`GET /api/jobs?project_id=`, SSE filter). It does not dictate any storage location, because there is no storage.
+- **Result.** The `result` field holds a small in-memory summary; the actual output already lives in the entities the worker wrote. No sibling result file, no size threshold.
+- **Coalescing, not flushing.** Any debouncing applies only to SSE emission frequency — there are no disk writes to debounce.
+
+**State vs. diagnostics — the one allowed file.** The "no persistence" rule is about *state*: status/progress must stay derivable, never copied to disk. Error *messages* are not state — they're diagnostic spillover with no representation in the Kiln entities. Keeping them in the long-lived registry forever would leak memory, so they spool to an ephemeral, per-`run_id` JSON file in the OS temp dir (`{tempfile.gettempdir()}/kiln_jobs/{run_id}.json`). This doesn't reintroduce a competing source of truth: the file is non-authoritative, the OS may delete it, and every reader treats "missing" as "empty." It's the single deliberate exception, scoped to bulky diagnostics that can't live in memory.
+
+## 3. Concurrency
+
+- One global asyncio semaphore caps `running` jobs (default 10, configurable via env var, e.g. `KILN_JOBS_MAX_CONCURRENT=10`).
+- Excess jobs stay in `pending` until a slot frees. Order: FIFO by `created_at`.
+- Per-type caps are not in v1 but the registry should keep the door open (`{type: semaphore}` map ready to grow).
+- Cancellation = `asyncio.Task.cancel()` from outside; the registry transitions state in-memory and emits the SSE event. `pause` and `cancel` share the same cancellation mechanism, differing only in the resulting state.
+
+## 4. Restart behavior (no recovery)
+
+There is nothing to recover. On process restart the in-memory index starts empty, so every prior job record is simply gone — including any that were `running` or `paused`. There is no orphan scan, no `interrupted` state, no rehydration step.
+
+This is safe precisely because of the idempotency contract: the operation's true state still lives in the Kiln entities. To "recover," the user just re-triggers the job; on start the registry calls `compute_state` to seed the real progress, and `run()` continues from where the project actually left off, without duplicating completed work.
+
+If cross-restart *visibility* into past jobs is ever wanted, it should be reconstructed by querying the Kiln entities (e.g. "show me recent eval runs"), not by persisting job records — that keeps a single source of truth.
+
+## 5. Code layout (suggested)
+
+```
+Kiln/app/desktop/studio_server/jobs/
+  __init__.py
+  registry.py       # JobRegistry singleton: in-memory index, semaphore, supervising tasks, lifecycle, reconciliation
+  models.py         # JobRecord, JobProgress, JobDerivedState, JobStatus, JobContext, JobWorker base
+  events.py         # in-process pub/sub bus
+  error_log.py      # per-run error log: append / read / delete by run_id, in the OS temp dir
+  api.py            # FastAPI router: create/list/get/result/errors/pause/resume/cancel/delete + SSE
+  workers/
+    __init__.py
+    noop.py         # NoopJobWorker
+    eval.py         # EvalJobWorker
+```
+
+No `persistence.py` — the registry is purely in-memory. `error_log.py` is the one module that touches disk, and only for ephemeral, best-effort diagnostic logs (§2), never for state.
+
+Registration happens once at server startup (alongside the existing route registration), e.g.:
+
+```python
+job_registry.register_type(NoopJobWorker)
+job_registry.register_type(EvalJobWorker)
+```
+
+Frontend (Svelte) — out of strict scope for this spec, but the natural shape:
+
+```
+Kiln/app/web_ui/src/lib/jobs/
+  jobs_store.ts          # subscribes to /api/jobs/events
+  api.ts                 # thin REST client
+Kiln/app/web_ui/src/routes/(app)/jobs/+page.svelte    # jobs panel
+Kiln/app/web_ui/src/lib/components/SidebarJobsBadge.svelte
+```
+
+## 6. Open items — verify during implementation
+
+Sensible defaults are listed; flip them if the code disagrees.
+
+1. **`EvalRunner` idempotency — CONFIRMED.** Verified against the code: `EvalRunner.collect_tasks_for_task_run_eval()` builds an `already_run` set from existing `EvalRun` children (keyed by `(eval_config_id, task_run_config_id, dataset_id)`) and excludes already-run triples (`libs/core/kiln_ai/adapters/eval/eval_runner.py` ~L147–173). So re-running skips completed items and never writes duplicate result entities. EvalJob is therefore idempotent → **`supports_pause = True`**. Pause is a hard task-cancel mid-run; resume re-invokes `run()` and EvalRunner re-collects only the unfinished items. This is the *same* cancellation the legacy `run_comparison` endpoint already performs on client disconnect, so it carries no new corruption risk. `compute_state` counts `EvalRun`s whose `task_run_config_id` matches, against the eval's dataset-filter size for `total`. (Runtime errors aren't persisted as entities — a failed item simply isn't saved — so derived `error` is 0; the live `error` count comes from `Progress.errors` during the run.)
+2. **Multi-project scope — RESOLVED.** Nothing is persisted, so there's no startup scan. A single in-session registry tracks every job regardless of project; `project_id` is an optional filter on list/SSE. For eval jobs `project_id` comes from `EvalJobParams.project_id`; for noop it's `null`.
+3. **Active-project hook — RESOLVED.** The local server has **no** server-side "active project" to default to (confirmed: `project_id` is always an explicit identifier; the active project is frontend UI state `$ui_state.current_project_id`). Don't invent one. `?project_id=` is a plain optional filter (omitted = all jobs); the frontend passes its current `$ui_state.current_project_id`.
+4. **Auth — RESOLVED.** Studio-server routes use no FastAPI auth dependency; they mark agent-callability via `openapi_extra` policy constants (`ALLOW_AGENT`, etc.) as `eval_api.py` does. Mirror that convention; introduce no new scheme.
+5. **Max-concurrent default.** Set to 10; expose as env var `KILN_JOBS_MAX_CONCURRENT`. Revisit if mixed job types (e.g. evals + future bandwidth-heavy syncs) starve each other; per-type caps then.
+6. **Job ID format.** `j_{12-char-base32-lowercase}` (e.g. `j_a1b2c3d4e5f6`). Compact, grep-friendly, collision space is fine for local-only.
+7. **Delete policy.** Allowed only on terminal status (`succeeded`, `failed`, `cancelled`). Paused jobs must be cancelled or resumed-then-terminal first. (No `interrupted` state exists.)
+8. **`compute_state` read cost.** Reconciliation calls `compute_state` on every status read, which reads Kiln entities from disk. For a frequently-polled jobs panel this could get expensive. Default: recompute on lifecycle transitions and on explicit `GET /api/jobs/{id}`, but let the SSE stream ride on `report_progress` deltas between recomputations (don't recompute per progress tick). If polling proves hot, add a short TTL cache on the derived state. Confirm `compute_state` for `EvalJob` is cheap enough (a count query, not a full re-score).
+9. **SSE keepalive / heartbeat.** Match whatever the existing chat / eval SSE endpoints do. If unclear, send a `: ping\n\n` comment every 15s to keep proxies happy.
+10. **Error-message capture for `EvalJob`.** The error *count* is easy (`progress.errors`). Whether we can capture per-item error *messages* via `report_error` depends on whether `EvalRunner` surfaces individual failures (vs. just counting them). If it doesn't, the `/errors` endpoint stays empty for evals until `EvalRunner` exposes them — acceptable for v1; the `NoopJob` fixture still exercises the full error-log path. Tie-in with open item #1.
+11. **Error-log file format & cleanup.** Default to append-friendly JSON Lines internally (`{tempdir}/kiln_jobs/{run_id}.json`, one error object per line), parsed into a JSON array on read. `DELETE /api/jobs/{id}` best-effort removes the current run's file; past-run files in `/tmp` are left to the OS to reap. Confirm the temp subdir is created lazily and writes never block the worker (consider a background writer if `report_error` volume is high).
+12. **Git-sync for background eval jobs — RESOLVED via option C.** Tension surfaced during Phase 3: the legacy eval-run endpoint lives under `/api/projects/...`, where `GitSyncMiddleware` + `build_save_context(request)` wrap each `EvalRun` save in `manager.atomic_write` (git commit/push). Background jobs are deliberately request-decoupled (a core design goal) and write `EvalRun`s from a registry-owned task, so the original worker passed `save_context=None` and those writes were **not** committed/pushed (and could be stashed away by a later `ensure_clean`). **Resolution:** `app/desktop/git_sync/save_context.py` adds a request-free `save_context_for_project(project_id, context) -> SaveContext | None` (and `get_manager_for_project`) that mirrors the middleware's `_get_manager_for_request` resolution (config keyed by `project_path`, manager by `clone_path`, via the shared `GitSyncRegistry.get_or_create`), returning `None` for every "not auto-sync" branch. `EvalJobWorker._build_eval_runner` passes this through, so each `EvalRun` is committed/pushed per item — converging to the same behavior as the legacy SSE path (which already runs at concurrency 25 through the same non-reentrant per-project lock; contention, not deadlock). For non-auto-sync projects (the default) it stays a no-op, identical to before. The resolution logic is intentionally duplicated from the middleware (a clean delegating refactor would break the middleware's test patches); both copies carry a "keep in sync" note.
diff --git a/specs/projects/background_job_system/functional_spec.md b/specs/projects/background_job_system/functional_spec.md
new file mode 100644
index 000000000..f833ec409
--- /dev/null
+++ b/specs/projects/background_job_system/functional_spec.md
@@ -0,0 +1,350 @@
+---
+status: complete
+---
+
+# Functional Spec: Background Job System
+
+This doc captures the externally observable behavior of the job system: the job record shape, the worker contract, the state machine, the REST API, and the SSE stream. Internal mechanics (concurrency primitives, code layout) live in `architecture.md`.
+
+**Core principle.** A job record is ephemeral, in-memory bookkeeping — for visibility and control only. It is **never** a source of truth and is never persisted to disk. The authoritative state of whatever the job is doing lives in the Kiln project entities it reads/writes (eval runs, task runs, etc.). Workers must be idempotent (see §2).
+
+The believed status/progress in the record is **recomputed from source of truth**, not accumulated from deltas. Each worker exposes a `compute_state(params)` method (§2) that reads the relevant Kiln entities and returns the operation's true progress and whether it's complete. The registry calls it at every lifecycle transition (start, pause, resume) and on status reads, then reconciles the in-memory snapshot against the result and emits an updated event if anything changed. Live `report_progress` calls during a run are just a smoothing layer on top for the UI between recomputations — they never override the derived truth. A snapshot may still briefly lag the true state, and the worker remains responsible for its own consistency.
+
+## 1. Job record (base shape)
+
+Lives in the registry's in-memory index; serialized to JSON only for HTTP/SSE responses (not to disk).
+
+```jsonc
+{
+  "id": "j_a1b2c3d4e5f6",
+  "type": "eval",
+  "status": "running",
+  "run_id": "8f3c1e0a-...-uuid",   /* UUID of the current/most-recent run() invocation */
+  "progress": {
+    "total":   50,
+    "success": 11,                 /* items completed without error */
+    "error":   1,                  /* items that errored (count only; messages in the error log) */
+    "message": "scoring item 12",
+    "updated_at": "2026-05-28T12:34:56Z"
+  },
+  "params":   { /* type-specific opaque JSON, validated against the type's params_model */ },
+  "result":   null,                /* small summary populated on success; detail lives in Kiln entities */
+  "error":    null,                /* populated on failure; short string + optional structured detail */
+  "metadata": {},                  /* free-form pass-through from caller; this layer never interprets it */
+  "project_id":      "p_abc",
+  "supports_pause":  true,         /* stamped at creation from the worker class */
+  "created_at":      "2026-05-28T12:30:00Z",
+  "updated_at":      "2026-05-28T12:34:56Z",
+  "started_at":      "2026-05-28T12:30:01Z",
+  "ended_at":        null
+}
+```
+
+- `type` is the discriminator. Each registered type declares typed `params_model` and `result_model` (pydantic). The base record stores them as plain JSON.
+- `progress` reports counts, not a single cursor: `total`, `success` (completed without error), `error` (errored count), and a free-text `message`. Processed = `success + error`; remaining = `total − success − error`. The `error` field is just a count — the actual error *messages* live in the per-run error log (see §1.1 below).
+- `run_id` is a fresh UUID assigned on each `run()` invocation (first run and every resume/re-trigger). It keys the per-run error log so messages from different runs of the same job don't collide. `null` before the first run.
+- `metadata` is a free-form pass-through: callers may attach arbitrary attribution (any JSON object). This layer never reads, writes, or interprets it — it just stores and returns it verbatim.
+- `result` is a **small summary** (counts, status, references to the Kiln entities that hold the detail). It is not a place to stash large blobs — the real output already lives in the project entities the worker wrote. There is no sibling result file and no size threshold.
+- No `schema_version`, no checkpoint file, no persisted *state* of any kind — records exist only while the process is alive. (The error log in §1.1 is diagnostic spillover, not state.)
+
+### 1.1 Per-run error log
+
+Error *counts* live in `progress`; error *messages* would bloat the in-memory record if kept forever, so they spill to an ephemeral file instead — never to a Kiln entity (they aren't source of truth).
+
+- **Location.** A file in the OS temp dir, keyed by `run_id` — e.g. `{tempdir}/kiln_jobs/{run_id}.json` (`tempdir` = `tempfile.gettempdir()`, so `/tmp/kiln_jobs/…` on macOS/Linux; portable to Windows). Temp storage is deliberately non-authoritative: the OS may clear it, and that's fine.
+- **Shape.** An array of objects, each at minimum `{ "error_message": "..." }`. Objects (not bare strings) so we can add fields later (`item_id`, `timestamp`, `traceback_ref`, …) without a format break.
+- **Writing.** Workers append via `ctx.report_error(...)` (§2) for non-fatal per-item errors; the registry also appends the final exception when a `run()` raises. Append-only, so it survives a crash mid-run.
+- **Reading.** `GET /api/jobs/{id}/errors` (§5) returns the array for the job's current `run_id`. **If the file is gone, return `[]` with `200` — never an error.** This keeps the feature best-effort: logs are a debugging convenience, not a guarantee.
+
+## 2. Worker contract
+
+A worker is two methods: `compute_state()` — a pure read that derives true state from source of truth — and `run()` — the idempotent do-the-work method. There is no `resume()` and no checkpoint: pause is task cancellation, and resume is just a fresh `run()` (see §4) that re-orients itself via `compute_state()`.
+
+```python
+class JobDerivedState(BaseModel):
+    """A worker's view of the operation's true state, read from source-of-truth entities."""
+    total: int | None = None
+    success: int = 0          # completed without error
+    error: int = 0            # errored count
+    is_complete: bool = False
+    message: str | None = None
+
+
+class JobContext:
+    """Provided to the worker by JobRegistry during run()."""
+    job_id: str
+
+    async def report_progress(
+        self,
+        success: int,
+        error: int = 0,
+        total: int | None = None,
+        message: str | None = None,
+    ) -> None:
+        """Update the registry's in-memory progress snapshot and emit an SSE event.
+        Cheap to call often; a UI-smoothing signal only — the authoritative progress
+        comes from compute_state(). The registry may coalesce rapid calls before emitting."""
+
+    async def report_error(self, error_message: str, **extra) -> None:
+        """Append one structured error entry — {"error_message": ..., **extra} — to this
+        run's error log (a JSON file in the OS temp dir, keyed by run_id; see §1.1).
+        For non-fatal per-item errors that don't stop the run. Best-effort: a failed
+        write is swallowed, never propagated to the worker. Does not itself bump the
+        progress `error` count — report that via report_progress."""
+
+    # Cancellation is just asyncio.CancelledError on the supervising task —
+    # workers may catch it for cleanup, but the transition is unconditional. A worker
+    # must leave any in-flight atomic unit of work consistent before returning.
+
+
+class JobWorker(Generic[TParams, TResult]):
+    type_name: ClassVar[str]                  # discriminator value
+    params_model: ClassVar[type[BaseModel]]   # pydantic model for params
+    result_model: ClassVar[type[BaseModel]]   # pydantic model for result
+    supports_pause: ClassVar[bool] = False    # worker is idempotent & safe to cancel-and-re-run
+
+    async def compute_state(self, params: TParams) -> JobDerivedState | None:
+        """Read source-of-truth Kiln entities and return the operation's true progress
+        and whether it's already complete. MUST be a pure read — no side effects,
+        idempotent, safe to call any time (before start, while paused, on a status read).
+        This is the authority; the in-memory snapshot is reconciled against it.
+
+        Return None only when the worker has no backing entity to consult (e.g. the
+        NoopJob fixture); the registry then keeps the last believed snapshot. Real
+        workers must override this."""
+        return None
+
+    async def run(self, params: TParams, ctx: JobContext) -> TResult:
+        """MUST be idempotent. Should call compute_state() to learn what's already done,
+        then perform only the remaining work, reporting progress as it goes. This single
+        method covers both first run and resume — the registry calls run() again to resume
+        a paused job; the worker re-orients via compute_state(), not a handed-in checkpoint."""
+```
+
+**The idempotency contract is the load-bearing invariant of this system.** Because nothing is persisted and resume is just a re-run, every worker author must guarantee that calling `run()` twice (or after an interruption) does not double-write, duplicate rows, or otherwise corrupt the project. `compute_state()` is how a worker stays honest: it derives status from the project rather than trusting in-memory deltas, so the system self-corrects after interruptions, restarts, or concurrent edits. `supports_pause` advertises that a worker meets this bar *and* is safe to cancel mid-flight and re-run; default `False` is conservative.
+
+## 3. State machine
+
+```
+                    ┌─────────────┐
+                    │   pending   │
+                    └─────┬───────┘
+                          ▼
+                    ┌─────────────┐
+   ┌───────────────►│   running   │
+   │ resume         └──┬───┬───┬──┘
+   │ (re-run)          │   │   │
+   │                   ▼   ▼   ▼
+┌──┴───────┐      terminal states
+│  paused  │   ┌──────────┬─────────┬──────────┐
+└────▲─────┘   │succeeded │ failed  │cancelled │
+     │         └──────────┴─────────┴──────────┘
+     │  pause
+     └─ (cancel task,
+        keep resumable)
+```
+
+There is no `interrupted` state. Records are in-memory only, so a process restart simply drops every record — there are no orphans to recover and nothing to flip.
+
+Transitions:
+
+| From → To | Trigger |
+|---|---|
+| `pending → running` | semaphore slot frees, worker task started |
+| `pending → cancelled` | cancel before run started |
+| `running → succeeded` | worker returns normally |
+| `running → failed` | worker raises (other than `CancelledError`) |
+| `running → cancelled` | `cancel` issued; `asyncio.Task.cancel()`, `CancelledError` reaches worker |
+| `running → paused` | `pause` issued; same task cancellation, but marked resumable |
+| `paused → running` | `resume` called; a fresh `run()` task is started |
+| `paused → cancelled` | cancel from paused state |
+| `succeeded / failed / cancelled → (deleted)` | explicit DELETE |
+
+`pending → paused` is not allowed (pausing a not-yet-started job = cancel + recreate). `pause` and `cancel` both cancel the supervising task; they differ only in the resulting state and whether resume is permitted.
+
+## 4. Pause / resume semantics
+
+Non-cooperative and checkpoint-free. Pause is task cancellation; resume is a fresh `run()`. The worker's idempotency is what makes this safe — on resume it reads source-of-truth Kiln entities and continues from wherever the project state left off.
+
+**Pause flow.**
+1. Client calls `POST /api/jobs/{id}/pause`. Registry returns `202`.
+2. Registry calls `asyncio.Task.cancel()` on the supervising task. The worker receives `CancelledError` at its next `await`; it should finish or unwind its current atomic unit so the project is left consistent.
+3. Once the task has settled, the registry calls `compute_state(params)` to record the true progress as of the pause (rather than the last reported delta), transitions `running → paused`, and emits an event. (Distinguished from `cancel` only by the target state.)
+
+**Resume flow.**
+1. Client calls `POST /api/jobs/{id}/resume`. Registry returns `202`.
+2. Registry calls `compute_state(params)` to re-seed the progress snapshot. If it reports `is_complete`, the job goes straight to `succeeded` without re-running. Otherwise the registry schedules a new task.
+3. The registry calls `run(params, ctx)` again — there is no separate `resume()` method and no checkpoint is handed in.
+4. The worker calls `compute_state()` itself to determine what is already done and continues. A re-run must not duplicate completed work (idempotency contract, §2).
+
+Workers that don't support pause: `supports_pause = False`. Pause endpoint returns `409 Conflict`. Cancel still works (it's terminal and doesn't require re-runnability).
+
+## 5. REST API
+
+All endpoints live under `/api/jobs`. Authentication piggybacks on whatever the local server uses today (don't introduce a new scheme).
+
+| Method | Path | Body | Response | Notes |
+|---|---|---|---|---|
+| `POST` | `/api/jobs/{type}` | `{ params: <type-specific> }` | `201 { job_id, status }` | `type` must be registered. `params` validated against `params_model`. Job starts as `pending`, runs as soon as semaphore allows. |
+| `GET` | `/api/jobs` | — | `200 [ <record>, ... ]` | Filters: `?status=`, `?type=`, `?project_id=`, `?since=<iso8601>`, `?limit=`. Default sort: `created_at desc`. |
+| `GET` | `/api/jobs/{id}` | — | `200 <record>` | 404 if unknown. Recomputes status via the worker's `compute_state` (source of truth), reconciles the in-memory snapshot, and emits a `job` event if it changed before returning. |
+| `GET` | `/api/jobs/{id}/result` | — | `200 <result summary>` | 404 if not terminal or no result. Returns the small in-memory summary; detail lives in the Kiln entities the job wrote. |
+| `GET` | `/api/jobs/{id}/errors` | — | `200 [ { "error_message": "...", ... }, ... ]` | Error log for the job's current `run_id` (§1.1). Optional `?run_id=<uuid>` for a specific past run. **Always `200`; returns `[]` if the file is missing/unreadable** — never errors. |
+| `POST` | `/api/jobs/{id}/pause` | — | `202` / `409` | 409 if not running, or worker doesn't support pause. |
+| `POST` | `/api/jobs/{id}/resume` | — | `202` / `409` | 409 if not paused. |
+| `POST` | `/api/jobs/{id}/cancel` | — | `202` / `409` | 409 if already terminal. Idempotent for `pending`. |
+| `DELETE` | `/api/jobs/{id}` | — | `204` / `409` | 409 if still in-flight. Drops the in-memory record and best-effort removes the run's error log file(s). |
+| `GET` | `/api/jobs/events` | — | `200 text/event-stream` | SSE; see §6. |
+
+All state-changing endpoints (pause/resume/cancel) are async-effecting: they return immediately with `202 Accepted`; the actual state transition is published via the SSE stream.
+
+Error envelopes follow the existing local-server convention (`{ "detail": "..." }`).
+
+## 6. SSE stream
+
+`GET /api/jobs/events?job_id=&type=&project_id=` — all filters optional, combinable.
+
+**The stream is a pure observer — jobs run independently of it.** This is the critical difference from today's eval flow. The existing blocking `run_comparison` SSE endpoint runs the eval *inside the request*, so `CancellableStreamingResponse` cancelling on client disconnect also cancels the eval. Here, the job is a supervising task owned by the registry; the SSE endpoint only subscribes to the event bus and forwards snapshots. A client disconnecting (closing the tab, even quitting the whole web UI) must tear down **only** the subscription — never the job. Jobs keep running, and a later reconnect resyncs via the `snapshot` event. The *only* things that stop a job are explicit `POST /api/jobs/{id}/cancel` or `/pause`.
+
+Implementation: reuse the `CancellableStreamingResponse` pattern from `Kiln/app/desktop/studio_server/eval_api.py`, but scope its cancellation to the **subscription generator** (unsubscribe from the bus, stop the keepalive) — do not let it reach into any job task. Don't create the supervising task inside the request handler; it lives in the registry, created at `create`/`resume`, with a lifetime decoupled from any HTTP connection.
+
+Events are **idempotent snapshots, not deltas.** Every per-job event carries the full current record; the client keeps a map keyed by `id` and upserts. There is no `from`/`to` transition payload to apply in order — a client that drops or reorders events still converges as long as it processes the latest snapshot per id. A snapshot reflects the registry's *believed* state at emit time and may briefly lag the worker's true state (e.g. under concurrent edits); the worker owns its own consistency.
+
+Event types:
+
+```
+event: snapshot
+data: { "jobs": [ <record>, ... ] }
+```
+Sent once on connect with the full current set of jobs matching the filter. Lets the UI sync without a parallel GET.
+
+```
+event: job
+data: <record>
+```
+Emitted on every change to a single job — creation, status transition, and progress update all use this one event, each carrying the complete record (including the latest `status` and `progress` with its `success`/`error` counts). The registry may coalesce rapid progress updates before emitting so a 500-item eval doesn't flood subscribers. Error *messages* are not streamed — the snapshot carries only the `error` count; clients fetch messages on demand via `GET /api/jobs/{id}/errors`.
+
+```
+event: deleted
+data: { "id": "j_..." }
+```
+A tombstone — the only non-snapshot event, since a deleted record has no state to send.
+
+One stream serves the sidebar badge, jobs panel, and any future in-chat widget. Clients reconnect on disconnect; the fresh `snapshot` event resyncs them. No need for `Last-Event-ID` replay in v1 — snapshots are self-healing.
+
+Why SSE over Socket.IO: matches every other streaming endpoint in the codebase (chat, eval, calibration); no new dependency; no client-to-server streaming need.
+
+## 7. Worker implementations
+
+### Reference: `NoopJob` (validation / smoke test)
+
+```python
+class NoopJobParams(BaseModel):
+    steps: int = 10
+    sleep_per_step_seconds: float = 0.5
+    fail_at_step: int | None = None         # fatal: raises (tests the failed path)
+    error_at_steps: list[int] = []          # non-fatal: logs an error, keeps going
+
+class NoopJobResult(BaseModel):
+    completed_steps: int
+
+class NoopJobWorker(JobWorker[NoopJobParams, NoopJobResult]):
+    type_name = "noop"
+    params_model = NoopJobParams
+    result_model = NoopJobResult
+    supports_pause = True
+
+    async def compute_state(self, params):
+        return None  # no backing entity — registry keeps the believed snapshot
+
+    async def run(self, params, ctx):
+        success = error = 0
+        for i in range(params.steps):
+            await asyncio.sleep(params.sleep_per_step_seconds)
+            if params.fail_at_step == i:
+                raise RuntimeError(f"intentional fail at step {i}")
+            if i in params.error_at_steps:
+                error += 1
+                await ctx.report_error(f"intentional error at step {i}", step=i)
+            else:
+                success += 1
+            await ctx.report_progress(
+                success=success,
+                error=error,
+                total=params.steps,
+                message=f"step {i+1}/{params.steps}",
+            )
+        return NoopJobResult(completed_steps=success + error)
+```
+
+`NoopJob` is the canary: end-to-end-tests pause / resume / cancel / error-log capture without needing real LLM calls or `EvalRunner`. `error_at_steps` exercises the non-fatal `report_error` path (errors accumulate in the log and the `error` count without stopping the run); `fail_at_step` exercises the fatal path. It has **no** backing Kiln entity, so `compute_state` returns `None` and its `run()` simply restarts from step 0 on resume. That's an honest limitation of a source-of-truth-free fixture and is fine: the canary's purpose is to exercise lifecycle transitions and the error log, not work-skipping. Real workers derive their state instead of restarting.
+
+### `EvalJob` (first real consumer)
+
+```python
+class EvalJobParams(BaseModel):
+    project_id: str
+    task_id: str
+    eval_id: str
+    eval_config_id: str
+    run_config_id: str
+
+class EvalJobResult(BaseModel):
+    total: int
+    success: int
+    error: int
+    # just a summary — per-row results live in the eval run entity (source of truth)
+
+class EvalJobWorker(JobWorker[EvalJobParams, EvalJobResult]):
+    type_name = "eval"
+    params_model = EvalJobParams
+    result_model = EvalJobResult
+    supports_pause = True   # EvalRunner is confirmed idempotent: collect_tasks excludes
+                            # already-run (eval_config, run_config, dataset_id) triples,
+                            # so pause (cancel) + resume (re-run) skips completed items
+                            # and writes no duplicates. See architecture.md open item #1.
+
+    async def compute_state(self, params):
+        # Source of truth: EvalRun entities, intersected with the eval-set filter so we
+        # count exactly the candidate set EvalRunner.collect_tasks would (open item #1).
+        in_filter_ids = dataset_ids_passing_eval_filter(params)          # task runs in the eval set
+        scored_ids    = scored_dataset_ids(params, params.run_config_id) # existing EvalRuns
+        success = len(scored_ids & in_filter_ids)
+        total   = len(in_filter_ids)
+        # Runtime errors aren't persisted as entities (a failed item simply isn't saved),
+        # so derived error is 0; the live error count comes from Progress.errors during run().
+        return JobDerivedState(total=total, success=success, error=0,
+                               is_complete=(success >= total))
+
+    async def run(self, params, ctx):
+        # EvalRunner.collect_tasks excludes already-scored items, so Progress counts only the
+        # REMAINING work (Progress.total = full − already_done, Progress.complete starts at 0).
+        # Add the already-done baseline so progress/result are on the full-set scale.
+        baseline = (await self.compute_state(params)).success
+        eval_runner = build_eval_runner(params)  # same construction as eval_api.py uses today
+        progress = None
+        async for progress in eval_runner.run():
+            await ctx.report_progress(
+                success=baseline + progress.complete,
+                error=progress.errors,
+                total=baseline + progress.total,   # baseline + remaining = full eval-set size
+            )
+        return EvalJobResult(
+            total=baseline + (progress.total if progress else 0),
+            success=baseline + (progress.complete if progress else 0),
+            error=(progress.errors if progress else 0),
+        )
+```
+
+`EvalRunner` is unchanged. Internally it still uses `AsyncJobRunner` for per-item parallelism. The translation is `Progress → JobContext.report_progress()` for counts. Capturing individual eval error *messages* via `report_error` depends on whether `EvalRunner` surfaces per-item failures (see open item #1); if it only exposes an error count, the messages endpoint stays empty for evals until that's wired up.
+
+The idempotency contract bears directly on this worker: a paused-then-resumed (or re-triggered) eval re-invokes `run()`, which re-invokes `EvalRunner.run()`. This is confirmed safe — `EvalRunner.collect_tasks` excludes already-run `(eval_config, run_config, dataset_id)` triples, so completed items are skipped and no duplicate `EvalRun` entities are written (architecture.md open item #1). Hence `supports_pause = True`.
+
+## 8. What's NOT in this spec
+
+- Full per-job log capture / streaming. Error *messages* are collected per run (§1.1) and fetched via `GET /api/jobs/{id}/errors`, but general stdout/stderr/log streaming is out — workers still use the standard logger for that.
+- Job dependencies / DAGs. One job, one task.
+- Retries at the job level. `AsyncJobRunner` already retries individual sub-tasks for workers that use it; whole-job retry is the caller's problem (or a future feature).
+- Cross-project listings. Records carry `project_id`; the SSE/list endpoints can filter by it, but there's no global "all jobs everywhere" view.
+- Multi-machine / remote execution. All jobs are local asyncio tasks. Cloud Run is GEPA's path and isn't generalized here.
+- Pre-run approval / authorization gates. The endpoints follow whatever auth the local server has; no new approval scheme.
diff --git a/specs/projects/background_job_system/implementation_plan.md b/specs/projects/background_job_system/implementation_plan.md
new file mode 100644
index 000000000..77f163bf6
--- /dev/null
+++ b/specs/projects/background_job_system/implementation_plan.md
@@ -0,0 +1,14 @@
+---
+status: complete
+---
+
+# Implementation Plan: Background Job System
+
+Derived from the "Quick start" section of the original spec, lightly re-split so each phase is one CR-sized chunk.
+
+## Phases
+
+- [x] Phase 1: Core layer + NoopJob (no HTTP yet) — `models.py` (incl. `JobDerivedState`, `JobProgress` with success/error counts), `registry.py` (in-memory index, semaphore, supervising tasks, lifecycle, per-run `run_id`, `compute_state` reconciliation at transitions/status reads), `events.py`, `error_log.py` (append/read/delete by `run_id` in the OS temp dir, all best-effort), `workers/noop.py`. No persistence layer for state. Verify the full lifecycle (`create / pause / resume / cancel / delete`) via Python tests against `NoopJobWorker`, including pause = task-cancel → `paused`, resume = fresh `run()`, reconciliation when `compute_state` returns `None`, and error-log capture (`error_at_steps` non-fatal + `fail_at_step` fatal), including graceful `[]` when the file is missing.
+- [x] Phase 2: REST API + SSE — `api.py` (FastAPI router, incl. `GET /api/jobs/{id}/errors`), wired into the local server alongside existing routes. Idempotent-snapshot events (`snapshot` / `job` / `deleted`). Reuse `CancellableStreamingResponse` from `eval_api.py`, but scope its cancellation to the subscription generator only. Verify via curl + the SSE stream against `NoopJob` — **including the decoupling test: start a long `NoopJob`, connect then disconnect the SSE stream, and confirm the job keeps running to completion (only explicit cancel/pause stops it).**
+- [x] Phase 3: `EvalJobWorker` — wraps existing `EvalRunner` unchanged, plus `compute_state` that counts `EvalRun`s with matching `task_run_config_id` (idempotency confirmed → `supports_pause = True`; see architecture open item #1). Wire `report_error` to per-item failures if `EvalRunner` surfaces them (open item #10; otherwise the `/errors` endpoint stays empty for evals in v1). `POST /api/jobs/eval` returns a job_id and runs in the background, alongside the legacy blocking eval-run SSE endpoint. Confirm progress (success/error counts) flows correctly.
+- [ ] Phase 4: Frontend — `jobs_store.ts` (subscribes to `/api/jobs/events`, upserts by id), `api.ts`, jobs panel at `/jobs`, sidebar badge component.
diff --git a/specs/projects/background_job_system/phase_plans/phase_1.md b/specs/projects/background_job_system/phase_plans/phase_1.md
new file mode 100644
index 000000000..bfd98c1dd
--- /dev/null
+++ b/specs/projects/background_job_system/phase_plans/phase_1.md
@@ -0,0 +1,186 @@
+---
+status: complete
+---
+
+# Phase 1: Core layer + NoopJob (no HTTP)
+
+## Overview
+
+Build the in-memory core of the background job system inside a new package
+`app/desktop/studio_server/jobs/`. This phase delivers the data models, the
+worker contract, the in-process event bus, the per-run error log, and the
+`JobRegistry` singleton that owns the full job lifecycle. No FastAPI router and
+no SSE endpoint — those land in Phase 2. The only consumer wired up here is the
+`NoopJobWorker` fixture, which is exercised end-to-end by Python tests.
+
+The design follows `functional_spec.md` and `architecture.md` exactly:
+
+- Job records are ephemeral, in-memory only. No disk persistence of state.
+- Status/progress is reconciled against `worker.compute_state(params)` at every
+  lifecycle transition (start, pause, resume) and on `get`. `None` means keep
+  the believed snapshot.
+- The supervising `asyncio.Task` per running job is owned by the registry and
+  decoupled from any HTTP connection.
+- A fresh `run_id` (uuid4) is minted per `run()` invocation. Error messages
+  (`report_error` + the fatal exception of a failed run) spill to a best-effort
+  per-`run_id` JSON file in the OS temp dir.
+- Pause = `task.cancel()` -> `paused`; resume = a fresh `run()`. No
+  `interrupted` state, no checkpoints, no `resume()` method.
+
+## Steps
+
+1. `jobs/__init__.py` — empty package marker.
+
+2. `jobs/models.py` — pydantic v2 models and the worker contract.
+   - `JobStatus(str, Enum)`: `PENDING="pending"`, `RUNNING="running"`,
+     `PAUSED="paused"`, `SUCCEEDED="succeeded"`, `FAILED="failed"`,
+     `CANCELLED="cancelled"`. Add a `terminal` helper / set
+     `{SUCCEEDED, FAILED, CANCELLED}`.
+   - `JobProgress(BaseModel)`: `total: int | None = None`, `success: int = 0`,
+     `error: int = 0`, `message: str | None = None`,
+     `updated_at: datetime` (default factory utc now).
+   - `JobDerivedState(BaseModel)`: `total: int | None = None`, `success: int = 0`,
+     `error: int = 0`, `is_complete: bool = False`, `message: str | None = None`.
+   - `JobError(BaseModel)`: `error: str | None = None`,
+     `detail: dict | None = None` — small failure summary on the record.
+   - `JobRecord(BaseModel)`: fields per functional_spec §1 — `id`, `type`,
+     `status: JobStatus`, `run_id: str | None`, `progress: JobProgress`,
+     `params: dict`, `result: dict | None`, `error: JobError | None`,
+     `metadata: dict`, `project_id: str | None`, `supports_pause: bool`,
+     `created_at`, `updated_at`, `started_at: datetime | None`,
+     `ended_at: datetime | None`.
+   - `JobContext`: holds `job_id`, `run_id`, and references to the registry's
+     progress-reporting + error-logging callbacks. Async methods:
+     `report_progress(success, error=0, total=None, message=None)` and
+     `report_error(error_message, **extra)`. Implemented as a small class taking
+     two async callables so the registry can inject behavior without a circular
+     import.
+   - `JobWorker(Generic[TParams, TResult])`: classvars `type_name`,
+     `params_model`, `result_model`, `supports_pause: bool = False`. Methods
+     `async def compute_state(self, params) -> JobDerivedState | None` (default
+     returns `None`) and `async def run(self, params, ctx) -> TResult` (raises
+     `NotImplementedError`).
+
+3. `jobs/events.py` — in-process async pub/sub bus.
+   - `JobEvent` union shape: emit dataclass/pydantic events of kind
+     `snapshot` / `job` / `deleted`. Keep it simple: a small `JobEvent` model
+     with `event: Literal["snapshot","job","deleted"]` and a `data` payload.
+   - `JobEventBus`: holds a set of subscriber `asyncio.Queue`s. `subscribe()`
+     is an async generator / context that registers a queue, immediately yields
+     a `snapshot` event (built from a snapshot provider callback) filtered by
+     `job_id` / `type` / `project_id`, then yields subsequent matching events.
+   - `publish_job(record)` / `publish_deleted(job_id, project_id, type_name)`
+     fan out to all subscriber queues, applying each subscriber's filter.
+   - Filtering helper that matches a record against optional `job_id`, `type`,
+     `project_id`.
+   - Unsubscribe removes the queue (used by Phase 2's SSE teardown). For Phase 1
+     this is tested directly without HTTP.
+
+4. `jobs/error_log.py` — per-`run_id` best-effort error log.
+   - Dir: `{tempfile.gettempdir()}/kiln_jobs`. Path helper
+     `error_log_path(run_id)`.
+   - `append_error(run_id, entry: dict)` — JSON-lines append; create dir lazily;
+     swallow all exceptions.
+   - `read_errors(run_id) -> list[dict]` — read JSON-lines, skip unparsable
+     lines; missing/unreadable file -> `[]`. Never raises.
+   - `delete_errors(run_id)` — best-effort unlink; swallow exceptions.
+
+5. `jobs/registry.py` — `JobRegistry`.
+   - `__init__(max_concurrent: int | None = None)`: semaphore sized from arg or
+     env `KILN_JOBS_MAX_CONCURRENT` (default 10); in-memory
+     `dict[str, JobRecord]`; `dict[str, JobWorker]` type map;
+     `dict[str, asyncio.Task]` supervising tasks; FIFO `pending` queue of job
+     ids; a `JobEventBus`.
+   - `register_type(worker_cls)`: instantiate and index by `type_name`.
+   - `_new_job_id()`: `j_` + 12 lowercase base32 chars (from `secrets`/`uuid4`
+     bytes, mapped to `abcdefghijklmnopqrstuvwxyz234567`).
+   - `create(type_name, params, project_id=None, metadata=None) -> JobRecord`:
+     validate params against `params_model`, build a `pending` record stamped
+     with `supports_pause`, enqueue, emit a `job` event, then try to start
+     pending jobs (respecting the semaphore). Returns the record.
+   - `_try_start_pending()`: while semaphore slots available and FIFO queue
+     non-empty, pop next still-`pending` job and launch its supervising task.
+   - `_launch(job)`: mint `run_id`, set `running` + `started_at`, reconcile via
+     `compute_state` (if `is_complete` -> straight to `succeeded`), emit, then
+     create the supervising `asyncio.Task` running `_supervise`.
+   - `_supervise(job_id, params)`: acquire semaphore inside the task; build a
+     `JobContext`; call `worker.run`; on normal return set `succeeded` + store
+     result summary; on `CancelledError` honor the pending intent (pause ->
+     `paused` after `compute_state` reconcile, else `cancelled`); on other
+     exception set `failed`, append the exception to the error log, store a
+     `JobError`. Always release the slot and kick `_try_start_pending`.
+   - Progress callback: `report_progress` updates the record's `JobProgress`
+     and emits a `job` event (coalescing is a Phase-2 SSE concern; Phase 1 emits
+     per call). `report_error` callback writes to the error log via
+     `error_log.append_error(run_id, {...})`.
+   - `pause(job_id)`: only valid for `running` + `supports_pause`; flag intent
+     `paused`, cancel the task. (Not-running or not-pausable raises a clear
+     error -> Phase 2 maps to 409.)
+   - `resume(job_id)`: only valid for `paused`; reconcile via `compute_state`
+     (if `is_complete` -> `succeeded`), else set back to `pending`/enqueue and
+     `_try_start_pending` (fresh `run()` / fresh `run_id`).
+   - `cancel(job_id)`: `pending` -> `cancelled` immediately (dequeue);
+     `running`/`paused` -> flag intent `cancelled`, cancel task; terminal ->
+     raise.
+   - `delete(job_id)`: terminal only (else raise); drop record, best-effort
+     delete error-log file for its `run_id`, emit a `deleted` event.
+   - `get(job_id) -> JobRecord | None`: reconcile via `compute_state` and emit
+     `job` if changed, then return the record.
+   - `list(status=None, type=None, project_id=None, since=None, limit=None)`:
+     filter + sort `created_at desc`.
+   - `_reconcile(job, derived)`: when `derived` is not `None`, update progress
+     counts/total/message and, if `is_complete` on a non-terminal job, mark
+     `succeeded`. Returns whether anything changed.
+   - Reconciliation correctly keeps the believed snapshot when `compute_state`
+     returns `None` (the Noop case).
+   - Provide a module-level `job_registry` singleton plus the class so tests can
+     instantiate fresh isolated registries.
+
+6. `jobs/workers/__init__.py` — package marker.
+
+7. `jobs/workers/noop.py` — `NoopJobParams`, `NoopJobResult`, `NoopJobWorker`
+   exactly per functional_spec §7 (`steps`, `sleep_per_step_seconds`,
+   `fail_at_step`, `error_at_steps`; `compute_state` -> `None`; `run` reports
+   success/error counts and calls `report_error` for `error_at_steps`).
+
+## Tests
+
+Tests live in `app/desktop/studio_server/jobs/` as `test_*.py`, async style
+(`@pytest.mark.asyncio`), using fresh `JobRegistry` instances and a short
+`sleep_per_step_seconds` for speed. Helper to poll until a job reaches a target
+status with a timeout.
+
+- `test_error_log.py`
+  - append + read round-trips a list of entries; entries preserve `**extra`.
+  - missing file -> `[]`; unreadable/garbage lines skipped -> partial list.
+  - delete removes the file; delete of missing file is a no-op.
+- `test_events.py`
+  - subscribe yields an initial `snapshot` containing current jobs.
+  - a subsequent `publish_job` is delivered as a `job` event.
+  - `publish_deleted` delivers a `deleted` tombstone with the id.
+  - filtering by `project_id` / `type` / `job_id` excludes non-matching events
+    and scopes the snapshot.
+- `test_registry.py`
+  - full lifecycle: create -> running -> succeeded; `result.completed_steps`
+    equals `steps`; `started_at`/`ended_at` populated.
+  - failure path: `fail_at_step` -> `failed`; `error` summary set; the fatal
+    exception is captured in the error log for the run.
+  - cancel from pending (job never started) -> `cancelled`, no task.
+  - cancel from running -> `cancelled`.
+  - pause running -> `paused`; resume -> running -> succeeded; a fresh `run_id`
+    is minted on resume (differs from the first run).
+  - pause rejected when `supports_pause = False` (use a tiny non-pausable test
+    worker) and when not running.
+  - delete on terminal succeeds and emits `deleted`; delete while running/pending
+    raises.
+  - error-log capture: `error_at_steps` entries are readable via the run's
+    `run_id` and the progress `error` count matches; missing file -> `[]`.
+  - `compute_state` returning `None` keeps the believed snapshot (Noop never
+    flips to complete early; progress comes from `report_progress`).
+  - `compute_state` returning `is_complete=True` (test worker) reconciles a job
+    to `succeeded` without running real work.
+  - semaphore caps concurrency: with `max_concurrent=2` and 4 long jobs, exactly
+    2 run while the other 2 stay `pending` (FIFO); as the first finish, pending
+    ones start.
+  - registry emits bus events: subscribing then creating/finishing a job yields
+    `snapshot` + `job` events; deleting yields `deleted`.
diff --git a/specs/projects/background_job_system/phase_plans/phase_2.md b/specs/projects/background_job_system/phase_plans/phase_2.md
new file mode 100644
index 000000000..94ee0c555
--- /dev/null
+++ b/specs/projects/background_job_system/phase_plans/phase_2.md
@@ -0,0 +1,169 @@
+---
+status: complete
+---
+
+# Phase 2: REST API + SSE
+
+## Overview
+
+Phase 1 built the in-memory `JobRegistry` (lifecycle, semaphore, supervising
+tasks, reconciliation, per-run error log) plus the `NoopJobWorker`. Phase 2
+exposes that registry over HTTP without changing it: a FastAPI router
+(`api.py`) covering create / list / get / result / errors / pause / resume /
+cancel / delete, plus an SSE stream (`/api/jobs/events`).
+
+The load-bearing requirement is SSE decoupling: the stream is a pure observer
+of the Phase 1 event bus. A client disconnect tears down only the subscription
+(unsubscribe + stop keepalive); it must never cancel, pause, or otherwise touch
+a job's supervising task. Jobs keep running; only explicit `cancel`/`pause`
+stops them.
+
+Follows functional_spec §5 (REST) and §6 (SSE) exactly. Paths are `/api/jobs/...`
+(not project-scoped). Auth mirrors the studio convention (`openapi_extra`
+policy constants, no FastAPI auth dependency). Error envelope is the existing
+convention (`HTTPException(detail=...)`).
+
+## Steps
+
+1. **`app/desktop/studio_server/jobs/api.py`** — new module exposing the
+   process-singleton `job_registry` over HTTP via `connect_jobs_api(app: FastAPI)`.
+
+   - Request/response models:
+     - `CreateJobRequest(BaseModel)`: `params: dict[str, Any]`,
+       `metadata: dict[str, Any] | None = None`. (`project_id` is derived from
+       params when the params model carries one, not from the request body.)
+     - `CreateJobResponse(BaseModel)`: `job_id: str`, `status: JobStatus`.
+   - Helper `_project_id_from_params(worker, validated_params) -> str | None`:
+     returns `getattr(validated, "project_id", None)` so eval jobs get a
+     `project_id` and noop jobs get `null`. (Open item #2/#3: plain optional
+     filter, no server-side active project.)
+   - Helper `_record_json(record: JobRecord) -> dict`: `record.model_dump(mode="json")`.
+
+   Route ordering (declared before the `{id}`/`{type}` catch-alls so they are
+   not shadowed):
+   - `GET /api/jobs/events` — SSE (declared first).
+   - `GET /api/jobs` — list with filters.
+   - Then the dynamic routes. POST uses `{type}`; GET/DELETE use `{id}`. They do
+     not collide because they are different HTTP methods on distinct subpaths
+     (`POST /api/jobs/{type}` vs `GET /api/jobs/{id}` etc.), and the sub-action
+     routes (`/{id}/result`, `/{id}/errors`, `/{id}/pause|resume|cancel`) have
+     an extra path segment.
+
+   Endpoints:
+   - `POST /api/jobs/{type}` (`openapi_extra=ALLOW_AGENT`): validate the type is
+     registered (404 `JobOperationError` → 404 if unknown type) and `params`
+     against `params_model` (pydantic `ValidationError` → 422). Derive
+     `project_id`. `await job_registry.create(...)`. Return
+     `201 CreateJobResponse`.
+     - Unknown type → 404. Implementation: check `type in registry workers`
+       before validating; raise `HTTPException(404)`.
+     - Invalid params → 422 (raise `RequestValidationError`/`HTTPException(422)`
+       from the caught pydantic `ValidationError`).
+   - `GET /api/jobs` (`ALLOW_AGENT`): query params `status`, `type`,
+     `project_id`, `since` (iso8601 datetime), `limit` (int). Maps to
+     `registry.list_jobs(...)`. Returns `200 list[JobRecord]` (serialized),
+     default sort `created_at desc` (registry already does this).
+   - `GET /api/jobs/{id}` (`ALLOW_AGENT`): `await registry.get(id)` (reconciles +
+     emits). 404 if `None`. Returns `200 <record>`.
+   - `GET /api/jobs/{id}/result` (`ALLOW_AGENT`): get record (no reconcile
+     needed beyond `get`); 404 if unknown, 404 if not terminal or `result is
+     None`. Returns `200 <result dict>`.
+   - `GET /api/jobs/{id}/errors` (`ALLOW_AGENT`): optional `run_id` query.
+     Resolve the run_id (query param if given, else the record's current
+     `run_id`). ALWAYS `200`. Returns `error_log.read_errors(run_id)` or `[]`
+     (also `[]` when the job is unknown or has no run_id — never errors).
+   - `POST /api/jobs/{id}/pause` (mutation policy mirroring eval mutations →
+     `agent_policy_require_approval(...)`): `await registry.pause(id)`;
+     `JobNotFoundError` → 404, `JobOperationError` → 409. Return `202` (empty
+     body, `status_code=202`).
+   - `POST /api/jobs/{id}/resume`: same pattern, `registry.resume`. 202 / 404 / 409.
+   - `POST /api/jobs/{id}/cancel`: same pattern, `registry.cancel`. 202 / 404 / 409.
+   - `DELETE /api/jobs/{id}`: `await registry.delete(id)`; 404 / 409. Return
+     `204` (`status_code=204`, no body).
+   - `GET /api/jobs/events` (`ALLOW_AGENT`): query `job_id`, `type`, `project_id`.
+     Returns `CancellableStreamingResponse(content=_event_stream(...),
+     media_type="text/event-stream")`.
+
+   SSE generator `_event_stream(job_id, type_name, project_id)`:
+   - `subscription = job_registry.events.subscribe(job_id, type_name, project_id)`.
+   - Loop: `event = await asyncio.wait_for(subscription.__anext__(), timeout=KEEPALIVE_SECONDS)`;
+     on success `yield _format_sse(event)`; on `asyncio.TimeoutError` `yield ": ping\n\n"`.
+   - `finally: await subscription.aclose()` (unsubscribe via the generator's
+     `finally`). Cancelling the generator (client disconnect via
+     `CancellableStreamingResponse`) only closes the subscription — the registry
+     and its supervising tasks are untouched.
+   - `_format_sse(event: JobEvent) -> str`: `f"event: {event.event}\n"` +
+     `f"data: {json.dumps(event.data)}\n\n"` (matches the `event:`/`data:` wire
+     format; snapshot/job/deleted carry their `data` dict as built by the bus).
+   - `KEEPALIVE_SECONDS = 15` (open item #9).
+
+2. **Wire into `desktop_server.py`** — add `connect_jobs_api(app)` in
+   `make_app()` alongside the other `connect_*_api(app)` calls, before
+   `connect_webhost(app)` (which stays last). The `connect_jobs_api` function
+   registers `NoopJobWorker` on the singleton `job_registry` (idempotent: guard
+   against double-registration of the same type so repeated `make_app()` calls
+   in tests don't error). Do NOT register `EvalJobWorker` (Phase 3). The
+   registry creates asyncio tasks lazily inside `create`, which runs within a
+   request's running loop, so no special lifespan startup is needed (registration
+   is pure dict mutation, loop-safe).
+
+3. **Regenerate the OpenAPI client schema** — after the API is in, run
+   `app/web_ui/src/lib/generate_schema.sh` so `api_schema.d.ts` reflects the new
+   endpoints and `check_schema.sh` passes. Leave the regenerated file in the
+   working tree (do not commit).
+
+## Tests
+
+`app/desktop/studio_server/jobs/test_api.py` using FastAPI `TestClient` (sync
+endpoints) and `httpx.AsyncClient` + `ASGITransport` for the streaming
+decoupling test. A fresh `JobRegistry` is patched in per test (module-level
+`job_registry` reference) so tests are isolated; `NoopJobWorker` registered.
+`temp_error_log_dir` autouse fixture (monkeypatch tempdir) mirrors
+`test_registry.py`.
+
+- `test_create_returns_201_and_pending` — `POST /api/jobs/noop` with valid
+  params returns 201, body has `job_id` + `status` in {pending, running}.
+- `test_create_unknown_type_404` — `POST /api/jobs/nope` → 404.
+- `test_create_invalid_params_422` — `POST /api/jobs/noop` with `steps:"abc"` → 422.
+- `test_list_empty` — `GET /api/jobs` → 200 `[]`.
+- `test_list_returns_jobs_sorted_desc` — create two jobs, list returns newest first.
+- `test_list_filter_by_status_and_type` — filters narrow results.
+- `test_list_filter_by_project_id` — only matching project_id returned (uses a
+  worker whose params carry project_id, or asserts noop → null filtered out).
+- `test_list_since_and_limit` — `since` excludes older, `limit` caps count.
+- `test_get_returns_record` — `GET /api/jobs/{id}` → 200 with full record.
+- `test_get_unknown_404` — `GET /api/jobs/j_missing` → 404.
+- `test_get_reconciles` — a worker whose compute_state flips to complete is
+  reconciled to succeeded on GET (mirrors registry reconcile test via a stub
+  worker registered on the test registry).
+- `test_result_returns_200_when_terminal` — succeeded noop → 200 result dict
+  `{"completed_steps": n}`.
+- `test_result_404_when_not_terminal` — running job → 404.
+- `test_result_404_unknown` — unknown id → 404.
+- `test_errors_returns_array` — job with `error_at_steps` → 200 list of error
+  objects with `error_message`.
+- `test_errors_empty_when_none` — succeeded clean job → 200 `[]`.
+- `test_errors_unknown_job_returns_empty_200` — unknown id → 200 `[]` (never 404).
+- `test_errors_specific_run_id` — `?run_id=` reads that run's log.
+- `test_pause_then_resume` — pause running → 202, status paused; resume → 202.
+- `test_pause_409_when_not_running` — pause terminal → 409.
+- `test_pause_409_when_unsupported` — non-pausable worker → 409.
+- `test_resume_409_when_not_paused` — resume running → 409.
+- `test_cancel_202` — cancel running → 202, becomes cancelled.
+- `test_cancel_409_when_terminal` — cancel succeeded → 409.
+- `test_cancel_unknown_404` — cancel unknown → 404.
+- `test_delete_204_when_terminal` — delete succeeded → 204, gone from list.
+- `test_delete_409_when_in_flight` — delete running → 409.
+- `test_delete_unknown_404` — delete unknown → 404.
+- SSE:
+  - `test_sse_snapshot_then_job_event` — async client streams `/api/jobs/events`,
+    first event is `snapshot` (empty), then create a noop and observe a `job`
+    event carrying the record.
+  - `test_sse_disconnect_leaves_job_running` (DECOUPLING) — start a long noop,
+    connect + read snapshot/a job event, disconnect the stream mid-run, then
+    assert via the registry that the job continues and reaches succeeded. Proves
+    the stream is a pure observer.
+  - `test_sse_filters_by_job_id` — subscribing with `?job_id=` only sees that
+    job's events.
+- `test_connect_jobs_api_registers_noop_idempotently` — calling
+  `connect_jobs_api` twice does not raise (guard) and registers noop.
diff --git a/specs/projects/background_job_system/phase_plans/phase_3.md b/specs/projects/background_job_system/phase_plans/phase_3.md
new file mode 100644
index 000000000..5e4142d8a
--- /dev/null
+++ b/specs/projects/background_job_system/phase_plans/phase_3.md
@@ -0,0 +1,133 @@
+---
+status: draft
+---
+
+# Phase 3: EvalJobWorker (first real consumer)
+
+## Overview
+
+Add the first real background worker, `EvalJobWorker`, that wraps the existing
+`EvalRunner` unchanged and plugs it into the Phase 1/2 job system. The worker:
+
+- Derives true progress from source-of-truth `EvalRun` entities via
+  `compute_state` (a pure read), so resume/re-trigger reconciles honestly.
+- Runs the eval in the background by streaming `EvalRunner.run()`'s `Progress`
+  yields into `ctx.report_progress`, returning a small `EvalJobResult` summary.
+- Advertises `supports_pause = True` because `EvalRunner.collect_tasks_for_task_run_eval`
+  excludes already-run `(eval_config, run_config, dataset)` triples — cancel +
+  re-run skips completed items and writes no duplicate `EvalRun`s (architecture
+  open item #1, CONFIRMED).
+
+No new endpoint is needed: the generic `POST /api/jobs/{type}` from Phase 2
+dispatches to it once `EvalJobWorker` is registered alongside `NoopJobWorker`.
+
+## Key design decisions (verified against current code)
+
+- **`save_context = None` (KNOWN OPEN ITEM — not equivalent to the request path
+  for git-sync-enabled projects).** `build_save_context(request)` reads
+  `request.state.git_sync_manager` and returns `None` only when git sync isn't
+  active; when it IS active it returns a context that wraps each save in
+  `manager.atomic_write(...)` (commit + push). A background worker has no request
+  and passes `save_context=None`, so `EvalRunner` falls back to
+  `default_save_context` (a no-op). This is identical to the request path ONLY
+  for projects that do NOT have git sync in `auto` mode. For a git-sync-enabled
+  project, background-eval `EvalRun` writes do NOT participate in request-scoped
+  git-sync: they are written to disk but are NOT committed or pushed by the job,
+  unlike the legacy SSE eval endpoint under `/api/projects/...` (which goes
+  through `GitSyncMiddleware` + `build_save_context`). The uncommitted writes sit
+  dirty in the working tree until the next write-locked request triggers
+  `GitSyncManager.ensure_clean()`, whose crash-recovery path stashes dirty files
+  (and hard-resets unpushed commits) — so the background-eval results can be
+  swept out of the working tree into a stash with no UI to recover them, and are
+  never backed up to the remote. We are keeping `save_context=None` for v1; this
+  is a known open item pending a design decision (do not treat it as safe/
+  equivalent for git-sync projects).
+- **Entity loading.** Reuse `eval_config_from_id` / `task_run_config_from_id`
+  from `eval_api.py`. They take only string IDs (resolve the project via
+  `project_from_id` → `task_from_id`), need no `Request`, and raise
+  `HTTPException(404)` on missing entities. In `run()` that surfaces as a normal
+  exception → the registry marks the job `failed` (acceptable). `compute_state`
+  loads the same way; a missing entity there propagates out of reconciliation
+  (the registry only swallows `None`, not exceptions) so the failure is visible
+  rather than silently treated as "no progress".
+- **`compute_state` counts.** `total` = task runs matching
+  `dataset_filter_from_id(eval.eval_set_filter_id)`. `success` = `EvalRun`
+  children of the eval_config whose `task_run_config_id == run_config_id`.
+  `error = 0` — failed items aren't persisted as entities; the live error count
+  comes from `Progress.errors` during the run only. `is_complete = success >= total`.
+- **Errors (open item #10).** `Progress` exposes only an error *count*, not
+  per-item messages, so `report_progress(error=...)` carries the count and the
+  `/errors` endpoint stays empty for evals in v1. No `report_error` wiring and
+  no change to `EvalRunner`.
+
+## Steps
+
+1. Add `app/desktop/studio_server/jobs/workers/eval.py`:
+
+   ```python
+   class EvalJobParams(BaseModel):
+       project_id: str
+       task_id: str
+       eval_id: str
+       eval_config_id: str
+       run_config_id: str
+
+   class EvalJobResult(BaseModel):
+       total: int
+       success: int
+       error: int
+
+   class EvalJobWorker(JobWorker[EvalJobParams, EvalJobResult]):
+       type_name = "eval"
+       params_model = EvalJobParams
+       result_model = EvalJobResult
+       supports_pause = True
+
+       async def compute_state(self, params) -> JobDerivedState: ...
+       async def run(self, params, ctx) -> EvalJobResult: ...
+   ```
+
+   - A private `_build_eval_runner(params) -> EvalRunner` helper that loads the
+     eval_config + run_config and constructs
+     `EvalRunner(eval_configs=[eval_config], run_configs=[run_config],
+     eval_run_type="task_run_eval", save_context=None)` — mirroring
+     `run_eval_config` in `eval_api.py`.
+   - `compute_state` loads the eval_config (and its parent eval), counts
+     filtered task runs for `total`, counts matching `EvalRun`s for `success`,
+     returns `JobDerivedState(total, success, error=0, is_complete=success>=total)`.
+   - `run` builds the runner, iterates `async for progress in eval_runner.run():`
+     calling `await ctx.report_progress(success=progress.complete,
+     error=progress.errors, total=progress.total)`, and returns
+     `EvalJobResult(total, success, error)` from the last `progress`.
+     `EvalRunner.run()` always yields at least an initial `Progress`, so a
+     `last_progress` is guaranteed; default to a zero summary defensively.
+
+2. Register the worker in `connect_jobs_api` (`api.py`) next to
+   `NoopJobWorker`: `job_registry.register_type(EvalJobWorker)`.
+
+3. Verify the OpenAPI schema is unchanged (no new route — the generic create
+   route already exists) via `check_schema.sh`.
+
+## Tests
+
+`app/desktop/studio_server/jobs/workers/test_eval.py`, mirroring the entity
+fixtures from `test_eval_api.py` / `test_eval_runner.py` (Project/Task/Eval/
+EvalConfig/TaskRunConfig/TaskRun in `tmp_path`, pre-seeded `EvalRun`s), patching
+`project_from_id` so the on-disk project resolves.
+
+- `compute_state` with no `EvalRun`s: `total` = number of filtered task runs,
+  `success = 0`, `error = 0`, `is_complete = False`.
+- `compute_state` counts already-scored items: seed `EvalRun`s with matching
+  `task_run_config_id`; `success` equals the seeded count; `is_complete` flips
+  true only when `success >= total`.
+- `compute_state` ignores `EvalRun`s with a different `task_run_config_id`
+  (doesn't over-count).
+- `run` maps `Progress` → `report_progress` and returns the right
+  `EvalJobResult`: patch/stub `EvalRunner.run` to yield canned `Progress`
+  objects, assert the recorded `report_progress` calls and the returned result.
+- Idempotent re-run: seed some `EvalRun`s, run a real `EvalRunner` whose
+  `run_job` is stubbed to write an `EvalRun` per remaining item, assert only the
+  not-yet-scored items are processed and no duplicate `EvalRun`s are written.
+- End-to-end via the registry: `registry.register_type(EvalJobWorker)`,
+  `registry.create("eval", params)` with `EvalRunner.run` stubbed, drive to
+  `succeeded`, assert the final `result` summary and progress counts.
diff --git a/specs/projects/background_job_system/phase_plans/phase_4.md b/specs/projects/background_job_system/phase_plans/phase_4.md
new file mode 100644
index 000000000..0d531286c
--- /dev/null
+++ b/specs/projects/background_job_system/phase_plans/phase_4.md
@@ -0,0 +1,200 @@
+---
+status: draft
+---
+
+# Phase 4: Frontend — jobs store, REST client, jobs panel, sidebar badge
+
+## Overview
+
+The final phase. Phases 1–3 built the in-memory `JobRegistry`, the `/api/jobs`
+REST + SSE surface, and the first real worker (`EvalJobWorker`). This phase is the
+Svelte UI that consumes that surface:
+
+- A store backed by the `GET /api/jobs/events` SSE stream that holds a live,
+  keyed `Map<id, JobRecord>`, handling the three named events (`snapshot`,
+  `job`, `deleted`) per functional_spec §6.
+- A thin typed REST client over the generated OpenAPI `client` for the
+  create/list/get/result/errors/pause/resume/cancel/delete endpoints.
+- A jobs panel at `/jobs` listing jobs with per-job lifecycle actions (only the
+  ones valid for the current status + `supports_pause`), plus drill-in for the
+  per-run error log and the result summary.
+- A small sidebar badge showing the count of active (`pending` / `running` /
+  `paused`) jobs, driven by the same store.
+- A nav entry into both the icon rail (`sidebar_rail.svelte`) and the wide
+  drawer sidebar (`(app)/+layout.svelte`).
+
+### Key design decisions (resolved from the integration map)
+
+- **Store location.** The repo's strong convention is `src/lib/stores/` (every
+  other store + its `*.test.ts` lives there). The spec *suggests* `lib/jobs/`.
+  We follow the repo: `jobs_store.ts` and `jobs_api.ts` live in
+  `src/lib/stores/`. (The architecture doc's `lib/jobs/` path is explicitly
+  "out of strict scope … the natural shape", so matching the repo wins.)
+- **SSE named events.** The jobs stream uses `event: snapshot|job|deleted`
+  (confirmed in `app/desktop/studio_server/jobs/api.py::_format_sse`). So we use
+  `addEventListener('snapshot'|'job'|'deleted', …)`, not the single `onmessage`
+  the extractor store uses.
+- **Pure observer.** The store opens one `EventSource`, reconnects on error, and
+  closes it only when the last subscriber unsubscribes (ref-counted). No job
+  action is ever tied to connection lifecycle. A fresh `snapshot` re-syncs the
+  map on reconnect (no `Last-Event-ID`).
+- **Project filter.** The store opens the stream with
+  `?project_id=$ui_state.current_project_id` when one is set; it re-opens the
+  stream when the active project changes (the badge / panel are project-scoped,
+  matching `?project_id=` list semantics). NoopJobs (no project) only show when
+  no project filter is active — acceptable; the panel is project-scoped.
+- **Reconnect URL is the schema path constant** but `EventSource` needs a raw
+  URL, so we build it from `base_url` (mirroring `extractor_progress_store`),
+  not the openapi-fetch `client` (which can't do SSE).
+
+## Steps
+
+1. **`src/lib/stores/jobs_api.ts`** — thin REST client. Re-export the generated
+   record type for convenience and wrap each endpoint:
+
+   ```ts
+   import { client } from "$lib/api_client"
+   import type { components } from "$lib/api_schema"
+
+   export type JobRecord = components["schemas"]["JobRecord"]
+   export type BackgroundJobStatus = components["schemas"]["BackgroundJobStatus"]
+   export type JobError = components["schemas"]["JobError"]
+
+   export async function list_jobs(query?: {...}): Promise<JobRecord[]>
+   export async function get_job(id: string): Promise<JobRecord>
+   export async function create_job(type, params, metadata?): Promise<{job_id, status}>
+   export async function get_job_result(id): Promise<Record<string, unknown>>
+   export async function get_job_errors(id, run_id?): Promise<Array<{error_message?: string} & Record<string, unknown>>>
+   export async function pause_job(id): Promise<void>
+   export async function resume_job(id): Promise<void>
+   export async function cancel_job(id): Promise<void>
+   export async function delete_job(id): Promise<void>
+   ```
+
+   Each unwraps `{ data, error }` from openapi-fetch and throws `error` when set
+   (so callers can wrap with `createKilnError`). Lifecycle calls (`pause` etc.)
+   return `void` (the backend returns `202`/`204` with no useful body).
+
+2. **`src/lib/stores/jobs_store.ts`** — the SSE-backed store.
+
+   - Internal `writable<Map<string, JobRecord>>`.
+   - `connect()`: builds `${base_url}/api/jobs/events` with optional
+     `?project_id=`, opens an `EventSource`, registers listeners:
+     - `snapshot`: `JSON.parse(data).jobs` → replace the whole map.
+     - `job`: `JSON.parse(data)` (a full `JobRecord`) → upsert by `id`.
+     - `deleted`: `JSON.parse(data).id` → delete by `id`.
+     - `onerror`: close + schedule a reconnect (small backoff); the next
+       `snapshot` re-syncs.
+   - Ref-counted lifecycle: `subscribe` increments a counter and `connect()`s on
+     first subscriber; the returned unsubscribe decrements and `disconnect()`s
+     (closes the `EventSource`, cancels any pending reconnect) when it hits zero.
+     Closing never touches a job — pure observer.
+   - Re-open on project change: subscribe to `ui_state`; when
+     `current_project_id` changes while connected, tear down and reconnect with
+     the new filter. (Implemented with an exposed `set_project(id)` the module
+     wires to `ui_state`, kept testable by allowing an injected project id.)
+   - Derived exports:
+     - `jobs`: a `Readable<JobRecord[]>` sorted by `created_at desc` (matches the
+       REST default sort) for the panel.
+     - `active_jobs_count`: a `Readable<number>` counting
+       `pending|running|paused` for the badge.
+   - Export an `ACTIVE_STATUSES` set and a helper `is_active(status)` so the
+     badge logic is unit-testable without the DOM.
+   - To make `EventSource` injectable for tests, read the constructor from
+     `globalThis.EventSource` at connect time (tests install a fake on
+     `globalThis`).
+
+3. **`src/lib/stores/job_status.ts`** (small helpers, colocated) — pure
+   functions used by both the panel and tests:
+   - `job_status_display(status)`: human label.
+   - `job_status_badge_class(status)`: DaisyUI badge color class
+     (`badge-info` running, `badge-success` succeeded, `badge-error` failed,
+     `badge-warning` paused, `badge-ghost` pending, neutral cancelled).
+   - `available_actions(job)`: returns which of
+     `pause|resume|cancel|delete` are valid given `status` + `supports_pause`,
+     per state machine (§3) + delete policy (open item #7: terminal only):
+     - `running`: cancel; pause iff `supports_pause`.
+     - `paused`: resume, cancel.
+     - `pending`: cancel.
+     - terminal (`succeeded|failed|cancelled`): delete.
+   - `progress_label(progress)`: `"{success} / {total}"` (+ error count when > 0),
+     and `progress_percent(progress)` for the bar.
+
+4. **`src/lib/components/SidebarJobsBadge.svelte`** — count bubble. Renders the
+   `active_jobs_count`; shows a small primary pill with the number when > 0,
+   nothing when 0. Designed to overlay the rail icon (absolute, top-right) and to
+   sit inline in the wide drawer. Accept a `count` prop (default: subscribe to
+   the store) so it's render-testable in isolation; expose a `variant`
+   (`rail` | `inline`) for placement styling.
+
+5. **`src/routes/(app)/jobs/+page.svelte`** — the panel. Uses `AppPage`
+   (`../../app_page.svelte`) with title "Jobs" and a short subtitle. Subscribes
+   to `jobs`. States:
+   - Loading: spinner until the first `snapshot` arrives (track a
+     `connected/received-snapshot` flag on the store).
+   - Empty: educational empty state (icon + heading + one-liner explaining that
+     background jobs like evals run here and keep running even if you navigate
+     away). No destructive CTA.
+   - List: a table (`bg-base-200` header, matching the app's table style) with
+     columns: Type, Status (colored badge), Progress (`success/total`, error
+     count, thin progress bar), Message, Created, and an Actions cell.
+     - Actions render only `available_actions(job)`; each calls the matching
+       `jobs_api` fn, wrapped in try/catch → toast/inline error. Optimistic UI
+       is unnecessary — the SSE event will reflect the real transition.
+     - "Errors" button (always available when `progress.error > 0` or status is
+       `failed`) opens a `Dialog` that lazy-loads `get_job_errors(id)` and lists
+       `error_message` rows; "Result" button (when terminal + has result) opens a
+       `Dialog` showing the result summary JSON in a `<pre>`.
+   - Use `formatDate` from `$lib/utils/formatters` for timestamps.
+
+6. **`src/lib/ui/section.ts`** — add `Jobs` to the `Section` enum.
+
+7. **`src/routes/(app)/sidebar_rail.svelte`** — add a `SidebarRailItem`
+   `href="/jobs"` with a briefcase/stack icon and an overlaid `SidebarJobsBadge`
+   (rail variant). Place it after Evals / before the optimize group.
+
+8. **`src/routes/(app)/+layout.svelte`** —
+   - Add the `/jobs` → `Section.Jobs` branch to the section reactive block.
+   - Add a wide-drawer `<li>` nav entry mirroring the Evals entry, with the
+     inline badge.
+   - Import `SidebarJobsBadge`.
+
+## Tests
+
+`src/lib/stores/jobs_store.test.ts` (jsdom, fake `EventSource` installed on
+`globalThis`):
+
+- **snapshot replace**: dispatch a `snapshot` with two jobs → `jobs` has both;
+  dispatch a second `snapshot` with one different job → map fully replaced.
+- **job upsert (insert)**: `job` event for a new id adds it.
+- **job upsert (status transition + progress update)**: `job` event for an
+  existing id with a new `status`/`progress` replaces the stored record (counts
+  reflect the latest snapshot, not accumulated).
+- **deleted removal**: `deleted` event removes the id; deleting an unknown id is
+  a no-op.
+- **reconnect re-sync**: trigger `onerror` → the fake records that `close()` was
+  called and a new `EventSource` is constructed after the backoff; a fresh
+  `snapshot` repopulates the map (stale entries from before are gone).
+- **active count derivation**: a mix of statuses → `active_jobs_count` counts
+  only `pending|running|paused`.
+- **pure-observer teardown**: last unsubscribe closes the `EventSource`; a job
+  action is never invoked by the store (assert no fetch/callback fired on close).
+- **project filter**: connecting with a project id builds the URL with
+  `?project_id=`; changing the project closes the old source and opens a new one
+  with the new filter.
+
+`src/lib/stores/job_status.test.ts` (pure, no DOM):
+
+- `available_actions` returns the correct sets for each status (running w/ &
+  w/o `supports_pause`, paused, pending, each terminal).
+- `job_status_badge_class` / `job_status_display` map each status.
+- `progress_label` / `progress_percent` for total=null, zero, partial, full.
+
+`src/lib/stores/jobs_api.test.ts` (mock `$lib/api_client`'s `client`):
+
+- each wrapper calls the expected client method + path with the right
+  params, and throws when the client returns `error`.
+
+`src/lib/components/SidebarJobsBadge.test.ts` (jsdom, render):
+
+- renders the count when > 0; renders nothing when 0.
diff --git a/specs/projects/background_job_system/project_overview.md b/specs/projects/background_job_system/project_overview.md
new file mode 100644
index 000000000..f06ef0e8c
--- /dev/null
+++ b/specs/projects/background_job_system/project_overview.md
@@ -0,0 +1,48 @@
+---
+status: complete
+---
+
+# Background Job System
+
+A generic background-job layer for the local Kiln app (FastAPI on `:8757`). Provides tracked, controllable jobs that run as asyncio tasks in-process; exposes lifecycle (list / get / pause / resume / cancel / delete) and progress (SSE) over HTTP.
+
+Job records are **in-memory only** — they are ephemeral bookkeeping for visibility and control, never a source of truth. The authoritative state of any operation lives in the Kiln project entities it touches (eval runs, task runs, etc.). Every worker must be **idempotent**: it derives "what's already done" by reading those entities, so a re-run converges to the same end state without duplicating side effects. Because of this, nothing is persisted and there is nothing to recover at startup — re-triggering a job after a crash or restart is always safe.
+
+A standalone, general-purpose layer. It is intentionally generic (typed workers, opaque params/result, free-form `metadata`) so other features can build on it later, but this spec designs no integration with any specific consumer — future consumers adapt to this system, not the reverse.
+
+## Goal & scope
+
+**In scope.**
+- A generic `Job` shape: base record + per-type opaque payloads (params / result).
+- A `JobRegistry` that supervises asyncio tasks, tracks state in-memory, and emits events.
+- REST API for `create / list / get / result / errors / pause / resume / cancel / delete`.
+- SSE stream for live state and progress — success/error counts, idempotent snapshots, not deltas.
+- An idempotency contract on workers: each derives its true state from source-of-truth reads on Kiln entities, so re-runs (including pause→resume) converge without duplicating side effects.
+- Per-run error-message capture: errors spool to an ephemeral, best-effort JSON file in the OS temp dir, keyed by a per-run UUID, retrievable on demand and gracefully empty if gone.
+- A reference `NoopJob` worker for end-to-end validation.
+- An `EvalJob` worker that wraps the existing `EvalRunner` (which internally uses `AsyncJobRunner`). No changes to `EvalRunner` or `AsyncJobRunner`.
+
+**Out of scope (deferred).**
+- Any assistant / orchestration layer that consumes this system — separate, future work, not designed for here.
+- Cloud Run remoting / surviving the desktop-app process.
+- Full per-job log capture / streaming / replay (beyond the per-run error-message capture above).
+- Job dependencies / DAGs.
+- Plan-style multi-job approval.
+
+## Positioning vs. `AsyncJobRunner`
+
+`AsyncJobRunner` (`Kiln/libs/core/kiln_ai/utils/async_job_runner.py`) is a low-level worker pool that parallelizes "do N similar things" inside a single domain operation. It is in-memory, has no lifecycle beyond `.run()` returning, and is consumed by `EvalRunner`, `ExtractorRunner`, RAG runners, etc.
+
+This new layer sits **above** `AsyncJobRunner`. It does not replace it. The composition is:
+
+```
+JobRegistry              (new — tracked lifecycle, in-memory, HTTP, SSE)
+  └─ EvalJobWorker       (new — one tracked job per eval invocation)
+       └─ EvalRunner     (existing — unchanged)
+            └─ AsyncJobRunner   (existing — unchanged)
+                 └─ N parallel eval calls
+```
+
+Existing adapters keep using `AsyncJobRunner` internally. What changes for evals is the *HTTP entry point and tracking*: a new `POST /api/jobs/eval` returns a job_id and runs in the background, alongside the existing blocking SSE `GET /api/.../run_comparison` which stays for the legacy browser flow.
+
+A defining difference: the legacy blocking endpoint runs the eval *inside the HTTP request*, so closing the browser cancels it. A job in the new system runs independently of any connection — the user can close the web UI entirely and the job keeps running; the SSE stream only *observes* it (see functional spec §6).

From 90b200de5e8a9694e59336820cab0a1b532895c1 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Thu, 28 May 2026 19:53:19 +0800
Subject: [PATCH 02/26] harness for testing eval running through job

---
 app/web_ui/src/routes/(app)/jobs/+page.svelte |   9 +
 .../routes/(app)/jobs/run_eval_dialog.svelte  | 309 ++++++++++++++++++
 .../routes/(app)/jobs/run_eval_dialog.test.ts | 286 ++++++++++++++++
 .../routes/(app)/jobs/run_eval_job.test.ts    | 282 ++++++++++++++++
 .../src/routes/(app)/jobs/run_eval_job.ts     | 171 ++++++++++
 5 files changed, 1057 insertions(+)
 create mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte
 create mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts
 create mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts
 create mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_job.ts

diff --git a/app/web_ui/src/routes/(app)/jobs/+page.svelte b/app/web_ui/src/routes/(app)/jobs/+page.svelte
index bbfd10191..dc4792fe3 100644
--- a/app/web_ui/src/routes/(app)/jobs/+page.svelte
+++ b/app/web_ui/src/routes/(app)/jobs/+page.svelte
@@ -1,6 +1,7 @@
 <script lang="ts">
   import AppPage from "../app_page.svelte"
   import Dialog from "$lib/ui/dialog.svelte"
+  import RunEvalDialog from "./run_eval_dialog.svelte"
   import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
   import { jobs, synced, connection } from "$lib/stores/jobs_store"
   import {
@@ -64,7 +65,13 @@
     }
   }
 
+  let run_eval_dialog: RunEvalDialog
+
   $: action_buttons = [
+    {
+      label: "Run eval",
+      handler: () => run_eval_dialog?.show(),
+    },
     {
       label: creating_test_job ? "Starting…" : "Start test job",
       handler: start_test_job,
@@ -360,3 +367,5 @@
     <p class="text-sm text-gray-500">No result available.</p>
   {/if}
 </Dialog>
+
+<RunEvalDialog bind:this={run_eval_dialog} />
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte b/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte
new file mode 100644
index 000000000..2b24f0820
--- /dev/null
+++ b/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte
@@ -0,0 +1,309 @@
+<script lang="ts">
+  import Dialog from "$lib/ui/dialog.svelte"
+  import FormElement from "$lib/utils/form_element.svelte"
+  import { client } from "$lib/api_client"
+  import { KilnError, createKilnError } from "$lib/utils/error_handlers"
+  import { create_job } from "$lib/stores/jobs_api"
+  import {
+    ui_state,
+    model_info,
+    load_model_info,
+    load_task,
+    get_task_composite_id,
+  } from "$lib/stores"
+  import {
+    load_task_run_configs,
+    run_configs_by_task_composite_id,
+  } from "$lib/stores/run_configs_store"
+  import type { Eval, EvalConfig } from "$lib/types"
+  import {
+    can_submit_run_eval,
+    eval_config_options,
+    load_eval_judges,
+    run_config_options,
+    start_eval_job,
+  } from "./run_eval_job"
+
+  let dialog: Dialog | null = null
+
+  $: project_id = $ui_state.current_project_id
+  $: task_id = $ui_state.current_task_id
+  $: has_task = !!project_id && !!task_id
+
+  let evals: Eval[] | null = null
+  let evals_loading = false
+  let evals_error: KilnError | null = null
+  // Bindable so tests can drive the eval-selection reactive path (FancySelect
+  // can't be opened in jsdom).
+  export let selected_eval_id: string | null = null
+
+  let eval_configs: EvalConfig[] | null = null
+  let eval_configs_loading = false
+  let eval_configs_error: KilnError | null = null
+  let default_eval_config_id: string | null = null
+  let selected_eval_config_id: string | null = null
+
+  let run_configs_loading = false
+  let run_configs_error: KilnError | null = null
+  let default_run_config_id: string | null = null
+  let selected_run_config_id: string | null = null
+
+  let submitting = false
+  let submit_error: KilnError | null = null
+
+  $: current_run_configs = task_id
+    ? $run_configs_by_task_composite_id[
+        get_task_composite_id(project_id ?? "", task_id)
+      ] || null
+    : null
+
+  $: judge_select_options = eval_config_options(
+    eval_configs,
+    default_eval_config_id,
+    $model_info,
+  )
+  $: run_config_select_options = run_config_options(
+    current_run_configs,
+    default_run_config_id,
+    $model_info,
+  )
+
+  $: eval_select_options = evals
+    ? [
+        {
+          label: "Evals",
+          options: evals.map((e) => ({ value: e.id, label: e.name })),
+        },
+      ]
+    : []
+
+  $: submit_disabled =
+    !has_task ||
+    submitting ||
+    !can_submit_run_eval({
+      project_id,
+      task_id,
+      eval_id: selected_eval_id,
+      eval_config_id: selected_eval_config_id,
+      run_config_id: selected_run_config_id,
+    })
+
+  export function show() {
+    submit_error = null
+    dialog?.show()
+    void on_open()
+  }
+
+  async function on_open() {
+    // Reset selections each time the dialog opens.
+    selected_eval_id = null
+    eval_configs = null
+    selected_eval_config_id = null
+    default_eval_config_id = null
+    default_run_config_id = null
+    selected_run_config_id = null
+    eval_configs_error = null
+    run_configs_error = null
+    if (!has_task) {
+      return
+    }
+    void load_model_info()
+    await Promise.all([load_evals(), load_run_configs()])
+  }
+
+  async function load_evals() {
+    if (!project_id || !task_id) {
+      return
+    }
+    evals = null
+    evals_error = null
+    evals_loading = true
+    try {
+      const { data, error } = await client.GET(
+        "/api/projects/{project_id}/tasks/{task_id}/evals",
+        { params: { path: { project_id, task_id } } },
+      )
+      if (error) {
+        throw error
+      }
+      evals = data
+    } catch (e) {
+      evals_error = createKilnError(e)
+    } finally {
+      evals_loading = false
+    }
+  }
+
+  async function load_run_configs() {
+    if (!project_id || !task_id) {
+      return
+    }
+    run_configs_error = null
+    run_configs_loading = true
+    try {
+      await load_task_run_configs(project_id, task_id)
+      const task = await load_task(project_id, task_id)
+      default_run_config_id = task?.default_run_config_id ?? null
+      if (!selected_run_config_id && default_run_config_id) {
+        selected_run_config_id = default_run_config_id
+      }
+    } catch (e) {
+      run_configs_error = createKilnError(e)
+    } finally {
+      run_configs_loading = false
+    }
+  }
+
+  // When an eval is chosen, load it (for its default judge) and its judges.
+  $: void on_eval_selected(selected_eval_id)
+  async function on_eval_selected(eval_id: string | null) {
+    eval_configs = null
+    selected_eval_config_id = null
+    default_eval_config_id = null
+    eval_configs_error = null
+    if (!eval_id || !project_id || !task_id) {
+      return
+    }
+    eval_configs_loading = true
+    try {
+      // Bail out if the user switched evals while the GETs were in flight, so a
+      // stale response can't clobber the newer eval's judge state.
+      const result = await load_eval_judges(
+        client.GET,
+        { project_id, task_id, eval_id },
+        () => selected_eval_id === eval_id,
+      )
+      if (result.stale) {
+        return
+      }
+      eval_configs = result.eval_configs
+      default_eval_config_id = result.default_eval_config_id
+      selected_eval_config_id = result.selected_eval_config_id
+    } catch (e) {
+      if (selected_eval_id !== eval_id) {
+        return
+      }
+      eval_configs_error = createKilnError(e)
+    } finally {
+      if (selected_eval_id === eval_id) {
+        eval_configs_loading = false
+      }
+    }
+  }
+
+  async function submit() {
+    submit_error = null
+    submitting = true
+    try {
+      const started = await start_eval_job(create_job, {
+        project_id,
+        task_id,
+        eval_id: selected_eval_id,
+        eval_config_id: selected_eval_config_id,
+        run_config_id: selected_run_config_id,
+      })
+      if (started) {
+        dialog?.close()
+      }
+    } catch (e) {
+      submit_error = createKilnError(e)
+    } finally {
+      submitting = false
+    }
+  }
+</script>
+
+<Dialog bind:this={dialog} title="Run an Eval">
+  {#if !has_task}
+    <p class="text-sm text-gray-500">
+      Select a task first to run an eval as a background job.
+    </p>
+  {:else}
+    <div class="flex flex-col gap-4">
+      <div>
+        <FormElement
+          id="run_eval_eval_select"
+          label="Eval"
+          description="Choose the eval to run."
+          inputType="fancy_select"
+          bind:value={selected_eval_id}
+          fancy_select_options={eval_select_options}
+          empty_label="Select an eval"
+          empty_state_message="No evals for this task yet"
+          disabled={evals_loading}
+        />
+        {#if evals_loading}
+          <div class="text-xs text-gray-500 mt-1">Loading evals…</div>
+        {:else if evals_error}
+          <div class="text-error text-sm mt-1">
+            {evals_error.getMessage() || "Could not load evals."}
+          </div>
+        {/if}
+      </div>
+
+      {#if selected_eval_id}
+        <div>
+          <FormElement
+            id="run_eval_judge_select"
+            label="Judge"
+            description="Select the judge used to score outputs."
+            inputType="fancy_select"
+            bind:value={selected_eval_config_id}
+            fancy_select_options={judge_select_options}
+            empty_label="Select a judge"
+            empty_state_message="No judges for this eval yet"
+            disabled={eval_configs_loading}
+          />
+          {#if eval_configs_loading}
+            <div class="text-xs text-gray-500 mt-1">Loading judges…</div>
+          {:else if eval_configs_error}
+            <div class="text-error text-sm mt-1">
+              {eval_configs_error.getMessage() || "Could not load judges."}
+            </div>
+          {/if}
+        </div>
+      {/if}
+
+      <div>
+        <FormElement
+          id="run_eval_run_config_select"
+          label="Run Method"
+          description="Select the run configuration to evaluate."
+          inputType="fancy_select"
+          bind:value={selected_run_config_id}
+          fancy_select_options={run_config_select_options}
+          empty_label="Select a run method"
+          empty_state_message="No run methods for this task yet"
+          disabled={run_configs_loading}
+        />
+        {#if run_configs_loading}
+          <div class="text-xs text-gray-500 mt-1">Loading run methods…</div>
+        {:else if run_configs_error}
+          <div class="text-error text-sm mt-1">
+            {run_configs_error.getMessage() || "Could not load run methods."}
+          </div>
+        {/if}
+      </div>
+
+      {#if submit_error}
+        <div role="alert" class="alert alert-error text-sm">
+          <span>{submit_error.getMessage() || "Could not start the eval."}</span
+          >
+        </div>
+      {/if}
+
+      <div class="flex flex-row justify-end mt-2">
+        <button
+          class="btn btn-sm h-10 min-w-24 btn-primary"
+          disabled={submit_disabled}
+          on:click={submit}
+        >
+          {#if submitting}
+            <div class="loading loading-spinner loading-sm"></div>
+          {/if}
+          Run eval
+        </button>
+      </div>
+    </div>
+  {/if}
+</Dialog>
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts b/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts
new file mode 100644
index 000000000..b2de70918
--- /dev/null
+++ b/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts
@@ -0,0 +1,286 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"
+import { render, waitFor, cleanup } from "@testing-library/svelte"
+import { tick } from "svelte"
+import { client } from "$lib/api_client"
+import { ui_state, default_ui_state } from "$lib/stores"
+import { run_configs_by_task_composite_id } from "$lib/stores/run_configs_store"
+import RunEvalDialog from "./run_eval_dialog.svelte"
+
+vi.mock("$lib/api_client", () => ({
+  client: { GET: vi.fn(), POST: vi.fn(), DELETE: vi.fn() },
+  base_url: "http://localhost:8757",
+}))
+
+vi.mock("$lib/stores/jobs_api", () => ({
+  create_job: vi.fn(),
+}))
+
+// FancySelect relies on @floating-ui/dom, which is unavailable in jsdom.
+vi.mock("@floating-ui/dom", () => ({
+  computePosition: vi.fn().mockResolvedValue({ x: 0, y: 0 }),
+  autoUpdate: vi.fn(() => () => {}),
+  offset: vi.fn(),
+}))
+
+// HTMLDialogElement methods are not implemented in jsdom.
+beforeEach(() => {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  ;(HTMLDialogElement.prototype as any).showModal = vi.fn()
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  ;(HTMLDialogElement.prototype as any).close = vi.fn()
+})
+
+const mockGET = client.GET as unknown as ReturnType<typeof vi.fn>
+
+function set_task() {
+  ui_state.set({
+    ...default_ui_state,
+    current_project_id: "p_1",
+    current_task_id: "t_1",
+  })
+}
+
+function set_no_task() {
+  ui_state.set({ ...default_ui_state })
+}
+
+// Routes each GET to the right fixture based on its URL template.
+function stub_endpoints() {
+  run_configs_by_task_composite_id.set({})
+  mockGET.mockImplementation((url: string) => {
+    if (url.endsWith("/evals")) {
+      return Promise.resolve({
+        data: [{ id: "e_1", name: "Quality Eval" }],
+        error: undefined,
+      })
+    }
+    if (url.endsWith("/eval_configs")) {
+      return Promise.resolve({
+        data: [{ id: "ec_1", name: "Judge One" }],
+        error: undefined,
+      })
+    }
+    if (url.endsWith("/evals/{eval_id}")) {
+      return Promise.resolve({
+        data: { id: "e_1", name: "Quality Eval", current_config_id: "ec_1" },
+        error: undefined,
+      })
+    }
+    if (url.endsWith("/run_configs")) {
+      return Promise.resolve({
+        data: [
+          {
+            id: "rc_1",
+            name: "Default Run",
+            run_config_properties: { type: "mcp" },
+          },
+        ],
+        error: undefined,
+      })
+    }
+    if (url.endsWith("/tasks/{task_id}")) {
+      return Promise.resolve({
+        data: { id: "t_1", default_run_config_id: "rc_1" },
+        error: undefined,
+      })
+    }
+    return Promise.resolve({ data: null, error: undefined })
+  })
+}
+
+function submit_button(): HTMLButtonElement {
+  const btn = Array.from(document.body.querySelectorAll("button")).find((b) =>
+    b.textContent?.includes("Run eval"),
+  )
+  if (!btn) throw new Error("Run eval button not rendered")
+  return btn as HTMLButtonElement
+}
+
+afterEach(() => {
+  cleanup()
+  vi.clearAllMocks()
+  set_no_task()
+})
+
+describe("RunEvalDialog", () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+  })
+
+  it("shows a 'select a task first' message and no submit when no task is selected", async () => {
+    set_no_task()
+    const { component } = render(RunEvalDialog)
+    component.show()
+    await tick()
+    expect(document.body.textContent).toContain("Select a task first")
+    expect(
+      Array.from(document.body.querySelectorAll("button")).some((b) =>
+        b.textContent?.includes("Run eval"),
+      ),
+    ).toBe(false)
+    // No data should be fetched when there is no task.
+    expect(mockGET).not.toHaveBeenCalled()
+  })
+
+  it("keeps submit disabled until the eval is chosen (judge + run method default automatically)", async () => {
+    set_task()
+    stub_endpoints()
+    const { component } = render(RunEvalDialog)
+    component.show()
+    // The default run method resolves automatically and renders its label.
+    await waitFor(() =>
+      expect(document.body.textContent).toContain("Default Run"),
+    )
+    // The eval picker is still empty, so the job cannot be started yet.
+    expect(submit_button().disabled).toBe(true)
+  })
+
+  // A controllable promise so a test can resolve responses out of order.
+  function deferred<T>() {
+    let resolve!: (value: T) => void
+    const promise = new Promise<T>((r) => {
+      resolve = r
+    })
+    return { promise, resolve }
+  }
+
+  // The judge picker's closed trigger renders the selected config's name (via
+  // formatEvalConfigName, which starts with the config name).
+  function judge_response(eval_id: string | undefined) {
+    return eval_id === "e_a"
+      ? [{ id: "ec_a1", name: "Judge A1" }]
+      : [{ id: "ec_b1", name: "Judge B1" }]
+  }
+
+  // FancySelect cannot be opened in jsdom, so we drive the eval-selection
+  // reactive path via the bindable `selected_eval_id` prop instead.
+  it("resets the judge selection to the new eval's configs when the eval changes", async () => {
+    set_task()
+    run_configs_by_task_composite_id.set({})
+    mockGET.mockImplementation(
+      (url: string, opts?: { params?: { path?: { eval_id?: string } } }) => {
+        const eval_id = opts?.params?.path?.eval_id
+        if (url.endsWith("/evals")) {
+          return Promise.resolve({
+            data: [
+              { id: "e_a", name: "Eval A" },
+              { id: "e_b", name: "Eval B" },
+            ],
+            error: undefined,
+          })
+        }
+        if (url.endsWith("/evals/{eval_id}/eval_configs")) {
+          return Promise.resolve({
+            data: judge_response(eval_id),
+            error: undefined,
+          })
+        }
+        if (url.endsWith("/evals/{eval_id}")) {
+          return Promise.resolve({
+            data: {
+              id: eval_id,
+              current_config_id: eval_id === "e_a" ? "ec_a1" : "ec_b1",
+            },
+            error: undefined,
+          })
+        }
+        if (url.endsWith("/run_configs")) {
+          return Promise.resolve({ data: [], error: undefined })
+        }
+        if (url.endsWith("/tasks/{task_id}")) {
+          return Promise.resolve({ data: { id: "t_1" }, error: undefined })
+        }
+        return Promise.resolve({ data: null, error: undefined })
+      },
+    )
+
+    const { component } = render(RunEvalDialog)
+    component.show()
+    await tick()
+
+    // Select eval A: judge A1 populates and is shown as selected.
+    component.$set({ selected_eval_id: "e_a" })
+    await waitFor(() => expect(document.body.textContent).toContain("Judge A1"))
+    expect(document.body.textContent).not.toContain("Judge B1")
+
+    // Switch to eval B: the judge list/selection resets to B's config.
+    component.$set({ selected_eval_id: "e_b" })
+    await waitFor(() => expect(document.body.textContent).toContain("Judge B1"))
+    expect(document.body.textContent).not.toContain("Judge A1")
+  })
+
+  it("ignores a delayed eval-A response that resolves after switching to eval B (race guard)", async () => {
+    set_task()
+    run_configs_by_task_composite_id.set({})
+
+    // Hold eval A's GETs open so they can resolve AFTER we switch to eval B.
+    const a_eval = deferred<unknown>()
+    const a_configs = deferred<unknown>()
+
+    mockGET.mockImplementation(
+      (url: string, opts?: { params?: { path?: { eval_id?: string } } }) => {
+        const eval_id = opts?.params?.path?.eval_id
+        if (url.endsWith("/evals")) {
+          return Promise.resolve({
+            data: [
+              { id: "e_a", name: "Eval A" },
+              { id: "e_b", name: "Eval B" },
+            ],
+            error: undefined,
+          })
+        }
+        if (url.endsWith("/evals/{eval_id}/eval_configs")) {
+          if (eval_id === "e_a") return a_configs.promise
+          return Promise.resolve({
+            data: judge_response(eval_id),
+            error: undefined,
+          })
+        }
+        if (url.endsWith("/evals/{eval_id}")) {
+          if (eval_id === "e_a") return a_eval.promise
+          return Promise.resolve({
+            data: { id: eval_id, current_config_id: "ec_b1" },
+            error: undefined,
+          })
+        }
+        if (url.endsWith("/run_configs")) {
+          return Promise.resolve({ data: [], error: undefined })
+        }
+        if (url.endsWith("/tasks/{task_id}")) {
+          return Promise.resolve({ data: { id: "t_1" }, error: undefined })
+        }
+        return Promise.resolve({ data: null, error: undefined })
+      },
+    )
+
+    const { component } = render(RunEvalDialog)
+    component.show()
+    await tick()
+
+    // Pick eval A — its GETs are pending. Then quickly switch to eval B, whose
+    // GETs resolve immediately and populate Judge B1.
+    component.$set({ selected_eval_id: "e_a" })
+    await tick()
+    component.$set({ selected_eval_id: "e_b" })
+    await waitFor(() => expect(document.body.textContent).toContain("Judge B1"))
+
+    // Now let eval A's stale responses resolve. They must NOT clobber B's
+    // state. Without the guard, A's late response would overwrite the judge
+    // list back to "Judge A1".
+    a_eval.resolve({
+      data: { id: "e_a", current_config_id: "ec_a1" },
+      error: undefined,
+    })
+    a_configs.resolve({ data: judge_response("e_a"), error: undefined })
+    // Flush A's full promise chain (two awaits + the state assignment) and the
+    // resulting reactive updates so a regression would actually surface.
+    for (let i = 0; i < 5; i++) {
+      await Promise.resolve()
+      await tick()
+    }
+
+    expect(document.body.textContent).toContain("Judge B1")
+    expect(document.body.textContent).not.toContain("Judge A1")
+  })
+})
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts b/app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts
new file mode 100644
index 000000000..18b930603
--- /dev/null
+++ b/app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts
@@ -0,0 +1,282 @@
+import { describe, it, expect, vi } from "vitest"
+import {
+  build_run_eval_params,
+  can_submit_run_eval,
+  eval_config_options,
+  load_eval_judges,
+  run_config_options,
+  start_eval_job,
+  type RunEvalSelection,
+} from "./run_eval_job"
+import type { EvalConfig, TaskRunConfig } from "$lib/types"
+import type { create_job } from "$lib/stores/jobs_api"
+import type { client } from "$lib/api_client"
+
+const complete: RunEvalSelection = {
+  project_id: "p_1",
+  task_id: "t_1",
+  eval_id: "e_1",
+  eval_config_id: "ec_1",
+  run_config_id: "rc_1",
+}
+
+describe("build_run_eval_params", () => {
+  it("returns the create_job payload when all selections are present", () => {
+    expect(build_run_eval_params(complete)).toEqual({
+      project_id: "p_1",
+      task_id: "t_1",
+      eval_id: "e_1",
+      eval_config_id: "ec_1",
+      run_config_id: "rc_1",
+    })
+  })
+
+  it("returns null when the task is not selected", () => {
+    expect(build_run_eval_params({ ...complete, task_id: null })).toBeNull()
+    expect(build_run_eval_params({ ...complete, project_id: null })).toBeNull()
+  })
+
+  it("returns null until every picker has a value", () => {
+    expect(build_run_eval_params({ ...complete, eval_id: null })).toBeNull()
+    expect(
+      build_run_eval_params({ ...complete, eval_config_id: null }),
+    ).toBeNull()
+    expect(
+      build_run_eval_params({ ...complete, run_config_id: null }),
+    ).toBeNull()
+  })
+})
+
+describe("can_submit_run_eval", () => {
+  it("is true only when the selection is complete", () => {
+    expect(can_submit_run_eval(complete)).toBe(true)
+  })
+
+  it("is false when no task is selected", () => {
+    expect(
+      can_submit_run_eval({ ...complete, project_id: null, task_id: null }),
+    ).toBe(false)
+  })
+
+  it("is false until eval, judge, and run config are all chosen", () => {
+    expect(can_submit_run_eval({ ...complete, eval_id: null })).toBe(false)
+    expect(can_submit_run_eval({ ...complete, eval_config_id: null })).toBe(
+      false,
+    )
+    expect(can_submit_run_eval({ ...complete, run_config_id: null })).toBe(
+      false,
+    )
+  })
+})
+
+describe("start_eval_job", () => {
+  it("calls create_job with the eval type, selected params, and project_id", async () => {
+    const create_job_fn = vi.fn().mockResolvedValue({
+      job_id: "j_1",
+      status: "pending",
+    }) as unknown as typeof create_job
+    const started = await start_eval_job(create_job_fn, complete)
+    expect(started).toBe(true)
+    expect(create_job_fn).toHaveBeenCalledTimes(1)
+    expect(create_job_fn).toHaveBeenCalledWith(
+      "eval",
+      {
+        project_id: "p_1",
+        task_id: "t_1",
+        eval_id: "e_1",
+        eval_config_id: "ec_1",
+        run_config_id: "rc_1",
+      },
+      null,
+      "p_1",
+    )
+  })
+
+  it("does not call create_job when the selection is incomplete", async () => {
+    const create_job_fn = vi.fn() as unknown as typeof create_job
+    const started = await start_eval_job(create_job_fn, {
+      ...complete,
+      eval_config_id: null,
+    })
+    expect(started).toBe(false)
+    expect(create_job_fn).not.toHaveBeenCalled()
+  })
+
+  it("does not call create_job when no task is selected", async () => {
+    const create_job_fn = vi.fn() as unknown as typeof create_job
+    const started = await start_eval_job(create_job_fn, {
+      ...complete,
+      project_id: null,
+      task_id: null,
+    })
+    expect(started).toBe(false)
+    expect(create_job_fn).not.toHaveBeenCalled()
+  })
+})
+
+describe("eval_config_options", () => {
+  const configs = [
+    { id: "ec_2", name: "Beta" },
+    { id: "ec_1", name: "Alpha" },
+  ] as unknown as EvalConfig[]
+
+  it("returns an empty list when there are no configs", () => {
+    expect(eval_config_options(null, "ec_1", null)).toEqual([])
+    expect(eval_config_options([], "ec_1", null)).toEqual([])
+  })
+
+  it("places the default judge first and badges it", () => {
+    const groups = eval_config_options(configs, "ec_1", null)
+    expect(groups).toHaveLength(1)
+    const options = groups[0].options
+    expect(options[0].value).toBe("ec_1")
+    expect(options[0].badge).toBe("Default")
+    expect(options[1].value).toBe("ec_2")
+    expect(options[1].badge).toBeUndefined()
+  })
+})
+
+describe("run_config_options", () => {
+  const configs = [
+    {
+      id: "rc_2",
+      name: "Zeta",
+      run_config_properties: { type: "mcp" },
+    },
+    {
+      id: "rc_1",
+      name: "Alpha",
+      run_config_properties: { type: "mcp" },
+    },
+  ] as unknown as TaskRunConfig[]
+
+  it("returns an empty list when there are no configs", () => {
+    expect(run_config_options(null, "rc_1", null)).toEqual([])
+    expect(run_config_options([], "rc_1", null)).toEqual([])
+  })
+
+  it("places the default run config first, badges it, then sorts by name", () => {
+    const groups = run_config_options(configs, "rc_1", null)
+    expect(groups).toHaveLength(1)
+    const options = groups[0].options
+    expect(options[0].value).toBe("rc_1")
+    expect(options[0].badge).toBe("Default")
+    expect(options[1].value).toBe("rc_2")
+  })
+
+  it("sorts by name when there is no default", () => {
+    const groups = run_config_options(configs, null, null)
+    const options = groups[0].options
+    expect(options.map((o) => o.value)).toEqual(["rc_1", "rc_2"])
+  })
+})
+
+describe("load_eval_judges", () => {
+  // A controllable promise so a test can resolve responses out of order.
+  function deferred<T>() {
+    let resolve!: (value: T) => void
+    const promise = new Promise<T>((r) => {
+      resolve = r
+    })
+    return { promise, resolve }
+  }
+
+  const params = { project_id: "p_1", task_id: "t_1", eval_id: "e_1" }
+
+  function stub_get(responses: {
+    evaluator: unknown
+    configs: unknown
+  }): typeof client.GET {
+    return vi.fn((url: string) => {
+      if (url.endsWith("/eval_configs")) {
+        return Promise.resolve(responses.configs)
+      }
+      return Promise.resolve(responses.evaluator)
+    }) as unknown as typeof client.GET
+  }
+
+  it("returns the eval's default judge and selects it", async () => {
+    const get = stub_get({
+      evaluator: {
+        data: { id: "e_1", current_config_id: "ec_2" },
+        error: undefined,
+      },
+      configs: {
+        data: [{ id: "ec_1" }, { id: "ec_2" }],
+        error: undefined,
+      },
+    })
+    const result = await load_eval_judges(get, params, () => true)
+    expect(result.stale).toBe(false)
+    if (result.stale) throw new Error("unexpected stale")
+    expect(result.default_eval_config_id).toBe("ec_2")
+    expect(result.selected_eval_config_id).toBe("ec_2")
+    expect(result.eval_configs.map((c) => c.id)).toEqual(["ec_1", "ec_2"])
+  })
+
+  it("falls back to the first judge when the eval has no default", async () => {
+    const get = stub_get({
+      evaluator: {
+        data: { id: "e_1", current_config_id: null },
+        error: undefined,
+      },
+      configs: {
+        data: [{ id: "ec_1" }, { id: "ec_2" }],
+        error: undefined,
+      },
+    })
+    const result = await load_eval_judges(get, params, () => true)
+    if (result.stale) throw new Error("unexpected stale")
+    expect(result.default_eval_config_id).toBeNull()
+    expect(result.selected_eval_config_id).toBe("ec_1")
+  })
+
+  it("bails out as stale when the eval changes during the first GET", async () => {
+    let is_current = true
+    const evaluator = deferred<unknown>()
+    const get = vi.fn(() => evaluator.promise) as unknown as typeof client.GET
+    const pending = load_eval_judges(get, params, () => is_current)
+    // User switches evals before the first response resolves.
+    is_current = false
+    evaluator.resolve({
+      data: { id: "e_1", current_config_id: "ec_2" },
+      error: undefined,
+    })
+    const result = await pending
+    expect(result.stale).toBe(true)
+    // The second GET must not even be issued once we know we are stale.
+    expect(get).toHaveBeenCalledTimes(1)
+  })
+
+  it("bails out as stale when the eval changes during the configs GET", async () => {
+    let is_current = true
+    const configs = deferred<unknown>()
+    const get = vi.fn((url: string) => {
+      if (url.endsWith("/eval_configs")) {
+        return configs.promise
+      }
+      return Promise.resolve({
+        data: { id: "e_1", current_config_id: "ec_2" },
+        error: undefined,
+      })
+    }) as unknown as typeof client.GET
+    const pending = load_eval_judges(get, params, () => is_current)
+    // Let the first (evaluator) GET resolve, then switch evals.
+    await Promise.resolve()
+    await Promise.resolve()
+    is_current = false
+    configs.resolve({ data: [{ id: "ec_1" }], error: undefined })
+    const result = await pending
+    expect(result.stale).toBe(true)
+  })
+
+  it("throws when an in-flight (still current) request errors", async () => {
+    const get = stub_get({
+      evaluator: { data: undefined, error: { message: "boom" } },
+      configs: { data: [], error: undefined },
+    })
+    await expect(load_eval_judges(get, params, () => true)).rejects.toEqual({
+      message: "boom",
+    })
+  })
+})
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_job.ts b/app/web_ui/src/routes/(app)/jobs/run_eval_job.ts
new file mode 100644
index 000000000..c904f90c9
--- /dev/null
+++ b/app/web_ui/src/routes/(app)/jobs/run_eval_job.ts
@@ -0,0 +1,171 @@
+import type { EvalConfig, TaskRunConfig } from "$lib/types"
+import type { OptionGroup } from "$lib/ui/fancy_select_types"
+import { formatEvalConfigName } from "$lib/utils/formatters"
+import { getRunConfigModelDisplayName } from "$lib/utils/run_config_formatters"
+import type { ProviderModels } from "$lib/types"
+import type { create_job } from "$lib/stores/jobs_api"
+import type { client } from "$lib/api_client"
+
+export type RunEvalSelection = {
+  project_id: string | null
+  task_id: string | null
+  eval_id: string | null
+  eval_config_id: string | null
+  run_config_id: string | null
+}
+
+export type RunEvalJobParams = {
+  project_id: string
+  task_id: string
+  eval_id: string
+  eval_config_id: string
+  run_config_id: string
+}
+
+// All four picks (plus a current task) are required before a job can start.
+export function can_submit_run_eval(selection: RunEvalSelection): boolean {
+  return build_run_eval_params(selection) !== null
+}
+
+// Returns the create_job param payload when the selection is complete, else null.
+export function build_run_eval_params(
+  selection: RunEvalSelection,
+): RunEvalJobParams | null {
+  const { project_id, task_id, eval_id, eval_config_id, run_config_id } =
+    selection
+  if (
+    !project_id ||
+    !task_id ||
+    !eval_id ||
+    !eval_config_id ||
+    !run_config_id
+  ) {
+    return null
+  }
+  return { project_id, task_id, eval_id, eval_config_id, run_config_id }
+}
+
+// Starts the eval background job for a complete selection. Returns true if a
+// job was started; false when the selection is incomplete (nothing to do).
+export async function start_eval_job(
+  create_job_fn: typeof create_job,
+  selection: RunEvalSelection,
+): Promise<boolean> {
+  const params = build_run_eval_params(selection)
+  if (!params) {
+    return false
+  }
+  await create_job_fn("eval", { ...params }, null, params.project_id)
+  return true
+}
+
+// Default judge first (badged), matching the compare_run_configs picker.
+export function eval_config_options(
+  configs: EvalConfig[] | null,
+  default_eval_config_id: string | null | undefined,
+  model_info: ProviderModels | null,
+): OptionGroup[] {
+  if (!configs || configs.length === 0) {
+    return []
+  }
+  const sorted = [...configs].sort((a, b) => {
+    if (a.id === default_eval_config_id) return -1
+    if (b.id === default_eval_config_id) return 1
+    return 0
+  })
+  return [
+    {
+      label: "Judges",
+      options: sorted.map((config) => ({
+        value: config.id,
+        label: formatEvalConfigName(config, model_info),
+        badge: config.id === default_eval_config_id ? "Default" : undefined,
+      })),
+    },
+  ]
+}
+
+// Resolved judge state for an eval, or STALE when the request was superseded.
+export type LoadEvalJudgesResult =
+  | {
+      stale: false
+      eval_configs: EvalConfig[]
+      default_eval_config_id: string | null
+      selected_eval_config_id: string | null
+    }
+  | { stale: true }
+
+const STALE: LoadEvalJudgesResult = { stale: true }
+
+// Loads an eval's default judge and its judge list. `is_current` is checked
+// after every await so a superseded request (the user switched evals while the
+// GETs were in flight) bails out instead of clobbering newer state.
+export async function load_eval_judges(
+  get: typeof client.GET,
+  params: { project_id: string; task_id: string; eval_id: string },
+  is_current: () => boolean,
+): Promise<LoadEvalJudgesResult> {
+  const { project_id, task_id, eval_id } = params
+
+  const evaluator_resp = await get(
+    "/api/projects/{project_id}/tasks/{task_id}/evals/{eval_id}",
+    { params: { path: { project_id, task_id, eval_id } } },
+  )
+  if (!is_current()) {
+    return STALE
+  }
+  if (evaluator_resp.error) {
+    throw evaluator_resp.error
+  }
+  const default_eval_config_id = evaluator_resp.data.current_config_id ?? null
+
+  const configs_resp = await get(
+    "/api/projects/{project_id}/tasks/{task_id}/evals/{eval_id}/eval_configs",
+    { params: { path: { project_id, task_id, eval_id } } },
+  )
+  if (!is_current()) {
+    return STALE
+  }
+  if (configs_resp.error) {
+    throw configs_resp.error
+  }
+  const eval_configs = configs_resp.data
+  const selected_eval_config_id =
+    default_eval_config_id ?? eval_configs[0]?.id ?? null
+
+  return {
+    stale: false,
+    eval_configs,
+    default_eval_config_id,
+    selected_eval_config_id,
+  }
+}
+
+// Default run config first (badged), then alphabetical — mirrors the eval table.
+export function run_config_options(
+  configs: TaskRunConfig[] | null,
+  default_run_config_id: string | null | undefined,
+  model_info: ProviderModels | null,
+): OptionGroup[] {
+  if (!configs || configs.length === 0) {
+    return []
+  }
+  const sorted = [...configs].sort((a, b) => {
+    if (a.id === default_run_config_id) return -1
+    if (b.id === default_run_config_id) return 1
+    return a.name.localeCompare(b.name)
+  })
+  return [
+    {
+      label: "Run Methods",
+      options: sorted.map((config) => {
+        const model_name = getRunConfigModelDisplayName(config, model_info)
+        return {
+          value: config.id,
+          label: model_name ? `${config.name} — ${model_name}` : config.name,
+          badge: config.id === default_run_config_id ? "Default" : undefined,
+        }
+      }),
+    },
+  ]
+}

From aed9af93fad32a3d4de1ee66f35f4dd713ae7968 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Thu, 28 May 2026 19:54:37 +0800
Subject: [PATCH 03/26] chore: build annotations

---
 .../agent_checks/annotations/delete_api_jobs_id.json     | 9 +++++++++
 .../utils/agent_checks/annotations/get_api_jobs.json     | 8 ++++++++
 .../agent_checks/annotations/get_api_jobs_events.json    | 8 ++++++++
 .../utils/agent_checks/annotations/get_api_jobs_id.json  | 8 ++++++++
 .../agent_checks/annotations/get_api_jobs_id_errors.json | 8 ++++++++
 .../agent_checks/annotations/get_api_jobs_id_result.json | 8 ++++++++
 .../annotations/post_api_jobs_id_cancel.json             | 9 +++++++++
 .../agent_checks/annotations/post_api_jobs_id_pause.json | 9 +++++++++
 .../annotations/post_api_jobs_id_resume.json             | 9 +++++++++
 .../agent_checks/annotations/post_api_jobs_type.json     | 8 ++++++++
 10 files changed, 84 insertions(+)
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/delete_api_jobs_id.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_events.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_errors.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_result.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_cancel.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_pause.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_resume.json
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_type.json

diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/delete_api_jobs_id.json b/libs/server/kiln_server/utils/agent_checks/annotations/delete_api_jobs_id.json
new file mode 100644
index 000000000..32f0fd78e
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/delete_api_jobs_id.json
@@ -0,0 +1,9 @@
+{
+  "method": "delete",
+  "path": "/api/jobs/{id}",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": true,
+    "approval_description": "Allow agent to control background jobs (pause, resume, cancel, delete)?"
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs.json b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs.json
new file mode 100644
index 000000000..e7f0de246
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs.json
@@ -0,0 +1,8 @@
+{
+  "method": "get",
+  "path": "/api/jobs",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": false
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_events.json b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_events.json
new file mode 100644
index 000000000..dfb42f0e7
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_events.json
@@ -0,0 +1,8 @@
+{
+  "method": "get",
+  "path": "/api/jobs/events",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": false
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id.json b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id.json
new file mode 100644
index 000000000..858ca1150
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id.json
@@ -0,0 +1,8 @@
+{
+  "method": "get",
+  "path": "/api/jobs/{id}",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": false
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_errors.json b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_errors.json
new file mode 100644
index 000000000..9df668758
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_errors.json
@@ -0,0 +1,8 @@
+{
+  "method": "get",
+  "path": "/api/jobs/{id}/errors",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": false
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_result.json b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_result.json
new file mode 100644
index 000000000..c9384bc76
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_result.json
@@ -0,0 +1,8 @@
+{
+  "method": "get",
+  "path": "/api/jobs/{id}/result",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": false
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_cancel.json b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_cancel.json
new file mode 100644
index 000000000..f48df2706
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_cancel.json
@@ -0,0 +1,9 @@
+{
+  "method": "post",
+  "path": "/api/jobs/{id}/cancel",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": true,
+    "approval_description": "Allow agent to control background jobs (pause, resume, cancel, delete)?"
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_pause.json b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_pause.json
new file mode 100644
index 000000000..bbb24cb9c
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_pause.json
@@ -0,0 +1,9 @@
+{
+  "method": "post",
+  "path": "/api/jobs/{id}/pause",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": true,
+    "approval_description": "Allow agent to control background jobs (pause, resume, cancel, delete)?"
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_resume.json b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_resume.json
new file mode 100644
index 000000000..0291b1c58
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_id_resume.json
@@ -0,0 +1,9 @@
+{
+  "method": "post",
+  "path": "/api/jobs/{id}/resume",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": true,
+    "approval_description": "Allow agent to control background jobs (pause, resume, cancel, delete)?"
+  }
+}
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_type.json b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_type.json
new file mode 100644
index 000000000..7583bb379
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/post_api_jobs_type.json
@@ -0,0 +1,8 @@
+{
+  "method": "post",
+  "path": "/api/jobs/{type}",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": false
+  }
+}

From 509e16076490db649657fba52f0f4aa7227879e8 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Thu, 28 May 2026 21:19:48 +0800
Subject: [PATCH 04/26] feat: support waiting for job

---
 app/desktop/studio_server/jobs/api.py         | 58 ++++++++++++-
 app/desktop/studio_server/jobs/registry.py    | 33 ++++++++
 app/desktop/studio_server/jobs/test_api.py    | 76 +++++++++++++++++
 .../studio_server/jobs/test_registry.py       | 83 +++++++++++++++++++
 app/web_ui/src/lib/api_schema.d.ts            | 68 ++++++++++++++-
 app/web_ui/src/lib/stores/jobs_api.ts         |  5 +-
 app/web_ui/src/routes/(app)/jobs/+page.svelte |  4 +
 .../annotations/get_api_jobs_id_wait.json     |  8 ++
 8 files changed, 330 insertions(+), 5 deletions(-)
 create mode 100644 libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_wait.json

diff --git a/app/desktop/studio_server/jobs/api.py b/app/desktop/studio_server/jobs/api.py
index ec66c6fcd..ec862e7a3 100644
--- a/app/desktop/studio_server/jobs/api.py
+++ b/app/desktop/studio_server/jobs/api.py
@@ -164,12 +164,28 @@ async def list_jobs(
         summary="Create Job",
         tags=["Jobs"],
         status_code=201,
+        response_model=CreateJobResponse | JobRecord,
         openapi_extra=ALLOW_AGENT,
     )
     async def create_job(
         type: Annotated[str, Path(description="The registered job type to run.")],
         request: CreateJobRequest,
-    ) -> CreateJobResponse:
+        wait: Annotated[
+            bool,
+            Query(
+                description="When true, block until the job reaches a terminal "
+                "state and return the full JobRecord instead of CreateJobResponse."
+            ),
+        ] = False,
+        timeout: Annotated[
+            float | None,
+            Query(
+                ge=0,
+                description="Seconds to wait when wait=true (504 on timeout). "
+                "Omit to wait indefinitely.",
+            ),
+        ] = None,
+    ) -> CreateJobResponse | JobRecord:
         try:
             worker = job_registry.worker_for(type)
         except JobOperationError:
@@ -186,7 +202,14 @@ async def create_job(
             project_id=request.project_id or _project_id_from_params(validated),
             metadata=request.metadata,
         )
-        return CreateJobResponse(job_id=job.id, status=job.status)
+        if not wait:
+            return CreateJobResponse(job_id=job.id, status=job.status)
+        try:
+            return await job_registry.wait(job.id, timeout=timeout)
+        except asyncio.TimeoutError:
+            raise HTTPException(
+                status_code=504, detail="Job did not complete within the timeout."
+            )
 
     @app.get(
         "/api/jobs/{id}",
@@ -220,6 +243,37 @@ async def get_job_result(
             )
         return job.result
 
+    @app.get(
+        "/api/jobs/{id}/wait",
+        summary="Wait For Job",
+        tags=["Jobs"],
+        openapi_extra=ALLOW_AGENT,
+    )
+    async def wait_for_job(
+        id: Annotated[str, Path(description="The job id.")],
+        timeout: Annotated[
+            float | None,
+            Query(
+                ge=0,
+                description="Seconds to wait before giving up (504 on timeout). "
+                "Omit to wait indefinitely.",
+            ),
+        ] = None,
+    ) -> JobRecord:
+        """Block until the job reaches a terminal state, then return its record.
+
+        A pure observer, like the SSE stream: if the client disconnects, uvicorn
+        cancels this handler coroutine, which cancels the wait() await and tears
+        down only the awaiter — the job's supervising task keeps running."""
+        try:
+            return await job_registry.wait(id, timeout=timeout)
+        except JobNotFoundError:
+            raise HTTPException(status_code=404, detail=f"Job not found: {id}")
+        except asyncio.TimeoutError:
+            raise HTTPException(
+                status_code=504, detail="Job did not complete within the timeout."
+            )
+
     @app.get(
         "/api/jobs/{id}/errors",
         summary="Get Job Errors",
diff --git a/app/desktop/studio_server/jobs/registry.py b/app/desktop/studio_server/jobs/registry.py
index e8d37d55b..b85521f0c 100644
--- a/app/desktop/studio_server/jobs/registry.py
+++ b/app/desktop/studio_server/jobs/registry.py
@@ -86,6 +86,12 @@ def __init__(self, max_concurrent: int | None = None) -> None:
         # normally — the former must transition to paused/cancelled, the latter
         # must keep its succeeded result.
         self._cancel_delivered: set[str] = set()
+        # Per-job completion events for awaiters (registry.wait). Created lazily
+        # by wait(); set by _emit() on the terminal transition; reclaimed in
+        # delete(). Bounded to one event per waited job, tracking the same
+        # lifecycle as the JobRecord. Shared across all awaiters of a job so one
+        # awaiter cancelling its wait() leaves the event (and the task) untouched.
+        self._completion_events: dict[str, asyncio.Event] = {}
         self._running_count = 0
         self.events = JobEventBus(snapshot_provider=self._snapshot)
 
@@ -383,6 +389,7 @@ async def delete(self, job_id: str) -> None:
             )
         self._jobs.pop(job_id, None)
         self._remove_pending(job_id)
+        self._completion_events.pop(job_id, None)
         if job.run_id is not None:
             error_log.delete_errors(job.run_id)
         self.events.publish_deleted(job_id, job.type, job.project_id)
@@ -474,6 +481,32 @@ def _touch(self, job: JobRecord) -> None:
 
     def _emit(self, job: JobRecord) -> None:
         self.events.publish_job(job)
+        if job.status.is_terminal:
+            ev = self._completion_events.get(job.id)
+            if ev is not None:
+                ev.set()
+
+    # -- await completion ----------------------------------------------------
+
+    async def wait(self, job_id: str, timeout: float | None = None) -> JobRecord:
+        """Observe a job until it reaches a terminal state, then return its record.
+
+        A pure observer, mirroring the SSE stream's decoupling: cancelling this
+        await (caller drops off / client disconnects) tears down only the awaiter
+        — the job's supervising task is owned by the registry and keeps running.
+        Multi-waiter safe: all awaiters of a job share one Event. timeout=None
+        waits indefinitely; on timeout asyncio.wait_for raises
+        asyncio.TimeoutError, which propagates to the caller.
+        """
+        job = self._require(job_id)
+        # Create the event before the terminal check so there's no race window:
+        # single-threaded asyncio guarantees no await between setdefault and the
+        # check, and _emit only sets the event if it already exists here.
+        ev = self._completion_events.setdefault(job_id, asyncio.Event())
+        if job.status.is_terminal:
+            return job
+        await asyncio.wait_for(ev.wait(), timeout)
+        return job
 
 
 job_registry = JobRegistry()
diff --git a/app/desktop/studio_server/jobs/test_api.py b/app/desktop/studio_server/jobs/test_api.py
index 9e5429b91..e7c66a4e1 100644
--- a/app/desktop/studio_server/jobs/test_api.py
+++ b/app/desktop/studio_server/jobs/test_api.py
@@ -529,6 +529,82 @@ async def test_delete_unknown_404(client):
     assert resp.status_code == 404
 
 
+# -- wait --------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_wait_endpoint_200_terminal_record(client):
+    resp = await client.post(
+        "/api/jobs/noop", json={"params": {"steps": 3, "sleep_per_step_seconds": 0.02}}
+    )
+    job_id = resp.json()["job_id"]
+    got = await client.get(f"/api/jobs/{job_id}/wait", timeout=10.0)
+    assert got.status_code == 200, got.text
+    body = got.json()
+    assert body["id"] == job_id
+    assert body["status"] == "succeeded"
+    assert body["result"] == {"completed_steps": 3}
+
+
+@pytest.mark.asyncio
+async def test_wait_endpoint_404_unknown(client):
+    resp = await client.get("/api/jobs/j_missing/wait")
+    assert resp.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_wait_endpoint_504_on_timeout(client, registry):
+    job_id = await _create_noop(client, steps=50, sleep_per_step_seconds=0.05)
+    await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING)
+    resp = await client.get(f"/api/jobs/{job_id}/wait", params={"timeout": 0.01})
+    assert resp.status_code == 504
+    await registry.cancel(job_id)
+
+
+@pytest.mark.asyncio
+async def test_create_wait_true_returns_terminal_record(client):
+    resp = await client.post(
+        "/api/jobs/noop",
+        params={"wait": "true"},
+        json={"params": {"steps": 3, "sleep_per_step_seconds": 0.02}},
+        timeout=10.0,
+    )
+    assert resp.status_code == 201, resp.text
+    body = resp.json()
+    assert body["id"].startswith("j_")
+    assert body["status"] == "succeeded"
+    assert body["result"] == {"completed_steps": 3}
+
+
+@pytest.mark.asyncio
+async def test_create_wait_false_returns_create_response(client, registry):
+    resp = await client.post(
+        "/api/jobs/noop",
+        params={"wait": "false"},
+        json={"params": {"steps": 50, "sleep_per_step_seconds": 0.05}},
+    )
+    assert resp.status_code == 201
+    body = resp.json()
+    assert body["job_id"].startswith("j_")
+    assert body["status"] in ("pending", "running")
+    assert "result" not in body
+    await registry.cancel(body["job_id"])
+
+
+@pytest.mark.asyncio
+async def test_create_wait_true_timeout_504(client, registry):
+    resp = await client.post(
+        "/api/jobs/noop",
+        params={"wait": "true", "timeout": 0.01},
+        json={"params": {"steps": 50, "sleep_per_step_seconds": 0.05}},
+    )
+    assert resp.status_code == 504
+    # The job was still created and keeps running despite the awaiter timing out.
+    running = [r for r in registry.list_jobs() if not r.status.is_terminal]
+    assert len(running) == 1
+    await registry.cancel(running[0].id)
+
+
 # -- wiring ------------------------------------------------------------------
 
 
diff --git a/app/desktop/studio_server/jobs/test_registry.py b/app/desktop/studio_server/jobs/test_registry.py
index 2dab8909c..d31235d63 100644
--- a/app/desktop/studio_server/jobs/test_registry.py
+++ b/app/desktop/studio_server/jobs/test_registry.py
@@ -709,6 +709,89 @@ async def collect():
     assert any(e.data["status"] == "succeeded" for e in job_events)
 
 
+# -- wait --------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_wait_returns_immediately_for_terminal_job(registry):
+    job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED)
+    awaited = await asyncio.wait_for(registry.wait(job.id), timeout=1.0)
+    assert awaited.id == job.id
+    assert awaited.status == BackgroundJobStatus.SUCCEEDED
+    assert awaited.result == {"completed_steps": 2}
+
+
+@pytest.mark.asyncio
+async def test_wait_blocks_then_returns_terminal_record(registry):
+    job = await registry.create("noop", {"steps": 4, "sleep_per_step_seconds": 0.03})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    awaited = await asyncio.wait_for(registry.wait(job.id), timeout=3.0)
+    assert awaited.status == BackgroundJobStatus.SUCCEEDED
+    assert awaited.result == {"completed_steps": 4}
+
+
+@pytest.mark.asyncio
+async def test_wait_unknown_raises(registry):
+    with pytest.raises(JobNotFoundError):
+        await registry.wait("j_doesnotexist")
+
+
+@pytest.mark.asyncio
+async def test_wait_times_out(registry):
+    job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    with pytest.raises(asyncio.TimeoutError):
+        await registry.wait(job.id, timeout=0.01)
+    await registry.cancel(job.id)
+
+
+@pytest.mark.asyncio
+async def test_wait_cancellation_leaves_job_running(registry):
+    # The load-bearing decoupling invariant: abandoning a wait() must NOT stop
+    # the job. A second concurrent waiter still resolves to the terminal record.
+    job = await registry.create("noop", {"steps": 6, "sleep_per_step_seconds": 0.05})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+
+    abandoned = asyncio.create_task(registry.wait(job.id))
+    survivor = asyncio.create_task(registry.wait(job.id))
+    # Let both awaiters reach their await point, then abandon the first.
+    await asyncio.sleep(0.02)
+    abandoned.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await abandoned
+
+    # The job keeps running and the surviving waiter resolves to its terminal
+    # record — the supervising task was untouched by the cancelled awaiter.
+    result = await asyncio.wait_for(survivor, timeout=3.0)
+    assert result.status == BackgroundJobStatus.SUCCEEDED
+    assert result.result == {"completed_steps": 6}
+
+
+@pytest.mark.asyncio
+async def test_wait_multiple_waiters_both_resolve(registry):
+    job = await registry.create("noop", {"steps": 4, "sleep_per_step_seconds": 0.03})
+    await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING)
+    first = asyncio.create_task(registry.wait(job.id))
+    second = asyncio.create_task(registry.wait(job.id))
+    one, two = await asyncio.wait_for(asyncio.gather(first, second), timeout=3.0)
+    assert one.status == BackgroundJobStatus.SUCCEEDED
+    assert two.status == BackgroundJobStatus.SUCCEEDED
+    assert one.result == two.result == {"completed_steps": 4}
+
+
+@pytest.mark.asyncio
+async def test_delete_removes_completion_event(registry):
+    job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01})
+    # wait() lazily creates the completion event; it survives to the terminal set.
+    awaited = await asyncio.wait_for(registry.wait(job.id), timeout=3.0)
+    assert awaited.status == BackgroundJobStatus.SUCCEEDED
+    assert job.id in registry._completion_events
+
+    await registry.delete(job.id)
+    assert job.id not in registry._completion_events
+
+
 # -- not found ---------------------------------------------------------------
 
 
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
index 2828ba19f..398401400 100644
--- a/app/web_ui/src/lib/api_schema.d.ts
+++ b/app/web_ui/src/lib/api_schema.d.ts
@@ -3166,6 +3166,30 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/jobs/{id}/wait": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /**
+         * Wait For Job
+         * @description Block until the job reaches a terminal state, then return its record.
+         *
+         *     A pure observer, like the SSE stream: if the client disconnects, uvicorn
+         *     cancels this handler coroutine, which cancels the wait() await and tears
+         *     down only the awaiter — the job's supervising task keeps running.
+         */
+        get: operations["wait_for_job_api_jobs__id__wait_get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
     "/api/jobs/{id}/errors": {
         parameters: {
             query?: never;
@@ -17875,7 +17899,12 @@ export interface operations {
     };
     create_job_api_jobs__type__post: {
         parameters: {
-            query?: never;
+            query?: {
+                /** @description When true, block until the job reaches a terminal state and return the full JobRecord instead of CreateJobResponse. */
+                wait?: boolean;
+                /** @description Seconds to wait when wait=true (504 on timeout). Omit to wait indefinitely. */
+                timeout?: number | null;
+            };
             header?: never;
             path: {
                 /** @description The registered job type to run. */
@@ -17895,7 +17924,7 @@ export interface operations {
                     [name: string]: unknown;
                 };
                 content: {
-                    "application/json": components["schemas"]["CreateJobResponse"];
+                    "application/json": components["schemas"]["CreateJobResponse"] | components["schemas"]["JobRecord"];
                 };
             };
             /** @description Validation Error */
@@ -18005,6 +18034,41 @@ export interface operations {
             };
         };
     };
+    wait_for_job_api_jobs__id__wait_get: {
+        parameters: {
+            query?: {
+                /** @description Seconds to wait before giving up (504 on timeout). Omit to wait indefinitely. */
+                timeout?: number | null;
+            };
+            header?: never;
+            path: {
+                /** @description The job id. */
+                id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["JobRecord"];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
     get_job_errors_api_jobs__id__errors_get: {
         parameters: {
             query?: {
diff --git a/app/web_ui/src/lib/stores/jobs_api.ts b/app/web_ui/src/lib/stores/jobs_api.ts
index d05993011..d0070c53a 100644
--- a/app/web_ui/src/lib/stores/jobs_api.ts
+++ b/app/web_ui/src/lib/stores/jobs_api.ts
@@ -45,7 +45,10 @@ export async function create_job(
   params: Record<string, unknown> = {},
   metadata: Record<string, unknown> | null = null,
   project_id: string | null = null,
-): Promise<components["schemas"]["CreateJobResponse"]> {
+): Promise<
+  | components["schemas"]["CreateJobResponse"]
+  | components["schemas"]["JobRecord"]
+> {
   const { data, error } = await client.POST("/api/jobs/{type}", {
     params: { path: { type } },
     body: { params, metadata, project_id },
diff --git a/app/web_ui/src/routes/(app)/jobs/+page.svelte b/app/web_ui/src/routes/(app)/jobs/+page.svelte
index dc4792fe3..636807666 100644
--- a/app/web_ui/src/routes/(app)/jobs/+page.svelte
+++ b/app/web_ui/src/routes/(app)/jobs/+page.svelte
@@ -222,6 +222,7 @@
       <table class="table">
         <thead>
           <tr>
+            <th>ID</th>
             <th>Type</th>
             <th>Status</th>
             <th>Progress</th>
@@ -233,6 +234,9 @@
         <tbody>
           {#each $jobs as job (job.id)}
             <tr>
+              <td class="font-mono text-xs text-gray-500 whitespace-nowrap"
+                >{job.id}</td
+              >
               <td class="font-medium">{job_type_display(job.type)}</td>
               <td>
                 <span class="badge {job_status_badge_class(job.status)}">
diff --git a/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_wait.json b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_wait.json
new file mode 100644
index 000000000..f13245b8c
--- /dev/null
+++ b/libs/server/kiln_server/utils/agent_checks/annotations/get_api_jobs_id_wait.json
@@ -0,0 +1,8 @@
+{
+  "method": "get",
+  "path": "/api/jobs/{id}/wait",
+  "agent_policy": {
+    "permission": "allow",
+    "requires_approval": false
+  }
+}

From 18d5988e33bc3f757f7a332ec04947008014eace Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Fri, 29 May 2026 13:59:42 +0800
Subject: [PATCH 05/26] feat: job widget + dialog

---
 .../lib/components/SidebarJobsBadge.svelte    |  32 --
 .../lib/components/SidebarJobsBadge.test.ts   |  40 --
 .../components/SidebarJobsIndicator.svelte    |  60 +++
 .../components/SidebarJobsIndicator.test.ts   |  59 +++
 .../components/jobs_dialog.component.test.ts  | 118 ++++++
 .../src/lib/components/jobs_dialog.svelte     |  28 ++
 .../src/lib/components/jobs_table.svelte      | 360 ++++++++++++++++++
 .../src/lib/components/jobs_table.test.ts     | 142 +++++++
 app/web_ui/src/lib/stores/job_status.test.ts  |  34 ++
 app/web_ui/src/lib/stores/job_status.ts       |  28 ++
 app/web_ui/src/lib/stores/jobs_dialog.test.ts |  14 +
 app/web_ui/src/lib/stores/jobs_dialog.ts      |  22 ++
 app/web_ui/src/routes/(app)/+layout.svelte    |  16 +-
 app/web_ui/src/routes/(app)/jobs/+page.svelte | 306 +--------------
 .../src/routes/(app)/sidebar_rail.svelte      |  11 +-
 .../src/routes/(app)/sidebar_rail_item.svelte |  61 ++-
 .../routes/(app)/sidebar_rail_item.test.ts    |  13 +
 17 files changed, 944 insertions(+), 400 deletions(-)
 delete mode 100644 app/web_ui/src/lib/components/SidebarJobsBadge.svelte
 delete mode 100644 app/web_ui/src/lib/components/SidebarJobsBadge.test.ts
 create mode 100644 app/web_ui/src/lib/components/SidebarJobsIndicator.svelte
 create mode 100644 app/web_ui/src/lib/components/SidebarJobsIndicator.test.ts
 create mode 100644 app/web_ui/src/lib/components/jobs_dialog.component.test.ts
 create mode 100644 app/web_ui/src/lib/components/jobs_dialog.svelte
 create mode 100644 app/web_ui/src/lib/components/jobs_table.svelte
 create mode 100644 app/web_ui/src/lib/components/jobs_table.test.ts
 create mode 100644 app/web_ui/src/lib/stores/jobs_dialog.test.ts
 create mode 100644 app/web_ui/src/lib/stores/jobs_dialog.ts

diff --git a/app/web_ui/src/lib/components/SidebarJobsBadge.svelte b/app/web_ui/src/lib/components/SidebarJobsBadge.svelte
deleted file mode 100644
index af843392e..000000000
--- a/app/web_ui/src/lib/components/SidebarJobsBadge.svelte
+++ /dev/null
@@ -1,32 +0,0 @@
-<script lang="ts">
-  import { active_jobs_count } from "$lib/stores/jobs_store"
-
-  // "rail" overlays the count on a sidebar icon (absolute, top-right).
-  // "inline" sits next to a label in the wide drawer.
-  export let variant: "rail" | "inline" = "inline"
-
-  // Defaults to the live active-jobs count, but accepts an override so the
-  // component is render-testable in isolation.
-  export let count: number | undefined = undefined
-
-  $: resolved = count ?? $active_jobs_count
-  $: label = resolved > 99 ? "99+" : `${resolved}`
-</script>
-
-{#if resolved > 0}
-  {#if variant === "rail"}
-    <span
-      class="absolute -top-1 -right-1 min-w-4 h-4 px-1 rounded-full bg-primary text-primary-content text-[10px] leading-4 font-medium text-center"
-      aria-label={`${resolved} active jobs`}
-    >
-      {label}
-    </span>
-  {:else}
-    <span
-      class="badge badge-sm badge-primary"
-      aria-label={`${resolved} active jobs`}
-    >
-      {label}
-    </span>
-  {/if}
-{/if}
diff --git a/app/web_ui/src/lib/components/SidebarJobsBadge.test.ts b/app/web_ui/src/lib/components/SidebarJobsBadge.test.ts
deleted file mode 100644
index 7873285ec..000000000
--- a/app/web_ui/src/lib/components/SidebarJobsBadge.test.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-// @vitest-environment jsdom
-import { describe, it, expect, vi } from "vitest"
-import { render } from "@testing-library/svelte"
-import { writable } from "svelte/store"
-
-vi.mock("$lib/api_client", () => ({
-  base_url: "http://localhost:8757",
-  client: {},
-}))
-
-vi.mock("$lib/stores", () => ({
-  ui_state: writable({ current_project_id: null }),
-}))
-
-const SidebarJobsBadge = (await import("./SidebarJobsBadge.svelte")).default
-
-describe("SidebarJobsBadge", () => {
-  it("renders the count when greater than zero", () => {
-    const { getByText } = render(SidebarJobsBadge, { props: { count: 3 } })
-    expect(getByText("3")).not.toBeNull()
-  })
-
-  it("renders nothing when count is zero", () => {
-    const { container } = render(SidebarJobsBadge, { props: { count: 0 } })
-    expect(container.textContent?.trim()).toBe("")
-  })
-
-  it("caps the displayed count at 99+", () => {
-    const { getByText } = render(SidebarJobsBadge, { props: { count: 150 } })
-    expect(getByText("99+")).not.toBeNull()
-  })
-
-  it("uses the rail variant styling when requested", () => {
-    const { container } = render(SidebarJobsBadge, {
-      props: { count: 2, variant: "rail" },
-    })
-    const span = container.querySelector("span")
-    expect(span?.className).toContain("absolute")
-  })
-})
diff --git a/app/web_ui/src/lib/components/SidebarJobsIndicator.svelte b/app/web_ui/src/lib/components/SidebarJobsIndicator.svelte
new file mode 100644
index 000000000..78bb13d83
--- /dev/null
+++ b/app/web_ui/src/lib/components/SidebarJobsIndicator.svelte
@@ -0,0 +1,60 @@
+<script lang="ts">
+  import { active_jobs_count, jobs } from "$lib/stores/jobs_store"
+  import { jobs_indicator } from "$lib/stores/job_status"
+
+  // "rail" overlays the indicator on a sidebar icon (absolute, top-right).
+  // "inline" sits next to a label in the wide drawer.
+  export let variant: "rail" | "inline" = "inline"
+
+  // Default to the live counts, but accept overrides so the component is
+  // render-testable in isolation.
+  export let active_count: number | undefined = undefined
+  export let total_count: number | undefined = undefined
+
+  $: indicator = jobs_indicator(
+    active_count ?? $active_jobs_count,
+    total_count ?? $jobs.length,
+  )
+  $: label =
+    indicator.kind === "hidden"
+      ? ""
+      : indicator.count > 99
+        ? "99+"
+        : `${indicator.count}`
+  $: aria_label =
+    indicator.kind === "spinner"
+      ? `${indicator.count} active jobs`
+      : indicator.kind === "static"
+        ? `${indicator.count} jobs`
+        : ""
+</script>
+
+{#if indicator.kind !== "hidden"}
+  {#if variant === "rail"}
+    <span
+      class="absolute -top-1 -right-1 flex items-center gap-0.5 min-w-4 h-4 px-1 rounded-full text-[10px] leading-4 font-medium text-center {indicator.kind ===
+      'spinner'
+        ? 'bg-primary text-primary-content'
+        : 'bg-base-300 text-base-content/70'}"
+      aria-label={aria_label}
+    >
+      {#if indicator.kind === "spinner"}
+        <span class="loading loading-spinner w-2 h-2" aria-hidden="true"></span>
+      {/if}
+      {label}
+    </span>
+  {:else}
+    <span
+      class="badge badge-sm inline-flex items-center gap-1 {indicator.kind ===
+      'spinner'
+        ? 'badge-primary'
+        : 'badge-ghost text-base-content/70'}"
+      aria-label={aria_label}
+    >
+      {#if indicator.kind === "spinner"}
+        <span class="loading loading-spinner w-3 h-3" aria-hidden="true"></span>
+      {/if}
+      {label}
+    </span>
+  {/if}
+{/if}
diff --git a/app/web_ui/src/lib/components/SidebarJobsIndicator.test.ts b/app/web_ui/src/lib/components/SidebarJobsIndicator.test.ts
new file mode 100644
index 000000000..352ffd3ee
--- /dev/null
+++ b/app/web_ui/src/lib/components/SidebarJobsIndicator.test.ts
@@ -0,0 +1,59 @@
+// @vitest-environment jsdom
+import { describe, it, expect, afterEach, vi } from "vitest"
+import { render, cleanup } from "@testing-library/svelte"
+import { readable } from "svelte/store"
+
+// The real jobs_store opens an EventSource on subscribe; mock it with plain
+// stores so the indicator can render in isolation. The component takes
+// active_count / total_count overrides for the actual assertions below.
+vi.mock("$lib/stores/jobs_store", () => ({
+  active_jobs_count: readable(0),
+  jobs: readable([]),
+}))
+
+const SidebarJobsIndicator = (await import("./SidebarJobsIndicator.svelte"))
+  .default
+
+describe("SidebarJobsIndicator", () => {
+  afterEach(() => {
+    cleanup()
+  })
+
+  it("shows a spinner and active count when jobs are active", () => {
+    const { container, getByText } = render(SidebarJobsIndicator, {
+      props: { active_count: 3, total_count: 5 },
+    })
+    expect(getByText("3")).not.toBeNull()
+    expect(container.querySelector(".loading-spinner")).not.toBeNull()
+  })
+
+  it("shows a static muted count without a spinner when none are active", () => {
+    const { container, getByText } = render(SidebarJobsIndicator, {
+      props: { active_count: 0, total_count: 4 },
+    })
+    expect(getByText("4")).not.toBeNull()
+    expect(container.querySelector(".loading-spinner")).toBeNull()
+  })
+
+  it("renders nothing when there are no jobs", () => {
+    const { container } = render(SidebarJobsIndicator, {
+      props: { active_count: 0, total_count: 0 },
+    })
+    expect(container.textContent?.trim()).toBe("")
+  })
+
+  it("caps the displayed count at 99+", () => {
+    const { getByText } = render(SidebarJobsIndicator, {
+      props: { active_count: 150, total_count: 150 },
+    })
+    expect(getByText("99+")).not.toBeNull()
+  })
+
+  it("uses the rail variant styling when requested", () => {
+    const { container } = render(SidebarJobsIndicator, {
+      props: { active_count: 2, total_count: 2, variant: "rail" },
+    })
+    const span = container.querySelector("span")
+    expect(span?.className).toContain("absolute")
+  })
+})
diff --git a/app/web_ui/src/lib/components/jobs_dialog.component.test.ts b/app/web_ui/src/lib/components/jobs_dialog.component.test.ts
new file mode 100644
index 000000000..73f346a6b
--- /dev/null
+++ b/app/web_ui/src/lib/components/jobs_dialog.component.test.ts
@@ -0,0 +1,118 @@
+// @vitest-environment jsdom
+import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"
+import { render, cleanup } from "@testing-library/svelte"
+import { tick } from "svelte"
+import { writable } from "svelte/store"
+import type { JobRecord } from "$lib/stores/jobs_api"
+import { jobs_dialog } from "$lib/stores/jobs_dialog"
+
+// The dialog hosts JobsTable, which subscribes to the job stream. Mock the
+// stores/api so the table renders an inert empty state.
+const jobs = writable<JobRecord[]>([])
+const synced = writable(true)
+const connection = writable<"idle" | "connecting" | "open" | "errored">("open")
+
+vi.mock("$lib/stores/jobs_store", () => ({
+  jobs,
+  synced,
+  connection,
+}))
+
+vi.mock("$lib/stores/jobs_api", () => ({
+  pause_job: vi.fn().mockResolvedValue(undefined),
+  resume_job: vi.fn().mockResolvedValue(undefined),
+  cancel_job: vi.fn().mockResolvedValue(undefined),
+  delete_job: vi.fn().mockResolvedValue(undefined),
+  get_job_errors: vi.fn().mockResolvedValue([]),
+  get_job_result: vi.fn().mockResolvedValue({}),
+}))
+
+const JobsDialog = (await import("./jobs_dialog.svelte")).default
+
+// jsdom doesn't implement HTMLDialogElement.showModal/close; emulate them so
+// the `open` property reflects the real show()/close() calls the component makes.
+beforeEach(() => {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  ;(HTMLDialogElement.prototype as any).showModal = function (
+    this: HTMLDialogElement,
+  ) {
+    this.setAttribute("open", "")
+  }
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  ;(HTMLDialogElement.prototype as any).close = function (
+    this: HTMLDialogElement,
+  ) {
+    this.removeAttribute("open")
+  }
+  jobs.set([])
+  synced.set(true)
+  connection.set("open")
+})
+
+afterEach(() => {
+  cleanup()
+})
+
+function jobsDialogEl(container: HTMLElement): HTMLDialogElement {
+  // The first dialog in the tree is the Jobs dialog (the errors/result
+  // sub-dialogs live inside JobsTable and follow it).
+  const el = container.querySelector("dialog")
+  expect(el).not.toBeNull()
+  return el as HTMLDialogElement
+}
+
+describe("JobsDialog open signal", () => {
+  it("stays closed on mount even if the signal has already advanced", async () => {
+    // Advance the module-level signal before the component mounts.
+    jobs_dialog.open()
+    jobs_dialog.open()
+
+    const { container } = render(JobsDialog)
+    await tick()
+
+    expect(jobsDialogEl(container).open).toBe(false)
+  })
+
+  it("opens when jobs_dialog.open() is called", async () => {
+    const { container } = render(JobsDialog)
+    await tick()
+    expect(jobsDialogEl(container).open).toBe(false)
+
+    jobs_dialog.open()
+    await tick()
+
+    expect(jobsDialogEl(container).open).toBe(true)
+  })
+
+  it("re-opens after being closed", async () => {
+    const { container } = render(JobsDialog)
+    await tick()
+
+    jobs_dialog.open()
+    await tick()
+    expect(jobsDialogEl(container).open).toBe(true)
+
+    // Close it the way the user would (the dialog's own close()).
+    jobsDialogEl(container).close()
+    expect(jobsDialogEl(container).open).toBe(false)
+
+    jobs_dialog.open()
+    await tick()
+    expect(jobsDialogEl(container).open).toBe(true)
+  })
+
+  it("does not reopen on an unrelated reactive update", async () => {
+    const { container } = render(JobsDialog)
+    await tick()
+    expect(jobsDialogEl(container).open).toBe(false)
+
+    // Mutate unrelated reactive inputs the dialog/table read; none of these
+    // touch the open signal, so the dialog must remain closed.
+    jobs.set([])
+    synced.set(true)
+    connection.set("open")
+    await tick()
+
+    expect(jobsDialogEl(container).open).toBe(false)
+  })
+})
diff --git a/app/web_ui/src/lib/components/jobs_dialog.svelte b/app/web_ui/src/lib/components/jobs_dialog.svelte
new file mode 100644
index 000000000..d5811d0ae
--- /dev/null
+++ b/app/web_ui/src/lib/components/jobs_dialog.svelte
@@ -0,0 +1,28 @@
+<script lang="ts">
+  import { get } from "svelte/store"
+  import Dialog from "$lib/ui/dialog.svelte"
+  import JobsTable from "./jobs_table.svelte"
+  import { jobs_dialog } from "$lib/stores/jobs_dialog"
+
+  let dialog: Dialog
+
+  const jobs_dialog_open_signal = jobs_dialog.open_signal
+
+  // Open whenever the cross-component signal changes. Seed from the current
+  // value so the dialog stays closed on mount even if the signal has already
+  // advanced (e.g. a future conditional remount).
+  let last_signal = get(jobs_dialog_open_signal)
+  $: if ($jobs_dialog_open_signal !== last_signal) {
+    last_signal = $jobs_dialog_open_signal
+    dialog?.show()
+  }
+</script>
+
+<Dialog bind:this={dialog} title="Jobs" width="wide">
+  <p class="text-sm font-light mb-4">
+    <a href="/jobs" class="link" on:click={() => dialog?.close()}
+      >View full page →</a
+    >
+  </p>
+  <JobsTable />
+</Dialog>
diff --git a/app/web_ui/src/lib/components/jobs_table.svelte b/app/web_ui/src/lib/components/jobs_table.svelte
new file mode 100644
index 000000000..73437cc94
--- /dev/null
+++ b/app/web_ui/src/lib/components/jobs_table.svelte
@@ -0,0 +1,360 @@
+<script lang="ts">
+  import Dialog from "$lib/ui/dialog.svelte"
+  import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
+  import CloseIcon from "$lib/ui/icons/close_icon.svelte"
+  import { jobs, synced, connection } from "$lib/stores/jobs_store"
+  import {
+    available_actions,
+    completed_jobs,
+    is_terminal,
+    job_status_badge_class,
+    job_status_display,
+    progress_label,
+    progress_percent,
+    type JobAction,
+  } from "$lib/stores/job_status"
+  import {
+    cancel_job,
+    delete_job,
+    get_job_errors,
+    get_job_result,
+    pause_job,
+    resume_job,
+    type JobError,
+    type JobErrorEntry,
+    type JobRecord,
+  } from "$lib/stores/jobs_api"
+  import { formatDate, capitalize } from "$lib/utils/formatters"
+  import { KilnError, createKilnError } from "$lib/utils/error_handlers"
+
+  let action_error: KilnError | null = null
+  let in_flight: Record<string, boolean> = {}
+  let clearing_completed = false
+
+  $: completed = completed_jobs($jobs)
+
+  const action_runners: Record<JobAction, (id: string) => Promise<void>> = {
+    pause: pause_job,
+    resume: resume_job,
+    cancel: cancel_job,
+    delete: delete_job,
+  }
+
+  const action_labels: Record<JobAction, string> = {
+    pause: "Pause",
+    resume: "Resume",
+    cancel: "Cancel",
+    delete: "Delete",
+  }
+
+  async function run_action(action: JobAction, id: string) {
+    action_error = null
+    in_flight = { ...in_flight, [id]: true }
+    try {
+      await action_runners[action](id)
+      // The SSE stream reflects the resulting transition; no local mutation.
+    } catch (e) {
+      action_error = createKilnError(e)
+    } finally {
+      in_flight = { ...in_flight, [id]: false }
+    }
+  }
+
+  // Best-effort delete of every terminal job. Failures are surfaced but don't
+  // halt the rest; the SSE stream removes the rows as each delete lands.
+  async function clear_completed() {
+    action_error = null
+    clearing_completed = true
+    try {
+      const results = await Promise.allSettled(
+        completed.map((job) => delete_job(job.id)),
+      )
+      const failure = results.find((r) => r.status === "rejected")
+      if (failure && failure.status === "rejected") {
+        action_error = createKilnError(failure.reason)
+      }
+    } finally {
+      clearing_completed = false
+    }
+  }
+
+  function job_type_display(type: string): string {
+    if (type === "noop") {
+      return "No-op"
+    }
+    return capitalize(type)
+  }
+
+  function has_errors(job: JobRecord): boolean {
+    return (job.progress?.error ?? 0) > 0 || job.status === "failed"
+  }
+
+  // Only show a result once the job is in a terminal state — a non-null
+  // `result` mid-run would be partial and misleading.
+  function has_result(job: JobRecord): boolean {
+    return is_terminal(job.status) && job.result != null
+  }
+
+  // Surface the record's failure summary inline for failed jobs.
+  function failure_error(job: JobRecord): JobError | null {
+    return job.status === "failed" ? job.error ?? null : null
+  }
+
+  // Errors dialog state
+  let errors_dialog: Dialog
+  let errors_loading = false
+  let errors_load_error: KilnError | null = null
+  let error_entries: JobErrorEntry[] = []
+  let errors_summary: JobError | null = null
+
+  async function open_errors(job: JobRecord) {
+    error_entries = []
+    errors_load_error = null
+    errors_summary = failure_error(job)
+    errors_loading = true
+    errors_dialog?.show()
+    try {
+      error_entries = await get_job_errors(job.id)
+    } catch (e) {
+      errors_load_error = createKilnError(e)
+    } finally {
+      errors_loading = false
+    }
+  }
+
+  // Result dialog state
+  let result_dialog: Dialog
+  let result_loading = false
+  let result_load_error: KilnError | null = null
+  let result_data: Record<string, unknown> | null = null
+
+  async function open_result(job: JobRecord) {
+    result_data = null
+    result_load_error = null
+    result_loading = true
+    result_dialog?.show()
+    try {
+      result_data = await get_job_result(job.id)
+    } catch (e) {
+      result_load_error = createKilnError(e)
+    } finally {
+      result_loading = false
+    }
+  }
+</script>
+
+{#if action_error}
+  <div role="alert" class="alert alert-error text-sm mb-4">
+    <span>{action_error.getMessage() || "An action failed."}</span>
+  </div>
+{/if}
+
+{#if !$synced && $connection === "errored"}
+  <div
+    class="flex flex-col items-center justify-center min-h-[50vh] text-center max-w-md mx-auto"
+  >
+    <div class="text-gray-400 mb-3">
+      <span class="loading loading-spinner loading-md"></span>
+    </div>
+    <h3 class="text-lg font-medium">Can't connect to the job stream</h3>
+    <p class="text-sm text-gray-500 mt-2">
+      We lost the connection to the background job updates and are retrying
+      automatically. Jobs keep running in the background — this page will
+      refresh once the connection is restored.
+    </p>
+  </div>
+{:else if !$synced}
+  <div class="w-full min-h-[50vh] flex justify-center items-center">
+    <div class="loading loading-spinner loading-lg"></div>
+  </div>
+{:else if $jobs.length === 0}
+  <div
+    class="flex flex-col items-center justify-center min-h-[55vh] text-center max-w-md mx-auto"
+  >
+    <div class="w-12 h-12 text-gray-400 mb-4" aria-hidden="true">
+      <JobsIcon />
+    </div>
+    <h3 class="text-lg font-medium">No jobs yet</h3>
+    <p class="text-sm text-gray-500 mt-2">
+      Long-running work like eval runs shows up here. Jobs run in the background
+      — you can leave this page and they'll keep going. Come back any time to
+      check progress, pause, or cancel them.
+    </p>
+  </div>
+{:else}
+  <div class="flex flex-row justify-end mb-3">
+    <button
+      class="btn btn-xs btn-ghost"
+      disabled={clearing_completed || completed.length === 0}
+      on:click={clear_completed}
+    >
+      {#if clearing_completed}
+        <span class="loading loading-spinner loading-xs"></span>
+      {/if}
+      Clear completed
+    </button>
+  </div>
+  <div class="overflow-x-auto rounded-lg border">
+    <table class="table">
+      <thead>
+        <tr>
+          <th>ID</th>
+          <th>Type</th>
+          <th>Status</th>
+          <th>Progress</th>
+          <th>Message</th>
+          <th>Created</th>
+          <th class="text-right">Actions</th>
+        </tr>
+      </thead>
+      <tbody>
+        {#each $jobs as job (job.id)}
+          <tr>
+            <td class="font-mono text-xs text-gray-500 whitespace-nowrap"
+              >{job.id}</td
+            >
+            <td class="font-medium">{job_type_display(job.type)}</td>
+            <td>
+              <span class="badge {job_status_badge_class(job.status)}">
+                {job_status_display(job.status)}
+              </span>
+            </td>
+            <td>
+              <div class="flex flex-col gap-1 min-w-32">
+                <span class="text-sm">{progress_label(job.progress)}</span>
+                {#if job.progress?.total}
+                  <progress
+                    class="progress progress-primary w-32 h-1.5"
+                    value={progress_percent(job.progress)}
+                    max="100"
+                  ></progress>
+                {/if}
+              </div>
+            </td>
+            <td class="text-sm text-gray-500 max-w-48">
+              {#if failure_error(job)?.error}
+                <span
+                  class="text-error block truncate"
+                  title={failure_error(job)?.error}
+                  >{failure_error(job)?.error}</span
+                >
+              {:else}
+                <span class="block truncate">{job.progress?.message || ""}</span
+                >
+              {/if}
+            </td>
+            <td class="text-sm text-gray-500 whitespace-nowrap">
+              {formatDate(job.created_at)}
+            </td>
+            <td>
+              <div
+                class="flex flex-row gap-1 justify-end flex-wrap items-center"
+              >
+                {#if has_result(job)}
+                  <button
+                    class="btn btn-xs btn-ghost"
+                    on:click={() => open_result(job)}
+                  >
+                    Result
+                  </button>
+                {/if}
+                {#if has_errors(job)}
+                  <button
+                    class="btn btn-xs btn-ghost"
+                    on:click={() => open_errors(job)}
+                  >
+                    Errors
+                  </button>
+                {/if}
+                {#each available_actions(job) as action}
+                  {#if action === "delete"}
+                    <button
+                      class="btn btn-xs btn-ghost btn-square text-error"
+                      disabled={in_flight[job.id]}
+                      aria-label="Dismiss job"
+                      title="Dismiss job"
+                      on:click={() => run_action(action, job.id)}
+                    >
+                      <span class="w-4 h-4 block"><CloseIcon /></span>
+                    </button>
+                  {:else}
+                    <button
+                      class="btn btn-xs {action === 'cancel'
+                        ? 'btn-ghost text-error'
+                        : 'btn-ghost'}"
+                      disabled={in_flight[job.id]}
+                      on:click={() => run_action(action, job.id)}
+                    >
+                      {action_labels[action]}
+                    </button>
+                  {/if}
+                {/each}
+              </div>
+            </td>
+          </tr>
+        {/each}
+      </tbody>
+    </table>
+  </div>
+{/if}
+
+<Dialog bind:this={errors_dialog} title="Job Errors" width="wide">
+  {#if errors_summary?.error}
+    <div
+      role="alert"
+      class="alert alert-error text-sm mb-4 flex flex-col items-start gap-1"
+    >
+      <span class="font-medium break-words">{errors_summary.error}</span>
+      {#if errors_summary.detail}
+        <pre
+          class="text-xs w-full bg-base-200 text-base-content rounded-md p-2 overflow-x-auto max-h-48">{JSON.stringify(
+            errors_summary.detail,
+            null,
+            2,
+          )}</pre>
+      {/if}
+    </div>
+  {/if}
+  {#if errors_loading}
+    <div class="flex justify-center py-8">
+      <div class="loading loading-spinner loading-lg"></div>
+    </div>
+  {:else if errors_load_error}
+    <div class="text-error text-sm">
+      {errors_load_error.getMessage() || "Could not load errors."}
+    </div>
+  {:else if error_entries.length === 0}
+    <p class="text-sm text-gray-500">
+      No error messages recorded for this job.
+    </p>
+  {:else}
+    <ul class="flex flex-col gap-2 max-h-[60vh] overflow-y-auto">
+      {#each error_entries as entry, index (index)}
+        <li class="text-sm bg-base-200 rounded-md p-3 font-mono break-words">
+          {entry.error_message || JSON.stringify(entry)}
+        </li>
+      {/each}
+    </ul>
+  {/if}
+</Dialog>
+
+<Dialog bind:this={result_dialog} title="Job Result" width="wide">
+  {#if result_loading}
+    <div class="flex justify-center py-8">
+      <div class="loading loading-spinner loading-lg"></div>
+    </div>
+  {:else if result_load_error}
+    <div class="text-error text-sm">
+      {result_load_error.getMessage() || "Could not load result."}
+    </div>
+  {:else if result_data}
+    <pre
+      class="text-xs bg-base-200 rounded-md p-3 overflow-x-auto max-h-[60vh]">{JSON.stringify(
+        result_data,
+        null,
+        2,
+      )}</pre>
+  {:else}
+    <p class="text-sm text-gray-500">No result available.</p>
+  {/if}
+</Dialog>
diff --git a/app/web_ui/src/lib/components/jobs_table.test.ts b/app/web_ui/src/lib/components/jobs_table.test.ts
new file mode 100644
index 000000000..52c76ac06
--- /dev/null
+++ b/app/web_ui/src/lib/components/jobs_table.test.ts
@@ -0,0 +1,142 @@
+// @vitest-environment jsdom
+import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"
+import { render, fireEvent, waitFor, cleanup } from "@testing-library/svelte"
+import { writable } from "svelte/store"
+import type { JobRecord } from "$lib/stores/jobs_api"
+
+// Live job list the table renders from. Replaced per-test.
+const jobs = writable<JobRecord[]>([])
+const synced = writable(true)
+const connection = writable<"idle" | "connecting" | "open" | "errored">("open")
+
+vi.mock("$lib/stores/jobs_store", () => ({
+  jobs,
+  synced,
+  connection,
+}))
+
+const api = {
+  pause_job: vi.fn().mockResolvedValue(undefined),
+  resume_job: vi.fn().mockResolvedValue(undefined),
+  cancel_job: vi.fn().mockResolvedValue(undefined),
+  delete_job: vi.fn().mockResolvedValue(undefined),
+  get_job_errors: vi.fn().mockResolvedValue([]),
+  get_job_result: vi.fn().mockResolvedValue({}),
+}
+vi.mock("$lib/stores/jobs_api", () => api)
+
+const JobsTable = (await import("./jobs_table.svelte")).default
+
+function makeJob(overrides: Partial<JobRecord> = {}): JobRecord {
+  return {
+    id: "j_1",
+    type: "noop",
+    status: "running",
+    supports_pause: false,
+    created_at: "2024-01-01T00:00:00Z",
+    ...overrides,
+  }
+}
+
+describe("JobsTable", () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+    synced.set(true)
+    connection.set("open")
+    jobs.set([])
+  })
+
+  afterEach(() => {
+    cleanup()
+  })
+
+  it("Clear completed deletes exactly the terminal jobs", async () => {
+    jobs.set([
+      makeJob({ id: "running", status: "running" }),
+      makeJob({ id: "succeeded", status: "succeeded" }),
+      makeJob({ id: "pending", status: "pending" }),
+      makeJob({ id: "failed", status: "failed" }),
+      makeJob({ id: "cancelled", status: "cancelled" }),
+    ])
+    const { getByText } = render(JobsTable)
+
+    await fireEvent.click(getByText("Clear completed"))
+
+    await waitFor(() => {
+      expect(api.delete_job).toHaveBeenCalledTimes(3)
+    })
+    const deleted = api.delete_job.mock.calls.map((c) => c[0]).sort()
+    expect(deleted).toEqual(["cancelled", "failed", "succeeded"])
+    // It must not touch the active jobs.
+    expect(deleted).not.toContain("running")
+    expect(deleted).not.toContain("pending")
+  })
+
+  it("Clear completed surfaces an error when a delete fails", async () => {
+    jobs.set([makeJob({ id: "failed", status: "failed" })])
+    api.delete_job.mockRejectedValueOnce(new Error("boom"))
+    const { getByText, getByRole } = render(JobsTable)
+
+    await fireEvent.click(getByText("Clear completed"))
+
+    await waitFor(() => {
+      expect(getByRole("alert").textContent).toContain("boom")
+    })
+  })
+
+  it("renders a dismiss button (not a Delete label) for terminal rows", () => {
+    jobs.set([makeJob({ id: "succeeded", status: "succeeded" })])
+    const { getByLabelText, queryByText } = render(JobsTable)
+    expect(getByLabelText("Dismiss job")).not.toBeNull()
+    expect(queryByText("Delete")).toBeNull()
+  })
+
+  it("gates row actions on status: running with pause shows Pause + Cancel", () => {
+    jobs.set([
+      makeJob({ id: "running", status: "running", supports_pause: true }),
+    ])
+    const { getByText, queryByLabelText } = render(JobsTable)
+    expect(getByText("Pause")).not.toBeNull()
+    expect(getByText("Cancel")).not.toBeNull()
+    expect(queryByLabelText("Dismiss job")).toBeNull()
+  })
+
+  it("gates row actions on status: paused shows Resume + Cancel", () => {
+    jobs.set([makeJob({ id: "paused", status: "paused" })])
+    const { getByText } = render(JobsTable)
+    expect(getByText("Resume")).not.toBeNull()
+    expect(getByText("Cancel")).not.toBeNull()
+  })
+
+  it("gates row actions on status: pending shows only Cancel", () => {
+    jobs.set([makeJob({ id: "pending", status: "pending" })])
+    const { getByText, queryByText, queryByLabelText } = render(JobsTable)
+    expect(getByText("Cancel")).not.toBeNull()
+    expect(queryByText("Pause")).toBeNull()
+    expect(queryByText("Resume")).toBeNull()
+    expect(queryByLabelText("Dismiss job")).toBeNull()
+  })
+
+  it("shows the loading spinner before the first sync", () => {
+    synced.set(false)
+    connection.set("connecting")
+    const { container, queryByText } = render(JobsTable)
+    expect(container.querySelector(".loading.loading-spinner")).not.toBeNull()
+    // Neither the table nor the empty state should render while syncing.
+    expect(queryByText("No jobs yet")).toBeNull()
+    expect(container.querySelector("table")).toBeNull()
+  })
+
+  it("shows the empty state when there are no jobs", () => {
+    jobs.set([])
+    const { getByText } = render(JobsTable)
+    expect(getByText("No jobs yet")).not.toBeNull()
+  })
+
+  it("shows the connection-error state when errored before first sync", () => {
+    synced.set(false)
+    connection.set("errored")
+    const { getByText } = render(JobsTable)
+    expect(getByText("Can't connect to the job stream")).not.toBeNull()
+  })
+})
diff --git a/app/web_ui/src/lib/stores/job_status.test.ts b/app/web_ui/src/lib/stores/job_status.test.ts
index 4e6f91ce4..01bf1d708 100644
--- a/app/web_ui/src/lib/stores/job_status.test.ts
+++ b/app/web_ui/src/lib/stores/job_status.test.ts
@@ -1,10 +1,12 @@
 import { describe, it, expect } from "vitest"
 import {
   available_actions,
+  completed_jobs,
   is_active,
   is_terminal,
   job_status_badge_class,
   job_status_display,
+  jobs_indicator,
   progress_label,
   progress_percent,
 } from "./job_status"
@@ -126,3 +128,35 @@ describe("progress_percent", () => {
     expect(progress_percent({ success: 8, error: 2, total: 10 })).toBe(100)
   })
 })
+
+describe("completed_jobs", () => {
+  it("returns exactly the terminal jobs", () => {
+    const jobs = [
+      makeJob({ id: "a", status: "running" }),
+      makeJob({ id: "b", status: "succeeded" }),
+      makeJob({ id: "c", status: "pending" }),
+      makeJob({ id: "d", status: "failed" }),
+      makeJob({ id: "e", status: "paused" }),
+      makeJob({ id: "f", status: "cancelled" }),
+    ]
+    expect(completed_jobs(jobs).map((j) => j.id)).toEqual(["b", "d", "f"])
+  })
+
+  it("returns an empty array when nothing is terminal", () => {
+    expect(completed_jobs([makeJob({ status: "running" })])).toEqual([])
+  })
+})
+
+describe("jobs_indicator", () => {
+  it("shows a spinner with the active count when any job is active", () => {
+    expect(jobs_indicator(2, 5)).toEqual({ kind: "spinner", count: 2 })
+  })
+
+  it("shows a static total count when none active but jobs remain", () => {
+    expect(jobs_indicator(0, 3)).toEqual({ kind: "static", count: 3 })
+  })
+
+  it("is hidden when there are no jobs at all", () => {
+    expect(jobs_indicator(0, 0)).toEqual({ kind: "hidden" })
+  })
+})
diff --git a/app/web_ui/src/lib/stores/job_status.ts b/app/web_ui/src/lib/stores/job_status.ts
index 9d6cdfd7c..9003ab315 100644
--- a/app/web_ui/src/lib/stores/job_status.ts
+++ b/app/web_ui/src/lib/stores/job_status.ts
@@ -107,3 +107,31 @@ export function progress_percent(progress: JobProgress | undefined): number {
   const processed = (progress?.success ?? 0) + (progress?.error ?? 0)
   return Math.max(0, Math.min(100, Math.round((processed / total) * 100)))
 }
+
+// The jobs that "Clear completed" removes: every job in a terminal state.
+export function completed_jobs(jobs: JobRecord[]): JobRecord[] {
+  return jobs.filter((job) => is_terminal(job.status))
+}
+
+// What the sidebar Jobs indicator should render, derived purely from the live
+// counts so it can be unit-tested without mounting the component:
+//   - "spinner": at least one active job; show a subtle spinner + active count.
+//   - "static": no active jobs but some still exist; show a muted total count.
+//   - "hidden": no jobs at all; show no indicator.
+export type JobsIndicator =
+  | { kind: "spinner"; count: number }
+  | { kind: "static"; count: number }
+  | { kind: "hidden" }
+
+export function jobs_indicator(
+  active_count: number,
+  total_count: number,
+): JobsIndicator {
+  if (active_count > 0) {
+    return { kind: "spinner", count: active_count }
+  }
+  if (total_count > 0) {
+    return { kind: "static", count: total_count }
+  }
+  return { kind: "hidden" }
+}
diff --git a/app/web_ui/src/lib/stores/jobs_dialog.test.ts b/app/web_ui/src/lib/stores/jobs_dialog.test.ts
new file mode 100644
index 000000000..f36289de4
--- /dev/null
+++ b/app/web_ui/src/lib/stores/jobs_dialog.test.ts
@@ -0,0 +1,14 @@
+import { describe, it, expect } from "vitest"
+import { get } from "svelte/store"
+import { jobs_dialog } from "./jobs_dialog"
+
+describe("jobs_dialog", () => {
+  it("bumps the open signal each time open() is called", () => {
+    const before = get(jobs_dialog.open_signal)
+    jobs_dialog.open()
+    const afterOne = get(jobs_dialog.open_signal)
+    expect(afterOne).toBe(before + 1)
+    jobs_dialog.open()
+    expect(get(jobs_dialog.open_signal)).toBe(before + 2)
+  })
+})
diff --git a/app/web_ui/src/lib/stores/jobs_dialog.ts b/app/web_ui/src/lib/stores/jobs_dialog.ts
new file mode 100644
index 000000000..43ea3b13d
--- /dev/null
+++ b/app/web_ui/src/lib/stores/jobs_dialog.ts
@@ -0,0 +1,22 @@
+import { writable } from "svelte/store"
+
+// Cross-component channel for opening the global jobs dialog. The dialog itself
+// is mounted once in (app)/+layout.svelte and subscribes here; any component
+// (e.g. the sidebar Jobs widget) can trigger it via `jobs_dialog.open()`.
+function createJobsDialog() {
+  // Bumped on each open() call. The layout-mounted dialog watches this counter
+  // and shows itself whenever it changes, so repeated opens always re-show even
+  // if the value of a boolean flag wouldn't have changed.
+  const open_signal = writable(0)
+
+  function open() {
+    open_signal.update((n) => n + 1)
+  }
+
+  return {
+    open_signal: { subscribe: open_signal.subscribe },
+    open,
+  }
+}
+
+export const jobs_dialog = createJobsDialog()
diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte
index 3c1c66e92..6b6f6bb2f 100644
--- a/app/web_ui/src/routes/(app)/+layout.svelte
+++ b/app/web_ui/src/routes/(app)/+layout.svelte
@@ -19,7 +19,9 @@
   import ChatBar from "./chat_bar.svelte"
   import ChatIcon from "$lib/ui/icons/chat_icon.svelte"
   import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
-  import SidebarJobsBadge from "$lib/components/SidebarJobsBadge.svelte"
+  import SidebarJobsIndicator from "$lib/components/SidebarJobsIndicator.svelte"
+  import JobsDialog from "$lib/components/jobs_dialog.svelte"
+  import { jobs_dialog } from "$lib/stores/jobs_dialog"
   import { Section } from "$lib/ui/section"
   import Dialog from "$lib/ui/dialog.svelte"
   import SidebarRail from "./sidebar_rail.svelte"
@@ -281,13 +283,17 @@
         </li>
 
         <li class="menu-sm">
-          <a href="/jobs" class={section == Section.Jobs ? "active" : ""}>
+          <button
+            type="button"
+            class="w-full {section == Section.Jobs ? 'active' : ''}"
+            on:click={() => jobs_dialog.open()}
+          >
             <div class="sidebar-icon">
               <JobsIcon />
             </div>
             Jobs
-            <SidebarJobsBadge variant="inline" />
-          </a>
+            <SidebarJobsIndicator variant="inline" />
+          </button>
         </li>
 
         <li class="menu-sm">
@@ -487,6 +493,8 @@
   <SelectTasksMenu on:dismiss={() => taskDialog?.close()} />
 </Dialog>
 
+<JobsDialog />
+
 <style>
   :global(ul > li.menu-nested) {
     padding: 0;
diff --git a/app/web_ui/src/routes/(app)/jobs/+page.svelte b/app/web_ui/src/routes/(app)/jobs/+page.svelte
index 636807666..ff3dd28c2 100644
--- a/app/web_ui/src/routes/(app)/jobs/+page.svelte
+++ b/app/web_ui/src/routes/(app)/jobs/+page.svelte
@@ -1,33 +1,9 @@
 <script lang="ts">
   import AppPage from "../app_page.svelte"
-  import Dialog from "$lib/ui/dialog.svelte"
+  import JobsTable from "$lib/components/jobs_table.svelte"
   import RunEvalDialog from "./run_eval_dialog.svelte"
-  import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
-  import { jobs, synced, connection } from "$lib/stores/jobs_store"
-  import {
-    available_actions,
-    is_terminal,
-    job_status_badge_class,
-    job_status_display,
-    progress_label,
-    progress_percent,
-    type JobAction,
-  } from "$lib/stores/job_status"
-  import {
-    cancel_job,
-    create_job,
-    delete_job,
-    get_job_errors,
-    get_job_result,
-    pause_job,
-    resume_job,
-    type JobError,
-    type JobErrorEntry,
-    type JobRecord,
-  } from "$lib/stores/jobs_api"
-  import { formatDate } from "$lib/utils/formatters"
+  import { create_job } from "$lib/stores/jobs_api"
   import { KilnError, createKilnError } from "$lib/utils/error_handlers"
-  import { capitalize } from "$lib/utils/formatters"
   import { agentInfo } from "$lib/agent"
   import { ui_state } from "$lib/stores"
 
@@ -38,7 +14,6 @@
   })
 
   let action_error: KilnError | null = null
-  let in_flight: Record<string, boolean> = {}
   let creating_test_job = false
 
   // Kicks off a no-op job: a simulated long-running task (sleeps per step,
@@ -80,97 +55,6 @@
       disabled: creating_test_job,
     },
   ]
-
-  const action_runners: Record<JobAction, (id: string) => Promise<void>> = {
-    pause: pause_job,
-    resume: resume_job,
-    cancel: cancel_job,
-    delete: delete_job,
-  }
-
-  const action_labels: Record<JobAction, string> = {
-    pause: "Pause",
-    resume: "Resume",
-    cancel: "Cancel",
-    delete: "Delete",
-  }
-
-  async function run_action(action: JobAction, id: string) {
-    action_error = null
-    in_flight = { ...in_flight, [id]: true }
-    try {
-      await action_runners[action](id)
-      // The SSE stream reflects the resulting transition; no local mutation.
-    } catch (e) {
-      action_error = createKilnError(e)
-    } finally {
-      in_flight = { ...in_flight, [id]: false }
-    }
-  }
-
-  function job_type_display(type: string): string {
-    if (type === "noop") {
-      return "No-op"
-    }
-    return capitalize(type)
-  }
-
-  function has_errors(job: JobRecord): boolean {
-    return (job.progress?.error ?? 0) > 0 || job.status === "failed"
-  }
-
-  // Only show a result once the job is in a terminal state — a non-null
-  // `result` mid-run would be partial and misleading.
-  function has_result(job: JobRecord): boolean {
-    return is_terminal(job.status) && job.result != null
-  }
-
-  // Surface the record's failure summary inline for failed jobs.
-  function failure_error(job: JobRecord): JobError | null {
-    return job.status === "failed" ? job.error ?? null : null
-  }
-
-  // Errors dialog state
-  let errors_dialog: Dialog
-  let errors_loading = false
-  let errors_load_error: KilnError | null = null
-  let error_entries: JobErrorEntry[] = []
-  let errors_summary: JobError | null = null
-
-  async function open_errors(job: JobRecord) {
-    error_entries = []
-    errors_load_error = null
-    errors_summary = failure_error(job)
-    errors_loading = true
-    errors_dialog?.show()
-    try {
-      error_entries = await get_job_errors(job.id)
-    } catch (e) {
-      errors_load_error = createKilnError(e)
-    } finally {
-      errors_loading = false
-    }
-  }
-
-  // Result dialog state
-  let result_dialog: Dialog
-  let result_loading = false
-  let result_load_error: KilnError | null = null
-  let result_data: Record<string, unknown> | null = null
-
-  async function open_result(job: JobRecord) {
-    result_data = null
-    result_load_error = null
-    result_loading = true
-    result_dialog?.show()
-    try {
-      result_data = await get_job_result(job.id)
-    } catch (e) {
-      result_load_error = createKilnError(e)
-    } finally {
-      result_loading = false
-    }
-  }
 </script>
 
 <AppPage
@@ -185,191 +69,7 @@
     </div>
   {/if}
 
-  {#if !$synced && $connection === "errored"}
-    <div
-      class="flex flex-col items-center justify-center min-h-[50vh] text-center max-w-md mx-auto"
-    >
-      <div class="text-gray-400 mb-3">
-        <span class="loading loading-spinner loading-md"></span>
-      </div>
-      <h3 class="text-lg font-medium">Can't connect to the job stream</h3>
-      <p class="text-sm text-gray-500 mt-2">
-        We lost the connection to the background job updates and are retrying
-        automatically. Jobs keep running in the background — this page will
-        refresh once the connection is restored.
-      </p>
-    </div>
-  {:else if !$synced}
-    <div class="w-full min-h-[50vh] flex justify-center items-center">
-      <div class="loading loading-spinner loading-lg"></div>
-    </div>
-  {:else if $jobs.length === 0}
-    <div
-      class="flex flex-col items-center justify-center min-h-[55vh] text-center max-w-md mx-auto"
-    >
-      <div class="w-12 h-12 text-gray-400 mb-4" aria-hidden="true">
-        <JobsIcon />
-      </div>
-      <h3 class="text-lg font-medium">No jobs yet</h3>
-      <p class="text-sm text-gray-500 mt-2">
-        Long-running work like eval runs shows up here. Jobs run in the
-        background — you can leave this page and they'll keep going. Come back
-        any time to check progress, pause, or cancel them.
-      </p>
-    </div>
-  {:else}
-    <div class="overflow-x-auto rounded-lg border">
-      <table class="table">
-        <thead>
-          <tr>
-            <th>ID</th>
-            <th>Type</th>
-            <th>Status</th>
-            <th>Progress</th>
-            <th>Message</th>
-            <th>Created</th>
-            <th class="text-right">Actions</th>
-          </tr>
-        </thead>
-        <tbody>
-          {#each $jobs as job (job.id)}
-            <tr>
-              <td class="font-mono text-xs text-gray-500 whitespace-nowrap"
-                >{job.id}</td
-              >
-              <td class="font-medium">{job_type_display(job.type)}</td>
-              <td>
-                <span class="badge {job_status_badge_class(job.status)}">
-                  {job_status_display(job.status)}
-                </span>
-              </td>
-              <td>
-                <div class="flex flex-col gap-1 min-w-32">
-                  <span class="text-sm">{progress_label(job.progress)}</span>
-                  {#if job.progress?.total}
-                    <progress
-                      class="progress progress-primary w-32 h-1.5"
-                      value={progress_percent(job.progress)}
-                      max="100"
-                    ></progress>
-                  {/if}
-                </div>
-              </td>
-              <td class="text-sm text-gray-500 max-w-48">
-                {#if failure_error(job)?.error}
-                  <span
-                    class="text-error block truncate"
-                    title={failure_error(job)?.error}
-                    >{failure_error(job)?.error}</span
-                  >
-                {:else}
-                  <span class="block truncate"
-                    >{job.progress?.message || ""}</span
-                  >
-                {/if}
-              </td>
-              <td class="text-sm text-gray-500 whitespace-nowrap">
-                {formatDate(job.created_at)}
-              </td>
-              <td>
-                <div class="flex flex-row gap-1 justify-end flex-wrap">
-                  {#if has_result(job)}
-                    <button
-                      class="btn btn-xs btn-ghost"
-                      on:click={() => open_result(job)}
-                    >
-                      Result
-                    </button>
-                  {/if}
-                  {#if has_errors(job)}
-                    <button
-                      class="btn btn-xs btn-ghost"
-                      on:click={() => open_errors(job)}
-                    >
-                      Errors
-                    </button>
-                  {/if}
-                  {#each available_actions(job) as action}
-                    <button
-                      class="btn btn-xs {action === 'delete' ||
-                      action === 'cancel'
-                        ? 'btn-ghost text-error'
-                        : 'btn-ghost'}"
-                      disabled={in_flight[job.id]}
-                      on:click={() => run_action(action, job.id)}
-                    >
-                      {action_labels[action]}
-                    </button>
-                  {/each}
-                </div>
-              </td>
-            </tr>
-          {/each}
-        </tbody>
-      </table>
-    </div>
-  {/if}
+  <JobsTable />
 </AppPage>
 
-<Dialog bind:this={errors_dialog} title="Job Errors" width="wide">
-  {#if errors_summary?.error}
-    <div
-      role="alert"
-      class="alert alert-error text-sm mb-4 flex flex-col items-start gap-1"
-    >
-      <span class="font-medium break-words">{errors_summary.error}</span>
-      {#if errors_summary.detail}
-        <pre
-          class="text-xs w-full bg-base-200 text-base-content rounded-md p-2 overflow-x-auto max-h-48">{JSON.stringify(
-            errors_summary.detail,
-            null,
-            2,
-          )}</pre>
-      {/if}
-    </div>
-  {/if}
-  {#if errors_loading}
-    <div class="flex justify-center py-8">
-      <div class="loading loading-spinner loading-lg"></div>
-    </div>
-  {:else if errors_load_error}
-    <div class="text-error text-sm">
-      {errors_load_error.getMessage() || "Could not load errors."}
-    </div>
-  {:else if error_entries.length === 0}
-    <p class="text-sm text-gray-500">
-      No error messages recorded for this job.
-    </p>
-  {:else}
-    <ul class="flex flex-col gap-2 max-h-[60vh] overflow-y-auto">
-      {#each error_entries as entry, index (index)}
-        <li class="text-sm bg-base-200 rounded-md p-3 font-mono break-words">
-          {entry.error_message || JSON.stringify(entry)}
-        </li>
-      {/each}
-    </ul>
-  {/if}
-</Dialog>
-
-<Dialog bind:this={result_dialog} title="Job Result" width="wide">
-  {#if result_loading}
-    <div class="flex justify-center py-8">
-      <div class="loading loading-spinner loading-lg"></div>
-    </div>
-  {:else if result_load_error}
-    <div class="text-error text-sm">
-      {result_load_error.getMessage() || "Could not load result."}
-    </div>
-  {:else if result_data}
-    <pre
-      class="text-xs bg-base-200 rounded-md p-3 overflow-x-auto max-h-[60vh]">{JSON.stringify(
-        result_data,
-        null,
-        2,
-      )}</pre>
-  {:else}
-    <p class="text-sm text-gray-500">No result available.</p>
-  {/if}
-</Dialog>
-
 <RunEvalDialog bind:this={run_eval_dialog} />
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail.svelte b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
index 05332dd44..ac9f40aae 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail.svelte
+++ b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
@@ -9,7 +9,8 @@
   import ChatIcon from "$lib/ui/icons/chat_icon.svelte"
   import EvalIcon from "$lib/ui/icons/eval_icon.svelte"
   import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
-  import SidebarJobsBadge from "$lib/components/SidebarJobsBadge.svelte"
+  import SidebarJobsIndicator from "$lib/components/SidebarJobsIndicator.svelte"
+  import { jobs_dialog } from "$lib/stores/jobs_dialog"
 
   export let section: Section = Section.None
   export let openTaskDialog: () => void
@@ -111,10 +112,14 @@
     </div>
   </SidebarRailItem>
 
-  <SidebarRailItem href="/jobs" active={section === Section.Jobs} label="Jobs">
+  <SidebarRailItem
+    on_click={() => jobs_dialog.open()}
+    active={section === Section.Jobs}
+    label="Jobs"
+  >
     <div slot="icon" class="w-full h-full relative">
       <JobsIcon />
-      <SidebarJobsBadge variant="rail" />
+      <SidebarJobsIndicator variant="rail" />
     </div>
   </SidebarRailItem>
 
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail_item.svelte b/app/web_ui/src/routes/(app)/sidebar_rail_item.svelte
index 3dd541343..92d9ebc61 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail_item.svelte
+++ b/app/web_ui/src/routes/(app)/sidebar_rail_item.svelte
@@ -1,31 +1,56 @@
 <script lang="ts">
   import SidebarRailTooltip from "./sidebar_rail_tooltip.svelte"
 
-  export let href: string
+  // Either a navigation target (`href`) or a click handler (`on_click`). When
+  // `on_click` is set the item renders as a button instead of a link (used by
+  // the Jobs entry, which opens a dialog rather than navigating).
+  export let href: string | undefined = undefined
+  export let on_click: (() => void) | undefined = undefined
   export let active: boolean = false
   export let label: string
 
   let hovered = false
   let focused = false
   $: show_tooltip = hovered || focused
+
+  $: item_class = `relative flex items-center justify-center w-10 h-8 xl:h-9 rounded-md ${
+    active ? "bg-base-300" : "hover:bg-base-300/50"
+  }`
 </script>
 
 <div class="flex justify-center">
-  <a
-    {href}
-    class="relative flex items-center justify-center w-10 h-8 xl:h-9 rounded-md {active
-      ? 'bg-base-300'
-      : 'hover:bg-base-300/50'}"
-    aria-label={label}
-    aria-current={active ? "page" : undefined}
-    on:mouseenter={() => (hovered = true)}
-    on:mouseleave={() => (hovered = false)}
-    on:focus={() => (focused = true)}
-    on:blur={() => (focused = false)}
-  >
-    <span class="w-5 h-5 block">
-      <slot name="icon" />
-    </span>
-    <SidebarRailTooltip show={show_tooltip}>{label}</SidebarRailTooltip>
-  </a>
+  {#if on_click}
+    <button
+      type="button"
+      class={item_class}
+      aria-label={label}
+      aria-current={active ? "page" : undefined}
+      on:click={on_click}
+      on:mouseenter={() => (hovered = true)}
+      on:mouseleave={() => (hovered = false)}
+      on:focus={() => (focused = true)}
+      on:blur={() => (focused = false)}
+    >
+      <span class="w-5 h-5 block">
+        <slot name="icon" />
+      </span>
+      <SidebarRailTooltip show={show_tooltip}>{label}</SidebarRailTooltip>
+    </button>
+  {:else}
+    <a
+      {href}
+      class={item_class}
+      aria-label={label}
+      aria-current={active ? "page" : undefined}
+      on:mouseenter={() => (hovered = true)}
+      on:mouseleave={() => (hovered = false)}
+      on:focus={() => (focused = true)}
+      on:blur={() => (focused = false)}
+    >
+      <span class="w-5 h-5 block">
+        <slot name="icon" />
+      </span>
+      <SidebarRailTooltip show={show_tooltip}>{label}</SidebarRailTooltip>
+    </a>
+  {/if}
 </div>
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail_item.test.ts b/app/web_ui/src/routes/(app)/sidebar_rail_item.test.ts
index 427831532..fc0182ef6 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail_item.test.ts
+++ b/app/web_ui/src/routes/(app)/sidebar_rail_item.test.ts
@@ -70,6 +70,19 @@ describe("SidebarRailItem", () => {
     expect(anchor?.className).toContain("hover:bg-base-300/50")
   })
 
+  it("renders a button that fires on_click when no href is given", async () => {
+    let clicked = 0
+    const { container } = render(SidebarRailItem, {
+      props: { on_click: () => (clicked += 1), label: "Jobs" },
+    })
+    expect(container.querySelector("a")).toBeNull()
+    const button = container.querySelector("button") as HTMLElement
+    expect(button).not.toBeNull()
+    expect(button.getAttribute("aria-label")).toBe("Jobs")
+    await fireEvent.click(button)
+    expect(clicked).toBe(1)
+  })
+
   it("keeps the visible tooltip pointer-events-none so clicks do not regress", async () => {
     // Regression guard: pre-portal the tooltip is a DOM descendant of the <a>,
     // so it must remain non-interactive or it could swallow clicks on the link.

From 45c17ece80f7188e56c2170ada81c30fae24c9c6 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Sat, 30 May 2026 01:35:29 +0800
Subject: [PATCH 06/26] refactor: link in sidebar rail

---
 app/web_ui/src/lib/ui/section.ts              |  1 -
 app/web_ui/src/routes/(app)/+layout.svelte    | 30 +++++++++----------
 .../src/routes/(app)/sidebar_rail.svelte      | 18 +++++------
 3 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/app/web_ui/src/lib/ui/section.ts b/app/web_ui/src/lib/ui/section.ts
index 3fbeccf63..0dd772847 100644
--- a/app/web_ui/src/lib/ui/section.ts
+++ b/app/web_ui/src/lib/ui/section.ts
@@ -12,6 +12,5 @@ export enum Section {
   Skills,
   Optimize,
   Assistant,
-  Jobs,
   None,
 }
diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte
index 6b6f6bb2f..928aa23d4 100644
--- a/app/web_ui/src/routes/(app)/+layout.svelte
+++ b/app/web_ui/src/routes/(app)/+layout.svelte
@@ -112,8 +112,6 @@
       section = Section.Specs
     } else if (path_start("/optimize", $page.url.pathname)) {
       section = Section.Optimize
-    } else if (path_start("/jobs", $page.url.pathname)) {
-      section = Section.Jobs
     } else if (path_start("/assistant", $page.url.pathname)) {
       section = Section.Assistant
     } else {
@@ -282,20 +280,6 @@
           >
         </li>
 
-        <li class="menu-sm">
-          <button
-            type="button"
-            class="w-full {section == Section.Jobs ? 'active' : ''}"
-            on:click={() => jobs_dialog.open()}
-          >
-            <div class="sidebar-icon">
-              <JobsIcon />
-            </div>
-            Jobs
-            <SidebarJobsIndicator variant="inline" />
-          </button>
-        </li>
-
         <li class="menu-sm">
           <a
             href={`/optimize/${$ui_state.current_project_id}/${$ui_state.current_task_id}`}
@@ -444,6 +428,20 @@
         <li class="mt-auto pt-2 bg-transparent">
           <ProgressWidget />
         </li>
+        <li class="menu-sm">
+          <button
+            type="button"
+            class="text-xs text-base-content/60"
+            on:click={() => jobs_dialog.open()}
+            aria-label="Background jobs"
+          >
+            <div class="sidebar-icon opacity-60">
+              <JobsIcon />
+            </div>
+            Jobs
+            <SidebarJobsIndicator variant="inline" />
+          </button>
+        </li>
         {#if $update_info.update_result && $update_info.update_result.has_update}
           <li class="menu-sm mt-2">
             <a
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail.svelte b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
index ac9f40aae..be1795d27 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail.svelte
+++ b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
@@ -112,22 +112,18 @@
     </div>
   </SidebarRailItem>
 
-  <SidebarRailItem
-    on_click={() => jobs_dialog.open()}
-    active={section === Section.Jobs}
-    label="Jobs"
-  >
-    <div slot="icon" class="w-full h-full relative">
-      <JobsIcon />
-      <SidebarJobsIndicator variant="rail" />
-    </div>
-  </SidebarRailItem>
-
   <SidebarRailOptimizeGroup {section} />
 
   <div class="flex-1"></div>
 
   <SidebarRailProgress />
 
+  <SidebarRailItem on_click={() => jobs_dialog.open()} label="Jobs">
+    <div slot="icon" class="w-full h-full relative">
+      <JobsIcon />
+      <SidebarJobsIndicator variant="rail" />
+    </div>
+  </SidebarRailItem>
+
   <SidebarRailSettings active={section === Section.Settings} />
 </nav>

From fc5f6f6c0a38ca48976d4f8a6f14217a487da5cf Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Sat, 30 May 2026 01:41:40 +0800
Subject: [PATCH 07/26] refactor: job entry in sidebar

---
 app/web_ui/src/lib/components/jobs_dialog.svelte |  2 +-
 app/web_ui/src/lib/ui/dialog.svelte              | 12 +++++++++---
 app/web_ui/src/routes/(app)/+layout.svelte       |  2 +-
 app/web_ui/src/routes/(app)/sidebar_rail.svelte  |  2 +-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/app/web_ui/src/lib/components/jobs_dialog.svelte b/app/web_ui/src/lib/components/jobs_dialog.svelte
index d5811d0ae..fcdf1ccfd 100644
--- a/app/web_ui/src/lib/components/jobs_dialog.svelte
+++ b/app/web_ui/src/lib/components/jobs_dialog.svelte
@@ -18,7 +18,7 @@
   }
 </script>
 
-<Dialog bind:this={dialog} title="Jobs" width="wide">
+<Dialog bind:this={dialog} title="Jobs" width="extra-wide">
   <p class="text-sm font-light mb-4">
     <a href="/jobs" class="link" on:click={() => dialog?.close()}
       >View full page →</a
diff --git a/app/web_ui/src/lib/ui/dialog.svelte b/app/web_ui/src/lib/ui/dialog.svelte
index bef5490e8..28cc0fbf8 100644
--- a/app/web_ui/src/lib/ui/dialog.svelte
+++ b/app/web_ui/src/lib/ui/dialog.svelte
@@ -11,7 +11,7 @@
   export let sub_subtitle: string | null = null
   export let sub_subtitle_link: string | null = null
   export let blur_background: boolean = false
-  export let width: "normal" | "wide" = "normal"
+  export let width: "normal" | "wide" | "extra-wide" = "normal"
   const id: string = "dialog-" + Math.random().toString(36)
   type ActionButton = {
     label: string
@@ -89,9 +89,15 @@
   on:close={() => dispatch("close")}
   on:cancel={(e) => dispatch("cancel", e)}
 >
-  <div class="modal-box {width === 'wide' ? 'w-11/12 max-w-3xl' : ''}">
+  <div
+    class="modal-box {width === 'extra-wide'
+      ? 'w-11/12 max-w-7xl'
+      : width === 'wide'
+        ? 'w-11/12 max-w-3xl'
+        : ''}"
+  >
     <!-- Hidden div to force the compiler to find these classes -->
-    <div class="hidden w-11/12 max-w-3xl"></div>
+    <div class="hidden w-11/12 max-w-3xl max-w-7xl"></div>
     <div class="flex flex-row gap-2 items-start">
       <div
         class="grow flex flex-col {center_content
diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte
index 928aa23d4..1802cc887 100644
--- a/app/web_ui/src/routes/(app)/+layout.svelte
+++ b/app/web_ui/src/routes/(app)/+layout.svelte
@@ -438,7 +438,7 @@
             <div class="sidebar-icon opacity-60">
               <JobsIcon />
             </div>
-            Jobs
+            In progress
             <SidebarJobsIndicator variant="inline" />
           </button>
         </li>
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail.svelte b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
index be1795d27..dad1dc4b4 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail.svelte
+++ b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
@@ -118,7 +118,7 @@
 
   <SidebarRailProgress />
 
-  <SidebarRailItem on_click={() => jobs_dialog.open()} label="Jobs">
+  <SidebarRailItem on_click={() => jobs_dialog.open()} label="In progress">
     <div slot="icon" class="w-full h-full relative">
       <JobsIcon />
       <SidebarJobsIndicator variant="rail" />

From 34f10db7df919af8a9f218904f00967d433f810b Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Fri, 5 Jun 2026 02:01:24 +0800
Subject: [PATCH 08/26] fix: guard task.uncancel() for Python 3.10 in job
 registry tests

asyncio.Task.uncancel() was added in Python 3.11. The SwallowCancelWorker
and TotalThenNoneWorker test workers called it unconditionally inside their
CancelledError handler, so on Python 3.10 the call raised AttributeError,
which propagated out of the worker and drove the job to 'failed' instead of
'paused'/'cancelled'. Guard the call with hasattr so 3.10 simply swallows the
cancellation, still exercising the worst-case 'swallows cancel' path the tests
validate.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/desktop/studio_server/jobs/test_registry.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/app/desktop/studio_server/jobs/test_registry.py b/app/desktop/studio_server/jobs/test_registry.py
index d31235d63..d179e83d7 100644
--- a/app/desktop/studio_server/jobs/test_registry.py
+++ b/app/desktop/studio_server/jobs/test_registry.py
@@ -152,7 +152,10 @@ async def run(self, params, ctx):
             await type(self).gate.wait()
         except asyncio.CancelledError:
             task = asyncio.current_task()
-            if task is not None:
+            # task.uncancel() was added in Python 3.11; on 3.10 simply
+            # swallowing the CancelledError exercises the same worst-case
+            # "swallows cancel and returns normally" path.
+            if task is not None and hasattr(task, "uncancel"):
                 task.uncancel()
         return _EmptyResult()
 
@@ -180,7 +183,10 @@ async def run(self, params, ctx):
             await type(self).gate.wait()
         except asyncio.CancelledError:
             task = asyncio.current_task()
-            if task is not None:
+            # task.uncancel() was added in Python 3.11; on 3.10 simply
+            # swallowing the CancelledError exercises the same worst-case
+            # "swallows cancel and returns normally" path.
+            if task is not None and hasattr(task, "uncancel"):
                 task.uncancel()
         return _EmptyResult()
 

From 589d8ddc90ba381536235d530c4dd9c6ce71b9e7 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Fri, 12 Jun 2026 19:34:01 +0800
Subject: [PATCH 09/26] fix: address PR review on background job system

- SSE payload: set ensure_ascii=False on json.dumps (project standard)
- functional_spec: document that pause/cancel return 202 after the
  transition settles (awaits supervising task), reconciling the spec
  with the deliberate implementation behavior

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/desktop/studio_server/jobs/api.py                   | 4 +++-
 specs/projects/background_job_system/functional_spec.md | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/app/desktop/studio_server/jobs/api.py b/app/desktop/studio_server/jobs/api.py
index ec862e7a3..0255b33f5 100644
--- a/app/desktop/studio_server/jobs/api.py
+++ b/app/desktop/studio_server/jobs/api.py
@@ -63,7 +63,9 @@ def _project_id_from_params(validated_params: BaseModel) -> str | None:
 
 
 def _format_sse(event: JobEvent) -> str:
-    return f"event: {event.event}\ndata: {json.dumps(event.data)}\n\n"
+    return (
+        f"event: {event.event}\ndata: {json.dumps(event.data, ensure_ascii=False)}\n\n"
+    )
 
 
 async def _event_stream(
diff --git a/specs/projects/background_job_system/functional_spec.md b/specs/projects/background_job_system/functional_spec.md
index f833ec409..479874878 100644
--- a/specs/projects/background_job_system/functional_spec.md
+++ b/specs/projects/background_job_system/functional_spec.md
@@ -196,7 +196,7 @@ All endpoints live under `/api/jobs`. Authentication piggybacks on whatever the
 | `DELETE` | `/api/jobs/{id}` | — | `204` / `409` | 409 if still in-flight. Drops the in-memory record and best-effort removes the run's error log file(s). |
 | `GET` | `/api/jobs/events` | — | `200 text/event-stream` | SSE; see §6. |
 
-All state-changing endpoints (pause/resume/cancel) are async-effecting: they return immediately with `202 Accepted`; the actual state transition is published via the SSE stream.
+All state-changing endpoints (pause/resume/cancel) return `202 Accepted` once the transition has settled. For pause and cancel this means the handler awaits the supervising task's cancellation/cleanup before responding, so the slot is reclaimed and the terminal result is recorded deterministically (no lost cancellation, no double-release). For our current workers cleanup lands at the next `await`, so this is effectively instant; a future worker with slow cancel-cleanup would hold the connection for that cleanup. The resulting state is also published via the SSE stream for any observers.
 
 Error envelopes follow the existing local-server convention (`{ "detail": "..." }`).
 

From 1b19939cdf8b77901bdb4184259cadd5e83ae9fb Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Sat, 13 Jun 2026 01:55:07 +0800
Subject: [PATCH 10/26] feat: typed per-worker progress payload

Adds a typed home for rich per-worker progress, replacing the freeform
metadata escape hatch:

- JobWorker.progress_model declares an optional per-worker model.
- JobContext.report_progress_detail(model) validates against it and
  stamps JobRecord.progress_detail (a wrong-type model fails the job
  loudly rather than storing garbage).
- Kept as a dict on the wire so the core stays worker-agnostic; the
  frontend casts progress_detail to the worker's exported model.

RAG adopts this (metadata.rag_progress -> progress_detail) when this
lifts into its branch. Adds registry tests for the stamp + the
wrong-model guard; regenerates the web client schema.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/desktop/studio_server/jobs/models.py      | 24 +++++++
 app/desktop/studio_server/jobs/registry.py    | 27 +++++++-
 .../studio_server/jobs/test_registry.py       | 62 +++++++++++++++++++
 app/web_ui/src/lib/api_schema.d.ts            |  4 ++
 4 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/app/desktop/studio_server/jobs/models.py b/app/desktop/studio_server/jobs/models.py
index 7262934a4..3d5404f64 100644
--- a/app/desktop/studio_server/jobs/models.py
+++ b/app/desktop/studio_server/jobs/models.py
@@ -79,6 +79,12 @@ class JobRecord(BaseModel):
     status: BackgroundJobStatus
     run_id: str | None = None
     progress: JobProgress = Field(default_factory=JobProgress)
+    # Typed, per-worker progress detail (validated against the worker's
+    # `progress_model`). The generic `progress` above is the universal counter;
+    # this carries the rich per-kind shape a worker needs the UI to render
+    # (e.g. RAG's four-phase breakdown). Kept as a dict on the wire so the core
+    # stays worker-agnostic; the frontend casts it to the worker's model.
+    progress_detail: dict[str, Any] | None = None
     params: dict[str, Any] = Field(default_factory=dict)
     result: dict[str, Any] | None = None
     error: JobError | None = None
@@ -92,6 +98,7 @@ class JobRecord(BaseModel):
 
 
 ReportProgress = Callable[["JobProgressUpdate"], Awaitable[None]]
+ReportProgressDetail = Callable[[BaseModel], Awaitable[None]]
 ReportError = Callable[[str, dict[str, Any]], Awaitable[None]]
 
 
@@ -114,11 +121,13 @@ def __init__(
         job_id: str,
         run_id: str,
         report_progress: ReportProgress,
+        report_progress_detail: ReportProgressDetail,
         report_error: ReportError,
     ) -> None:
         self.job_id = job_id
         self.run_id = run_id
         self._report_progress = report_progress
+        self._report_progress_detail = report_progress_detail
         self._report_error = report_error
 
     async def report_progress(
@@ -142,6 +151,17 @@ async def report_progress(
             )
         )
 
+    async def report_progress_detail(self, detail: BaseModel) -> None:
+        """Stamp the job's typed `progress_detail` with a worker-specific model.
+
+        For rich per-kind progress the generic counter can't carry (e.g. RAG's
+        per-phase breakdown). `detail` must be an instance of the worker's
+        declared `progress_model`; the registry validates and serializes it.
+        A UI-smoothing signal only — authoritative progress comes from
+        compute_state(). Cheap to call often.
+        """
+        await self._report_progress_detail(detail)
+
     async def report_error(self, error_message: str, **extra: Any) -> None:
         """Append one structured error entry to this run's error log.
 
@@ -160,6 +180,10 @@ class JobWorker(Generic[TParams, TResult]):
     type_name: ClassVar[str]
     params_model: ClassVar[type[BaseModel]]
     result_model: ClassVar[type[BaseModel]]
+    # Optional typed model for rich per-worker progress reported via
+    # JobContext.report_progress_detail(); stamped on JobRecord.progress_detail.
+    # Leave None for workers whose generic count progress is enough.
+    progress_model: ClassVar[type[BaseModel] | None] = None
     supports_pause: ClassVar[bool] = False
 
     async def compute_state(self, params: TParams) -> JobDerivedState | None:
diff --git a/app/desktop/studio_server/jobs/registry.py b/app/desktop/studio_server/jobs/registry.py
index b85521f0c..b76012724 100644
--- a/app/desktop/studio_server/jobs/registry.py
+++ b/app/desktop/studio_server/jobs/registry.py
@@ -222,7 +222,7 @@ async def _supervise(self, job_id: str, worker: JobWorker, run_id: str) -> None:
         if job is None:
             return
         params = worker.params_model.model_validate(job.params)
-        ctx = self._build_context(job_id, run_id)
+        ctx = self._build_context(job_id, run_id, worker)
         try:
             try:
                 await self._reconcile(job, emit_on_change=True)
@@ -249,7 +249,9 @@ async def _supervise(self, job_id: str, worker: JobWorker, run_id: str) -> None:
         finally:
             self._release_slot(job_id)
 
-    def _build_context(self, job_id: str, run_id: str) -> JobContext:
+    def _build_context(
+        self, job_id: str, run_id: str, worker: JobWorker
+    ) -> JobContext:
         async def report_progress(update: JobProgressUpdate) -> None:
             job = self._jobs.get(job_id)
             if job is None or job.run_id != run_id:
@@ -265,10 +267,29 @@ async def report_progress(update: JobProgressUpdate) -> None:
             self._touch(job)
             self._emit(job)
 
+        async def report_progress_detail(detail: BaseModel) -> None:
+            job = self._jobs.get(job_id)
+            if job is None or job.run_id != run_id:
+                return
+            # Guard the worker's contract: the detail must be the model the
+            # worker declared, so progress_detail's shape is predictable for
+            # the frontend that casts it.
+            expected = worker.progress_model
+            if expected is not None and not isinstance(detail, expected):
+                raise TypeError(
+                    f"report_progress_detail expected {expected.__name__}, "
+                    f"got {type(detail).__name__}"
+                )
+            job.progress_detail = detail.model_dump(mode="json")
+            self._touch(job)
+            self._emit(job)
+
         async def report_error(message: str, extra: dict[str, Any]) -> None:
             error_log.append_error(run_id, {"error_message": message, **extra})
 
-        return JobContext(job_id, run_id, report_progress, report_error)
+        return JobContext(
+            job_id, run_id, report_progress, report_progress_detail, report_error
+        )
 
     def _finish_succeeded(self, job: JobRecord, result: BaseModel) -> None:
         job.status = BackgroundJobStatus.SUCCEEDED
diff --git a/app/desktop/studio_server/jobs/test_registry.py b/app/desktop/studio_server/jobs/test_registry.py
index d179e83d7..71a46a5bd 100644
--- a/app/desktop/studio_server/jobs/test_registry.py
+++ b/app/desktop/studio_server/jobs/test_registry.py
@@ -810,3 +810,65 @@ async def test_get_unknown_returns_none(registry):
 async def test_lifecycle_op_unknown_raises(registry):
     with pytest.raises(JobNotFoundError):
         await registry.cancel("j_doesnotexist")
+
+
+# -- typed progress detail ---------------------------------------------------
+
+
+class DetailModel(BaseModel):
+    phase: str
+    done: int
+
+
+class DetailWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    type_name = "detail"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    progress_model = DetailModel
+    gate: asyncio.Event
+
+    async def run(self, params, ctx):
+        await ctx.report_progress_detail(DetailModel(phase="extract", done=3))
+        await type(self).gate.wait()
+        return _EmptyResult()
+
+
+@pytest.mark.asyncio
+async def test_report_progress_detail_stamps_typed_payload():
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(DetailWorker)
+    DetailWorker.gate = asyncio.Event()
+    job = await reg.create("detail", {})
+    await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING)
+    # Give the worker a tick to report the detail.
+    for _ in range(50):
+        if reg._jobs[job.id].progress_detail is not None:
+            break
+        await asyncio.sleep(0.01)
+    assert reg._jobs[job.id].progress_detail == {"phase": "extract", "done": 3}
+    DetailWorker.gate.set()
+
+
+class WrongModel(BaseModel):
+    other: str
+
+
+class BadDetailWorker(JobWorker[_EmptyParams, _EmptyResult]):
+    type_name = "bad_detail"
+    params_model = _EmptyParams
+    result_model = _EmptyResult
+    progress_model = DetailModel
+
+    async def run(self, params, ctx):
+        await ctx.report_progress_detail(WrongModel(other="x"))
+        return _EmptyResult()
+
+
+@pytest.mark.asyncio
+async def test_report_progress_detail_rejects_wrong_model():
+    reg = JobRegistry(max_concurrent=2)
+    reg.register_type(BadDetailWorker)
+    job = await reg.create("bad_detail", {})
+    # The type guard raises inside run(), routing the job to FAILED.
+    await wait_for_status(reg, job.id, BackgroundJobStatus.FAILED)
+    assert reg._jobs[job.id].error is not None
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
index 398401400..71248e5e2 100644
--- a/app/web_ui/src/lib/api_schema.d.ts
+++ b/app/web_ui/src/lib/api_schema.d.ts
@@ -7022,6 +7022,10 @@ export interface components {
             /** Run Id */
             run_id?: string | null;
             progress?: components["schemas"]["JobProgress"];
+            /** Progress Detail */
+            progress_detail?: {
+                [key: string]: unknown;
+            } | null;
             /** Params */
             params?: {
                 [key: string]: unknown;

From a14d73f4f0c3edf9d36e8713fa2eeac1b0e042f6 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Mon, 15 Jun 2026 18:35:21 +0800
Subject: [PATCH 11/26] chore: fmt

---
 app/desktop/studio_server/jobs/registry.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/app/desktop/studio_server/jobs/registry.py b/app/desktop/studio_server/jobs/registry.py
index b76012724..3fe914a9b 100644
--- a/app/desktop/studio_server/jobs/registry.py
+++ b/app/desktop/studio_server/jobs/registry.py
@@ -249,9 +249,7 @@ async def _supervise(self, job_id: str, worker: JobWorker, run_id: str) -> None:
         finally:
             self._release_slot(job_id)
 
-    def _build_context(
-        self, job_id: str, run_id: str, worker: JobWorker
-    ) -> JobContext:
+    def _build_context(self, job_id: str, run_id: str, worker: JobWorker) -> JobContext:
         async def report_progress(update: JobProgressUpdate) -> None:
             job = self._jobs.get(job_id)
             if job is None or job.run_id != run_id:

From 8b9199f21bba6f5f78dfaf9702acbd6fb11e07e1 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Tue, 16 Jun 2026 13:59:41 +0800
Subject: [PATCH 12/26] feat: rename sidebar "In progress" to "Jobs" with
 active-state styling

Rename the sidebar entry to "Jobs" to match the dialog it opens, only grey
it out when no jobs are active, and swap the briefcase icon for a task-list icon.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 app/web_ui/src/lib/ui/icons/jobs_icon.svelte    | 12 +++++++++---
 app/web_ui/src/routes/(app)/+layout.svelte      | 12 ++++++++----
 app/web_ui/src/routes/(app)/sidebar_rail.svelte |  2 +-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/app/web_ui/src/lib/ui/icons/jobs_icon.svelte b/app/web_ui/src/lib/ui/icons/jobs_icon.svelte
index 065ddaac4..8e6f82c3f 100644
--- a/app/web_ui/src/lib/ui/icons/jobs_icon.svelte
+++ b/app/web_ui/src/lib/ui/icons/jobs_icon.svelte
@@ -3,19 +3,25 @@
   viewBox="0 0 24 24"
   fill="none"
   xmlns="http://www.w3.org/2000/svg"
+  aria-hidden="true"
 >
+  <circle cx="4.5" cy="6" r="1.25" fill="currentColor" />
+  <circle cx="4.5" cy="12" r="1.25" fill="currentColor" />
+  <circle cx="4.5" cy="18" r="1.25" fill="currentColor" />
   <path
-    d="M2 14C2 11.1716 2 9.75736 2.87868 8.87868C3.75736 8 5.17157 8 8 8H16C18.8284 8 20.2426 8 21.1213 8.87868C22 9.75736 22 11.1716 22 14C22 16.8284 22 18.2426 21.1213 19.1213C20.2426 20 18.8284 20 16 20H8C5.17157 20 3.75736 20 2.87868 19.1213C2 18.2426 2 16.8284 2 14Z"
+    d="M9 6H20"
     stroke="currentColor"
     stroke-width="1.5"
+    stroke-linecap="round"
   />
   <path
-    d="M16 8V7C16 5.11438 16 4.17157 15.4142 3.58579C14.8284 3 13.8856 3 12 3C10.1144 3 9.17157 3 8.58579 3.58579C8 4.17157 8 5.11438 8 7V8"
+    d="M9 12H20"
     stroke="currentColor"
     stroke-width="1.5"
+    stroke-linecap="round"
   />
   <path
-    d="M2 13H22"
+    d="M9 18H16"
     stroke="currentColor"
     stroke-width="1.5"
     stroke-linecap="round"
diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte
index 1802cc887..e798a93fe 100644
--- a/app/web_ui/src/routes/(app)/+layout.svelte
+++ b/app/web_ui/src/routes/(app)/+layout.svelte
@@ -22,6 +22,7 @@
   import SidebarJobsIndicator from "$lib/components/SidebarJobsIndicator.svelte"
   import JobsDialog from "$lib/components/jobs_dialog.svelte"
   import { jobs_dialog } from "$lib/stores/jobs_dialog"
+  import { active_jobs_count } from "$lib/stores/jobs_store"
   import { Section } from "$lib/ui/section"
   import Dialog from "$lib/ui/dialog.svelte"
   import SidebarRail from "./sidebar_rail.svelte"
@@ -431,14 +432,17 @@
         <li class="menu-sm">
           <button
             type="button"
-            class="text-xs text-base-content/60"
+            class="text-xs {$active_jobs_count > 0
+              ? 'text-base-content'
+              : 'text-base-content/60'}"
             on:click={() => jobs_dialog.open()}
-            aria-label="Background jobs"
           >
-            <div class="sidebar-icon opacity-60">
+            <div
+              class="sidebar-icon {$active_jobs_count > 0 ? '' : 'opacity-60'}"
+            >
               <JobsIcon />
             </div>
-            In progress
+            Jobs
             <SidebarJobsIndicator variant="inline" />
           </button>
         </li>
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail.svelte b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
index dad1dc4b4..be1795d27 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail.svelte
+++ b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
@@ -118,7 +118,7 @@
 
   <SidebarRailProgress />
 
-  <SidebarRailItem on_click={() => jobs_dialog.open()} label="In progress">
+  <SidebarRailItem on_click={() => jobs_dialog.open()} label="Jobs">
     <div slot="icon" class="w-full h-full relative">
       <JobsIcon />
       <SidebarJobsIndicator variant="rail" />

From ad9c241167f6e93fdb508ef7ec693a6911d0ab09 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Tue, 16 Jun 2026 14:15:59 +0800
Subject: [PATCH 13/26] fix: harden job reconcile + offload eval compute_state
 IO

Address PR #1435 review feedback:
- registry: wrap worker.compute_state() in _reconcile with try/except so a
  deleted or transiently-unavailable entity falls back to the last known
  in-memory state instead of 500-ing GET /api/jobs/{id} (or crashing the
  supervisor's initial reconcile).
- eval worker: run compute_state's blocking runs(readonly=True) filesystem
  scans via asyncio.to_thread (_compute_state_sync) so large eval-run
  directories don't stall the event loop / SSE progress updates.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/desktop/studio_server/jobs/registry.py     | 10 +++++++++-
 app/desktop/studio_server/jobs/workers/eval.py |  9 +++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/app/desktop/studio_server/jobs/registry.py b/app/desktop/studio_server/jobs/registry.py
index 3fe914a9b..00ddff357 100644
--- a/app/desktop/studio_server/jobs/registry.py
+++ b/app/desktop/studio_server/jobs/registry.py
@@ -461,7 +461,15 @@ async def _reconcile(self, job: JobRecord, emit_on_change: bool) -> bool:
         if worker is None:
             return False
         params = worker.params_model.model_validate(job.params)
-        derived = await worker.compute_state(params)
+        try:
+            derived = await worker.compute_state(params)
+        except Exception:
+            # compute_state may touch on-disk entities (project/task/eval) that
+            # could be deleted or transiently unavailable. A failure here must
+            # not 500 the GET /api/jobs/{id} read or crash _supervise's initial
+            # reconcile — fall back to the last known in-memory state.
+            logger.exception("Failed to compute state for job %s", job.id)
+            return False
         if derived is None:
             return False
         changed = self._apply_derived(job, derived)
diff --git a/app/desktop/studio_server/jobs/workers/eval.py b/app/desktop/studio_server/jobs/workers/eval.py
index 89f540fa6..6c1a4ab53 100644
--- a/app/desktop/studio_server/jobs/workers/eval.py
+++ b/app/desktop/studio_server/jobs/workers/eval.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import asyncio
+
 from app.desktop.git_sync.save_context import save_context_for_project
 from kiln_ai.adapters.eval.eval_runner import EvalRunner
 from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
@@ -40,6 +42,13 @@ class EvalJobWorker(JobWorker[EvalJobParams, EvalJobResult]):
     supports_pause = True
 
     async def compute_state(self, params: EvalJobParams) -> JobDerivedState:
+        # _compute_state_sync loads entities and enumerates runs/ directories
+        # (os.scandir + open/read/json.loads per child) synchronously. The
+        # registry awaits this on the event loop, so offload the blocking IO to
+        # a thread to keep progress/SSE updates flowing for large eval sets.
+        return await asyncio.to_thread(self._compute_state_sync, params)
+
+    def _compute_state_sync(self, params: EvalJobParams) -> JobDerivedState:
         eval_config = eval_config_from_id(
             params.project_id,
             params.task_id,

From 824edec78faad57ac52f6f8f09ea25c3f463d00a Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 17 Jun 2026 00:30:39 +0800
Subject: [PATCH 14/26] fix: keep jobs SSE stream alive past keepalive + close
 it on shutdown

Address PR #1435 review (gemini): the SSE keepalive was implemented as
asyncio.wait_for(subscription.__anext__(), timeout). Cancelling __anext__()
on timeout finalizes the async generator, so the next __anext__() raises
StopAsyncIteration and the stream dies after the first 15s ping. Move the
timeout INTO subscribe(): it now takes a timeout and yields a 'ping' JobEvent
on queue-get timeout, so the generator is never cancelled from outside.
_event_stream maps 'ping' events to ': ping' SSE comments.

Also add JobEventBus.shutdown(): pushes a close sentinel to every subscriber so
open SSE generators return promptly, wired into the desktop lifespan. Because
uvicorn only runs lifespan shutdown AFTER its graceful-shutdown wait, the dev
server also sets timeout_graceful_shutdown=1 so a UI holding the jobs stream
open can't block a hot reload.

Tests: httpx ASGITransport buffers the whole body and can't stream an infinite
SSE response (the old tests passed only because the buggy stream
self-terminated), so drive _event_stream/subscribe directly for
content/filter/keepalive/disconnect, keep one HTTP-level test that ends via
shutdown(), and cover shutdown() at the bus level.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/desktop/desktop_server.py                 |   7 +
 app/desktop/dev_server.py                     |   6 +
 app/desktop/studio_server/jobs/api.py         |  18 +-
 app/desktop/studio_server/jobs/events.py      |  57 +++++-
 app/desktop/studio_server/jobs/test_api.py    | 167 +++++++++---------
 app/desktop/studio_server/jobs/test_events.py |  54 ++++++
 6 files changed, 209 insertions(+), 100 deletions(-)

diff --git a/app/desktop/desktop_server.py b/app/desktop/desktop_server.py
index 639bbe27e..52aa2fd0e 100644
--- a/app/desktop/desktop_server.py
+++ b/app/desktop/desktop_server.py
@@ -34,6 +34,7 @@
 from app.desktop.studio_server.finetune_api import connect_fine_tune_api
 from app.desktop.studio_server.import_api import connect_import_api
 from app.desktop.studio_server.jobs.api import connect_jobs_api
+from app.desktop.studio_server.jobs.registry import job_registry
 from app.desktop.studio_server.prompt_api import connect_prompt_api
 from app.desktop.studio_server.prompt_optimization_job_api import (
     connect_prompt_optimization_job_api,
@@ -112,6 +113,12 @@ async def lifespan(app: FastAPI):
         await _start_background_syncs()
         yield
     finally:
+        # End open SSE subscriptions so a UI holding the jobs stream open can't
+        # keep the worker alive (e.g. block a dev-server hot reload). Pure
+        # observer teardown — jobs keep running. Note uvicorn only reaches
+        # lifespan shutdown after its graceful-shutdown wait, so the dev server
+        # also sets timeout_graceful_shutdown to bound that wait.
+        job_registry.events.shutdown()
         try:
             await _stop_background_syncs()
         finally:
diff --git a/app/desktop/dev_server.py b/app/desktop/dev_server.py
index 2b8804b9c..514ab013a 100644
--- a/app/desktop/dev_server.py
+++ b/app/desktop/dev_server.py
@@ -36,4 +36,10 @@
         reload=True,
         # Debounce when changing many files (changing branch)
         reload_delay=0.1,
+        # Bound the graceful-shutdown wait on reload. The UI holds the jobs SSE
+        # stream open; uvicorn waits for in-flight requests to finish BEFORE it
+        # runs lifespan shutdown (which closes the stream), so without a bound a
+        # reload would hang on the open SSE. After this many seconds uvicorn
+        # cancels the lingering request task instead.
+        timeout_graceful_shutdown=1,
     )
diff --git a/app/desktop/studio_server/jobs/api.py b/app/desktop/studio_server/jobs/api.py
index 0255b33f5..85962efbf 100644
--- a/app/desktop/studio_server/jobs/api.py
+++ b/app/desktop/studio_server/jobs/api.py
@@ -80,23 +80,21 @@ async def _event_stream(
     (client disconnect, via CancellableStreamingResponse) only unsubscribes from
     the bus — it never touches any job's supervising task. Jobs keep running.
     """
+    # The keepalive timeout lives inside subscribe() (yields a "ping" event),
+    # NOT here via asyncio.wait_for on __anext__(): cancelling the generator's
+    # __anext__() finalizes it, so the stream would die after the first ping.
     subscription: AsyncGenerator[JobEvent, None] = job_registry.events.subscribe(
         job_id=job_id,
         type_name=type_name,
         project_id=project_id,
+        timeout=KEEPALIVE_SECONDS,
     )
     try:
-        while True:
-            try:
-                event = await asyncio.wait_for(
-                    subscription.__anext__(), timeout=KEEPALIVE_SECONDS
-                )
-            except asyncio.TimeoutError:
+        async for event in subscription:
+            if event.event == "ping":
                 yield ": ping\n\n"
-                continue
-            except StopAsyncIteration:
-                break
-            yield _format_sse(event)
+            else:
+                yield _format_sse(event)
     finally:
         await subscription.aclose()
 
diff --git a/app/desktop/studio_server/jobs/events.py b/app/desktop/studio_server/jobs/events.py
index b85f0f2d1..dacff3d48 100644
--- a/app/desktop/studio_server/jobs/events.py
+++ b/app/desktop/studio_server/jobs/events.py
@@ -11,10 +11,19 @@
 class JobEvent(BaseModel):
     """A single bus event. Per-job events carry the full record (idempotent snapshot)."""
 
-    event: Literal["snapshot", "job", "deleted"]
+    event: Literal["snapshot", "job", "deleted", "ping"]
     data: dict[str, Any]
 
 
+class _CloseSentinel:
+    """Pushed onto a subscriber's queue by ``shutdown()`` to end its stream
+    promptly (e.g. so a dev-server hot reload isn't blocked by open SSE
+    connections), distinct from a normal ``JobEvent``."""
+
+
+_CLOSE = _CloseSentinel()
+
+
 class _Subscriber:
     def __init__(
         self,
@@ -22,7 +31,7 @@ def __init__(
         type_name: str | None,
         project_id: str | None,
     ) -> None:
-        self.queue: asyncio.Queue[JobEvent] = asyncio.Queue()
+        self.queue: asyncio.Queue[JobEvent | _CloseSentinel] = asyncio.Queue()
         self.job_id = job_id
         self.type_name = type_name
         self.project_id = project_id
@@ -55,6 +64,7 @@ class JobEventBus:
     def __init__(self, snapshot_provider: SnapshotProvider | None = None) -> None:
         self._subscribers: set[_Subscriber] = set()
         self._snapshot_provider = snapshot_provider
+        self._closed = False
 
     def set_snapshot_provider(self, provider: SnapshotProvider) -> None:
         self._snapshot_provider = provider
@@ -73,7 +83,23 @@ async def subscribe(
         job_id: str | None = None,
         type_name: str | None = None,
         project_id: str | None = None,
+        timeout: float | None = None,
     ) -> AsyncGenerator[JobEvent, None]:
+        """Yield the initial snapshot then per-job events.
+
+        When ``timeout`` is set, a ``ping`` event is yielded after that many
+        seconds without a real event. The timeout MUST live here, inside the
+        generator: cancelling ``subscribe().__anext__()` from the outside (e.g.
+        ``asyncio.wait_for``) throws CancelledError into the suspended generator,
+        runs its ``finally``, and finalizes it — so the very next ``__anext__``
+        would raise StopAsyncIteration and kill the stream after one ping.
+
+        The generator ends (returns) when ``shutdown()`` has been called: either
+        immediately if the bus is already closed, or as soon as the close
+        sentinel reaches the head of the queue.
+        """
+        if self._closed:
+            return
         subscriber = _Subscriber(job_id, type_name, project_id)
         self._subscribers.add(subscriber)
         try:
@@ -83,10 +109,35 @@ async def subscribe(
                 data={"jobs": [r.model_dump(mode="json") for r in snapshot]},
             )
             while True:
-                yield await subscriber.queue.get()
+                if timeout is None:
+                    item = await subscriber.queue.get()
+                else:
+                    try:
+                        item = await asyncio.wait_for(
+                            subscriber.queue.get(), timeout=timeout
+                        )
+                    except asyncio.TimeoutError:
+                        yield JobEvent(event="ping", data={})
+                        continue
+                if isinstance(item, _CloseSentinel):
+                    return
+                yield item
         finally:
             self._subscribers.discard(subscriber)
 
+    def shutdown(self) -> None:
+        """End every open subscription and reject new ones.
+
+        Pushes a close sentinel onto each subscriber's queue so its
+        ``subscribe()`` generator returns promptly. Used on server shutdown so a
+        long-lived SSE connection (the jobs stream the UI holds open) doesn't
+        keep the worker alive — e.g. blocking a dev-server hot reload. A pure
+        observer teardown: it never touches any job's supervising task.
+        """
+        self._closed = True
+        for subscriber in self._subscribers:
+            subscriber.queue.put_nowait(_CLOSE)
+
     def publish_job(self, record: JobRecord) -> None:
         event = JobEvent(event="job", data=record.model_dump(mode="json"))
         for subscriber in self._subscribers:
diff --git a/app/desktop/studio_server/jobs/test_api.py b/app/desktop/studio_server/jobs/test_api.py
index e7c66a4e1..46e5b8053 100644
--- a/app/desktop/studio_server/jobs/test_api.py
+++ b/app/desktop/studio_server/jobs/test_api.py
@@ -114,15 +114,6 @@ def registry(monkeypatch):
     return reg
 
 
-@pytest.fixture
-def fast_keepalive(monkeypatch):
-    # httpx's ASGITransport batches the SSE generator's output and only surfaces
-    # buffered lines once the next chunk (here, the keepalive ping) forces a
-    # flush. Shortening the keepalive makes that flush — and stream teardown —
-    # prompt in tests. Production keeps the 15s default.
-    monkeypatch.setattr(jobs_api, "KEEPALIVE_SECONDS", 0.1)
-
-
 @pytest.fixture
 def app(registry):
     app = FastAPI()
@@ -666,111 +657,113 @@ def _parse_sse_block(block: str) -> tuple[str | None, dict | None]:
     return event_name, data
 
 
-async def _read_until_event(line_iter, target: str, timeout: float = 3.0) -> dict:
-    """Read SSE blocks from a shared line iterator until one matches the target
-    event name; return its data. httpx allows streaming the body only once, so a
-    single iterator must be threaded through all reads on a response."""
-    buffer = ""
-    while True:
-        line = await asyncio.wait_for(line_iter.__anext__(), timeout=timeout)
-        if line == "":
-            event_name, data = _parse_sse_block(buffer)
-            buffer = ""
-            if event_name == target and data is not None:
-                return data
-        else:
-            buffer += line + "\n"
+# The SSE endpoint is now a correctly *infinite* stream (it pings forever until
+# the client disconnects or the bus shuts down). httpx's ASGITransport runs the
+# app to completion and buffers the whole body before returning a response, and
+# its `receive()` only yields http.disconnect once the response is complete — so
+# it cannot exercise an open-ended stream incrementally or simulate a mid-stream
+# disconnect. We therefore drive `_event_stream` / `subscribe` directly for the
+# streaming-content behavior, and keep one HTTP-level test that ends the stream
+# via `events.shutdown()` so ASGITransport can return the buffered response.
 
 
-@pytest.mark.asyncio
-async def test_sse_empty_snapshot(app, fast_keepalive):
-    # Connecting with no jobs yields an empty snapshot. (httpx's ASGITransport
-    # sends http.disconnect right after the GET body, so we only assert the
-    # initial snapshot here; live-event delivery is covered below with a job
-    # that is already running before we connect.)
-    transport = httpx.ASGITransport(app=app)
-    async with httpx.AsyncClient(
-        transport=transport, base_url="http://test"
-    ) as http_client:
-        async with http_client.stream("GET", "/api/jobs/events") as response:
-            assert response.status_code == 200
-            assert response.headers["content-type"].startswith("text/event-stream")
-            snapshot = await _read_until_event(response.aiter_lines(), "snapshot")
-            assert snapshot == {"jobs": []}
+async def _read_stream_until(stream, target: str, timeout: float = 3.0) -> dict:
+    """Pull SSE blocks straight from the `_event_stream` async generator until
+    one matches `target`; return its parsed data."""
+    deadline = asyncio.get_event_loop().time() + timeout
+    while asyncio.get_event_loop().time() < deadline:
+        chunk = await asyncio.wait_for(stream.__anext__(), timeout=timeout)
+        event_name, data = _parse_sse_block(chunk)
+        if event_name == target and data is not None:
+            return data
+    raise AssertionError(f"did not see event '{target}' within {timeout}s")
 
 
-@pytest.mark.asyncio
-async def test_sse_snapshot_then_job_event(app, registry, fast_keepalive):
-    # Start a long-running job first, so it appears in the snapshot and keeps
-    # emitting live `job` progress events while we observe the stream.
-    job = await registry.create("noop", {"steps": 40, "sleep_per_step_seconds": 0.05})
+def _parse_sse_body(body: str) -> list[tuple[str | None, dict | None]]:
+    return [_parse_sse_block(b) for b in body.split("\n\n") if b.strip()]
+
 
+@pytest.mark.asyncio
+async def test_sse_endpoint_returns_event_stream_and_ends_on_shutdown(app, registry):
+    # Full HTTP path: correct status + content-type and an initial snapshot.
+    # The stream is infinite, and ASGITransport buffers the whole body, so we
+    # end it with events.shutdown() (the same hook the server uses on reload)
+    # to let the buffered response come back.
     transport = httpx.ASGITransport(app=app)
     async with httpx.AsyncClient(
         transport=transport, base_url="http://test"
     ) as http_client:
-        async with http_client.stream("GET", "/api/jobs/events") as response:
-            assert response.status_code == 200
-            assert response.headers["content-type"].startswith("text/event-stream")
-            lines = response.aiter_lines()
+        get = asyncio.ensure_future(http_client.get("/api/jobs/events"))
+        # Wait until the endpoint's subscription is registered, then shut the
+        # bus so the (otherwise infinite) stream returns.
+        for _ in range(300):
+            if registry.events._subscribers:
+                break
+            await asyncio.sleep(0.01)
+        else:
+            raise AssertionError("SSE subscription never registered")
+        registry.events.shutdown()
 
-            snapshot = await _read_until_event(lines, "snapshot")
-            assert [j["id"] for j in snapshot["jobs"]] == [job.id]
+        response = await asyncio.wait_for(get, timeout=3.0)
+        assert response.status_code == 200
+        assert response.headers["content-type"].startswith("text/event-stream")
+        blocks = _parse_sse_body(response.text)
+        assert ("snapshot", {"jobs": []}) in blocks
 
-            data = await _read_until_event(lines, "job")
-            assert data["id"] == job.id
-            assert data["type"] == "noop"
 
-    await _safe_cancel(registry, job.id)
+@pytest.mark.asyncio
+async def test_event_stream_emits_keepalive_ping(registry, monkeypatch):
+    # The keepalive is the regression we fixed: a timeout must yield a `: ping`
+    # comment WITHOUT finalizing the generator, so MANY pings arrive over time.
+    monkeypatch.setattr(jobs_api, "KEEPALIVE_SECONDS", 0.05)
+    stream = jobs_api._event_stream(job_id=None, type_name=None, project_id=None)
+    try:
+        first = await asyncio.wait_for(stream.__anext__(), timeout=3.0)
+        assert first.startswith("event: snapshot\n")
+        # Two consecutive pings prove the stream survives repeated timeouts.
+        for _ in range(2):
+            chunk = await asyncio.wait_for(stream.__anext__(), timeout=3.0)
+            assert chunk == ": ping\n\n"
+    finally:
+        await stream.aclose()
 
 
 @pytest.mark.asyncio
-async def test_sse_filters_by_job_id(app, registry, fast_keepalive):
-    # Both jobs run; only `target`'s events should reach a job_id-filtered stream.
+async def test_event_stream_filters_by_job_id(registry):
+    # Both jobs run; only `target`'s events reach a job_id-filtered stream.
     other = await registry.create("noop", {"steps": 40, "sleep_per_step_seconds": 0.05})
     target = await registry.create(
         "noop", {"steps": 40, "sleep_per_step_seconds": 0.05}
     )
-
-    transport = httpx.ASGITransport(app=app)
-    async with httpx.AsyncClient(
-        transport=transport, base_url="http://test"
-    ) as http_client:
-        async with http_client.stream(
-            "GET", "/api/jobs/events", params={"job_id": target.id}
-        ) as response:
-            lines = response.aiter_lines()
-            snapshot = await _read_until_event(lines, "snapshot")
-            snapshot_ids = {j["id"] for j in snapshot["jobs"]}
-            assert target.id in snapshot_ids
-            assert other.id not in snapshot_ids
-
-            # The progress event that arrives is for the target, never `other`.
-            data = await _read_until_event(lines, "job")
-            assert data["id"] == target.id
-
+    stream = jobs_api._event_stream(job_id=target.id, type_name=None, project_id=None)
+    try:
+        snapshot = await _read_stream_until(stream, "snapshot")
+        snapshot_ids = {j["id"] for j in snapshot["jobs"]}
+        assert target.id in snapshot_ids
+        assert other.id not in snapshot_ids
+
+        # Every live event that arrives is for the target, never `other`.
+        data = await _read_stream_until(stream, "job")
+        assert data["id"] == target.id
+    finally:
+        await stream.aclose()
     await _safe_cancel(registry, other.id)
     await _safe_cancel(registry, target.id)
 
 
 @pytest.mark.asyncio
-async def test_sse_disconnect_leaves_job_running(app, registry, fast_keepalive):
+async def test_event_stream_disconnect_leaves_job_running(registry):
     """The decoupling guarantee: dropping the SSE stream mid-run must NOT stop
-    the job. Only explicit cancel/pause stops a job."""
+    the job. Only explicit cancel/pause stops a job. Closing the generator is
+    exactly what CancellableStreamingResponse does on a real client disconnect."""
     job = await registry.create("noop", {"steps": 6, "sleep_per_step_seconds": 0.05})
 
-    transport = httpx.ASGITransport(app=app)
-    async with httpx.AsyncClient(
-        transport=transport, base_url="http://test"
-    ) as http_client:
-        async with http_client.stream("GET", "/api/jobs/events") as response:
-            lines = response.aiter_lines()
-            await _read_until_event(lines, "snapshot")
-            # Observe at least one live job event so we know the run is underway.
-            await _read_until_event(lines, "job")
-        # Exiting the `stream` context drops the client connection, which cancels
-        # the SSE subscription generator (CancellableStreamingResponse). The job
-        # task lives in the registry and must keep running.
+    stream = jobs_api._event_stream(job_id=None, type_name=None, project_id=None)
+    await _read_stream_until(stream, "snapshot")
+    # Observe at least one live job event so we know the run is underway.
+    await _read_stream_until(stream, "job")
+    # Simulate the client disconnecting mid-stream.
+    await stream.aclose()
 
     assert registry._jobs[job.id].status in (
         BackgroundJobStatus.RUNNING,
diff --git a/app/desktop/studio_server/jobs/test_events.py b/app/desktop/studio_server/jobs/test_events.py
index 2a60e3f2f..95eb19b2c 100644
--- a/app/desktop/studio_server/jobs/test_events.py
+++ b/app/desktop/studio_server/jobs/test_events.py
@@ -88,3 +88,57 @@ async def test_filter_by_type_and_job_id():
     assert event.data["id"] == "j_target000001"
 
     await gen.aclose()
+
+
+@pytest.mark.asyncio
+async def test_keepalive_ping_does_not_finalize_generator():
+    # Regression: the timeout must yield `ping` events from inside the generator
+    # and keep it alive, not finalize it after the first one.
+    bus = JobEventBus(snapshot_provider=lambda: [])
+    gen = bus.subscribe(timeout=0.02)
+    assert (await _next_event(gen)).event == "snapshot"
+    assert (await _next_event(gen)).event == "ping"
+    assert (await _next_event(gen)).event == "ping"
+
+    # A real event still flows after pings.
+    bus.publish_job(_record("j_after000001"))
+    event = await _next_event(gen)
+    assert event.event == "job"
+    assert event.data["id"] == "j_after000001"
+
+    await gen.aclose()
+
+
+@pytest.mark.asyncio
+async def test_shutdown_ends_open_stream_and_rejects_new_ones():
+    bus = JobEventBus(snapshot_provider=lambda: [])
+    gen = bus.subscribe()
+    assert (await _next_event(gen)).event == "snapshot"
+
+    # shutdown() pushes a close sentinel so the open generator returns.
+    bus.shutdown()
+    with pytest.raises(StopAsyncIteration):
+        await _next_event(gen)
+
+    # A subscription opened after shutdown ends immediately (no snapshot).
+    gen2 = bus.subscribe()
+    with pytest.raises(StopAsyncIteration):
+        await _next_event(gen2)
+
+
+@pytest.mark.asyncio
+async def test_shutdown_unblocks_subscriber_waiting_without_timeout():
+    # With no keepalive timeout the subscriber blocks on queue.get(); shutdown()
+    # must still wake it so a hot reload isn't held open.
+    bus = JobEventBus(snapshot_provider=lambda: [])
+    gen = bus.subscribe()  # timeout=None
+    assert (await _next_event(gen)).event == "snapshot"
+
+    async def _drain():
+        with pytest.raises(StopAsyncIteration):
+            await gen.__anext__()
+
+    waiter = asyncio.ensure_future(_drain())
+    await asyncio.sleep(0)  # let the waiter block on queue.get()
+    bus.shutdown()
+    await asyncio.wait_for(waiter, timeout=1.0)

From 619a915a5597699d00f4d6b77b8ab93251acf1dd Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Thu, 18 Jun 2026 00:21:16 +0800
Subject: [PATCH 15/26] refactor: ui redesign for jobs

---
 app/desktop/studio_server/jobs/api.py         | 11 +--
 .../src/lib/components/jobs_dialog.svelte     | 13 +--
 .../src/lib/components/jobs_table.svelte      | 86 ++++++++++---------
 app/web_ui/src/lib/stores/job_status.test.ts  | 68 +++++++++++++--
 app/web_ui/src/lib/stores/job_status.ts       | 34 ++++++--
 app/web_ui/src/lib/ui/dialog.svelte           | 12 +--
 6 files changed, 149 insertions(+), 75 deletions(-)

diff --git a/app/desktop/studio_server/jobs/api.py b/app/desktop/studio_server/jobs/api.py
index 85962efbf..e2479f475 100644
--- a/app/desktop/studio_server/jobs/api.py
+++ b/app/desktop/studio_server/jobs/api.py
@@ -16,11 +16,7 @@
 from . import error_log
 from .events import JobEvent
 from .models import BackgroundJobStatus, JobRecord
-from .registry import (
-    JobNotFoundError,
-    JobOperationError,
-    job_registry,
-)
+from .registry import JobNotFoundError, JobOperationError, job_registry
 from .workers.eval import EvalJobWorker
 from .workers.noop import NoopJobWorker
 
@@ -80,9 +76,8 @@ async def _event_stream(
     (client disconnect, via CancellableStreamingResponse) only unsubscribes from
     the bus — it never touches any job's supervising task. Jobs keep running.
     """
-    # The keepalive timeout lives inside subscribe() (yields a "ping" event),
-    # NOT here via asyncio.wait_for on __anext__(): cancelling the generator's
-    # __anext__() finalizes it, so the stream would die after the first ping.
+    # subscribe() handles the keepalive itself, yielding a "ping" event after
+    # `timeout` idle seconds.
     subscription: AsyncGenerator[JobEvent, None] = job_registry.events.subscribe(
         job_id=job_id,
         type_name=type_name,
diff --git a/app/web_ui/src/lib/components/jobs_dialog.svelte b/app/web_ui/src/lib/components/jobs_dialog.svelte
index fcdf1ccfd..65fcb54d9 100644
--- a/app/web_ui/src/lib/components/jobs_dialog.svelte
+++ b/app/web_ui/src/lib/components/jobs_dialog.svelte
@@ -18,11 +18,12 @@
   }
 </script>
 
-<Dialog bind:this={dialog} title="Jobs" width="extra-wide">
-  <p class="text-sm font-light mb-4">
-    <a href="/jobs" class="link" on:click={() => dialog?.close()}
-      >View full page →</a
-    >
-  </p>
+<Dialog
+  bind:this={dialog}
+  title="Jobs"
+  width="wide"
+  sub_subtitle="View full page →"
+  sub_subtitle_link="/jobs"
+>
   <JobsTable />
 </Dialog>
diff --git a/app/web_ui/src/lib/components/jobs_table.svelte b/app/web_ui/src/lib/components/jobs_table.svelte
index 73437cc94..7919f31f4 100644
--- a/app/web_ui/src/lib/components/jobs_table.svelte
+++ b/app/web_ui/src/lib/components/jobs_table.svelte
@@ -1,14 +1,13 @@
 <script lang="ts">
   import Dialog from "$lib/ui/dialog.svelte"
   import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
-  import CloseIcon from "$lib/ui/icons/close_icon.svelte"
   import { jobs, synced, connection } from "$lib/stores/jobs_store"
   import {
     available_actions,
     completed_jobs,
     is_terminal,
-    job_status_badge_class,
-    job_status_display,
+    job_status_display_badge_class,
+    job_status_display_label,
     progress_label,
     progress_percent,
     type JobAction,
@@ -198,54 +197,61 @@
     <table class="table">
       <thead>
         <tr>
-          <th>ID</th>
-          <th>Type</th>
+          <th>Details</th>
           <th>Status</th>
-          <th>Progress</th>
-          <th>Message</th>
-          <th>Created</th>
-          <th class="text-right">Actions</th>
+          <th class="text-right"></th>
         </tr>
       </thead>
       <tbody>
         {#each $jobs as job (job.id)}
           <tr>
-            <td class="font-mono text-xs text-gray-500 whitespace-nowrap"
-              >{job.id}</td
-            >
-            <td class="font-medium">{job_type_display(job.type)}</td>
-            <td>
-              <span class="badge {job_status_badge_class(job.status)}">
-                {job_status_display(job.status)}
-              </span>
+            <td class="whitespace-nowrap">
+              <div class="flex flex-col gap-1">
+                <span class="font-medium">{job_type_display(job.type)}</span>
+                <span class="font-mono text-xs text-gray-500">{job.id}</span>
+                <span class="text-xs text-gray-500"
+                  >{formatDate(job.created_at)}</span
+                >
+              </div>
             </td>
             <td>
-              <div class="flex flex-col gap-1 min-w-32">
-                <span class="text-sm">{progress_label(job.progress)}</span>
-                {#if job.progress?.total}
+              <div class="flex flex-col gap-2 w-full max-w-[360px] min-w-48">
+                <span
+                  class="badge px-3 py-1 self-start {job_status_display_badge_class(
+                    job,
+                  )}"
+                >
+                  {job_status_display_label(job)}
+                </span>
+                <div class="flex items-center justify-between text-gray-500">
+                  {#if job.status === "running"}
+                    <span>{progress_percent(job.progress)}% Complete</span>
+                  {/if}
+                  {#if job.progress?.total}
+                    <span>{progress_label(job.progress)}</span>
+                  {/if}
+                </div>
+                {#if progress_percent(job.progress) < 100}
                   <progress
-                    class="progress progress-primary w-32 h-1.5"
+                    class="progress progress-primary bg-base-200 w-full h-2"
                     value={progress_percent(job.progress)}
                     max="100"
                   ></progress>
+                  {#if failure_error(job)?.error}
+                    <span
+                      class="font-mono text-sm text-error block truncate"
+                      title={failure_error(job)?.error}
+                      >{failure_error(job)?.error}</span
+                    >
+                  {:else if job.progress?.message}
+                    <span
+                      class="font-mono text-sm text-gray-500 block truncate"
+                      title={job.progress.message}>{job.progress.message}</span
+                    >
+                  {/if}
                 {/if}
               </div>
             </td>
-            <td class="text-sm text-gray-500 max-w-48">
-              {#if failure_error(job)?.error}
-                <span
-                  class="text-error block truncate"
-                  title={failure_error(job)?.error}
-                  >{failure_error(job)?.error}</span
-                >
-              {:else}
-                <span class="block truncate">{job.progress?.message || ""}</span
-                >
-              {/if}
-            </td>
-            <td class="text-sm text-gray-500 whitespace-nowrap">
-              {formatDate(job.created_at)}
-            </td>
             <td>
               <div
                 class="flex flex-row gap-1 justify-end flex-wrap items-center"
@@ -255,7 +261,7 @@
                     class="btn btn-xs btn-ghost"
                     on:click={() => open_result(job)}
                   >
-                    Result
+                    View results
                   </button>
                 {/if}
                 {#if has_errors(job)}
@@ -263,19 +269,19 @@
                     class="btn btn-xs btn-ghost"
                     on:click={() => open_errors(job)}
                   >
-                    Errors
+                    View errors
                   </button>
                 {/if}
                 {#each available_actions(job) as action}
                   {#if action === "delete"}
                     <button
-                      class="btn btn-xs btn-ghost btn-square text-error"
+                      class="btn btn-xs btn-ghost"
                       disabled={in_flight[job.id]}
                       aria-label="Dismiss job"
                       title="Dismiss job"
                       on:click={() => run_action(action, job.id)}
                     >
-                      <span class="w-4 h-4 block"><CloseIcon /></span>
+                      Clear
                     </button>
                   {:else}
                     <button
diff --git a/app/web_ui/src/lib/stores/job_status.test.ts b/app/web_ui/src/lib/stores/job_status.test.ts
index 01bf1d708..8b8a5d0dc 100644
--- a/app/web_ui/src/lib/stores/job_status.test.ts
+++ b/app/web_ui/src/lib/stores/job_status.test.ts
@@ -4,8 +4,11 @@ import {
   completed_jobs,
   is_active,
   is_terminal,
+  job_completed_with_errors,
   job_status_badge_class,
   job_status_display,
+  job_status_display_badge_class,
+  job_status_display_label,
   jobs_indicator,
   progress_label,
   progress_percent,
@@ -81,12 +84,12 @@ describe("available_actions", () => {
 
 describe("job_status_display / job_status_badge_class", () => {
   const cases: [BackgroundJobStatus, string, string][] = [
-    ["pending", "Pending", "badge-ghost"],
-    ["running", "Running", "badge-info"],
-    ["paused", "Paused", "badge-warning"],
-    ["succeeded", "Succeeded", "badge-success"],
-    ["failed", "Failed", "badge-error"],
-    ["cancelled", "Cancelled", "badge-ghost"],
+    ["pending", "Pending", "badge-outline"],
+    ["running", "Running", "badge-outline badge-success"],
+    ["paused", "Paused", "badge-outline badge-warning"],
+    ["succeeded", "Succeeded", "badge-outline badge-primary"],
+    ["failed", "Failed", "badge-outline badge-error"],
+    ["cancelled", "Cancelled", "badge-outline"],
   ]
   it.each(cases)("maps %s", (status, label, badge) => {
     expect(job_status_display(status)).toBe(label)
@@ -94,6 +97,59 @@ describe("job_status_display / job_status_badge_class", () => {
   })
 })
 
+describe("job_completed_with_errors / display helpers", () => {
+  it("is true only when succeeded with a positive error count", () => {
+    expect(
+      job_completed_with_errors(
+        makeJob({ status: "succeeded", progress: { success: 8, error: 2 } }),
+      ),
+    ).toBe(true)
+  })
+
+  it("is false when succeeded without errors", () => {
+    expect(
+      job_completed_with_errors(
+        makeJob({ status: "succeeded", progress: { success: 10, error: 0 } }),
+      ),
+    ).toBe(false)
+  })
+
+  it("is false for non-succeeded statuses even with errors", () => {
+    expect(
+      job_completed_with_errors(
+        makeJob({ status: "running", progress: { success: 1, error: 3 } }),
+      ),
+    ).toBe(false)
+    expect(
+      job_completed_with_errors(
+        makeJob({ status: "failed", progress: { success: 1, error: 3 } }),
+      ),
+    ).toBe(false)
+  })
+
+  it("derives label and badge for completed-with-errors", () => {
+    const job = makeJob({
+      status: "succeeded",
+      progress: { success: 8, error: 2 },
+    })
+    expect(job_status_display_label(job)).toBe("Completed with errors")
+    expect(job_status_display_badge_class(job)).toBe(
+      "badge-outline badge-error",
+    )
+  })
+
+  it("falls back to plain status display when there are no errors", () => {
+    const job = makeJob({
+      status: "succeeded",
+      progress: { success: 10, error: 0 },
+    })
+    expect(job_status_display_label(job)).toBe("Succeeded")
+    expect(job_status_display_badge_class(job)).toBe(
+      "badge-outline badge-primary",
+    )
+  })
+})
+
 describe("progress_label", () => {
   it("shows count only when total is null", () => {
     expect(progress_label({ success: 3, error: 0 })).toBe("3")
diff --git a/app/web_ui/src/lib/stores/job_status.ts b/app/web_ui/src/lib/stores/job_status.ts
index 9003ab315..6e58dfc92 100644
--- a/app/web_ui/src/lib/stores/job_status.ts
+++ b/app/web_ui/src/lib/stores/job_status.ts
@@ -44,17 +44,17 @@ export function job_status_display(status: BackgroundJobStatus): string {
 export function job_status_badge_class(status: BackgroundJobStatus): string {
   switch (status) {
     case "running":
-      return "badge-info"
+      return "badge-outline badge-success"
     case "succeeded":
-      return "badge-success"
+      return "badge-outline badge-primary"
     case "failed":
-      return "badge-error"
+      return "badge-outline badge-error"
     case "paused":
-      return "badge-warning"
+      return "badge-outline badge-warning"
     case "pending":
-      return "badge-ghost"
+      return "badge-outline"
     case "cancelled":
-      return "badge-ghost"
+      return "badge-outline"
     default: {
       const exhaustive: never = status
       return exhaustive
@@ -62,6 +62,28 @@ export function job_status_badge_class(status: BackgroundJobStatus): string {
   }
 }
 
+// A job that finished successfully but logged one or more non-fatal per-item
+// errors. Like RAG's `completed_with_errors`, this is a frontend-derived display
+// state only — the backend status stays `succeeded` and the error detail lives
+// in the per-run error log. No worker/backend change is needed.
+export function job_completed_with_errors(job: JobRecord): boolean {
+  return job.status === "succeeded" && (job.progress?.error ?? 0) > 0
+}
+
+export function job_status_display_label(job: JobRecord): string {
+  if (job_completed_with_errors(job)) {
+    return "Completed with errors"
+  }
+  return job_status_display(job.status)
+}
+
+export function job_status_display_badge_class(job: JobRecord): string {
+  if (job_completed_with_errors(job)) {
+    return "badge-outline badge-error"
+  }
+  return job_status_badge_class(job.status)
+}
+
 export type JobAction = "pause" | "resume" | "cancel" | "delete"
 
 // The set of lifecycle actions valid for a job given its status and whether
diff --git a/app/web_ui/src/lib/ui/dialog.svelte b/app/web_ui/src/lib/ui/dialog.svelte
index 28cc0fbf8..bef5490e8 100644
--- a/app/web_ui/src/lib/ui/dialog.svelte
+++ b/app/web_ui/src/lib/ui/dialog.svelte
@@ -11,7 +11,7 @@
   export let sub_subtitle: string | null = null
   export let sub_subtitle_link: string | null = null
   export let blur_background: boolean = false
-  export let width: "normal" | "wide" | "extra-wide" = "normal"
+  export let width: "normal" | "wide" = "normal"
   const id: string = "dialog-" + Math.random().toString(36)
   type ActionButton = {
     label: string
@@ -89,15 +89,9 @@
   on:close={() => dispatch("close")}
   on:cancel={(e) => dispatch("cancel", e)}
 >
-  <div
-    class="modal-box {width === 'extra-wide'
-      ? 'w-11/12 max-w-7xl'
-      : width === 'wide'
-        ? 'w-11/12 max-w-3xl'
-        : ''}"
-  >
+  <div class="modal-box {width === 'wide' ? 'w-11/12 max-w-3xl' : ''}">
     <!-- Hidden div to force the compiler to find these classes -->
-    <div class="hidden w-11/12 max-w-3xl max-w-7xl"></div>
+    <div class="hidden w-11/12 max-w-3xl"></div>
     <div class="flex flex-row gap-2 items-start">
       <div
         class="grow flex flex-col {center_content

From a440f19ee22d112e8a0a43ac679fb4b00f4ab192 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Thu, 18 Jun 2026 00:32:17 +0800
Subject: [PATCH 16/26] fix: drop unsatisfiable networkidle wait in
 docs-library e2e test

The always-on jobs SSE stream (/api/jobs/events) keeps a network
connection open for the page lifetime, so waitForLoadState("networkidle")
never settles. Remove it; the breadcrumb assertion already auto-waits.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/web_ui/tests/e2e/act/discover/docs-library.spec.ts | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/app/web_ui/tests/e2e/act/discover/docs-library.spec.ts b/app/web_ui/tests/e2e/act/discover/docs-library.spec.ts
index e02cd3c1f..77fa9a18b 100644
--- a/app/web_ui/tests/e2e/act/discover/docs-library.spec.ts
+++ b/app/web_ui/tests/e2e/act/discover/docs-library.spec.ts
@@ -388,8 +388,10 @@ test.describe("Document library page", () => {
     const { project } = seededProjectWithTask
 
     await page.goto(`/docs/library/${project.id}`)
-    await page.waitForLoadState("networkidle")
 
+    // Note: don't wait for "networkidle" here — the app holds an always-on jobs
+    // SSE stream (/api/jobs/events) open for the lifetime of the page, so the
+    // network never goes idle. The assertion below auto-waits regardless.
     const breadcrumb = page
       .locator(".breadcrumbs")
       .getByRole("link", { name: "Docs & Search", exact: true })

From 406ce320d6019cd7fefc39a99ae209e97f2aad10 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Thu, 18 Jun 2026 00:34:28 +0800
Subject: [PATCH 17/26] chore: delete spec project

---
 .../background_job_system/architecture.md     | 108 ------
 .../background_job_system/functional_spec.md  | 350 ------------------
 .../implementation_plan.md                    |  14 -
 .../phase_plans/phase_1.md                    | 186 ----------
 .../phase_plans/phase_2.md                    | 169 ---------
 .../phase_plans/phase_3.md                    | 133 -------
 .../phase_plans/phase_4.md                    | 200 ----------
 .../background_job_system/project_overview.md |  48 ---
 8 files changed, 1208 deletions(-)
 delete mode 100644 specs/projects/background_job_system/architecture.md
 delete mode 100644 specs/projects/background_job_system/functional_spec.md
 delete mode 100644 specs/projects/background_job_system/implementation_plan.md
 delete mode 100644 specs/projects/background_job_system/phase_plans/phase_1.md
 delete mode 100644 specs/projects/background_job_system/phase_plans/phase_2.md
 delete mode 100644 specs/projects/background_job_system/phase_plans/phase_3.md
 delete mode 100644 specs/projects/background_job_system/phase_plans/phase_4.md
 delete mode 100644 specs/projects/background_job_system/project_overview.md

diff --git a/specs/projects/background_job_system/architecture.md b/specs/projects/background_job_system/architecture.md
deleted file mode 100644
index 1d3b4ad9d..000000000
--- a/specs/projects/background_job_system/architecture.md
+++ /dev/null
@@ -1,108 +0,0 @@
----
-status: complete
----
-
-# Architecture: Background Job System
-
-Internal mechanics. The externally observable surface (record shape, REST API, SSE events, state machine, pause/resume semantics, worker contract) is in `functional_spec.md`. This doc covers state management, concurrency, the (non-)recovery story, code layout, and open items to verify during implementation.
-
-## 1. JobRegistry
-
-Singleton per process. Responsibilities:
-
-- Type registration (`register_type(WorkerClass)`).
-- In-memory index `{job_id → JobRecord}` — the only store. Starts empty on each process boot.
-- Supervising asyncio task per running job (`asyncio.Task` tracked in a dict). Its lifetime is owned entirely by the registry and is **decoupled from any HTTP request or SSE connection** — created at `create`/`resume`, ended only by completion or an explicit `cancel`/`pause`. Closing the web UI or dropping the SSE stream has no effect on it.
-- Global semaphore for max-concurrent `running` jobs (configurable; default 10).
-- Pub/sub bus that feeds the SSE endpoint.
-- Progress coalescing: rapid `report_progress` calls update the in-memory record freely but may be throttled before emitting an SSE `job` event (so a 500-item eval doesn't flood subscribers). Status transitions emit immediately.
-- **Reconciliation:** at every lifecycle transition (start, pause, resume) and on status reads (`GET /api/jobs/{id}`), call the worker's `compute_state(params)`, reconcile the in-memory snapshot against the derived truth, and emit a `job` event if it changed. This is what keeps the believed state honest without persistence. If `compute_state` returns `None` (fixture with no source of truth), keep the believed snapshot.
-- **Per-run identity & error log:** mint a fresh `run_id` (UUID) on each `run()` invocation and stamp it on the record. Route `ctx.report_error(...)` calls (and the final exception on a failed run) to an append-only JSON file keyed by that `run_id` in the OS temp dir. All file IO here is best-effort — a failed write or missing file never propagates.
-- Lifecycle methods: `create`, `pause`, `resume`, `cancel`, `delete`.
-
-## 2. State management (no persistence)
-
-There is no disk persistence. The in-memory index is the registry's entire store, and it is never the source of truth — it is a best-effort view of operations whose authoritative state lives in the Kiln project entities they touch (eval runs, task runs, etc.).
-
-```
-source of truth   →  Kiln project entities (eval runs, task runs, ...)
-                        │  worker.compute_state(params) reads these
-                        ▼
-registry view      →  in-memory {job_id → JobRecord}, reconciled against
-                       compute_state at transitions / status reads; lost on restart
-```
-
-**Why no files.** Job records are transient visibility/control data. Persisting them would create a second, drifting copy of state that we'd then have to reconcile against the real entities. Instead we lean on the idempotency contract and `compute_state`: every worker can re-derive "what's done" from the project, so the registry never needs to remember anything across a restart — and the in-memory snapshot self-corrects whenever it's recomputed.
-
-- **Project scope.** The record carries `project_id` purely for filtering (`GET /api/jobs?project_id=`, SSE filter). It does not dictate any storage location, because there is no storage.
-- **Result.** The `result` field holds a small in-memory summary; the actual output already lives in the entities the worker wrote. No sibling result file, no size threshold.
-- **Coalescing, not flushing.** Any debouncing applies only to SSE emission frequency — there are no disk writes to debounce.
-
-**State vs. diagnostics — the one allowed file.** The "no persistence" rule is about *state*: status/progress must stay derivable, never copied to disk. Error *messages* are not state — they're diagnostic spillover with no representation in the Kiln entities. Keeping them in the long-lived registry forever would leak memory, so they spool to an ephemeral, per-`run_id` JSON file in the OS temp dir (`{tempfile.gettempdir()}/kiln_jobs/{run_id}.json`). This doesn't reintroduce a competing source of truth: the file is non-authoritative, the OS may delete it, and every reader treats "missing" as "empty." It's the single deliberate exception, scoped to bulky diagnostics that can't live in memory.
-
-## 3. Concurrency
-
-- One global asyncio semaphore caps `running` jobs (default 10, configurable via env var, e.g. `KILN_JOBS_MAX_CONCURRENT=10`).
-- Excess jobs stay in `pending` until a slot frees. Order: FIFO by `created_at`.
-- Per-type caps are not in v1 but the registry should keep the door open (`{type: semaphore}` map ready to grow).
-- Cancellation = `asyncio.Task.cancel()` from outside; the registry transitions state in-memory and emits the SSE event. `pause` and `cancel` share the same cancellation mechanism, differing only in the resulting state.
-
-## 4. Restart behavior (no recovery)
-
-There is nothing to recover. On process restart the in-memory index starts empty, so every prior job record is simply gone — including any that were `running` or `paused`. There is no orphan scan, no `interrupted` state, no rehydration step.
-
-This is safe precisely because of the idempotency contract: the operation's true state still lives in the Kiln entities. To "recover," the user just re-triggers the job; on start the registry calls `compute_state` to seed the real progress, and `run()` continues from where the project actually left off, without duplicating completed work.
-
-If cross-restart *visibility* into past jobs is ever wanted, it should be reconstructed by querying the Kiln entities (e.g. "show me recent eval runs"), not by persisting job records — that keeps a single source of truth.
-
-## 5. Code layout (suggested)
-
-```
-Kiln/app/desktop/studio_server/jobs/
-  __init__.py
-  registry.py       # JobRegistry singleton: in-memory index, semaphore, supervising tasks, lifecycle, reconciliation
-  models.py         # JobRecord, JobProgress, JobDerivedState, JobStatus, JobContext, JobWorker base
-  events.py         # in-process pub/sub bus
-  error_log.py      # per-run error log: append / read / delete by run_id, in the OS temp dir
-  api.py            # FastAPI router: create/list/get/result/errors/pause/resume/cancel/delete + SSE
-  workers/
-    __init__.py
-    noop.py         # NoopJobWorker
-    eval.py         # EvalJobWorker
-```
-
-No `persistence.py` — the registry is purely in-memory. `error_log.py` is the one module that touches disk, and only for ephemeral, best-effort diagnostic logs (§2), never for state.
-
-Registration happens once at server startup (alongside the existing route registration), e.g.:
-
-```python
-job_registry.register_type(NoopJobWorker)
-job_registry.register_type(EvalJobWorker)
-```
-
-Frontend (Svelte) — out of strict scope for this spec, but the natural shape:
-
-```
-Kiln/app/web_ui/src/lib/jobs/
-  jobs_store.ts          # subscribes to /api/jobs/events
-  api.ts                 # thin REST client
-Kiln/app/web_ui/src/routes/(app)/jobs/+page.svelte    # jobs panel
-Kiln/app/web_ui/src/lib/components/SidebarJobsBadge.svelte
-```
-
-## 6. Open items — verify during implementation
-
-Sensible defaults are listed; flip them if the code disagrees.
-
-1. **`EvalRunner` idempotency — CONFIRMED.** Verified against the code: `EvalRunner.collect_tasks_for_task_run_eval()` builds an `already_run` set from existing `EvalRun` children (keyed by `(eval_config_id, task_run_config_id, dataset_id)`) and excludes already-run triples (`libs/core/kiln_ai/adapters/eval/eval_runner.py` ~L147–173). So re-running skips completed items and never writes duplicate result entities. EvalJob is therefore idempotent → **`supports_pause = True`**. Pause is a hard task-cancel mid-run; resume re-invokes `run()` and EvalRunner re-collects only the unfinished items. This is the *same* cancellation the legacy `run_comparison` endpoint already performs on client disconnect, so it carries no new corruption risk. `compute_state` counts `EvalRun`s whose `task_run_config_id` matches, against the eval's dataset-filter size for `total`. (Runtime errors aren't persisted as entities — a failed item simply isn't saved — so derived `error` is 0; the live `error` count comes from `Progress.errors` during the run.)
-2. **Multi-project scope — RESOLVED.** Nothing is persisted, so there's no startup scan. A single in-session registry tracks every job regardless of project; `project_id` is an optional filter on list/SSE. For eval jobs `project_id` comes from `EvalJobParams.project_id`; for noop it's `null`.
-3. **Active-project hook — RESOLVED.** The local server has **no** server-side "active project" to default to (confirmed: `project_id` is always an explicit identifier; the active project is frontend UI state `$ui_state.current_project_id`). Don't invent one. `?project_id=` is a plain optional filter (omitted = all jobs); the frontend passes its current `$ui_state.current_project_id`.
-4. **Auth — RESOLVED.** Studio-server routes use no FastAPI auth dependency; they mark agent-callability via `openapi_extra` policy constants (`ALLOW_AGENT`, etc.) as `eval_api.py` does. Mirror that convention; introduce no new scheme.
-5. **Max-concurrent default.** Set to 10; expose as env var `KILN_JOBS_MAX_CONCURRENT`. Revisit if mixed job types (e.g. evals + future bandwidth-heavy syncs) starve each other; per-type caps then.
-6. **Job ID format.** `j_{12-char-base32-lowercase}` (e.g. `j_a1b2c3d4e5f6`). Compact, grep-friendly, collision space is fine for local-only.
-7. **Delete policy.** Allowed only on terminal status (`succeeded`, `failed`, `cancelled`). Paused jobs must be cancelled or resumed-then-terminal first. (No `interrupted` state exists.)
-8. **`compute_state` read cost.** Reconciliation calls `compute_state` on every status read, which reads Kiln entities from disk. For a frequently-polled jobs panel this could get expensive. Default: recompute on lifecycle transitions and on explicit `GET /api/jobs/{id}`, but let the SSE stream ride on `report_progress` deltas between recomputations (don't recompute per progress tick). If polling proves hot, add a short TTL cache on the derived state. Confirm `compute_state` for `EvalJob` is cheap enough (a count query, not a full re-score).
-9. **SSE keepalive / heartbeat.** Match whatever the existing chat / eval SSE endpoints do. If unclear, send a `: ping\n\n` comment every 15s to keep proxies happy.
-10. **Error-message capture for `EvalJob`.** The error *count* is easy (`progress.errors`). Whether we can capture per-item error *messages* via `report_error` depends on whether `EvalRunner` surfaces individual failures (vs. just counting them). If it doesn't, the `/errors` endpoint stays empty for evals until `EvalRunner` exposes them — acceptable for v1; the `NoopJob` fixture still exercises the full error-log path. Tie-in with open item #1.
-11. **Error-log file format & cleanup.** Default to append-friendly JSON Lines internally (`{tempdir}/kiln_jobs/{run_id}.json`, one error object per line), parsed into a JSON array on read. `DELETE /api/jobs/{id}` best-effort removes the current run's file; past-run files in `/tmp` are left to the OS to reap. Confirm the temp subdir is created lazily and writes never block the worker (consider a background writer if `report_error` volume is high).
-12. **Git-sync for background eval jobs — RESOLVED via option C.** Tension surfaced during Phase 3: the legacy eval-run endpoint lives under `/api/projects/...`, where `GitSyncMiddleware` + `build_save_context(request)` wrap each `EvalRun` save in `manager.atomic_write` (git commit/push). Background jobs are deliberately request-decoupled (a core design goal) and write `EvalRun`s from a registry-owned task, so the original worker passed `save_context=None` and those writes were **not** committed/pushed (and could be stashed away by a later `ensure_clean`). **Resolution:** `app/desktop/git_sync/save_context.py` adds a request-free `save_context_for_project(project_id, context) -> SaveContext | None` (and `get_manager_for_project`) that mirrors the middleware's `_get_manager_for_request` resolution (config keyed by `project_path`, manager by `clone_path`, via the shared `GitSyncRegistry.get_or_create`), returning `None` for every "not auto-sync" branch. `EvalJobWorker._build_eval_runner` passes this through, so each `EvalRun` is committed/pushed per item — converging to the same behavior as the legacy SSE path (which already runs at concurrency 25 through the same non-reentrant per-project lock; contention, not deadlock). For non-auto-sync projects (the default) it stays a no-op, identical to before. The resolution logic is intentionally duplicated from the middleware (a clean delegating refactor would break the middleware's test patches); both copies carry a "keep in sync" note.
diff --git a/specs/projects/background_job_system/functional_spec.md b/specs/projects/background_job_system/functional_spec.md
deleted file mode 100644
index 479874878..000000000
--- a/specs/projects/background_job_system/functional_spec.md
+++ /dev/null
@@ -1,350 +0,0 @@
----
-status: complete
----
-
-# Functional Spec: Background Job System
-
-This doc captures the externally observable behavior of the job system: the job record shape, the worker contract, the state machine, the REST API, and the SSE stream. Internal mechanics (concurrency primitives, code layout) live in `architecture.md`.
-
-**Core principle.** A job record is ephemeral, in-memory bookkeeping — for visibility and control only. It is **never** a source of truth and is never persisted to disk. The authoritative state of whatever the job is doing lives in the Kiln project entities it reads/writes (eval runs, task runs, etc.). Workers must be idempotent (see §2).
-
-The believed status/progress in the record is **recomputed from source of truth**, not accumulated from deltas. Each worker exposes a `compute_state(params)` method (§2) that reads the relevant Kiln entities and returns the operation's true progress and whether it's complete. The registry calls it at every lifecycle transition (start, pause, resume) and on status reads, then reconciles the in-memory snapshot against the result and emits an updated event if anything changed. Live `report_progress` calls during a run are just a smoothing layer on top for the UI between recomputations — they never override the derived truth. A snapshot may still briefly lag the true state, and the worker remains responsible for its own consistency.
-
-## 1. Job record (base shape)
-
-Lives in the registry's in-memory index; serialized to JSON only for HTTP/SSE responses (not to disk).
-
-```jsonc
-{
-  "id": "j_a1b2c3d4e5f6",
-  "type": "eval",
-  "status": "running",
-  "run_id": "8f3c1e0a-...-uuid",   /* UUID of the current/most-recent run() invocation */
-  "progress": {
-    "total":   50,
-    "success": 11,                 /* items completed without error */
-    "error":   1,                  /* items that errored (count only; messages in the error log) */
-    "message": "scoring item 12",
-    "updated_at": "2026-05-28T12:34:56Z"
-  },
-  "params":   { /* type-specific opaque JSON, validated against the type's params_model */ },
-  "result":   null,                /* small summary populated on success; detail lives in Kiln entities */
-  "error":    null,                /* populated on failure; short string + optional structured detail */
-  "metadata": {},                  /* free-form pass-through from caller; this layer never interprets it */
-  "project_id":      "p_abc",
-  "supports_pause":  true,         /* stamped at creation from the worker class */
-  "created_at":      "2026-05-28T12:30:00Z",
-  "updated_at":      "2026-05-28T12:34:56Z",
-  "started_at":      "2026-05-28T12:30:01Z",
-  "ended_at":        null
-}
-```
-
-- `type` is the discriminator. Each registered type declares typed `params_model` and `result_model` (pydantic). The base record stores them as plain JSON.
-- `progress` reports counts, not a single cursor: `total`, `success` (completed without error), `error` (errored count), and a free-text `message`. Processed = `success + error`; remaining = `total − success − error`. The `error` field is just a count — the actual error *messages* live in the per-run error log (see §1.1 below).
-- `run_id` is a fresh UUID assigned on each `run()` invocation (first run and every resume/re-trigger). It keys the per-run error log so messages from different runs of the same job don't collide. `null` before the first run.
-- `metadata` is a free-form pass-through: callers may attach arbitrary attribution (any JSON object). This layer never reads, writes, or interprets it — it just stores and returns it verbatim.
-- `result` is a **small summary** (counts, status, references to the Kiln entities that hold the detail). It is not a place to stash large blobs — the real output already lives in the project entities the worker wrote. There is no sibling result file and no size threshold.
-- No `schema_version`, no checkpoint file, no persisted *state* of any kind — records exist only while the process is alive. (The error log in §1.1 is diagnostic spillover, not state.)
-
-### 1.1 Per-run error log
-
-Error *counts* live in `progress`; error *messages* would bloat the in-memory record if kept forever, so they spill to an ephemeral file instead — never to a Kiln entity (they aren't source of truth).
-
-- **Location.** A file in the OS temp dir, keyed by `run_id` — e.g. `{tempdir}/kiln_jobs/{run_id}.json` (`tempdir` = `tempfile.gettempdir()`, so `/tmp/kiln_jobs/…` on macOS/Linux; portable to Windows). Temp storage is deliberately non-authoritative: the OS may clear it, and that's fine.
-- **Shape.** An array of objects, each at minimum `{ "error_message": "..." }`. Objects (not bare strings) so we can add fields later (`item_id`, `timestamp`, `traceback_ref`, …) without a format break.
-- **Writing.** Workers append via `ctx.report_error(...)` (§2) for non-fatal per-item errors; the registry also appends the final exception when a `run()` raises. Append-only, so it survives a crash mid-run.
-- **Reading.** `GET /api/jobs/{id}/errors` (§5) returns the array for the job's current `run_id`. **If the file is gone, return `[]` with `200` — never an error.** This keeps the feature best-effort: logs are a debugging convenience, not a guarantee.
-
-## 2. Worker contract
-
-A worker is two methods: `compute_state()` — a pure read that derives true state from source of truth — and `run()` — the idempotent do-the-work method. There is no `resume()` and no checkpoint: pause is task cancellation, and resume is just a fresh `run()` (see §4) that re-orients itself via `compute_state()`.
-
-```python
-class JobDerivedState(BaseModel):
-    """A worker's view of the operation's true state, read from source-of-truth entities."""
-    total: int | None = None
-    success: int = 0          # completed without error
-    error: int = 0            # errored count
-    is_complete: bool = False
-    message: str | None = None
-
-
-class JobContext:
-    """Provided to the worker by JobRegistry during run()."""
-    job_id: str
-
-    async def report_progress(
-        self,
-        success: int,
-        error: int = 0,
-        total: int | None = None,
-        message: str | None = None,
-    ) -> None:
-        """Update the registry's in-memory progress snapshot and emit an SSE event.
-        Cheap to call often; a UI-smoothing signal only — the authoritative progress
-        comes from compute_state(). The registry may coalesce rapid calls before emitting."""
-
-    async def report_error(self, error_message: str, **extra) -> None:
-        """Append one structured error entry — {"error_message": ..., **extra} — to this
-        run's error log (a JSON file in the OS temp dir, keyed by run_id; see §1.1).
-        For non-fatal per-item errors that don't stop the run. Best-effort: a failed
-        write is swallowed, never propagated to the worker. Does not itself bump the
-        progress `error` count — report that via report_progress."""
-
-    # Cancellation is just asyncio.CancelledError on the supervising task —
-    # workers may catch it for cleanup, but the transition is unconditional. A worker
-    # must leave any in-flight atomic unit of work consistent before returning.
-
-
-class JobWorker(Generic[TParams, TResult]):
-    type_name: ClassVar[str]                  # discriminator value
-    params_model: ClassVar[type[BaseModel]]   # pydantic model for params
-    result_model: ClassVar[type[BaseModel]]   # pydantic model for result
-    supports_pause: ClassVar[bool] = False    # worker is idempotent & safe to cancel-and-re-run
-
-    async def compute_state(self, params: TParams) -> JobDerivedState | None:
-        """Read source-of-truth Kiln entities and return the operation's true progress
-        and whether it's already complete. MUST be a pure read — no side effects,
-        idempotent, safe to call any time (before start, while paused, on a status read).
-        This is the authority; the in-memory snapshot is reconciled against it.
-
-        Return None only when the worker has no backing entity to consult (e.g. the
-        NoopJob fixture); the registry then keeps the last believed snapshot. Real
-        workers must override this."""
-        return None
-
-    async def run(self, params: TParams, ctx: JobContext) -> TResult:
-        """MUST be idempotent. Should call compute_state() to learn what's already done,
-        then perform only the remaining work, reporting progress as it goes. This single
-        method covers both first run and resume — the registry calls run() again to resume
-        a paused job; the worker re-orients via compute_state(), not a handed-in checkpoint."""
-```
-
-**The idempotency contract is the load-bearing invariant of this system.** Because nothing is persisted and resume is just a re-run, every worker author must guarantee that calling `run()` twice (or after an interruption) does not double-write, duplicate rows, or otherwise corrupt the project. `compute_state()` is how a worker stays honest: it derives status from the project rather than trusting in-memory deltas, so the system self-corrects after interruptions, restarts, or concurrent edits. `supports_pause` advertises that a worker meets this bar *and* is safe to cancel mid-flight and re-run; default `False` is conservative.
-
-## 3. State machine
-
-```
-                    ┌─────────────┐
-                    │   pending   │
-                    └─────┬───────┘
-                          ▼
-                    ┌─────────────┐
-   ┌───────────────►│   running   │
-   │ resume         └──┬───┬───┬──┘
-   │ (re-run)          │   │   │
-   │                   ▼   ▼   ▼
-┌──┴───────┐      terminal states
-│  paused  │   ┌──────────┬─────────┬──────────┐
-└────▲─────┘   │succeeded │ failed  │cancelled │
-     │         └──────────┴─────────┴──────────┘
-     │  pause
-     └─ (cancel task,
-        keep resumable)
-```
-
-There is no `interrupted` state. Records are in-memory only, so a process restart simply drops every record — there are no orphans to recover and nothing to flip.
-
-Transitions:
-
-| From → To | Trigger |
-|---|---|
-| `pending → running` | semaphore slot frees, worker task started |
-| `pending → cancelled` | cancel before run started |
-| `running → succeeded` | worker returns normally |
-| `running → failed` | worker raises (other than `CancelledError`) |
-| `running → cancelled` | `cancel` issued; `asyncio.Task.cancel()`, `CancelledError` reaches worker |
-| `running → paused` | `pause` issued; same task cancellation, but marked resumable |
-| `paused → running` | `resume` called; a fresh `run()` task is started |
-| `paused → cancelled` | cancel from paused state |
-| `succeeded / failed / cancelled → (deleted)` | explicit DELETE |
-
-`pending → paused` is not allowed (pausing a not-yet-started job = cancel + recreate). `pause` and `cancel` both cancel the supervising task; they differ only in the resulting state and whether resume is permitted.
-
-## 4. Pause / resume semantics
-
-Non-cooperative and checkpoint-free. Pause is task cancellation; resume is a fresh `run()`. The worker's idempotency is what makes this safe — on resume it reads source-of-truth Kiln entities and continues from wherever the project state left off.
-
-**Pause flow.**
-1. Client calls `POST /api/jobs/{id}/pause`. Registry returns `202`.
-2. Registry calls `asyncio.Task.cancel()` on the supervising task. The worker receives `CancelledError` at its next `await`; it should finish or unwind its current atomic unit so the project is left consistent.
-3. Once the task has settled, the registry calls `compute_state(params)` to record the true progress as of the pause (rather than the last reported delta), transitions `running → paused`, and emits an event. (Distinguished from `cancel` only by the target state.)
-
-**Resume flow.**
-1. Client calls `POST /api/jobs/{id}/resume`. Registry returns `202`.
-2. Registry calls `compute_state(params)` to re-seed the progress snapshot. If it reports `is_complete`, the job goes straight to `succeeded` without re-running. Otherwise the registry schedules a new task.
-3. The registry calls `run(params, ctx)` again — there is no separate `resume()` method and no checkpoint is handed in.
-4. The worker calls `compute_state()` itself to determine what is already done and continues. A re-run must not duplicate completed work (idempotency contract, §2).
-
-Workers that don't support pause: `supports_pause = False`. Pause endpoint returns `409 Conflict`. Cancel still works (it's terminal and doesn't require re-runnability).
-
-## 5. REST API
-
-All endpoints live under `/api/jobs`. Authentication piggybacks on whatever the local server uses today (don't introduce a new scheme).
-
-| Method | Path | Body | Response | Notes |
-|---|---|---|---|---|
-| `POST` | `/api/jobs/{type}` | `{ params: <type-specific> }` | `201 { job_id, status }` | `type` must be registered. `params` validated against `params_model`. Job starts as `pending`, runs as soon as semaphore allows. |
-| `GET` | `/api/jobs` | — | `200 [ <record>, ... ]` | Filters: `?status=`, `?type=`, `?project_id=`, `?since=<iso8601>`, `?limit=`. Default sort: `created_at desc`. |
-| `GET` | `/api/jobs/{id}` | — | `200 <record>` | 404 if unknown. Recomputes status via the worker's `compute_state` (source of truth), reconciles the in-memory snapshot, and emits a `job` event if it changed before returning. |
-| `GET` | `/api/jobs/{id}/result` | — | `200 <result summary>` | 404 if not terminal or no result. Returns the small in-memory summary; detail lives in the Kiln entities the job wrote. |
-| `GET` | `/api/jobs/{id}/errors` | — | `200 [ { "error_message": "...", ... }, ... ]` | Error log for the job's current `run_id` (§1.1). Optional `?run_id=<uuid>` for a specific past run. **Always `200`; returns `[]` if the file is missing/unreadable** — never errors. |
-| `POST` | `/api/jobs/{id}/pause` | — | `202` / `409` | 409 if not running, or worker doesn't support pause. |
-| `POST` | `/api/jobs/{id}/resume` | — | `202` / `409` | 409 if not paused. |
-| `POST` | `/api/jobs/{id}/cancel` | — | `202` / `409` | 409 if already terminal. Idempotent for `pending`. |
-| `DELETE` | `/api/jobs/{id}` | — | `204` / `409` | 409 if still in-flight. Drops the in-memory record and best-effort removes the run's error log file(s). |
-| `GET` | `/api/jobs/events` | — | `200 text/event-stream` | SSE; see §6. |
-
-All state-changing endpoints (pause/resume/cancel) return `202 Accepted` once the transition has settled. For pause and cancel this means the handler awaits the supervising task's cancellation/cleanup before responding, so the slot is reclaimed and the terminal result is recorded deterministically (no lost cancellation, no double-release). For our current workers cleanup lands at the next `await`, so this is effectively instant; a future worker with slow cancel-cleanup would hold the connection for that cleanup. The resulting state is also published via the SSE stream for any observers.
-
-Error envelopes follow the existing local-server convention (`{ "detail": "..." }`).
-
-## 6. SSE stream
-
-`GET /api/jobs/events?job_id=&type=&project_id=` — all filters optional, combinable.
-
-**The stream is a pure observer — jobs run independently of it.** This is the critical difference from today's eval flow. The existing blocking `run_comparison` SSE endpoint runs the eval *inside the request*, so `CancellableStreamingResponse` cancelling on client disconnect also cancels the eval. Here, the job is a supervising task owned by the registry; the SSE endpoint only subscribes to the event bus and forwards snapshots. A client disconnecting (closing the tab, even quitting the whole web UI) must tear down **only** the subscription — never the job. Jobs keep running, and a later reconnect resyncs via the `snapshot` event. The *only* things that stop a job are explicit `POST /api/jobs/{id}/cancel` or `/pause`.
-
-Implementation: reuse the `CancellableStreamingResponse` pattern from `Kiln/app/desktop/studio_server/eval_api.py`, but scope its cancellation to the **subscription generator** (unsubscribe from the bus, stop the keepalive) — do not let it reach into any job task. Don't create the supervising task inside the request handler; it lives in the registry, created at `create`/`resume`, with a lifetime decoupled from any HTTP connection.
-
-Events are **idempotent snapshots, not deltas.** Every per-job event carries the full current record; the client keeps a map keyed by `id` and upserts. There is no `from`/`to` transition payload to apply in order — a client that drops or reorders events still converges as long as it processes the latest snapshot per id. A snapshot reflects the registry's *believed* state at emit time and may briefly lag the worker's true state (e.g. under concurrent edits); the worker owns its own consistency.
-
-Event types:
-
-```
-event: snapshot
-data: { "jobs": [ <record>, ... ] }
-```
-Sent once on connect with the full current set of jobs matching the filter. Lets the UI sync without a parallel GET.
-
-```
-event: job
-data: <record>
-```
-Emitted on every change to a single job — creation, status transition, and progress update all use this one event, each carrying the complete record (including the latest `status` and `progress` with its `success`/`error` counts). The registry may coalesce rapid progress updates before emitting so a 500-item eval doesn't flood subscribers. Error *messages* are not streamed — the snapshot carries only the `error` count; clients fetch messages on demand via `GET /api/jobs/{id}/errors`.
-
-```
-event: deleted
-data: { "id": "j_..." }
-```
-A tombstone — the only non-snapshot event, since a deleted record has no state to send.
-
-One stream serves the sidebar badge, jobs panel, and any future in-chat widget. Clients reconnect on disconnect; the fresh `snapshot` event resyncs them. No need for `Last-Event-ID` replay in v1 — snapshots are self-healing.
-
-Why SSE over Socket.IO: matches every other streaming endpoint in the codebase (chat, eval, calibration); no new dependency; no client-to-server streaming need.
-
-## 7. Worker implementations
-
-### Reference: `NoopJob` (validation / smoke test)
-
-```python
-class NoopJobParams(BaseModel):
-    steps: int = 10
-    sleep_per_step_seconds: float = 0.5
-    fail_at_step: int | None = None         # fatal: raises (tests the failed path)
-    error_at_steps: list[int] = []          # non-fatal: logs an error, keeps going
-
-class NoopJobResult(BaseModel):
-    completed_steps: int
-
-class NoopJobWorker(JobWorker[NoopJobParams, NoopJobResult]):
-    type_name = "noop"
-    params_model = NoopJobParams
-    result_model = NoopJobResult
-    supports_pause = True
-
-    async def compute_state(self, params):
-        return None  # no backing entity — registry keeps the believed snapshot
-
-    async def run(self, params, ctx):
-        success = error = 0
-        for i in range(params.steps):
-            await asyncio.sleep(params.sleep_per_step_seconds)
-            if params.fail_at_step == i:
-                raise RuntimeError(f"intentional fail at step {i}")
-            if i in params.error_at_steps:
-                error += 1
-                await ctx.report_error(f"intentional error at step {i}", step=i)
-            else:
-                success += 1
-            await ctx.report_progress(
-                success=success,
-                error=error,
-                total=params.steps,
-                message=f"step {i+1}/{params.steps}",
-            )
-        return NoopJobResult(completed_steps=success + error)
-```
-
-`NoopJob` is the canary: end-to-end-tests pause / resume / cancel / error-log capture without needing real LLM calls or `EvalRunner`. `error_at_steps` exercises the non-fatal `report_error` path (errors accumulate in the log and the `error` count without stopping the run); `fail_at_step` exercises the fatal path. It has **no** backing Kiln entity, so `compute_state` returns `None` and its `run()` simply restarts from step 0 on resume. That's an honest limitation of a source-of-truth-free fixture and is fine: the canary's purpose is to exercise lifecycle transitions and the error log, not work-skipping. Real workers derive their state instead of restarting.
-
-### `EvalJob` (first real consumer)
-
-```python
-class EvalJobParams(BaseModel):
-    project_id: str
-    task_id: str
-    eval_id: str
-    eval_config_id: str
-    run_config_id: str
-
-class EvalJobResult(BaseModel):
-    total: int
-    success: int
-    error: int
-    # just a summary — per-row results live in the eval run entity (source of truth)
-
-class EvalJobWorker(JobWorker[EvalJobParams, EvalJobResult]):
-    type_name = "eval"
-    params_model = EvalJobParams
-    result_model = EvalJobResult
-    supports_pause = True   # EvalRunner is confirmed idempotent: collect_tasks excludes
-                            # already-run (eval_config, run_config, dataset_id) triples,
-                            # so pause (cancel) + resume (re-run) skips completed items
-                            # and writes no duplicates. See architecture.md open item #1.
-
-    async def compute_state(self, params):
-        # Source of truth: EvalRun entities, intersected with the eval-set filter so we
-        # count exactly the candidate set EvalRunner.collect_tasks would (open item #1).
-        in_filter_ids = dataset_ids_passing_eval_filter(params)          # task runs in the eval set
-        scored_ids    = scored_dataset_ids(params, params.run_config_id) # existing EvalRuns
-        success = len(scored_ids & in_filter_ids)
-        total   = len(in_filter_ids)
-        # Runtime errors aren't persisted as entities (a failed item simply isn't saved),
-        # so derived error is 0; the live error count comes from Progress.errors during run().
-        return JobDerivedState(total=total, success=success, error=0,
-                               is_complete=(success >= total))
-
-    async def run(self, params, ctx):
-        # EvalRunner.collect_tasks excludes already-scored items, so Progress counts only the
-        # REMAINING work (Progress.total = full − already_done, Progress.complete starts at 0).
-        # Add the already-done baseline so progress/result are on the full-set scale.
-        baseline = (await self.compute_state(params)).success
-        eval_runner = build_eval_runner(params)  # same construction as eval_api.py uses today
-        progress = None
-        async for progress in eval_runner.run():
-            await ctx.report_progress(
-                success=baseline + progress.complete,
-                error=progress.errors,
-                total=baseline + progress.total,   # baseline + remaining = full eval-set size
-            )
-        return EvalJobResult(
-            total=baseline + (progress.total if progress else 0),
-            success=baseline + (progress.complete if progress else 0),
-            error=(progress.errors if progress else 0),
-        )
-```
-
-`EvalRunner` is unchanged. Internally it still uses `AsyncJobRunner` for per-item parallelism. The translation is `Progress → JobContext.report_progress()` for counts. Capturing individual eval error *messages* via `report_error` depends on whether `EvalRunner` surfaces per-item failures (see open item #1); if it only exposes an error count, the messages endpoint stays empty for evals until that's wired up.
-
-The idempotency contract bears directly on this worker: a paused-then-resumed (or re-triggered) eval re-invokes `run()`, which re-invokes `EvalRunner.run()`. This is confirmed safe — `EvalRunner.collect_tasks` excludes already-run `(eval_config, run_config, dataset_id)` triples, so completed items are skipped and no duplicate `EvalRun` entities are written (architecture.md open item #1). Hence `supports_pause = True`.
-
-## 8. What's NOT in this spec
-
-- Full per-job log capture / streaming. Error *messages* are collected per run (§1.1) and fetched via `GET /api/jobs/{id}/errors`, but general stdout/stderr/log streaming is out — workers still use the standard logger for that.
-- Job dependencies / DAGs. One job, one task.
-- Retries at the job level. `AsyncJobRunner` already retries individual sub-tasks for workers that use it; whole-job retry is the caller's problem (or a future feature).
-- Cross-project listings. Records carry `project_id`; the SSE/list endpoints can filter by it, but there's no global "all jobs everywhere" view.
-- Multi-machine / remote execution. All jobs are local asyncio tasks. Cloud Run is GEPA's path and isn't generalized here.
-- Pre-run approval / authorization gates. The endpoints follow whatever auth the local server has; no new approval scheme.
diff --git a/specs/projects/background_job_system/implementation_plan.md b/specs/projects/background_job_system/implementation_plan.md
deleted file mode 100644
index 77f163bf6..000000000
--- a/specs/projects/background_job_system/implementation_plan.md
+++ /dev/null
@@ -1,14 +0,0 @@
----
-status: complete
----
-
-# Implementation Plan: Background Job System
-
-Derived from the "Quick start" section of the original spec, lightly re-split so each phase is one CR-sized chunk.
-
-## Phases
-
-- [x] Phase 1: Core layer + NoopJob (no HTTP yet) — `models.py` (incl. `JobDerivedState`, `JobProgress` with success/error counts), `registry.py` (in-memory index, semaphore, supervising tasks, lifecycle, per-run `run_id`, `compute_state` reconciliation at transitions/status reads), `events.py`, `error_log.py` (append/read/delete by `run_id` in the OS temp dir, all best-effort), `workers/noop.py`. No persistence layer for state. Verify the full lifecycle (`create / pause / resume / cancel / delete`) via Python tests against `NoopJobWorker`, including pause = task-cancel → `paused`, resume = fresh `run()`, reconciliation when `compute_state` returns `None`, and error-log capture (`error_at_steps` non-fatal + `fail_at_step` fatal), including graceful `[]` when the file is missing.
-- [x] Phase 2: REST API + SSE — `api.py` (FastAPI router, incl. `GET /api/jobs/{id}/errors`), wired into the local server alongside existing routes. Idempotent-snapshot events (`snapshot` / `job` / `deleted`). Reuse `CancellableStreamingResponse` from `eval_api.py`, but scope its cancellation to the subscription generator only. Verify via curl + the SSE stream against `NoopJob` — **including the decoupling test: start a long `NoopJob`, connect then disconnect the SSE stream, and confirm the job keeps running to completion (only explicit cancel/pause stops it).**
-- [x] Phase 3: `EvalJobWorker` — wraps existing `EvalRunner` unchanged, plus `compute_state` that counts `EvalRun`s with matching `task_run_config_id` (idempotency confirmed → `supports_pause = True`; see architecture open item #1). Wire `report_error` to per-item failures if `EvalRunner` surfaces them (open item #10; otherwise the `/errors` endpoint stays empty for evals in v1). `POST /api/jobs/eval` returns a job_id and runs in the background, alongside the legacy blocking eval-run SSE endpoint. Confirm progress (success/error counts) flows correctly.
-- [ ] Phase 4: Frontend — `jobs_store.ts` (subscribes to `/api/jobs/events`, upserts by id), `api.ts`, jobs panel at `/jobs`, sidebar badge component.
diff --git a/specs/projects/background_job_system/phase_plans/phase_1.md b/specs/projects/background_job_system/phase_plans/phase_1.md
deleted file mode 100644
index bfd98c1dd..000000000
--- a/specs/projects/background_job_system/phase_plans/phase_1.md
+++ /dev/null
@@ -1,186 +0,0 @@
----
-status: complete
----
-
-# Phase 1: Core layer + NoopJob (no HTTP)
-
-## Overview
-
-Build the in-memory core of the background job system inside a new package
-`app/desktop/studio_server/jobs/`. This phase delivers the data models, the
-worker contract, the in-process event bus, the per-run error log, and the
-`JobRegistry` singleton that owns the full job lifecycle. No FastAPI router and
-no SSE endpoint — those land in Phase 2. The only consumer wired up here is the
-`NoopJobWorker` fixture, which is exercised end-to-end by Python tests.
-
-The design follows `functional_spec.md` and `architecture.md` exactly:
-
-- Job records are ephemeral, in-memory only. No disk persistence of state.
-- Status/progress is reconciled against `worker.compute_state(params)` at every
-  lifecycle transition (start, pause, resume) and on `get`. `None` means keep
-  the believed snapshot.
-- The supervising `asyncio.Task` per running job is owned by the registry and
-  decoupled from any HTTP connection.
-- A fresh `run_id` (uuid4) is minted per `run()` invocation. Error messages
-  (`report_error` + the fatal exception of a failed run) spill to a best-effort
-  per-`run_id` JSON file in the OS temp dir.
-- Pause = `task.cancel()` -> `paused`; resume = a fresh `run()`. No
-  `interrupted` state, no checkpoints, no `resume()` method.
-
-## Steps
-
-1. `jobs/__init__.py` — empty package marker.
-
-2. `jobs/models.py` — pydantic v2 models and the worker contract.
-   - `JobStatus(str, Enum)`: `PENDING="pending"`, `RUNNING="running"`,
-     `PAUSED="paused"`, `SUCCEEDED="succeeded"`, `FAILED="failed"`,
-     `CANCELLED="cancelled"`. Add a `terminal` helper / set
-     `{SUCCEEDED, FAILED, CANCELLED}`.
-   - `JobProgress(BaseModel)`: `total: int | None = None`, `success: int = 0`,
-     `error: int = 0`, `message: str | None = None`,
-     `updated_at: datetime` (default factory utc now).
-   - `JobDerivedState(BaseModel)`: `total: int | None = None`, `success: int = 0`,
-     `error: int = 0`, `is_complete: bool = False`, `message: str | None = None`.
-   - `JobError(BaseModel)`: `error: str | None = None`,
-     `detail: dict | None = None` — small failure summary on the record.
-   - `JobRecord(BaseModel)`: fields per functional_spec §1 — `id`, `type`,
-     `status: JobStatus`, `run_id: str | None`, `progress: JobProgress`,
-     `params: dict`, `result: dict | None`, `error: JobError | None`,
-     `metadata: dict`, `project_id: str | None`, `supports_pause: bool`,
-     `created_at`, `updated_at`, `started_at: datetime | None`,
-     `ended_at: datetime | None`.
-   - `JobContext`: holds `job_id`, `run_id`, and references to the registry's
-     progress-reporting + error-logging callbacks. Async methods:
-     `report_progress(success, error=0, total=None, message=None)` and
-     `report_error(error_message, **extra)`. Implemented as a small class taking
-     two async callables so the registry can inject behavior without a circular
-     import.
-   - `JobWorker(Generic[TParams, TResult])`: classvars `type_name`,
-     `params_model`, `result_model`, `supports_pause: bool = False`. Methods
-     `async def compute_state(self, params) -> JobDerivedState | None` (default
-     returns `None`) and `async def run(self, params, ctx) -> TResult` (raises
-     `NotImplementedError`).
-
-3. `jobs/events.py` — in-process async pub/sub bus.
-   - `JobEvent` union shape: emit dataclass/pydantic events of kind
-     `snapshot` / `job` / `deleted`. Keep it simple: a small `JobEvent` model
-     with `event: Literal["snapshot","job","deleted"]` and a `data` payload.
-   - `JobEventBus`: holds a set of subscriber `asyncio.Queue`s. `subscribe()`
-     is an async generator / context that registers a queue, immediately yields
-     a `snapshot` event (built from a snapshot provider callback) filtered by
-     `job_id` / `type` / `project_id`, then yields subsequent matching events.
-   - `publish_job(record)` / `publish_deleted(job_id, project_id, type_name)`
-     fan out to all subscriber queues, applying each subscriber's filter.
-   - Filtering helper that matches a record against optional `job_id`, `type`,
-     `project_id`.
-   - Unsubscribe removes the queue (used by Phase 2's SSE teardown). For Phase 1
-     this is tested directly without HTTP.
-
-4. `jobs/error_log.py` — per-`run_id` best-effort error log.
-   - Dir: `{tempfile.gettempdir()}/kiln_jobs`. Path helper
-     `error_log_path(run_id)`.
-   - `append_error(run_id, entry: dict)` — JSON-lines append; create dir lazily;
-     swallow all exceptions.
-   - `read_errors(run_id) -> list[dict]` — read JSON-lines, skip unparsable
-     lines; missing/unreadable file -> `[]`. Never raises.
-   - `delete_errors(run_id)` — best-effort unlink; swallow exceptions.
-
-5. `jobs/registry.py` — `JobRegistry`.
-   - `__init__(max_concurrent: int | None = None)`: semaphore sized from arg or
-     env `KILN_JOBS_MAX_CONCURRENT` (default 10); in-memory
-     `dict[str, JobRecord]`; `dict[str, JobWorker]` type map;
-     `dict[str, asyncio.Task]` supervising tasks; FIFO `pending` queue of job
-     ids; a `JobEventBus`.
-   - `register_type(worker_cls)`: instantiate and index by `type_name`.
-   - `_new_job_id()`: `j_` + 12 lowercase base32 chars (from `secrets`/`uuid4`
-     bytes, mapped to `abcdefghijklmnopqrstuvwxyz234567`).
-   - `create(type_name, params, project_id=None, metadata=None) -> JobRecord`:
-     validate params against `params_model`, build a `pending` record stamped
-     with `supports_pause`, enqueue, emit a `job` event, then try to start
-     pending jobs (respecting the semaphore). Returns the record.
-   - `_try_start_pending()`: while semaphore slots available and FIFO queue
-     non-empty, pop next still-`pending` job and launch its supervising task.
-   - `_launch(job)`: mint `run_id`, set `running` + `started_at`, reconcile via
-     `compute_state` (if `is_complete` -> straight to `succeeded`), emit, then
-     create the supervising `asyncio.Task` running `_supervise`.
-   - `_supervise(job_id, params)`: acquire semaphore inside the task; build a
-     `JobContext`; call `worker.run`; on normal return set `succeeded` + store
-     result summary; on `CancelledError` honor the pending intent (pause ->
-     `paused` after `compute_state` reconcile, else `cancelled`); on other
-     exception set `failed`, append the exception to the error log, store a
-     `JobError`. Always release the slot and kick `_try_start_pending`.
-   - Progress callback: `report_progress` updates the record's `JobProgress`
-     and emits a `job` event (coalescing is a Phase-2 SSE concern; Phase 1 emits
-     per call). `report_error` callback writes to the error log via
-     `error_log.append_error(run_id, {...})`.
-   - `pause(job_id)`: only valid for `running` + `supports_pause`; flag intent
-     `paused`, cancel the task. (Not-running or not-pausable raises a clear
-     error -> Phase 2 maps to 409.)
-   - `resume(job_id)`: only valid for `paused`; reconcile via `compute_state`
-     (if `is_complete` -> `succeeded`), else set back to `pending`/enqueue and
-     `_try_start_pending` (fresh `run()` / fresh `run_id`).
-   - `cancel(job_id)`: `pending` -> `cancelled` immediately (dequeue);
-     `running`/`paused` -> flag intent `cancelled`, cancel task; terminal ->
-     raise.
-   - `delete(job_id)`: terminal only (else raise); drop record, best-effort
-     delete error-log file for its `run_id`, emit a `deleted` event.
-   - `get(job_id) -> JobRecord | None`: reconcile via `compute_state` and emit
-     `job` if changed, then return the record.
-   - `list(status=None, type=None, project_id=None, since=None, limit=None)`:
-     filter + sort `created_at desc`.
-   - `_reconcile(job, derived)`: when `derived` is not `None`, update progress
-     counts/total/message and, if `is_complete` on a non-terminal job, mark
-     `succeeded`. Returns whether anything changed.
-   - Reconciliation correctly keeps the believed snapshot when `compute_state`
-     returns `None` (the Noop case).
-   - Provide a module-level `job_registry` singleton plus the class so tests can
-     instantiate fresh isolated registries.
-
-6. `jobs/workers/__init__.py` — package marker.
-
-7. `jobs/workers/noop.py` — `NoopJobParams`, `NoopJobResult`, `NoopJobWorker`
-   exactly per functional_spec §7 (`steps`, `sleep_per_step_seconds`,
-   `fail_at_step`, `error_at_steps`; `compute_state` -> `None`; `run` reports
-   success/error counts and calls `report_error` for `error_at_steps`).
-
-## Tests
-
-Tests live in `app/desktop/studio_server/jobs/` as `test_*.py`, async style
-(`@pytest.mark.asyncio`), using fresh `JobRegistry` instances and a short
-`sleep_per_step_seconds` for speed. Helper to poll until a job reaches a target
-status with a timeout.
-
-- `test_error_log.py`
-  - append + read round-trips a list of entries; entries preserve `**extra`.
-  - missing file -> `[]`; unreadable/garbage lines skipped -> partial list.
-  - delete removes the file; delete of missing file is a no-op.
-- `test_events.py`
-  - subscribe yields an initial `snapshot` containing current jobs.
-  - a subsequent `publish_job` is delivered as a `job` event.
-  - `publish_deleted` delivers a `deleted` tombstone with the id.
-  - filtering by `project_id` / `type` / `job_id` excludes non-matching events
-    and scopes the snapshot.
-- `test_registry.py`
-  - full lifecycle: create -> running -> succeeded; `result.completed_steps`
-    equals `steps`; `started_at`/`ended_at` populated.
-  - failure path: `fail_at_step` -> `failed`; `error` summary set; the fatal
-    exception is captured in the error log for the run.
-  - cancel from pending (job never started) -> `cancelled`, no task.
-  - cancel from running -> `cancelled`.
-  - pause running -> `paused`; resume -> running -> succeeded; a fresh `run_id`
-    is minted on resume (differs from the first run).
-  - pause rejected when `supports_pause = False` (use a tiny non-pausable test
-    worker) and when not running.
-  - delete on terminal succeeds and emits `deleted`; delete while running/pending
-    raises.
-  - error-log capture: `error_at_steps` entries are readable via the run's
-    `run_id` and the progress `error` count matches; missing file -> `[]`.
-  - `compute_state` returning `None` keeps the believed snapshot (Noop never
-    flips to complete early; progress comes from `report_progress`).
-  - `compute_state` returning `is_complete=True` (test worker) reconciles a job
-    to `succeeded` without running real work.
-  - semaphore caps concurrency: with `max_concurrent=2` and 4 long jobs, exactly
-    2 run while the other 2 stay `pending` (FIFO); as the first finish, pending
-    ones start.
-  - registry emits bus events: subscribing then creating/finishing a job yields
-    `snapshot` + `job` events; deleting yields `deleted`.
diff --git a/specs/projects/background_job_system/phase_plans/phase_2.md b/specs/projects/background_job_system/phase_plans/phase_2.md
deleted file mode 100644
index 94ee0c555..000000000
--- a/specs/projects/background_job_system/phase_plans/phase_2.md
+++ /dev/null
@@ -1,169 +0,0 @@
----
-status: complete
----
-
-# Phase 2: REST API + SSE
-
-## Overview
-
-Phase 1 built the in-memory `JobRegistry` (lifecycle, semaphore, supervising
-tasks, reconciliation, per-run error log) plus the `NoopJobWorker`. Phase 2
-exposes that registry over HTTP without changing it: a FastAPI router
-(`api.py`) covering create / list / get / result / errors / pause / resume /
-cancel / delete, plus an SSE stream (`/api/jobs/events`).
-
-The load-bearing requirement is SSE decoupling: the stream is a pure observer
-of the Phase 1 event bus. A client disconnect tears down only the subscription
-(unsubscribe + stop keepalive); it must never cancel, pause, or otherwise touch
-a job's supervising task. Jobs keep running; only explicit `cancel`/`pause`
-stops them.
-
-Follows functional_spec §5 (REST) and §6 (SSE) exactly. Paths are `/api/jobs/...`
-(not project-scoped). Auth mirrors the studio convention (`openapi_extra`
-policy constants, no FastAPI auth dependency). Error envelope is the existing
-convention (`HTTPException(detail=...)`).
-
-## Steps
-
-1. **`app/desktop/studio_server/jobs/api.py`** — new module exposing the
-   process-singleton `job_registry` over HTTP via `connect_jobs_api(app: FastAPI)`.
-
-   - Request/response models:
-     - `CreateJobRequest(BaseModel)`: `params: dict[str, Any]`,
-       `metadata: dict[str, Any] | None = None`. (`project_id` is derived from
-       params when the params model carries one, not from the request body.)
-     - `CreateJobResponse(BaseModel)`: `job_id: str`, `status: JobStatus`.
-   - Helper `_project_id_from_params(worker, validated_params) -> str | None`:
-     returns `getattr(validated, "project_id", None)` so eval jobs get a
-     `project_id` and noop jobs get `null`. (Open item #2/#3: plain optional
-     filter, no server-side active project.)
-   - Helper `_record_json(record: JobRecord) -> dict`: `record.model_dump(mode="json")`.
-
-   Route ordering (declared before the `{id}`/`{type}` catch-alls so they are
-   not shadowed):
-   - `GET /api/jobs/events` — SSE (declared first).
-   - `GET /api/jobs` — list with filters.
-   - Then the dynamic routes. POST uses `{type}`; GET/DELETE use `{id}`. They do
-     not collide because they are different HTTP methods on distinct subpaths
-     (`POST /api/jobs/{type}` vs `GET /api/jobs/{id}` etc.), and the sub-action
-     routes (`/{id}/result`, `/{id}/errors`, `/{id}/pause|resume|cancel`) have
-     an extra path segment.
-
-   Endpoints:
-   - `POST /api/jobs/{type}` (`openapi_extra=ALLOW_AGENT`): validate the type is
-     registered (404 `JobOperationError` → 404 if unknown type) and `params`
-     against `params_model` (pydantic `ValidationError` → 422). Derive
-     `project_id`. `await job_registry.create(...)`. Return
-     `201 CreateJobResponse`.
-     - Unknown type → 404. Implementation: check `type in registry workers`
-       before validating; raise `HTTPException(404)`.
-     - Invalid params → 422 (raise `RequestValidationError`/`HTTPException(422)`
-       from the caught pydantic `ValidationError`).
-   - `GET /api/jobs` (`ALLOW_AGENT`): query params `status`, `type`,
-     `project_id`, `since` (iso8601 datetime), `limit` (int). Maps to
-     `registry.list_jobs(...)`. Returns `200 list[JobRecord]` (serialized),
-     default sort `created_at desc` (registry already does this).
-   - `GET /api/jobs/{id}` (`ALLOW_AGENT`): `await registry.get(id)` (reconciles +
-     emits). 404 if `None`. Returns `200 <record>`.
-   - `GET /api/jobs/{id}/result` (`ALLOW_AGENT`): get record (no reconcile
-     needed beyond `get`); 404 if unknown, 404 if not terminal or `result is
-     None`. Returns `200 <result dict>`.
-   - `GET /api/jobs/{id}/errors` (`ALLOW_AGENT`): optional `run_id` query.
-     Resolve the run_id (query param if given, else the record's current
-     `run_id`). ALWAYS `200`. Returns `error_log.read_errors(run_id)` or `[]`
-     (also `[]` when the job is unknown or has no run_id — never errors).
-   - `POST /api/jobs/{id}/pause` (mutation policy mirroring eval mutations →
-     `agent_policy_require_approval(...)`): `await registry.pause(id)`;
-     `JobNotFoundError` → 404, `JobOperationError` → 409. Return `202` (empty
-     body, `status_code=202`).
-   - `POST /api/jobs/{id}/resume`: same pattern, `registry.resume`. 202 / 404 / 409.
-   - `POST /api/jobs/{id}/cancel`: same pattern, `registry.cancel`. 202 / 404 / 409.
-   - `DELETE /api/jobs/{id}`: `await registry.delete(id)`; 404 / 409. Return
-     `204` (`status_code=204`, no body).
-   - `GET /api/jobs/events` (`ALLOW_AGENT`): query `job_id`, `type`, `project_id`.
-     Returns `CancellableStreamingResponse(content=_event_stream(...),
-     media_type="text/event-stream")`.
-
-   SSE generator `_event_stream(job_id, type_name, project_id)`:
-   - `subscription = job_registry.events.subscribe(job_id, type_name, project_id)`.
-   - Loop: `event = await asyncio.wait_for(subscription.__anext__(), timeout=KEEPALIVE_SECONDS)`;
-     on success `yield _format_sse(event)`; on `asyncio.TimeoutError` `yield ": ping\n\n"`.
-   - `finally: await subscription.aclose()` (unsubscribe via the generator's
-     `finally`). Cancelling the generator (client disconnect via
-     `CancellableStreamingResponse`) only closes the subscription — the registry
-     and its supervising tasks are untouched.
-   - `_format_sse(event: JobEvent) -> str`: `f"event: {event.event}\n"` +
-     `f"data: {json.dumps(event.data)}\n\n"` (matches the `event:`/`data:` wire
-     format; snapshot/job/deleted carry their `data` dict as built by the bus).
-   - `KEEPALIVE_SECONDS = 15` (open item #9).
-
-2. **Wire into `desktop_server.py`** — add `connect_jobs_api(app)` in
-   `make_app()` alongside the other `connect_*_api(app)` calls, before
-   `connect_webhost(app)` (which stays last). The `connect_jobs_api` function
-   registers `NoopJobWorker` on the singleton `job_registry` (idempotent: guard
-   against double-registration of the same type so repeated `make_app()` calls
-   in tests don't error). Do NOT register `EvalJobWorker` (Phase 3). The
-   registry creates asyncio tasks lazily inside `create`, which runs within a
-   request's running loop, so no special lifespan startup is needed (registration
-   is pure dict mutation, loop-safe).
-
-3. **Regenerate the OpenAPI client schema** — after the API is in, run
-   `app/web_ui/src/lib/generate_schema.sh` so `api_schema.d.ts` reflects the new
-   endpoints and `check_schema.sh` passes. Leave the regenerated file in the
-   working tree (do not commit).
-
-## Tests
-
-`app/desktop/studio_server/jobs/test_api.py` using FastAPI `TestClient` (sync
-endpoints) and `httpx.AsyncClient` + `ASGITransport` for the streaming
-decoupling test. A fresh `JobRegistry` is patched in per test (module-level
-`job_registry` reference) so tests are isolated; `NoopJobWorker` registered.
-`temp_error_log_dir` autouse fixture (monkeypatch tempdir) mirrors
-`test_registry.py`.
-
-- `test_create_returns_201_and_pending` — `POST /api/jobs/noop` with valid
-  params returns 201, body has `job_id` + `status` in {pending, running}.
-- `test_create_unknown_type_404` — `POST /api/jobs/nope` → 404.
-- `test_create_invalid_params_422` — `POST /api/jobs/noop` with `steps:"abc"` → 422.
-- `test_list_empty` — `GET /api/jobs` → 200 `[]`.
-- `test_list_returns_jobs_sorted_desc` — create two jobs, list returns newest first.
-- `test_list_filter_by_status_and_type` — filters narrow results.
-- `test_list_filter_by_project_id` — only matching project_id returned (uses a
-  worker whose params carry project_id, or asserts noop → null filtered out).
-- `test_list_since_and_limit` — `since` excludes older, `limit` caps count.
-- `test_get_returns_record` — `GET /api/jobs/{id}` → 200 with full record.
-- `test_get_unknown_404` — `GET /api/jobs/j_missing` → 404.
-- `test_get_reconciles` — a worker whose compute_state flips to complete is
-  reconciled to succeeded on GET (mirrors registry reconcile test via a stub
-  worker registered on the test registry).
-- `test_result_returns_200_when_terminal` — succeeded noop → 200 result dict
-  `{"completed_steps": n}`.
-- `test_result_404_when_not_terminal` — running job → 404.
-- `test_result_404_unknown` — unknown id → 404.
-- `test_errors_returns_array` — job with `error_at_steps` → 200 list of error
-  objects with `error_message`.
-- `test_errors_empty_when_none` — succeeded clean job → 200 `[]`.
-- `test_errors_unknown_job_returns_empty_200` — unknown id → 200 `[]` (never 404).
-- `test_errors_specific_run_id` — `?run_id=` reads that run's log.
-- `test_pause_then_resume` — pause running → 202, status paused; resume → 202.
-- `test_pause_409_when_not_running` — pause terminal → 409.
-- `test_pause_409_when_unsupported` — non-pausable worker → 409.
-- `test_resume_409_when_not_paused` — resume running → 409.
-- `test_cancel_202` — cancel running → 202, becomes cancelled.
-- `test_cancel_409_when_terminal` — cancel succeeded → 409.
-- `test_cancel_unknown_404` — cancel unknown → 404.
-- `test_delete_204_when_terminal` — delete succeeded → 204, gone from list.
-- `test_delete_409_when_in_flight` — delete running → 409.
-- `test_delete_unknown_404` — delete unknown → 404.
-- SSE:
-  - `test_sse_snapshot_then_job_event` — async client streams `/api/jobs/events`,
-    first event is `snapshot` (empty), then create a noop and observe a `job`
-    event carrying the record.
-  - `test_sse_disconnect_leaves_job_running` (DECOUPLING) — start a long noop,
-    connect + read snapshot/a job event, disconnect the stream mid-run, then
-    assert via the registry that the job continues and reaches succeeded. Proves
-    the stream is a pure observer.
-  - `test_sse_filters_by_job_id` — subscribing with `?job_id=` only sees that
-    job's events.
-- `test_connect_jobs_api_registers_noop_idempotently` — calling
-  `connect_jobs_api` twice does not raise (guard) and registers noop.
diff --git a/specs/projects/background_job_system/phase_plans/phase_3.md b/specs/projects/background_job_system/phase_plans/phase_3.md
deleted file mode 100644
index 5e4142d8a..000000000
--- a/specs/projects/background_job_system/phase_plans/phase_3.md
+++ /dev/null
@@ -1,133 +0,0 @@
----
-status: draft
----
-
-# Phase 3: EvalJobWorker (first real consumer)
-
-## Overview
-
-Add the first real background worker, `EvalJobWorker`, that wraps the existing
-`EvalRunner` unchanged and plugs it into the Phase 1/2 job system. The worker:
-
-- Derives true progress from source-of-truth `EvalRun` entities via
-  `compute_state` (a pure read), so resume/re-trigger reconciles honestly.
-- Runs the eval in the background by streaming `EvalRunner.run()`'s `Progress`
-  yields into `ctx.report_progress`, returning a small `EvalJobResult` summary.
-- Advertises `supports_pause = True` because `EvalRunner.collect_tasks_for_task_run_eval`
-  excludes already-run `(eval_config, run_config, dataset)` triples — cancel +
-  re-run skips completed items and writes no duplicate `EvalRun`s (architecture
-  open item #1, CONFIRMED).
-
-No new endpoint is needed: the generic `POST /api/jobs/{type}` from Phase 2
-dispatches to it once `EvalJobWorker` is registered alongside `NoopJobWorker`.
-
-## Key design decisions (verified against current code)
-
-- **`save_context = None` (KNOWN OPEN ITEM — not equivalent to the request path
-  for git-sync-enabled projects).** `build_save_context(request)` reads
-  `request.state.git_sync_manager` and returns `None` only when git sync isn't
-  active; when it IS active it returns a context that wraps each save in
-  `manager.atomic_write(...)` (commit + push). A background worker has no request
-  and passes `save_context=None`, so `EvalRunner` falls back to
-  `default_save_context` (a no-op). This is identical to the request path ONLY
-  for projects that do NOT have git sync in `auto` mode. For a git-sync-enabled
-  project, background-eval `EvalRun` writes do NOT participate in request-scoped
-  git-sync: they are written to disk but are NOT committed or pushed by the job,
-  unlike the legacy SSE eval endpoint under `/api/projects/...` (which goes
-  through `GitSyncMiddleware` + `build_save_context`). The uncommitted writes sit
-  dirty in the working tree until the next write-locked request triggers
-  `GitSyncManager.ensure_clean()`, whose crash-recovery path stashes dirty files
-  (and hard-resets unpushed commits) — so the background-eval results can be
-  swept out of the working tree into a stash with no UI to recover them, and are
-  never backed up to the remote. We are keeping `save_context=None` for v1; this
-  is a known open item pending a design decision (do not treat it as safe/
-  equivalent for git-sync projects).
-- **Entity loading.** Reuse `eval_config_from_id` / `task_run_config_from_id`
-  from `eval_api.py`. They take only string IDs (resolve the project via
-  `project_from_id` → `task_from_id`), need no `Request`, and raise
-  `HTTPException(404)` on missing entities. In `run()` that surfaces as a normal
-  exception → the registry marks the job `failed` (acceptable). `compute_state`
-  loads the same way; a missing entity there propagates out of reconciliation
-  (the registry only swallows `None`, not exceptions) so the failure is visible
-  rather than silently treated as "no progress".
-- **`compute_state` counts.** `total` = task runs matching
-  `dataset_filter_from_id(eval.eval_set_filter_id)`. `success` = `EvalRun`
-  children of the eval_config whose `task_run_config_id == run_config_id`.
-  `error = 0` — failed items aren't persisted as entities; the live error count
-  comes from `Progress.errors` during the run only. `is_complete = success >= total`.
-- **Errors (open item #10).** `Progress` exposes only an error *count*, not
-  per-item messages, so `report_progress(error=...)` carries the count and the
-  `/errors` endpoint stays empty for evals in v1. No `report_error` wiring and
-  no change to `EvalRunner`.
-
-## Steps
-
-1. Add `app/desktop/studio_server/jobs/workers/eval.py`:
-
-   ```python
-   class EvalJobParams(BaseModel):
-       project_id: str
-       task_id: str
-       eval_id: str
-       eval_config_id: str
-       run_config_id: str
-
-   class EvalJobResult(BaseModel):
-       total: int
-       success: int
-       error: int
-
-   class EvalJobWorker(JobWorker[EvalJobParams, EvalJobResult]):
-       type_name = "eval"
-       params_model = EvalJobParams
-       result_model = EvalJobResult
-       supports_pause = True
-
-       async def compute_state(self, params) -> JobDerivedState: ...
-       async def run(self, params, ctx) -> EvalJobResult: ...
-   ```
-
-   - A private `_build_eval_runner(params) -> EvalRunner` helper that loads the
-     eval_config + run_config and constructs
-     `EvalRunner(eval_configs=[eval_config], run_configs=[run_config],
-     eval_run_type="task_run_eval", save_context=None)` — mirroring
-     `run_eval_config` in `eval_api.py`.
-   - `compute_state` loads the eval_config (and its parent eval), counts
-     filtered task runs for `total`, counts matching `EvalRun`s for `success`,
-     returns `JobDerivedState(total, success, error=0, is_complete=success>=total)`.
-   - `run` builds the runner, iterates `async for progress in eval_runner.run():`
-     calling `await ctx.report_progress(success=progress.complete,
-     error=progress.errors, total=progress.total)`, and returns
-     `EvalJobResult(total, success, error)` from the last `progress`.
-     `EvalRunner.run()` always yields at least an initial `Progress`, so a
-     `last_progress` is guaranteed; default to a zero summary defensively.
-
-2. Register the worker in `connect_jobs_api` (`api.py`) next to
-   `NoopJobWorker`: `job_registry.register_type(EvalJobWorker)`.
-
-3. Verify the OpenAPI schema is unchanged (no new route — the generic create
-   route already exists) via `check_schema.sh`.
-
-## Tests
-
-`app/desktop/studio_server/jobs/workers/test_eval.py`, mirroring the entity
-fixtures from `test_eval_api.py` / `test_eval_runner.py` (Project/Task/Eval/
-EvalConfig/TaskRunConfig/TaskRun in `tmp_path`, pre-seeded `EvalRun`s), patching
-`project_from_id` so the on-disk project resolves.
-
-- `compute_state` with no `EvalRun`s: `total` = number of filtered task runs,
-  `success = 0`, `error = 0`, `is_complete = False`.
-- `compute_state` counts already-scored items: seed `EvalRun`s with matching
-  `task_run_config_id`; `success` equals the seeded count; `is_complete` flips
-  true only when `success >= total`.
-- `compute_state` ignores `EvalRun`s with a different `task_run_config_id`
-  (doesn't over-count).
-- `run` maps `Progress` → `report_progress` and returns the right
-  `EvalJobResult`: patch/stub `EvalRunner.run` to yield canned `Progress`
-  objects, assert the recorded `report_progress` calls and the returned result.
-- Idempotent re-run: seed some `EvalRun`s, run a real `EvalRunner` whose
-  `run_job` is stubbed to write an `EvalRun` per remaining item, assert only the
-  not-yet-scored items are processed and no duplicate `EvalRun`s are written.
-- End-to-end via the registry: `registry.register_type(EvalJobWorker)`,
-  `registry.create("eval", params)` with `EvalRunner.run` stubbed, drive to
-  `succeeded`, assert the final `result` summary and progress counts.
diff --git a/specs/projects/background_job_system/phase_plans/phase_4.md b/specs/projects/background_job_system/phase_plans/phase_4.md
deleted file mode 100644
index 0d531286c..000000000
--- a/specs/projects/background_job_system/phase_plans/phase_4.md
+++ /dev/null
@@ -1,200 +0,0 @@
----
-status: draft
----
-
-# Phase 4: Frontend — jobs store, REST client, jobs panel, sidebar badge
-
-## Overview
-
-The final phase. Phases 1–3 built the in-memory `JobRegistry`, the `/api/jobs`
-REST + SSE surface, and the first real worker (`EvalJobWorker`). This phase is the
-Svelte UI that consumes that surface:
-
-- A store backed by the `GET /api/jobs/events` SSE stream that holds a live,
-  keyed `Map<id, JobRecord>`, handling the three named events (`snapshot`,
-  `job`, `deleted`) per functional_spec §6.
-- A thin typed REST client over the generated OpenAPI `client` for the
-  create/list/get/result/errors/pause/resume/cancel/delete endpoints.
-- A jobs panel at `/jobs` listing jobs with per-job lifecycle actions (only the
-  ones valid for the current status + `supports_pause`), plus drill-in for the
-  per-run error log and the result summary.
-- A small sidebar badge showing the count of active (`pending` / `running` /
-  `paused`) jobs, driven by the same store.
-- A nav entry into both the icon rail (`sidebar_rail.svelte`) and the wide
-  drawer sidebar (`(app)/+layout.svelte`).
-
-### Key design decisions (resolved from the integration map)
-
-- **Store location.** The repo's strong convention is `src/lib/stores/` (every
-  other store + its `*.test.ts` lives there). The spec *suggests* `lib/jobs/`.
-  We follow the repo: `jobs_store.ts` and `jobs_api.ts` live in
-  `src/lib/stores/`. (The architecture doc's `lib/jobs/` path is explicitly
-  "out of strict scope … the natural shape", so matching the repo wins.)
-- **SSE named events.** The jobs stream uses `event: snapshot|job|deleted`
-  (confirmed in `app/desktop/studio_server/jobs/api.py::_format_sse`). So we use
-  `addEventListener('snapshot'|'job'|'deleted', …)`, not the single `onmessage`
-  the extractor store uses.
-- **Pure observer.** The store opens one `EventSource`, reconnects on error, and
-  closes it only when the last subscriber unsubscribes (ref-counted). No job
-  action is ever tied to connection lifecycle. A fresh `snapshot` re-syncs the
-  map on reconnect (no `Last-Event-ID`).
-- **Project filter.** The store opens the stream with
-  `?project_id=$ui_state.current_project_id` when one is set; it re-opens the
-  stream when the active project changes (the badge / panel are project-scoped,
-  matching `?project_id=` list semantics). NoopJobs (no project) only show when
-  no project filter is active — acceptable; the panel is project-scoped.
-- **Reconnect URL is the schema path constant** but `EventSource` needs a raw
-  URL, so we build it from `base_url` (mirroring `extractor_progress_store`),
-  not the openapi-fetch `client` (which can't do SSE).
-
-## Steps
-
-1. **`src/lib/stores/jobs_api.ts`** — thin REST client. Re-export the generated
-   record type for convenience and wrap each endpoint:
-
-   ```ts
-   import { client } from "$lib/api_client"
-   import type { components } from "$lib/api_schema"
-
-   export type JobRecord = components["schemas"]["JobRecord"]
-   export type BackgroundJobStatus = components["schemas"]["BackgroundJobStatus"]
-   export type JobError = components["schemas"]["JobError"]
-
-   export async function list_jobs(query?: {...}): Promise<JobRecord[]>
-   export async function get_job(id: string): Promise<JobRecord>
-   export async function create_job(type, params, metadata?): Promise<{job_id, status}>
-   export async function get_job_result(id): Promise<Record<string, unknown>>
-   export async function get_job_errors(id, run_id?): Promise<Array<{error_message?: string} & Record<string, unknown>>>
-   export async function pause_job(id): Promise<void>
-   export async function resume_job(id): Promise<void>
-   export async function cancel_job(id): Promise<void>
-   export async function delete_job(id): Promise<void>
-   ```
-
-   Each unwraps `{ data, error }` from openapi-fetch and throws `error` when set
-   (so callers can wrap with `createKilnError`). Lifecycle calls (`pause` etc.)
-   return `void` (the backend returns `202`/`204` with no useful body).
-
-2. **`src/lib/stores/jobs_store.ts`** — the SSE-backed store.
-
-   - Internal `writable<Map<string, JobRecord>>`.
-   - `connect()`: builds `${base_url}/api/jobs/events` with optional
-     `?project_id=`, opens an `EventSource`, registers listeners:
-     - `snapshot`: `JSON.parse(data).jobs` → replace the whole map.
-     - `job`: `JSON.parse(data)` (a full `JobRecord`) → upsert by `id`.
-     - `deleted`: `JSON.parse(data).id` → delete by `id`.
-     - `onerror`: close + schedule a reconnect (small backoff); the next
-       `snapshot` re-syncs.
-   - Ref-counted lifecycle: `subscribe` increments a counter and `connect()`s on
-     first subscriber; the returned unsubscribe decrements and `disconnect()`s
-     (closes the `EventSource`, cancels any pending reconnect) when it hits zero.
-     Closing never touches a job — pure observer.
-   - Re-open on project change: subscribe to `ui_state`; when
-     `current_project_id` changes while connected, tear down and reconnect with
-     the new filter. (Implemented with an exposed `set_project(id)` the module
-     wires to `ui_state`, kept testable by allowing an injected project id.)
-   - Derived exports:
-     - `jobs`: a `Readable<JobRecord[]>` sorted by `created_at desc` (matches the
-       REST default sort) for the panel.
-     - `active_jobs_count`: a `Readable<number>` counting
-       `pending|running|paused` for the badge.
-   - Export an `ACTIVE_STATUSES` set and a helper `is_active(status)` so the
-     badge logic is unit-testable without the DOM.
-   - To make `EventSource` injectable for tests, read the constructor from
-     `globalThis.EventSource` at connect time (tests install a fake on
-     `globalThis`).
-
-3. **`src/lib/stores/job_status.ts`** (small helpers, colocated) — pure
-   functions used by both the panel and tests:
-   - `job_status_display(status)`: human label.
-   - `job_status_badge_class(status)`: DaisyUI badge color class
-     (`badge-info` running, `badge-success` succeeded, `badge-error` failed,
-     `badge-warning` paused, `badge-ghost` pending, neutral cancelled).
-   - `available_actions(job)`: returns which of
-     `pause|resume|cancel|delete` are valid given `status` + `supports_pause`,
-     per state machine (§3) + delete policy (open item #7: terminal only):
-     - `running`: cancel; pause iff `supports_pause`.
-     - `paused`: resume, cancel.
-     - `pending`: cancel.
-     - terminal (`succeeded|failed|cancelled`): delete.
-   - `progress_label(progress)`: `"{success} / {total}"` (+ error count when > 0),
-     and `progress_percent(progress)` for the bar.
-
-4. **`src/lib/components/SidebarJobsBadge.svelte`** — count bubble. Renders the
-   `active_jobs_count`; shows a small primary pill with the number when > 0,
-   nothing when 0. Designed to overlay the rail icon (absolute, top-right) and to
-   sit inline in the wide drawer. Accept a `count` prop (default: subscribe to
-   the store) so it's render-testable in isolation; expose a `variant`
-   (`rail` | `inline`) for placement styling.
-
-5. **`src/routes/(app)/jobs/+page.svelte`** — the panel. Uses `AppPage`
-   (`../../app_page.svelte`) with title "Jobs" and a short subtitle. Subscribes
-   to `jobs`. States:
-   - Loading: spinner until the first `snapshot` arrives (track a
-     `connected/received-snapshot` flag on the store).
-   - Empty: educational empty state (icon + heading + one-liner explaining that
-     background jobs like evals run here and keep running even if you navigate
-     away). No destructive CTA.
-   - List: a table (`bg-base-200` header, matching the app's table style) with
-     columns: Type, Status (colored badge), Progress (`success/total`, error
-     count, thin progress bar), Message, Created, and an Actions cell.
-     - Actions render only `available_actions(job)`; each calls the matching
-       `jobs_api` fn, wrapped in try/catch → toast/inline error. Optimistic UI
-       is unnecessary — the SSE event will reflect the real transition.
-     - "Errors" button (always available when `progress.error > 0` or status is
-       `failed`) opens a `Dialog` that lazy-loads `get_job_errors(id)` and lists
-       `error_message` rows; "Result" button (when terminal + has result) opens a
-       `Dialog` showing the result summary JSON in a `<pre>`.
-   - Use `formatDate` from `$lib/utils/formatters` for timestamps.
-
-6. **`src/lib/ui/section.ts`** — add `Jobs` to the `Section` enum.
-
-7. **`src/routes/(app)/sidebar_rail.svelte`** — add a `SidebarRailItem`
-   `href="/jobs"` with a briefcase/stack icon and an overlaid `SidebarJobsBadge`
-   (rail variant). Place it after Evals / before the optimize group.
-
-8. **`src/routes/(app)/+layout.svelte`** —
-   - Add the `/jobs` → `Section.Jobs` branch to the section reactive block.
-   - Add a wide-drawer `<li>` nav entry mirroring the Evals entry, with the
-     inline badge.
-   - Import `SidebarJobsBadge`.
-
-## Tests
-
-`src/lib/stores/jobs_store.test.ts` (jsdom, fake `EventSource` installed on
-`globalThis`):
-
-- **snapshot replace**: dispatch a `snapshot` with two jobs → `jobs` has both;
-  dispatch a second `snapshot` with one different job → map fully replaced.
-- **job upsert (insert)**: `job` event for a new id adds it.
-- **job upsert (status transition + progress update)**: `job` event for an
-  existing id with a new `status`/`progress` replaces the stored record (counts
-  reflect the latest snapshot, not accumulated).
-- **deleted removal**: `deleted` event removes the id; deleting an unknown id is
-  a no-op.
-- **reconnect re-sync**: trigger `onerror` → the fake records that `close()` was
-  called and a new `EventSource` is constructed after the backoff; a fresh
-  `snapshot` repopulates the map (stale entries from before are gone).
-- **active count derivation**: a mix of statuses → `active_jobs_count` counts
-  only `pending|running|paused`.
-- **pure-observer teardown**: last unsubscribe closes the `EventSource`; a job
-  action is never invoked by the store (assert no fetch/callback fired on close).
-- **project filter**: connecting with a project id builds the URL with
-  `?project_id=`; changing the project closes the old source and opens a new one
-  with the new filter.
-
-`src/lib/stores/job_status.test.ts` (pure, no DOM):
-
-- `available_actions` returns the correct sets for each status (running w/ &
-  w/o `supports_pause`, paused, pending, each terminal).
-- `job_status_badge_class` / `job_status_display` map each status.
-- `progress_label` / `progress_percent` for total=null, zero, partial, full.
-
-`src/lib/stores/jobs_api.test.ts` (mock `$lib/api_client`'s `client`):
-
-- each wrapper calls the expected client method + path with the right
-  params, and throws when the client returns `error`.
-
-`src/lib/components/SidebarJobsBadge.test.ts` (jsdom, render):
-
-- renders the count when > 0; renders nothing when 0.
diff --git a/specs/projects/background_job_system/project_overview.md b/specs/projects/background_job_system/project_overview.md
deleted file mode 100644
index f06ef0e8c..000000000
--- a/specs/projects/background_job_system/project_overview.md
+++ /dev/null
@@ -1,48 +0,0 @@
----
-status: complete
----
-
-# Background Job System
-
-A generic background-job layer for the local Kiln app (FastAPI on `:8757`). Provides tracked, controllable jobs that run as asyncio tasks in-process; exposes lifecycle (list / get / pause / resume / cancel / delete) and progress (SSE) over HTTP.
-
-Job records are **in-memory only** — they are ephemeral bookkeeping for visibility and control, never a source of truth. The authoritative state of any operation lives in the Kiln project entities it touches (eval runs, task runs, etc.). Every worker must be **idempotent**: it derives "what's already done" by reading those entities, so a re-run converges to the same end state without duplicating side effects. Because of this, nothing is persisted and there is nothing to recover at startup — re-triggering a job after a crash or restart is always safe.
-
-A standalone, general-purpose layer. It is intentionally generic (typed workers, opaque params/result, free-form `metadata`) so other features can build on it later, but this spec designs no integration with any specific consumer — future consumers adapt to this system, not the reverse.
-
-## Goal & scope
-
-**In scope.**
-- A generic `Job` shape: base record + per-type opaque payloads (params / result).
-- A `JobRegistry` that supervises asyncio tasks, tracks state in-memory, and emits events.
-- REST API for `create / list / get / result / errors / pause / resume / cancel / delete`.
-- SSE stream for live state and progress — success/error counts, idempotent snapshots, not deltas.
-- An idempotency contract on workers: each derives its true state from source-of-truth reads on Kiln entities, so re-runs (including pause→resume) converge without duplicating side effects.
-- Per-run error-message capture: errors spool to an ephemeral, best-effort JSON file in the OS temp dir, keyed by a per-run UUID, retrievable on demand and gracefully empty if gone.
-- A reference `NoopJob` worker for end-to-end validation.
-- An `EvalJob` worker that wraps the existing `EvalRunner` (which internally uses `AsyncJobRunner`). No changes to `EvalRunner` or `AsyncJobRunner`.
-
-**Out of scope (deferred).**
-- Any assistant / orchestration layer that consumes this system — separate, future work, not designed for here.
-- Cloud Run remoting / surviving the desktop-app process.
-- Full per-job log capture / streaming / replay (beyond the per-run error-message capture above).
-- Job dependencies / DAGs.
-- Plan-style multi-job approval.
-
-## Positioning vs. `AsyncJobRunner`
-
-`AsyncJobRunner` (`Kiln/libs/core/kiln_ai/utils/async_job_runner.py`) is a low-level worker pool that parallelizes "do N similar things" inside a single domain operation. It is in-memory, has no lifecycle beyond `.run()` returning, and is consumed by `EvalRunner`, `ExtractorRunner`, RAG runners, etc.
-
-This new layer sits **above** `AsyncJobRunner`. It does not replace it. The composition is:
-
-```
-JobRegistry              (new — tracked lifecycle, in-memory, HTTP, SSE)
-  └─ EvalJobWorker       (new — one tracked job per eval invocation)
-       └─ EvalRunner     (existing — unchanged)
-            └─ AsyncJobRunner   (existing — unchanged)
-                 └─ N parallel eval calls
-```
-
-Existing adapters keep using `AsyncJobRunner` internally. What changes for evals is the *HTTP entry point and tracking*: a new `POST /api/jobs/eval` returns a job_id and runs in the background, alongside the existing blocking SSE `GET /api/.../run_comparison` which stays for the legacy browser flow.
-
-A defining difference: the legacy blocking endpoint runs the eval *inside the HTTP request*, so closing the browser cancels it. A job in the new system runs independently of any connection — the user can close the web UI entirely and the job keeps running; the SSE stream only *observes* it (see functional spec §6).

From 922eff498ebe52bc89c76f4edbce6a7bf39acd7b Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 18:41:35 +0800
Subject: [PATCH 18/26] refactor: use intro component for job table

---
 .../src/lib/components/jobs_table.svelte      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/app/web_ui/src/lib/components/jobs_table.svelte b/app/web_ui/src/lib/components/jobs_table.svelte
index 7919f31f4..a02e04b83 100644
--- a/app/web_ui/src/lib/components/jobs_table.svelte
+++ b/app/web_ui/src/lib/components/jobs_table.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
   import Dialog from "$lib/ui/dialog.svelte"
+  import Intro from "$lib/ui/intro.svelte"
   import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
   import { jobs, synced, connection } from "$lib/stores/jobs_store"
   import {
@@ -167,18 +168,17 @@
     <div class="loading loading-spinner loading-lg"></div>
   </div>
 {:else if $jobs.length === 0}
-  <div
-    class="flex flex-col items-center justify-center min-h-[55vh] text-center max-w-md mx-auto"
-  >
-    <div class="w-12 h-12 text-gray-400 mb-4" aria-hidden="true">
-      <JobsIcon />
-    </div>
-    <h3 class="text-lg font-medium">No jobs yet</h3>
-    <p class="text-sm text-gray-500 mt-2">
-      Long-running work like eval runs shows up here. Jobs run in the background
-      — you can leave this page and they'll keep going. Come back any time to
-      check progress, pause, or cancel them.
-    </p>
+  <div class="flex justify-center items-center min-h-[55vh]">
+    <Intro
+      title="No jobs yet"
+      description_paragraphs={[
+        "Long-running work like eval runs shows up here. Jobs keep running in the background, even if you leave this page.",
+      ]}
+    >
+      <div slot="icon" class="w-12 h-12 text-gray-400" aria-hidden="true">
+        <JobsIcon />
+      </div>
+    </Intro>
   </div>
 {:else}
   <div class="flex flex-row justify-end mb-3">

From 47b24c4e20d0acbfa9cf8c5973eae43b62ffefbd Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 18:45:05 +0800
Subject: [PATCH 19/26] refactor: three dots dropdown instead of buttons

---
 .../src/lib/components/jobs_table.svelte      | 66 +++++++------------
 1 file changed, 23 insertions(+), 43 deletions(-)

diff --git a/app/web_ui/src/lib/components/jobs_table.svelte b/app/web_ui/src/lib/components/jobs_table.svelte
index a02e04b83..69976d69f 100644
--- a/app/web_ui/src/lib/components/jobs_table.svelte
+++ b/app/web_ui/src/lib/components/jobs_table.svelte
@@ -1,6 +1,8 @@
 <script lang="ts">
   import Dialog from "$lib/ui/dialog.svelte"
   import Intro from "$lib/ui/intro.svelte"
+  import TableActionMenu from "$lib/ui/table_action_menu.svelte"
+  import type { FloatingMenuItem } from "$lib/ui/floating_menu_types"
   import JobsIcon from "$lib/ui/icons/jobs_icon.svelte"
   import { jobs, synced, connection } from "$lib/stores/jobs_store"
   import {
@@ -141,6 +143,24 @@
       result_loading = false
     }
   }
+
+  // The row's overflow menu: view actions first, then lifecycle actions.
+  function row_menu_items(job: JobRecord): FloatingMenuItem[] {
+    const items: FloatingMenuItem[] = []
+    if (has_result(job)) {
+      items.push({ label: "View results", onclick: () => open_result(job) })
+    }
+    if (has_errors(job)) {
+      items.push({ label: "View errors", onclick: () => open_errors(job) })
+    }
+    for (const action of available_actions(job)) {
+      items.push({
+        label: action === "delete" ? "Clear" : action_labels[action],
+        onclick: () => run_action(action, job.id),
+      })
+    }
+    return items
+  }
 </script>
 
 {#if action_error}
@@ -252,49 +272,9 @@
                 {/if}
               </div>
             </td>
-            <td>
-              <div
-                class="flex flex-row gap-1 justify-end flex-wrap items-center"
-              >
-                {#if has_result(job)}
-                  <button
-                    class="btn btn-xs btn-ghost"
-                    on:click={() => open_result(job)}
-                  >
-                    View results
-                  </button>
-                {/if}
-                {#if has_errors(job)}
-                  <button
-                    class="btn btn-xs btn-ghost"
-                    on:click={() => open_errors(job)}
-                  >
-                    View errors
-                  </button>
-                {/if}
-                {#each available_actions(job) as action}
-                  {#if action === "delete"}
-                    <button
-                      class="btn btn-xs btn-ghost"
-                      disabled={in_flight[job.id]}
-                      aria-label="Dismiss job"
-                      title="Dismiss job"
-                      on:click={() => run_action(action, job.id)}
-                    >
-                      Clear
-                    </button>
-                  {:else}
-                    <button
-                      class="btn btn-xs {action === 'cancel'
-                        ? 'btn-ghost text-error'
-                        : 'btn-ghost'}"
-                      disabled={in_flight[job.id]}
-                      on:click={() => run_action(action, job.id)}
-                    >
-                      {action_labels[action]}
-                    </button>
-                  {/if}
-                {/each}
+            <td class="align-top">
+              <div class="flex flex-row justify-end items-start">
+                <TableActionMenu items={row_menu_items(job)} />
               </div>
             </td>
           </tr>

From aa9de8b82a33a3daeacae1724dca7c6ac5cdc420 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 18:47:56 +0800
Subject: [PATCH 20/26] refactor: one string subtitle in modal

---
 app/web_ui/src/lib/components/jobs_dialog.svelte | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app/web_ui/src/lib/components/jobs_dialog.svelte b/app/web_ui/src/lib/components/jobs_dialog.svelte
index 65fcb54d9..71f6f3040 100644
--- a/app/web_ui/src/lib/components/jobs_dialog.svelte
+++ b/app/web_ui/src/lib/components/jobs_dialog.svelte
@@ -22,6 +22,7 @@
   bind:this={dialog}
   title="Jobs"
   width="wide"
+  subtitle="Background work for the current project. Jobs keep running even if you leave this page."
   sub_subtitle="View full page →"
   sub_subtitle_link="/jobs"
 >

From 424e4fbf3d818ac8f609e7c4a36b118f67be330f Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 18:51:08 +0800
Subject: [PATCH 21/26] refactor: move jobs entry to below App Update Available
 in sidebar

---
 app/web_ui/src/routes/(app)/+layout.svelte | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte
index e798a93fe..0be206166 100644
--- a/app/web_ui/src/routes/(app)/+layout.svelte
+++ b/app/web_ui/src/routes/(app)/+layout.svelte
@@ -429,6 +429,17 @@
         <li class="mt-auto pt-2 bg-transparent">
           <ProgressWidget />
         </li>
+        {#if $update_info.update_result && $update_info.update_result.has_update}
+          <li class="menu-sm mt-2">
+            <a
+              href="/settings/check_for_update"
+              class="px-4 text-xs font-medium"
+            >
+              <span class="bg-primary rounded-full w-2 h-2 mr-1"></span>App
+              Update Available</a
+            >
+          </li>
+        {/if}
         <li class="menu-sm">
           <button
             type="button"
@@ -446,17 +457,6 @@
             <SidebarJobsIndicator variant="inline" />
           </button>
         </li>
-        {#if $update_info.update_result && $update_info.update_result.has_update}
-          <li class="menu-sm mt-2">
-            <a
-              href="/settings/check_for_update"
-              class="px-4 text-xs font-medium"
-            >
-              <span class="bg-primary rounded-full w-2 h-2 mr-1"></span>App
-              Update Available</a
-            >
-          </li>
-        {/if}
         <li class="menu-sm">
           <a
             href="/settings"

From b2040c398563eb172cff0388116a856e656a8b90 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 18:52:20 +0800
Subject: [PATCH 22/26] fix: maybe fix wrapping badge

---
 app/web_ui/src/lib/components/jobs_table.svelte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/web_ui/src/lib/components/jobs_table.svelte b/app/web_ui/src/lib/components/jobs_table.svelte
index 69976d69f..17c22ef9d 100644
--- a/app/web_ui/src/lib/components/jobs_table.svelte
+++ b/app/web_ui/src/lib/components/jobs_table.svelte
@@ -237,7 +237,7 @@
             <td>
               <div class="flex flex-col gap-2 w-full max-w-[360px] min-w-48">
                 <span
-                  class="badge px-3 py-1 self-start {job_status_display_badge_class(
+                  class="badge h-auto px-3 py-1 self-start whitespace-normal text-center leading-tight {job_status_display_badge_class(
                     job,
                   )}"
                 >

From 97cf7007adcb15a1804eaf51683d0a0f1b4be02e Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 18:57:45 +0800
Subject: [PATCH 23/26] refactor: tighten up jobs empty-state copy

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 app/web_ui/src/lib/components/jobs_table.svelte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/web_ui/src/lib/components/jobs_table.svelte b/app/web_ui/src/lib/components/jobs_table.svelte
index 17c22ef9d..bc801a350 100644
--- a/app/web_ui/src/lib/components/jobs_table.svelte
+++ b/app/web_ui/src/lib/components/jobs_table.svelte
@@ -192,7 +192,7 @@
     <Intro
       title="No jobs yet"
       description_paragraphs={[
-        "Long-running work like eval runs shows up here. Jobs keep running in the background, even if you leave this page.",
+        "Long-running workloads show up here. Manage them from this page, or leave — they'll keep running in the background.",
       ]}
     >
       <div slot="icon" class="w-12 h-12 text-gray-400" aria-hidden="true">

From d978693494d007ec356eede8af65f3cebc017f41 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 19:16:26 +0800
Subject: [PATCH 24/26] feat: gate Jobs sidebar behind PUBLIC_ENABLE_JOBS flag

- Add PUBLIC_ENABLE_JOBS env flag gating the Jobs sidebar entries (full
  + rail) and the JobsDialog mount; documented in .env.example
- Reorder full sidebar so Jobs sits below the App Update Available entry
- Keep the Jobs nav entry at normal color even when there are no jobs
- Mark /jobs as a temporary test page in its title/subtitle

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01AWp8vw86zgAH8acGJfB24q
---
 app/web_ui/.env.example                       |  4 ++
 app/web_ui/src/routes/(app)/+layout.svelte    | 46 +++++++++++--------
 app/web_ui/src/routes/(app)/jobs/+page.svelte |  5 +-
 .../src/routes/(app)/sidebar_rail.svelte      | 15 +++---
 4 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/app/web_ui/.env.example b/app/web_ui/.env.example
index c966be48d..5bd044fe1 100644
--- a/app/web_ui/.env.example
+++ b/app/web_ui/.env.example
@@ -12,5 +12,9 @@
 # Useful for debugging agent behavior during development.
 # PUBLIC_SHOW_TOOL_CALL_DETAILS=true
 
+# Background Jobs UI — set to "true" to show the Jobs entry in the sidebar and
+# enable the jobs dialog. When unset or any other value, the feature is hidden.
+# PUBLIC_ENABLE_JOBS=true
+
 # Sentry — set the DSN to enable client-side error reporting. Unset = no-op.
 # VITE_KILN_SENTRY_DSN=https://...@o.../...
diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte
index 0be206166..26e510c06 100644
--- a/app/web_ui/src/routes/(app)/+layout.svelte
+++ b/app/web_ui/src/routes/(app)/+layout.svelte
@@ -22,7 +22,6 @@
   import SidebarJobsIndicator from "$lib/components/SidebarJobsIndicator.svelte"
   import JobsDialog from "$lib/components/jobs_dialog.svelte"
   import { jobs_dialog } from "$lib/stores/jobs_dialog"
-  import { active_jobs_count } from "$lib/stores/jobs_store"
   import { Section } from "$lib/ui/section"
   import Dialog from "$lib/ui/dialog.svelte"
   import SidebarRail from "./sidebar_rail.svelte"
@@ -30,6 +29,11 @@
   import { chatBarExpanded } from "$lib/stores/chat_ui_state"
   import { derived } from "svelte/store"
   import DatabaseIcon from "$lib/ui/icons/database_icon.svelte"
+  import { env } from "$env/dynamic/public"
+
+  // Feature flag: the background Jobs UI (sidebar entry + dialog) only renders
+  // when PUBLIC_ENABLE_JOBS is explicitly "true". See .env.example.
+  const jobs_enabled = env.PUBLIC_ENABLE_JOBS === "true"
 
   // Rail-eligibility predicate: lg breakpoint, narrow viewport (< 1550px),
   // and chat bar expanded. See functional_spec.md "Trigger".
@@ -166,7 +170,11 @@
     ></label>
 
     {#if showRail}
-      <SidebarRail {section} openTaskDialog={() => taskDialog?.show()} />
+      <SidebarRail
+        {section}
+        {jobs_enabled}
+        openTaskDialog={() => taskDialog?.show()}
+      />
     {:else}
       <ul
         class="sidebar-menu menu bg-base-200 text-base-content w-72 md:w-52 2xl:w-56 p-3 pt-1 lg:pt-3 min-h-full text-xs"
@@ -440,23 +448,21 @@
             >
           </li>
         {/if}
-        <li class="menu-sm">
-          <button
-            type="button"
-            class="text-xs {$active_jobs_count > 0
-              ? 'text-base-content'
-              : 'text-base-content/60'}"
-            on:click={() => jobs_dialog.open()}
-          >
-            <div
-              class="sidebar-icon {$active_jobs_count > 0 ? '' : 'opacity-60'}"
+        {#if jobs_enabled}
+          <li class="menu-sm">
+            <button
+              type="button"
+              class="text-xs text-base-content"
+              on:click={() => jobs_dialog.open()}
             >
-              <JobsIcon />
-            </div>
-            Jobs
-            <SidebarJobsIndicator variant="inline" />
-          </button>
-        </li>
+              <div class="sidebar-icon">
+                <JobsIcon />
+              </div>
+              Jobs
+              <SidebarJobsIndicator variant="inline" />
+            </button>
+          </li>
+        {/if}
         <li class="menu-sm">
           <a
             href="/settings"
@@ -495,7 +501,9 @@
   <SelectTasksMenu on:dismiss={() => taskDialog?.close()} />
 </Dialog>
 
-<JobsDialog />
+{#if jobs_enabled}
+  <JobsDialog />
+{/if}
 
 <style>
   :global(ul > li.menu-nested) {
diff --git a/app/web_ui/src/routes/(app)/jobs/+page.svelte b/app/web_ui/src/routes/(app)/jobs/+page.svelte
index ff3dd28c2..257c640f9 100644
--- a/app/web_ui/src/routes/(app)/jobs/+page.svelte
+++ b/app/web_ui/src/routes/(app)/jobs/+page.svelte
@@ -58,9 +58,8 @@
 </script>
 
 <AppPage
-  title="Jobs"
-  subtitle="Background work for the current project."
-  sub_subtitle="Jobs keep running even if you navigate away or close this panel."
+  title="Jobs (temporary test page)"
+  subtitle="This page is a placeholder test to trigger jobs - will be removed before merging"
   {action_buttons}
 >
   {#if action_error}
diff --git a/app/web_ui/src/routes/(app)/sidebar_rail.svelte b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
index be1795d27..7aca02253 100644
--- a/app/web_ui/src/routes/(app)/sidebar_rail.svelte
+++ b/app/web_ui/src/routes/(app)/sidebar_rail.svelte
@@ -14,6 +14,7 @@
 
   export let section: Section = Section.None
   export let openTaskDialog: () => void
+  export let jobs_enabled: boolean = false
 </script>
 
 <nav
@@ -118,12 +119,14 @@
 
   <SidebarRailProgress />
 
-  <SidebarRailItem on_click={() => jobs_dialog.open()} label="Jobs">
-    <div slot="icon" class="w-full h-full relative">
-      <JobsIcon />
-      <SidebarJobsIndicator variant="rail" />
-    </div>
-  </SidebarRailItem>
+  {#if jobs_enabled}
+    <SidebarRailItem on_click={() => jobs_dialog.open()} label="Jobs">
+      <div slot="icon" class="w-full h-full relative">
+        <JobsIcon />
+        <SidebarJobsIndicator variant="rail" />
+      </div>
+    </SidebarRailItem>
+  {/if}
 
   <SidebarRailSettings active={section === Section.Settings} />
 </nav>

From 53d75bdeb607245b2f6b9155d1d8347284b6945b Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 19:33:09 +0800
Subject: [PATCH 25/26] chore: remove Run eval example job from jobs system

The run-eval-as-a-job feature was only an example of how the background
job system works; drop it from this PR.

- Backend: delete EvalJobWorker (workers/eval.py + tests), unregister it
  in jobs/api.py
- Frontend: delete run_eval_dialog + run_eval_job (and tests); remove the
  "Run eval" button/dialog from the temporary /jobs test page

The noop worker and "Start test job" button remain for exercising the
panel end-to-end.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01AWp8vw86zgAH8acGJfB24q
---
 app/desktop/studio_server/jobs/api.py         |   2 -
 .../studio_server/jobs/workers/eval.py        | 145 -----
 .../studio_server/jobs/workers/test_eval.py   | 535 ------------------
 app/web_ui/src/routes/(app)/jobs/+page.svelte |  11 +-
 .../routes/(app)/jobs/run_eval_dialog.svelte  | 309 ----------
 .../routes/(app)/jobs/run_eval_dialog.test.ts | 286 ----------
 .../routes/(app)/jobs/run_eval_job.test.ts    | 282 ---------
 .../src/routes/(app)/jobs/run_eval_job.ts     | 171 ------
 8 files changed, 1 insertion(+), 1740 deletions(-)
 delete mode 100644 app/desktop/studio_server/jobs/workers/eval.py
 delete mode 100644 app/desktop/studio_server/jobs/workers/test_eval.py
 delete mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte
 delete mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts
 delete mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts
 delete mode 100644 app/web_ui/src/routes/(app)/jobs/run_eval_job.ts

diff --git a/app/desktop/studio_server/jobs/api.py b/app/desktop/studio_server/jobs/api.py
index e2479f475..554470940 100644
--- a/app/desktop/studio_server/jobs/api.py
+++ b/app/desktop/studio_server/jobs/api.py
@@ -17,7 +17,6 @@
 from .events import JobEvent
 from .models import BackgroundJobStatus, JobRecord
 from .registry import JobNotFoundError, JobOperationError, job_registry
-from .workers.eval import EvalJobWorker
 from .workers.noop import NoopJobWorker
 
 KEEPALIVE_SECONDS = 15.0
@@ -98,7 +97,6 @@ def connect_jobs_api(app: FastAPI) -> None:
     # Register the workers this server exposes. register_type overwrites by
     # type_name, so repeated calls (e.g. multiple make_app() in tests) are safe.
     job_registry.register_type(NoopJobWorker)
-    job_registry.register_type(EvalJobWorker)
 
     @app.get(
         "/api/jobs/events",
diff --git a/app/desktop/studio_server/jobs/workers/eval.py b/app/desktop/studio_server/jobs/workers/eval.py
deleted file mode 100644
index 6c1a4ab53..000000000
--- a/app/desktop/studio_server/jobs/workers/eval.py
+++ /dev/null
@@ -1,145 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-
-from app.desktop.git_sync.save_context import save_context_for_project
-from kiln_ai.adapters.eval.eval_runner import EvalRunner
-from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
-from kiln_ai.datamodel.eval import Eval, EvalConfig
-from kiln_ai.datamodel.task import Task
-from pydantic import BaseModel
-
-from ...eval_api import eval_config_from_id, task_run_config_from_id
-from ..models import JobContext, JobDerivedState, JobWorker
-
-
-class EvalJobParams(BaseModel):
-    project_id: str
-    task_id: str
-    eval_id: str
-    eval_config_id: str
-    run_config_id: str
-
-
-class EvalJobResult(BaseModel):
-    total: int
-    success: int
-    error: int
-
-
-class EvalJobWorker(JobWorker[EvalJobParams, EvalJobResult]):
-    """Background worker that runs an eval against a single run config.
-
-    Wraps the existing EvalRunner unchanged. Idempotent: EvalRunner excludes
-    already-run (eval_config, run_config, dataset) triples, so a paused-then-
-    resumed (or re-triggered) job skips completed items and writes no duplicate
-    EvalRun entities — hence supports_pause = True.
-    """
-
-    type_name = "eval"
-    params_model = EvalJobParams
-    result_model = EvalJobResult
-    supports_pause = True
-
-    async def compute_state(self, params: EvalJobParams) -> JobDerivedState:
-        # _compute_state_sync loads entities and enumerates runs/ directories
-        # (os.scandir + open/read/json.loads per child) synchronously. The
-        # registry awaits this on the event loop, so offload the blocking IO to
-        # a thread to keep progress/SSE updates flowing for large eval sets.
-        return await asyncio.to_thread(self._compute_state_sync, params)
-
-    def _compute_state_sync(self, params: EvalJobParams) -> JobDerivedState:
-        eval_config = eval_config_from_id(
-            params.project_id,
-            params.task_id,
-            params.eval_id,
-            params.eval_config_id,
-        )
-        eval, task = self._eval_and_task(eval_config)
-
-        # The eval-set filter defines the universe of dataset items in scope.
-        # EvalRunner only works items that BOTH pass this filter AND lack a
-        # matching EvalRun, so progress must be measured against this set.
-        filter = dataset_filter_from_id(eval.eval_set_filter_id)
-        in_filter_ids = {
-            task_run.id for task_run in task.runs(readonly=True) if filter(task_run)
-        }
-        total = len(in_filter_ids)
-
-        # Count only scored items that are still in the filter set. Items that
-        # were scored but later drifted out of the filter must not be counted,
-        # or success/is_complete would overcount and a resume could short-circuit
-        # to succeeded while real work remains.
-        scored_ids = {
-            run.dataset_id
-            for run in eval_config.runs(readonly=True)
-            if run.task_run_config_id == params.run_config_id
-        }
-        success = len(scored_ids & in_filter_ids)
-
-        return JobDerivedState(
-            total=total,
-            success=success,
-            error=0,
-            is_complete=success >= total,
-        )
-
-    async def run(self, params: EvalJobParams, ctx: JobContext) -> EvalJobResult:
-        # Baseline: items already scored (and still in-filter) before this run.
-        # EvalRunner only works the unfinished remainder, so its Progress counts
-        # are relative to that remainder. We add the baseline back so progress
-        # and the returned result are reported against the FULL eval-set size,
-        # not just the work left for this run.
-        baseline = await self.compute_state(params)
-        baseline_success = baseline.success
-
-        eval_runner = self._build_eval_runner(params)
-
-        success = baseline_success
-        total = baseline.total if baseline.total is not None else baseline_success
-        error = 0
-        async for progress in eval_runner.run():
-            # progress.total = full - baseline_success (the unfinished remainder),
-            # so baseline_success + progress.total = the full eval-set size.
-            success = baseline_success + progress.complete
-            total = baseline_success + progress.total
-            error = progress.errors
-            await ctx.report_progress(
-                success=success,
-                error=error,
-                total=total,
-            )
-
-        return EvalJobResult(total=total, success=success, error=error)
-
-    def _build_eval_runner(self, params: EvalJobParams) -> EvalRunner:
-        eval_config = eval_config_from_id(
-            params.project_id,
-            params.task_id,
-            params.eval_id,
-            params.eval_config_id,
-        )
-        run_config = task_run_config_from_id(
-            params.project_id,
-            params.task_id,
-            params.run_config_id,
-        )
-        save_context = save_context_for_project(
-            params.project_id,
-            context=f"eval job {params.eval_id}/{params.run_config_id}",
-        )
-        return EvalRunner(
-            eval_configs=[eval_config],
-            run_configs=[run_config],
-            eval_run_type="task_run_eval",
-            save_context=save_context,
-        )
-
-    def _eval_and_task(self, eval_config: EvalConfig) -> tuple[Eval, Task]:
-        eval = eval_config.parent_eval()
-        if eval is None:
-            raise ValueError("Eval config has no parent eval")
-        task = eval.parent_task()
-        if task is None:
-            raise ValueError("Eval has no parent task")
-        return eval, task
diff --git a/app/desktop/studio_server/jobs/workers/test_eval.py b/app/desktop/studio_server/jobs/workers/test_eval.py
deleted file mode 100644
index 0715344f9..000000000
--- a/app/desktop/studio_server/jobs/workers/test_eval.py
+++ /dev/null
@@ -1,535 +0,0 @@
-from __future__ import annotations
-
-from contextlib import contextmanager
-from typing import AsyncIterator
-from unittest.mock import patch
-
-import pytest
-from app.desktop.studio_server.jobs.models import BackgroundJobStatus
-from app.desktop.studio_server.jobs.registry import JobRegistry
-from app.desktop.studio_server.jobs.workers.eval import (
-    EvalJobParams,
-    EvalJobResult,
-    EvalJobWorker,
-)
-from kiln_ai.adapters.ml_model_list import ModelProviderName
-from kiln_ai.datamodel import (
-    DataSource,
-    DataSourceType,
-    Project,
-    Task,
-    TaskOutput,
-    TaskOutputRatingType,
-    TaskRun,
-)
-from kiln_ai.datamodel.eval import (
-    Eval,
-    EvalConfig,
-    EvalOutputScore,
-    EvalRun,
-)
-from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
-from kiln_ai.datamodel.task import StructuredOutputMode, TaskRunConfig
-from kiln_ai.utils.async_job_runner import Progress
-
-
-@pytest.fixture
-def project(tmp_path):
-    project = Project(
-        id="project1", name="Test Project", path=tmp_path / "project.kiln"
-    )
-    project.save_to_file()
-    return project
-
-
-@pytest.fixture
-def task(project):
-    task = Task(
-        id="task1",
-        name="Test Task",
-        description="test",
-        instruction="do the thing",
-        parent=project,
-    )
-    task.save_to_file()
-    return task
-
-
-@pytest.fixture
-def eval(task):
-    eval = Eval(
-        id="eval1",
-        name="Test Eval",
-        description="test",
-        eval_set_filter_id="tag::eval_set",
-        eval_configs_filter_id="tag::golden",
-        output_scores=[
-            EvalOutputScore(
-                name="Accuracy",
-                instruction="Check accuracy",
-                type=TaskOutputRatingType.pass_fail,
-            ),
-        ],
-        parent=task,
-    )
-    eval.save_to_file()
-    return eval
-
-
-@pytest.fixture
-def eval_config(eval):
-    eval_config = EvalConfig(
-        id="eval_config1",
-        name="Test Eval Config",
-        model_name="gpt-4",
-        model_provider="openai",
-        properties={"eval_steps": ["step1", "step2"]},
-        parent=eval,
-    )
-    eval_config.save_to_file()
-    return eval_config
-
-
-@pytest.fixture
-def run_config(task):
-    run_config = TaskRunConfig(
-        id="run_config1",
-        name="Test Run Config",
-        description="test",
-        run_config_properties=KilnAgentRunConfigProperties(
-            model_name="gpt-4",
-            model_provider_name=ModelProviderName.openai,
-            prompt_id="simple_prompt_builder",
-            structured_output_mode=StructuredOutputMode.json_schema,
-        ),
-        parent=task,
-    )
-    run_config.save_to_file()
-    return run_config
-
-
-@pytest.fixture
-def data_source():
-    return DataSource(
-        type=DataSourceType.synthetic,
-        properties={
-            "model_name": "gpt-4",
-            "model_provider": "openai",
-            "adapter_name": "test_adapter",
-        },
-    )
-
-
-@pytest.fixture
-def params():
-    return EvalJobParams(
-        project_id="project1",
-        task_id="task1",
-        eval_id="eval1",
-        eval_config_id="eval_config1",
-        run_config_id="run_config1",
-    )
-
-
-@pytest.fixture
-def resolve_project(project):
-    """Make the eval_api entity helpers resolve the on-disk project by id.
-
-    task_from_id binds project_from_id into kiln_server.task_api, so we patch it
-    there (the name as looked up), not at its definition site.
-    """
-    with patch("kiln_server.task_api.project_from_id", return_value=project):
-        yield project
-
-
-def _make_task_run(task, data_source, tag: str) -> TaskRun:
-    task_run = TaskRun(
-        parent=task,
-        input="test",
-        input_source=data_source,
-        tags=[tag],
-        output=TaskOutput(output="test"),
-    )
-    task_run.save_to_file()
-    return task_run
-
-
-def _make_eval_run(eval_config, dataset_id, run_config_id) -> EvalRun:
-    eval_run = EvalRun(
-        parent=eval_config,
-        dataset_id=dataset_id,
-        task_run_config_id=run_config_id,
-        input="test",
-        output="test",
-        scores={"accuracy": 1.0},
-    )
-    eval_run.save_to_file()
-    return eval_run
-
-
-@contextmanager
-def _stub_eval_runner_run(progresses: list[Progress]):
-    async def fake_run(self, concurrency: int = 25) -> AsyncIterator[Progress]:
-        for progress in progresses:
-            yield progress
-
-    with patch(
-        "kiln_ai.adapters.eval.eval_runner.EvalRunner.run",
-        new=fake_run,
-    ):
-        yield
-
-
-# -- compute_state -----------------------------------------------------------
-
-
-async def test_compute_state_no_eval_runs(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    for _ in range(3):
-        _make_task_run(task, data_source, "eval_set")
-    # A task run outside the eval-set filter must not be counted toward total.
-    _make_task_run(task, data_source, "other")
-
-    state = await EvalJobWorker().compute_state(params)
-
-    assert state.total == 3
-    assert state.success == 0
-    assert state.error == 0
-    assert state.is_complete is False
-
-
-async def test_compute_state_counts_already_scored(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(3)]
-    _make_eval_run(eval_config, task_runs[0].id, run_config.id)
-    _make_eval_run(eval_config, task_runs[1].id, run_config.id)
-
-    state = await EvalJobWorker().compute_state(params)
-
-    assert state.total == 3
-    assert state.success == 2
-    assert state.is_complete is False
-
-
-async def test_compute_state_is_complete(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(2)]
-    for task_run in task_runs:
-        _make_eval_run(eval_config, task_run.id, run_config.id)
-
-    state = await EvalJobWorker().compute_state(params)
-
-    assert state.total == 2
-    assert state.success == 2
-    assert state.is_complete is True
-
-
-async def test_compute_state_ignores_other_run_config(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(2)]
-    # Scored under a different run config — must not be counted.
-    _make_eval_run(eval_config, task_runs[0].id, "some_other_run_config")
-
-    state = await EvalJobWorker().compute_state(params)
-
-    assert state.total == 2
-    assert state.success == 0
-    assert state.is_complete is False
-
-
-async def test_compute_state_ignores_scored_items_out_of_filter(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    # Two items in the eval-set filter, both scored.
-    in_filter = [_make_task_run(task, data_source, "eval_set") for _ in range(2)]
-    for task_run in in_filter:
-        _make_eval_run(eval_config, task_run.id, run_config.id)
-
-    # An item that was scored under this run config but is NOT in the eval-set
-    # filter (e.g. it drifted out / was tagged differently). EvalRunner would
-    # never work it, so it must not count toward success or flip is_complete.
-    out_of_filter = _make_task_run(task, data_source, "other")
-    _make_eval_run(eval_config, out_of_filter.id, run_config.id)
-
-    state = await EvalJobWorker().compute_state(params)
-
-    # total reflects only in-filter items; the out-of-filter scored item is
-    # neither counted in total nor in success.
-    assert state.total == 2
-    assert state.success == 2
-    assert state.is_complete is True
-
-
-async def test_compute_state_out_of_filter_does_not_short_circuit(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    # Three in-filter items; only one scored. Two remain to be worked.
-    in_filter = [_make_task_run(task, data_source, "eval_set") for _ in range(3)]
-    _make_eval_run(eval_config, in_filter[0].id, run_config.id)
-
-    # Extra scored items that are out-of-filter. A naive count would inflate
-    # success to 3 and falsely report is_complete, short-circuiting a resume.
-    for _ in range(5):
-        out_of_filter = _make_task_run(task, data_source, "other")
-        _make_eval_run(eval_config, out_of_filter.id, run_config.id)
-
-    state = await EvalJobWorker().compute_state(params)
-
-    assert state.total == 3
-    assert state.success == 1
-    assert state.is_complete is False
-
-
-async def test_compute_state_missing_eval_config_raises(
-    resolve_project, task, run_config, data_source
-):
-    # No EvalConfig (or Eval) with this id exists on disk: the entity loader
-    # raises rather than silently reporting "no progress", so the failure is
-    # visible to the registry during reconciliation.
-    bad_params = EvalJobParams(
-        project_id="project1",
-        task_id="task1",
-        eval_id="missing_eval",
-        eval_config_id="missing_eval_config",
-        run_config_id="run_config1",
-    )
-
-    with pytest.raises(Exception):
-        await EvalJobWorker().compute_state(bad_params)
-
-
-# -- run ---------------------------------------------------------------------
-
-
-async def test_run_maps_progress_and_returns_result(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    progresses = [
-        Progress(complete=0, total=3, errors=0),
-        Progress(complete=1, total=3, errors=0),
-        Progress(complete=2, total=3, errors=1),
-    ]
-
-    reported: list[tuple[int, int, int | None]] = []
-
-    class FakeCtx:
-        job_id = "j_test"
-        run_id = "run_test"
-
-        async def report_progress(self, success, error=0, total=None, message=None):
-            reported.append((success, error, total))
-
-        async def report_error(self, error_message, **extra):
-            pass
-
-    with _stub_eval_runner_run(progresses):
-        result = await EvalJobWorker().run(params, FakeCtx())
-
-    assert reported == [(0, 0, 3), (1, 0, 3), (2, 1, 3)]
-    assert result == EvalJobResult(total=3, success=2, error=1)
-
-
-async def test_run_no_items_returns_zero_summary(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    class FakeCtx:
-        job_id = "j_test"
-        run_id = "run_test"
-
-        async def report_progress(self, success, error=0, total=None, message=None):
-            pass
-
-        async def report_error(self, error_message, **extra):
-            pass
-
-    # Real EvalRunner with an empty dataset yields only the initial Progress(0,0,0).
-    result = await EvalJobWorker().run(params, FakeCtx())
-
-    assert result == EvalJobResult(total=0, success=0, error=0)
-
-
-async def test_run_idempotent_skips_already_scored(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(3)]
-    # Two of three already scored.
-    _make_eval_run(eval_config, task_runs[0].id, run_config.id)
-    _make_eval_run(eval_config, task_runs[1].id, run_config.id)
-
-    processed_dataset_ids: list = []
-
-    async def fake_run_job(self, job) -> bool:
-        processed_dataset_ids.append(job.item.id)
-        EvalRun(
-            parent=job.eval_config,
-            dataset_id=job.item.id,
-            task_run_config_id=job.task_run_config.id,
-            input="test",
-            output="test",
-            scores={"accuracy": 1.0},
-        ).save_to_file()
-        return True
-
-    class FakeCtx:
-        job_id = "j_test"
-        run_id = "run_test"
-
-        async def report_progress(self, success, error=0, total=None, message=None):
-            pass
-
-        async def report_error(self, error_message, **extra):
-            pass
-
-    with patch(
-        "kiln_ai.adapters.eval.eval_runner.EvalRunner.run_job",
-        new=fake_run_job,
-    ):
-        result = await EvalJobWorker().run(params, FakeCtx())
-
-    # Only the single not-yet-scored item should have been processed.
-    assert processed_dataset_ids == [task_runs[2].id]
-    # Totals are reported against the FULL eval-set size (3), not just the work
-    # remaining for this run. Two were already scored (baseline), one processed.
-    assert result.total == 3
-    assert result.success == 3
-
-    # No duplicate EvalRuns: three task runs, three EvalRuns total.
-    assert len(eval_config.runs(readonly=True)) == 3
-
-
-async def test_run_reports_full_set_totals_on_partial_resume(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    # 5-item eval set, 2 already scored (baseline). The stubbed runner only sees
-    # the remaining 3 items, so its Progress.total is 3 — but the worker must add
-    # the baseline back and report against the full set of 5.
-    task_runs = [_make_task_run(task, data_source, "eval_set") for _ in range(5)]
-    _make_eval_run(eval_config, task_runs[0].id, run_config.id)
-    _make_eval_run(eval_config, task_runs[1].id, run_config.id)
-
-    # EvalRunner.run() yields counts relative to the unfinished remainder (3).
-    progresses = [
-        Progress(complete=0, total=3, errors=0),
-        Progress(complete=1, total=3, errors=0),
-        Progress(complete=2, total=3, errors=0),
-        Progress(complete=3, total=3, errors=0),
-    ]
-
-    reported: list[tuple[int, int, int | None]] = []
-
-    class FakeCtx:
-        job_id = "j_test"
-        run_id = "run_test"
-
-        async def report_progress(self, success, error=0, total=None, message=None):
-            reported.append((success, error, total))
-
-        async def report_error(self, error_message, **extra):
-            pass
-
-    with _stub_eval_runner_run(progresses):
-        result = await EvalJobWorker().run(params, FakeCtx())
-
-    # Reported success = baseline (2) + complete; total = baseline (2) + 3 = 5.
-    # The snapshot must not regress below the baseline of 2 already-scored items.
-    assert reported == [(2, 0, 5), (3, 0, 5), (4, 0, 5), (5, 0, 5)]
-    assert result == EvalJobResult(total=5, success=5, error=0)
-
-
-# -- save_context wiring -----------------------------------------------------
-
-
-def test_build_eval_runner_passes_save_context_when_git_sync_enabled(
-    resolve_project, task, eval_config, run_config, params
-):
-    sentinel = object()
-
-    with patch(
-        "app.desktop.studio_server.jobs.workers.eval.save_context_for_project",
-        return_value=sentinel,
-    ) as mock_helper:
-        runner = EvalJobWorker()._build_eval_runner(params)
-
-    mock_helper.assert_called_once_with(
-        params.project_id,
-        context=f"eval job {params.eval_id}/{params.run_config_id}",
-    )
-    # The helper's SaveContext is threaded straight into the runner.
-    assert runner._save_context is sentinel
-
-
-def test_build_eval_runner_defaults_to_noop_when_not_git_sync(
-    resolve_project, task, eval_config, run_config, params
-):
-    from kiln_ai.utils.git_sync_protocols import default_save_context
-
-    with patch(
-        "app.desktop.studio_server.jobs.workers.eval.save_context_for_project",
-        return_value=None,
-    ) as mock_helper:
-        runner = EvalJobWorker()._build_eval_runner(params)
-
-    mock_helper.assert_called_once()
-    # EvalRunner coalesces None to the no-op default_save_context.
-    assert runner._save_context is default_save_context
-
-
-# -- end-to-end via registry -------------------------------------------------
-
-
-async def test_eval_job_through_registry(
-    resolve_project, task, eval_config, run_config, data_source, params
-):
-    for _ in range(2):
-        _make_task_run(task, data_source, "eval_set")
-
-    progresses = [
-        Progress(complete=0, total=2, errors=0),
-        Progress(complete=1, total=2, errors=0),
-        Progress(complete=2, total=2, errors=0),
-    ]
-
-    registry = JobRegistry()
-    registry.register_type(EvalJobWorker)
-
-    with _stub_eval_runner_run(progresses):
-        job = await registry.create("eval", params, project_id=params.project_id)
-        task_handle = registry._tasks[job.id]
-        await task_handle
-
-    final = registry._jobs[job.id]
-    assert final.status == BackgroundJobStatus.SUCCEEDED
-    assert final.result == {"total": 2, "success": 2, "error": 0}
-    assert final.progress.success == 2
-    assert final.progress.total == 2
-    assert final.project_id == "project1"
-
-
-async def test_eval_job_missing_entity_marks_failed(
-    resolve_project, task, run_config, data_source
-):
-    # A job whose eval/eval_config does not exist: compute_state (run during
-    # reconciliation) raises, and the registry marks the job failed rather than
-    # treating the missing entity as "no progress".
-    bad_params = EvalJobParams(
-        project_id="project1",
-        task_id="task1",
-        eval_id="missing_eval",
-        eval_config_id="missing_eval_config",
-        run_config_id="run_config1",
-    )
-
-    registry = JobRegistry()
-    registry.register_type(EvalJobWorker)
-
-    job = await registry.create("eval", bad_params, project_id="project1")
-    task_handle = registry._tasks[job.id]
-    await task_handle
-
-    final = registry._jobs[job.id]
-    assert final.status == BackgroundJobStatus.FAILED
-    assert final.error is not None
diff --git a/app/web_ui/src/routes/(app)/jobs/+page.svelte b/app/web_ui/src/routes/(app)/jobs/+page.svelte
index 257c640f9..645011617 100644
--- a/app/web_ui/src/routes/(app)/jobs/+page.svelte
+++ b/app/web_ui/src/routes/(app)/jobs/+page.svelte
@@ -1,7 +1,6 @@
 <script lang="ts">
   import AppPage from "../app_page.svelte"
   import JobsTable from "$lib/components/jobs_table.svelte"
-  import RunEvalDialog from "./run_eval_dialog.svelte"
   import { create_job } from "$lib/stores/jobs_api"
   import { KilnError, createKilnError } from "$lib/utils/error_handlers"
   import { agentInfo } from "$lib/agent"
@@ -10,7 +9,7 @@
   agentInfo.set({
     name: "Background Jobs",
     description:
-      "Background job panel. Lists jobs (evals and others) with status, progress, and lifecycle controls.",
+      "Background job panel. Lists jobs with status, progress, and lifecycle controls.",
   })
 
   let action_error: KilnError | null = null
@@ -40,13 +39,7 @@
     }
   }
 
-  let run_eval_dialog: RunEvalDialog
-
   $: action_buttons = [
-    {
-      label: "Run eval",
-      handler: () => run_eval_dialog?.show(),
-    },
     {
       label: creating_test_job ? "Starting…" : "Start test job",
       handler: start_test_job,
@@ -70,5 +63,3 @@
 
   <JobsTable />
 </AppPage>
-
-<RunEvalDialog bind:this={run_eval_dialog} />
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte b/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte
deleted file mode 100644
index 2b24f0820..000000000
--- a/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.svelte
+++ /dev/null
@@ -1,309 +0,0 @@
-<script lang="ts">
-  import Dialog from "$lib/ui/dialog.svelte"
-  import FormElement from "$lib/utils/form_element.svelte"
-  import { client } from "$lib/api_client"
-  import { KilnError, createKilnError } from "$lib/utils/error_handlers"
-  import { create_job } from "$lib/stores/jobs_api"
-  import {
-    ui_state,
-    model_info,
-    load_model_info,
-    load_task,
-    get_task_composite_id,
-  } from "$lib/stores"
-  import {
-    load_task_run_configs,
-    run_configs_by_task_composite_id,
-  } from "$lib/stores/run_configs_store"
-  import type { Eval, EvalConfig } from "$lib/types"
-  import {
-    can_submit_run_eval,
-    eval_config_options,
-    load_eval_judges,
-    run_config_options,
-    start_eval_job,
-  } from "./run_eval_job"
-
-  let dialog: Dialog | null = null
-
-  $: project_id = $ui_state.current_project_id
-  $: task_id = $ui_state.current_task_id
-  $: has_task = !!project_id && !!task_id
-
-  let evals: Eval[] | null = null
-  let evals_loading = false
-  let evals_error: KilnError | null = null
-  // Bindable so tests can drive the eval-selection reactive path (FancySelect
-  // can't be opened in jsdom).
-  export let selected_eval_id: string | null = null
-
-  let eval_configs: EvalConfig[] | null = null
-  let eval_configs_loading = false
-  let eval_configs_error: KilnError | null = null
-  let default_eval_config_id: string | null = null
-  let selected_eval_config_id: string | null = null
-
-  let run_configs_loading = false
-  let run_configs_error: KilnError | null = null
-  let default_run_config_id: string | null = null
-  let selected_run_config_id: string | null = null
-
-  let submitting = false
-  let submit_error: KilnError | null = null
-
-  $: current_run_configs = task_id
-    ? $run_configs_by_task_composite_id[
-        get_task_composite_id(project_id ?? "", task_id)
-      ] || null
-    : null
-
-  $: judge_select_options = eval_config_options(
-    eval_configs,
-    default_eval_config_id,
-    $model_info,
-  )
-  $: run_config_select_options = run_config_options(
-    current_run_configs,
-    default_run_config_id,
-    $model_info,
-  )
-
-  $: eval_select_options = evals
-    ? [
-        {
-          label: "Evals",
-          options: evals.map((e) => ({ value: e.id, label: e.name })),
-        },
-      ]
-    : []
-
-  $: submit_disabled =
-    !has_task ||
-    submitting ||
-    !can_submit_run_eval({
-      project_id,
-      task_id,
-      eval_id: selected_eval_id,
-      eval_config_id: selected_eval_config_id,
-      run_config_id: selected_run_config_id,
-    })
-
-  export function show() {
-    submit_error = null
-    dialog?.show()
-    void on_open()
-  }
-
-  async function on_open() {
-    // Reset selections each time the dialog opens.
-    selected_eval_id = null
-    eval_configs = null
-    selected_eval_config_id = null
-    default_eval_config_id = null
-    default_run_config_id = null
-    selected_run_config_id = null
-    eval_configs_error = null
-    run_configs_error = null
-    if (!has_task) {
-      return
-    }
-    void load_model_info()
-    await Promise.all([load_evals(), load_run_configs()])
-  }
-
-  async function load_evals() {
-    if (!project_id || !task_id) {
-      return
-    }
-    evals = null
-    evals_error = null
-    evals_loading = true
-    try {
-      const { data, error } = await client.GET(
-        "/api/projects/{project_id}/tasks/{task_id}/evals",
-        { params: { path: { project_id, task_id } } },
-      )
-      if (error) {
-        throw error
-      }
-      evals = data
-    } catch (e) {
-      evals_error = createKilnError(e)
-    } finally {
-      evals_loading = false
-    }
-  }
-
-  async function load_run_configs() {
-    if (!project_id || !task_id) {
-      return
-    }
-    run_configs_error = null
-    run_configs_loading = true
-    try {
-      await load_task_run_configs(project_id, task_id)
-      const task = await load_task(project_id, task_id)
-      default_run_config_id = task?.default_run_config_id ?? null
-      if (!selected_run_config_id && default_run_config_id) {
-        selected_run_config_id = default_run_config_id
-      }
-    } catch (e) {
-      run_configs_error = createKilnError(e)
-    } finally {
-      run_configs_loading = false
-    }
-  }
-
-  // When an eval is chosen, load it (for its default judge) and its judges.
-  $: void on_eval_selected(selected_eval_id)
-  async function on_eval_selected(eval_id: string | null) {
-    eval_configs = null
-    selected_eval_config_id = null
-    default_eval_config_id = null
-    eval_configs_error = null
-    if (!eval_id || !project_id || !task_id) {
-      return
-    }
-    eval_configs_loading = true
-    try {
-      // Bail out if the user switched evals while the GETs were in flight, so a
-      // stale response can't clobber the newer eval's judge state.
-      const result = await load_eval_judges(
-        client.GET,
-        { project_id, task_id, eval_id },
-        () => selected_eval_id === eval_id,
-      )
-      if (result.stale) {
-        return
-      }
-      eval_configs = result.eval_configs
-      default_eval_config_id = result.default_eval_config_id
-      selected_eval_config_id = result.selected_eval_config_id
-    } catch (e) {
-      if (selected_eval_id !== eval_id) {
-        return
-      }
-      eval_configs_error = createKilnError(e)
-    } finally {
-      if (selected_eval_id === eval_id) {
-        eval_configs_loading = false
-      }
-    }
-  }
-
-  async function submit() {
-    submit_error = null
-    submitting = true
-    try {
-      const started = await start_eval_job(create_job, {
-        project_id,
-        task_id,
-        eval_id: selected_eval_id,
-        eval_config_id: selected_eval_config_id,
-        run_config_id: selected_run_config_id,
-      })
-      if (started) {
-        dialog?.close()
-      }
-    } catch (e) {
-      submit_error = createKilnError(e)
-    } finally {
-      submitting = false
-    }
-  }
-</script>
-
-<Dialog bind:this={dialog} title="Run an Eval">
-  {#if !has_task}
-    <p class="text-sm text-gray-500">
-      Select a task first to run an eval as a background job.
-    </p>
-  {:else}
-    <div class="flex flex-col gap-4">
-      <div>
-        <FormElement
-          id="run_eval_eval_select"
-          label="Eval"
-          description="Choose the eval to run."
-          inputType="fancy_select"
-          bind:value={selected_eval_id}
-          fancy_select_options={eval_select_options}
-          empty_label="Select an eval"
-          empty_state_message="No evals for this task yet"
-          disabled={evals_loading}
-        />
-        {#if evals_loading}
-          <div class="text-xs text-gray-500 mt-1">Loading evals…</div>
-        {:else if evals_error}
-          <div class="text-error text-sm mt-1">
-            {evals_error.getMessage() || "Could not load evals."}
-          </div>
-        {/if}
-      </div>
-
-      {#if selected_eval_id}
-        <div>
-          <FormElement
-            id="run_eval_judge_select"
-            label="Judge"
-            description="Select the judge used to score outputs."
-            inputType="fancy_select"
-            bind:value={selected_eval_config_id}
-            fancy_select_options={judge_select_options}
-            empty_label="Select a judge"
-            empty_state_message="No judges for this eval yet"
-            disabled={eval_configs_loading}
-          />
-          {#if eval_configs_loading}
-            <div class="text-xs text-gray-500 mt-1">Loading judges…</div>
-          {:else if eval_configs_error}
-            <div class="text-error text-sm mt-1">
-              {eval_configs_error.getMessage() || "Could not load judges."}
-            </div>
-          {/if}
-        </div>
-      {/if}
-
-      <div>
-        <FormElement
-          id="run_eval_run_config_select"
-          label="Run Method"
-          description="Select the run configuration to evaluate."
-          inputType="fancy_select"
-          bind:value={selected_run_config_id}
-          fancy_select_options={run_config_select_options}
-          empty_label="Select a run method"
-          empty_state_message="No run methods for this task yet"
-          disabled={run_configs_loading}
-        />
-        {#if run_configs_loading}
-          <div class="text-xs text-gray-500 mt-1">Loading run methods…</div>
-        {:else if run_configs_error}
-          <div class="text-error text-sm mt-1">
-            {run_configs_error.getMessage() || "Could not load run methods."}
-          </div>
-        {/if}
-      </div>
-
-      {#if submit_error}
-        <div role="alert" class="alert alert-error text-sm">
-          <span>{submit_error.getMessage() || "Could not start the eval."}</span
-          >
-        </div>
-      {/if}
-
-      <div class="flex flex-row justify-end mt-2">
-        <button
-          class="btn btn-sm h-10 min-w-24 btn-primary"
-          disabled={submit_disabled}
-          on:click={submit}
-        >
-          {#if submitting}
-            <div class="loading loading-spinner loading-sm"></div>
-          {/if}
-          Run eval
-        </button>
-      </div>
-    </div>
-  {/if}
-</Dialog>
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts b/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts
deleted file mode 100644
index b2de70918..000000000
--- a/app/web_ui/src/routes/(app)/jobs/run_eval_dialog.test.ts
+++ /dev/null
@@ -1,286 +0,0 @@
-// @vitest-environment jsdom
-import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"
-import { render, waitFor, cleanup } from "@testing-library/svelte"
-import { tick } from "svelte"
-import { client } from "$lib/api_client"
-import { ui_state, default_ui_state } from "$lib/stores"
-import { run_configs_by_task_composite_id } from "$lib/stores/run_configs_store"
-import RunEvalDialog from "./run_eval_dialog.svelte"
-
-vi.mock("$lib/api_client", () => ({
-  client: { GET: vi.fn(), POST: vi.fn(), DELETE: vi.fn() },
-  base_url: "http://localhost:8757",
-}))
-
-vi.mock("$lib/stores/jobs_api", () => ({
-  create_job: vi.fn(),
-}))
-
-// FancySelect relies on @floating-ui/dom, which is unavailable in jsdom.
-vi.mock("@floating-ui/dom", () => ({
-  computePosition: vi.fn().mockResolvedValue({ x: 0, y: 0 }),
-  autoUpdate: vi.fn(() => () => {}),
-  offset: vi.fn(),
-}))
-
-// HTMLDialogElement methods are not implemented in jsdom.
-beforeEach(() => {
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  ;(HTMLDialogElement.prototype as any).showModal = vi.fn()
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  ;(HTMLDialogElement.prototype as any).close = vi.fn()
-})
-
-const mockGET = client.GET as unknown as ReturnType<typeof vi.fn>
-
-function set_task() {
-  ui_state.set({
-    ...default_ui_state,
-    current_project_id: "p_1",
-    current_task_id: "t_1",
-  })
-}
-
-function set_no_task() {
-  ui_state.set({ ...default_ui_state })
-}
-
-// Routes each GET to the right fixture based on its URL template.
-function stub_endpoints() {
-  run_configs_by_task_composite_id.set({})
-  mockGET.mockImplementation((url: string) => {
-    if (url.endsWith("/evals")) {
-      return Promise.resolve({
-        data: [{ id: "e_1", name: "Quality Eval" }],
-        error: undefined,
-      })
-    }
-    if (url.endsWith("/eval_configs")) {
-      return Promise.resolve({
-        data: [{ id: "ec_1", name: "Judge One" }],
-        error: undefined,
-      })
-    }
-    if (url.endsWith("/evals/{eval_id}")) {
-      return Promise.resolve({
-        data: { id: "e_1", name: "Quality Eval", current_config_id: "ec_1" },
-        error: undefined,
-      })
-    }
-    if (url.endsWith("/run_configs")) {
-      return Promise.resolve({
-        data: [
-          {
-            id: "rc_1",
-            name: "Default Run",
-            run_config_properties: { type: "mcp" },
-          },
-        ],
-        error: undefined,
-      })
-    }
-    if (url.endsWith("/tasks/{task_id}")) {
-      return Promise.resolve({
-        data: { id: "t_1", default_run_config_id: "rc_1" },
-        error: undefined,
-      })
-    }
-    return Promise.resolve({ data: null, error: undefined })
-  })
-}
-
-function submit_button(): HTMLButtonElement {
-  const btn = Array.from(document.body.querySelectorAll("button")).find((b) =>
-    b.textContent?.includes("Run eval"),
-  )
-  if (!btn) throw new Error("Run eval button not rendered")
-  return btn as HTMLButtonElement
-}
-
-afterEach(() => {
-  cleanup()
-  vi.clearAllMocks()
-  set_no_task()
-})
-
-describe("RunEvalDialog", () => {
-  beforeEach(() => {
-    vi.clearAllMocks()
-  })
-
-  it("shows a 'select a task first' message and no submit when no task is selected", async () => {
-    set_no_task()
-    const { component } = render(RunEvalDialog)
-    component.show()
-    await tick()
-    expect(document.body.textContent).toContain("Select a task first")
-    expect(
-      Array.from(document.body.querySelectorAll("button")).some((b) =>
-        b.textContent?.includes("Run eval"),
-      ),
-    ).toBe(false)
-    // No data should be fetched when there is no task.
-    expect(mockGET).not.toHaveBeenCalled()
-  })
-
-  it("keeps submit disabled until the eval is chosen (judge + run method default automatically)", async () => {
-    set_task()
-    stub_endpoints()
-    const { component } = render(RunEvalDialog)
-    component.show()
-    // The default run method resolves automatically and renders its label.
-    await waitFor(() =>
-      expect(document.body.textContent).toContain("Default Run"),
-    )
-    // The eval picker is still empty, so the job cannot be started yet.
-    expect(submit_button().disabled).toBe(true)
-  })
-
-  // A controllable promise so a test can resolve responses out of order.
-  function deferred<T>() {
-    let resolve!: (value: T) => void
-    const promise = new Promise<T>((r) => {
-      resolve = r
-    })
-    return { promise, resolve }
-  }
-
-  // The judge picker's closed trigger renders the selected config's name (via
-  // formatEvalConfigName, which starts with the config name).
-  function judge_response(eval_id: string | undefined) {
-    return eval_id === "e_a"
-      ? [{ id: "ec_a1", name: "Judge A1" }]
-      : [{ id: "ec_b1", name: "Judge B1" }]
-  }
-
-  // FancySelect cannot be opened in jsdom, so we drive the eval-selection
-  // reactive path via the bindable `selected_eval_id` prop instead.
-  it("resets the judge selection to the new eval's configs when the eval changes", async () => {
-    set_task()
-    run_configs_by_task_composite_id.set({})
-    mockGET.mockImplementation(
-      (url: string, opts?: { params?: { path?: { eval_id?: string } } }) => {
-        const eval_id = opts?.params?.path?.eval_id
-        if (url.endsWith("/evals")) {
-          return Promise.resolve({
-            data: [
-              { id: "e_a", name: "Eval A" },
-              { id: "e_b", name: "Eval B" },
-            ],
-            error: undefined,
-          })
-        }
-        if (url.endsWith("/evals/{eval_id}/eval_configs")) {
-          return Promise.resolve({
-            data: judge_response(eval_id),
-            error: undefined,
-          })
-        }
-        if (url.endsWith("/evals/{eval_id}")) {
-          return Promise.resolve({
-            data: {
-              id: eval_id,
-              current_config_id: eval_id === "e_a" ? "ec_a1" : "ec_b1",
-            },
-            error: undefined,
-          })
-        }
-        if (url.endsWith("/run_configs")) {
-          return Promise.resolve({ data: [], error: undefined })
-        }
-        if (url.endsWith("/tasks/{task_id}")) {
-          return Promise.resolve({ data: { id: "t_1" }, error: undefined })
-        }
-        return Promise.resolve({ data: null, error: undefined })
-      },
-    )
-
-    const { component } = render(RunEvalDialog)
-    component.show()
-    await tick()
-
-    // Select eval A: judge A1 populates and is shown as selected.
-    component.$set({ selected_eval_id: "e_a" })
-    await waitFor(() => expect(document.body.textContent).toContain("Judge A1"))
-    expect(document.body.textContent).not.toContain("Judge B1")
-
-    // Switch to eval B: the judge list/selection resets to B's config.
-    component.$set({ selected_eval_id: "e_b" })
-    await waitFor(() => expect(document.body.textContent).toContain("Judge B1"))
-    expect(document.body.textContent).not.toContain("Judge A1")
-  })
-
-  it("ignores a delayed eval-A response that resolves after switching to eval B (race guard)", async () => {
-    set_task()
-    run_configs_by_task_composite_id.set({})
-
-    // Hold eval A's GETs open so they can resolve AFTER we switch to eval B.
-    const a_eval = deferred<unknown>()
-    const a_configs = deferred<unknown>()
-
-    mockGET.mockImplementation(
-      (url: string, opts?: { params?: { path?: { eval_id?: string } } }) => {
-        const eval_id = opts?.params?.path?.eval_id
-        if (url.endsWith("/evals")) {
-          return Promise.resolve({
-            data: [
-              { id: "e_a", name: "Eval A" },
-              { id: "e_b", name: "Eval B" },
-            ],
-            error: undefined,
-          })
-        }
-        if (url.endsWith("/evals/{eval_id}/eval_configs")) {
-          if (eval_id === "e_a") return a_configs.promise
-          return Promise.resolve({
-            data: judge_response(eval_id),
-            error: undefined,
-          })
-        }
-        if (url.endsWith("/evals/{eval_id}")) {
-          if (eval_id === "e_a") return a_eval.promise
-          return Promise.resolve({
-            data: { id: eval_id, current_config_id: "ec_b1" },
-            error: undefined,
-          })
-        }
-        if (url.endsWith("/run_configs")) {
-          return Promise.resolve({ data: [], error: undefined })
-        }
-        if (url.endsWith("/tasks/{task_id}")) {
-          return Promise.resolve({ data: { id: "t_1" }, error: undefined })
-        }
-        return Promise.resolve({ data: null, error: undefined })
-      },
-    )
-
-    const { component } = render(RunEvalDialog)
-    component.show()
-    await tick()
-
-    // Pick eval A — its GETs are pending. Then quickly switch to eval B, whose
-    // GETs resolve immediately and populate Judge B1.
-    component.$set({ selected_eval_id: "e_a" })
-    await tick()
-    component.$set({ selected_eval_id: "e_b" })
-    await waitFor(() => expect(document.body.textContent).toContain("Judge B1"))
-
-    // Now let eval A's stale responses resolve. They must NOT clobber B's
-    // state. Without the guard, A's late response would overwrite the judge
-    // list back to "Judge A1".
-    a_eval.resolve({
-      data: { id: "e_a", current_config_id: "ec_a1" },
-      error: undefined,
-    })
-    a_configs.resolve({ data: judge_response("e_a"), error: undefined })
-    // Flush A's full promise chain (two awaits + the state assignment) and the
-    // resulting reactive updates so a regression would actually surface.
-    for (let i = 0; i < 5; i++) {
-      await Promise.resolve()
-      await tick()
-    }
-
-    expect(document.body.textContent).toContain("Judge B1")
-    expect(document.body.textContent).not.toContain("Judge A1")
-  })
-})
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts b/app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts
deleted file mode 100644
index 18b930603..000000000
--- a/app/web_ui/src/routes/(app)/jobs/run_eval_job.test.ts
+++ /dev/null
@@ -1,282 +0,0 @@
-import { describe, it, expect, vi } from "vitest"
-import {
-  build_run_eval_params,
-  can_submit_run_eval,
-  eval_config_options,
-  load_eval_judges,
-  run_config_options,
-  start_eval_job,
-  type RunEvalSelection,
-} from "./run_eval_job"
-import type { EvalConfig, TaskRunConfig } from "$lib/types"
-import type { create_job } from "$lib/stores/jobs_api"
-import type { client } from "$lib/api_client"
-
-const complete: RunEvalSelection = {
-  project_id: "p_1",
-  task_id: "t_1",
-  eval_id: "e_1",
-  eval_config_id: "ec_1",
-  run_config_id: "rc_1",
-}
-
-describe("build_run_eval_params", () => {
-  it("returns the create_job payload when all selections are present", () => {
-    expect(build_run_eval_params(complete)).toEqual({
-      project_id: "p_1",
-      task_id: "t_1",
-      eval_id: "e_1",
-      eval_config_id: "ec_1",
-      run_config_id: "rc_1",
-    })
-  })
-
-  it("returns null when the task is not selected", () => {
-    expect(build_run_eval_params({ ...complete, task_id: null })).toBeNull()
-    expect(build_run_eval_params({ ...complete, project_id: null })).toBeNull()
-  })
-
-  it("returns null until every picker has a value", () => {
-    expect(build_run_eval_params({ ...complete, eval_id: null })).toBeNull()
-    expect(
-      build_run_eval_params({ ...complete, eval_config_id: null }),
-    ).toBeNull()
-    expect(
-      build_run_eval_params({ ...complete, run_config_id: null }),
-    ).toBeNull()
-  })
-})
-
-describe("can_submit_run_eval", () => {
-  it("is true only when the selection is complete", () => {
-    expect(can_submit_run_eval(complete)).toBe(true)
-  })
-
-  it("is false when no task is selected", () => {
-    expect(
-      can_submit_run_eval({ ...complete, project_id: null, task_id: null }),
-    ).toBe(false)
-  })
-
-  it("is false until eval, judge, and run config are all chosen", () => {
-    expect(can_submit_run_eval({ ...complete, eval_id: null })).toBe(false)
-    expect(can_submit_run_eval({ ...complete, eval_config_id: null })).toBe(
-      false,
-    )
-    expect(can_submit_run_eval({ ...complete, run_config_id: null })).toBe(
-      false,
-    )
-  })
-})
-
-describe("start_eval_job", () => {
-  it("calls create_job with the eval type, selected params, and project_id", async () => {
-    const create_job_fn = vi.fn().mockResolvedValue({
-      job_id: "j_1",
-      status: "pending",
-    }) as unknown as typeof create_job
-    const started = await start_eval_job(create_job_fn, complete)
-    expect(started).toBe(true)
-    expect(create_job_fn).toHaveBeenCalledTimes(1)
-    expect(create_job_fn).toHaveBeenCalledWith(
-      "eval",
-      {
-        project_id: "p_1",
-        task_id: "t_1",
-        eval_id: "e_1",
-        eval_config_id: "ec_1",
-        run_config_id: "rc_1",
-      },
-      null,
-      "p_1",
-    )
-  })
-
-  it("does not call create_job when the selection is incomplete", async () => {
-    const create_job_fn = vi.fn() as unknown as typeof create_job
-    const started = await start_eval_job(create_job_fn, {
-      ...complete,
-      eval_config_id: null,
-    })
-    expect(started).toBe(false)
-    expect(create_job_fn).not.toHaveBeenCalled()
-  })
-
-  it("does not call create_job when no task is selected", async () => {
-    const create_job_fn = vi.fn() as unknown as typeof create_job
-    const started = await start_eval_job(create_job_fn, {
-      ...complete,
-      project_id: null,
-      task_id: null,
-    })
-    expect(started).toBe(false)
-    expect(create_job_fn).not.toHaveBeenCalled()
-  })
-})
-
-describe("eval_config_options", () => {
-  const configs = [
-    { id: "ec_2", name: "Beta" },
-    { id: "ec_1", name: "Alpha" },
-  ] as unknown as EvalConfig[]
-
-  it("returns an empty list when there are no configs", () => {
-    expect(eval_config_options(null, "ec_1", null)).toEqual([])
-    expect(eval_config_options([], "ec_1", null)).toEqual([])
-  })
-
-  it("places the default judge first and badges it", () => {
-    const groups = eval_config_options(configs, "ec_1", null)
-    expect(groups).toHaveLength(1)
-    const options = groups[0].options
-    expect(options[0].value).toBe("ec_1")
-    expect(options[0].badge).toBe("Default")
-    expect(options[1].value).toBe("ec_2")
-    expect(options[1].badge).toBeUndefined()
-  })
-})
-
-describe("run_config_options", () => {
-  const configs = [
-    {
-      id: "rc_2",
-      name: "Zeta",
-      run_config_properties: { type: "mcp" },
-    },
-    {
-      id: "rc_1",
-      name: "Alpha",
-      run_config_properties: { type: "mcp" },
-    },
-  ] as unknown as TaskRunConfig[]
-
-  it("returns an empty list when there are no configs", () => {
-    expect(run_config_options(null, "rc_1", null)).toEqual([])
-    expect(run_config_options([], "rc_1", null)).toEqual([])
-  })
-
-  it("places the default run config first, badges it, then sorts by name", () => {
-    const groups = run_config_options(configs, "rc_1", null)
-    expect(groups).toHaveLength(1)
-    const options = groups[0].options
-    expect(options[0].value).toBe("rc_1")
-    expect(options[0].badge).toBe("Default")
-    expect(options[1].value).toBe("rc_2")
-  })
-
-  it("sorts by name when there is no default", () => {
-    const groups = run_config_options(configs, null, null)
-    const options = groups[0].options
-    expect(options.map((o) => o.value)).toEqual(["rc_1", "rc_2"])
-  })
-})
-
-describe("load_eval_judges", () => {
-  // A controllable promise so a test can resolve responses out of order.
-  function deferred<T>() {
-    let resolve!: (value: T) => void
-    const promise = new Promise<T>((r) => {
-      resolve = r
-    })
-    return { promise, resolve }
-  }
-
-  const params = { project_id: "p_1", task_id: "t_1", eval_id: "e_1" }
-
-  function stub_get(responses: {
-    evaluator: unknown
-    configs: unknown
-  }): typeof client.GET {
-    return vi.fn((url: string) => {
-      if (url.endsWith("/eval_configs")) {
-        return Promise.resolve(responses.configs)
-      }
-      return Promise.resolve(responses.evaluator)
-    }) as unknown as typeof client.GET
-  }
-
-  it("returns the eval's default judge and selects it", async () => {
-    const get = stub_get({
-      evaluator: {
-        data: { id: "e_1", current_config_id: "ec_2" },
-        error: undefined,
-      },
-      configs: {
-        data: [{ id: "ec_1" }, { id: "ec_2" }],
-        error: undefined,
-      },
-    })
-    const result = await load_eval_judges(get, params, () => true)
-    expect(result.stale).toBe(false)
-    if (result.stale) throw new Error("unexpected stale")
-    expect(result.default_eval_config_id).toBe("ec_2")
-    expect(result.selected_eval_config_id).toBe("ec_2")
-    expect(result.eval_configs.map((c) => c.id)).toEqual(["ec_1", "ec_2"])
-  })
-
-  it("falls back to the first judge when the eval has no default", async () => {
-    const get = stub_get({
-      evaluator: {
-        data: { id: "e_1", current_config_id: null },
-        error: undefined,
-      },
-      configs: {
-        data: [{ id: "ec_1" }, { id: "ec_2" }],
-        error: undefined,
-      },
-    })
-    const result = await load_eval_judges(get, params, () => true)
-    if (result.stale) throw new Error("unexpected stale")
-    expect(result.default_eval_config_id).toBeNull()
-    expect(result.selected_eval_config_id).toBe("ec_1")
-  })
-
-  it("bails out as stale when the eval changes during the first GET", async () => {
-    let is_current = true
-    const evaluator = deferred<unknown>()
-    const get = vi.fn(() => evaluator.promise) as unknown as typeof client.GET
-    const pending = load_eval_judges(get, params, () => is_current)
-    // User switches evals before the first response resolves.
-    is_current = false
-    evaluator.resolve({
-      data: { id: "e_1", current_config_id: "ec_2" },
-      error: undefined,
-    })
-    const result = await pending
-    expect(result.stale).toBe(true)
-    // The second GET must not even be issued once we know we are stale.
-    expect(get).toHaveBeenCalledTimes(1)
-  })
-
-  it("bails out as stale when the eval changes during the configs GET", async () => {
-    let is_current = true
-    const configs = deferred<unknown>()
-    const get = vi.fn((url: string) => {
-      if (url.endsWith("/eval_configs")) {
-        return configs.promise
-      }
-      return Promise.resolve({
-        data: { id: "e_1", current_config_id: "ec_2" },
-        error: undefined,
-      })
-    }) as unknown as typeof client.GET
-    const pending = load_eval_judges(get, params, () => is_current)
-    // Let the first (evaluator) GET resolve, then switch evals.
-    await Promise.resolve()
-    await Promise.resolve()
-    is_current = false
-    configs.resolve({ data: [{ id: "ec_1" }], error: undefined })
-    const result = await pending
-    expect(result.stale).toBe(true)
-  })
-
-  it("throws when an in-flight (still current) request errors", async () => {
-    const get = stub_get({
-      evaluator: { data: undefined, error: { message: "boom" } },
-      configs: { data: [], error: undefined },
-    })
-    await expect(load_eval_judges(get, params, () => true)).rejects.toEqual({
-      message: "boom",
-    })
-  })
-})
diff --git a/app/web_ui/src/routes/(app)/jobs/run_eval_job.ts b/app/web_ui/src/routes/(app)/jobs/run_eval_job.ts
deleted file mode 100644
index c904f90c9..000000000
--- a/app/web_ui/src/routes/(app)/jobs/run_eval_job.ts
+++ /dev/null
@@ -1,171 +0,0 @@
-import type { EvalConfig, TaskRunConfig } from "$lib/types"
-import type { OptionGroup } from "$lib/ui/fancy_select_types"
-import { formatEvalConfigName } from "$lib/utils/formatters"
-import { getRunConfigModelDisplayName } from "$lib/utils/run_config_formatters"
-import type { ProviderModels } from "$lib/types"
-import type { create_job } from "$lib/stores/jobs_api"
-import type { client } from "$lib/api_client"
-
-export type RunEvalSelection = {
-  project_id: string | null
-  task_id: string | null
-  eval_id: string | null
-  eval_config_id: string | null
-  run_config_id: string | null
-}
-
-export type RunEvalJobParams = {
-  project_id: string
-  task_id: string
-  eval_id: string
-  eval_config_id: string
-  run_config_id: string
-}
-
-// All four picks (plus a current task) are required before a job can start.
-export function can_submit_run_eval(selection: RunEvalSelection): boolean {
-  return build_run_eval_params(selection) !== null
-}
-
-// Returns the create_job param payload when the selection is complete, else null.
-export function build_run_eval_params(
-  selection: RunEvalSelection,
-): RunEvalJobParams | null {
-  const { project_id, task_id, eval_id, eval_config_id, run_config_id } =
-    selection
-  if (
-    !project_id ||
-    !task_id ||
-    !eval_id ||
-    !eval_config_id ||
-    !run_config_id
-  ) {
-    return null
-  }
-  return { project_id, task_id, eval_id, eval_config_id, run_config_id }
-}
-
-// Starts the eval background job for a complete selection. Returns true if a
-// job was started; false when the selection is incomplete (nothing to do).
-export async function start_eval_job(
-  create_job_fn: typeof create_job,
-  selection: RunEvalSelection,
-): Promise<boolean> {
-  const params = build_run_eval_params(selection)
-  if (!params) {
-    return false
-  }
-  await create_job_fn("eval", { ...params }, null, params.project_id)
-  return true
-}
-
-// Default judge first (badged), matching the compare_run_configs picker.
-export function eval_config_options(
-  configs: EvalConfig[] | null,
-  default_eval_config_id: string | null | undefined,
-  model_info: ProviderModels | null,
-): OptionGroup[] {
-  if (!configs || configs.length === 0) {
-    return []
-  }
-  const sorted = [...configs].sort((a, b) => {
-    if (a.id === default_eval_config_id) return -1
-    if (b.id === default_eval_config_id) return 1
-    return 0
-  })
-  return [
-    {
-      label: "Judges",
-      options: sorted.map((config) => ({
-        value: config.id,
-        label: formatEvalConfigName(config, model_info),
-        badge: config.id === default_eval_config_id ? "Default" : undefined,
-      })),
-    },
-  ]
-}
-
-// Resolved judge state for an eval, or STALE when the request was superseded.
-export type LoadEvalJudgesResult =
-  | {
-      stale: false
-      eval_configs: EvalConfig[]
-      default_eval_config_id: string | null
-      selected_eval_config_id: string | null
-    }
-  | { stale: true }
-
-const STALE: LoadEvalJudgesResult = { stale: true }
-
-// Loads an eval's default judge and its judge list. `is_current` is checked
-// after every await so a superseded request (the user switched evals while the
-// GETs were in flight) bails out instead of clobbering newer state.
-export async function load_eval_judges(
-  get: typeof client.GET,
-  params: { project_id: string; task_id: string; eval_id: string },
-  is_current: () => boolean,
-): Promise<LoadEvalJudgesResult> {
-  const { project_id, task_id, eval_id } = params
-
-  const evaluator_resp = await get(
-    "/api/projects/{project_id}/tasks/{task_id}/evals/{eval_id}",
-    { params: { path: { project_id, task_id, eval_id } } },
-  )
-  if (!is_current()) {
-    return STALE
-  }
-  if (evaluator_resp.error) {
-    throw evaluator_resp.error
-  }
-  const default_eval_config_id = evaluator_resp.data.current_config_id ?? null
-
-  const configs_resp = await get(
-    "/api/projects/{project_id}/tasks/{task_id}/evals/{eval_id}/eval_configs",
-    { params: { path: { project_id, task_id, eval_id } } },
-  )
-  if (!is_current()) {
-    return STALE
-  }
-  if (configs_resp.error) {
-    throw configs_resp.error
-  }
-  const eval_configs = configs_resp.data
-  const selected_eval_config_id =
-    default_eval_config_id ?? eval_configs[0]?.id ?? null
-
-  return {
-    stale: false,
-    eval_configs,
-    default_eval_config_id,
-    selected_eval_config_id,
-  }
-}
-
-// Default run config first (badged), then alphabetical — mirrors the eval table.
-export function run_config_options(
-  configs: TaskRunConfig[] | null,
-  default_run_config_id: string | null | undefined,
-  model_info: ProviderModels | null,
-): OptionGroup[] {
-  if (!configs || configs.length === 0) {
-    return []
-  }
-  const sorted = [...configs].sort((a, b) => {
-    if (a.id === default_run_config_id) return -1
-    if (b.id === default_run_config_id) return 1
-    return a.name.localeCompare(b.name)
-  })
-  return [
-    {
-      label: "Run Methods",
-      options: sorted.map((config) => {
-        const model_name = getRunConfigModelDisplayName(config, model_info)
-        return {
-          value: config.id,
-          label: model_name ? `${config.name} — ${model_name}` : config.name,
-          badge: config.id === default_run_config_id ? "Default" : undefined,
-        }
-      }),
-    },
-  ]
-}

From 887bcc067e60f10ef1f869f0a0654a1b07dc1198 Mon Sep 17 00:00:00 2001
From: "Leonard Q. Marcq" <marcqleonard@gmail.com>
Date: Wed, 24 Jun 2026 19:52:21 +0800
Subject: [PATCH 26/26] test: update jobs_table tests for 3-dot dropdown
 actions

Row actions moved from inline buttons into a TableActionMenu dropdown, so
the menu items only render once the menu is opened, and the terminal
"Dismiss job" button is now a "Clear" menu item. Open the dropdown before
asserting on the action labels.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01AWp8vw86zgAH8acGJfB24q
---
 .../src/lib/components/jobs_table.test.ts     | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/app/web_ui/src/lib/components/jobs_table.test.ts b/app/web_ui/src/lib/components/jobs_table.test.ts
index 52c76ac06..5cb27901d 100644
--- a/app/web_ui/src/lib/components/jobs_table.test.ts
+++ b/app/web_ui/src/lib/components/jobs_table.test.ts
@@ -84,37 +84,41 @@ describe("JobsTable", () => {
     })
   })
 
-  it("renders a dismiss button (not a Delete label) for terminal rows", () => {
+  it("offers a Clear action (not a Delete label) for terminal rows", async () => {
     jobs.set([makeJob({ id: "succeeded", status: "succeeded" })])
-    const { getByLabelText, queryByText } = render(JobsTable)
-    expect(getByLabelText("Dismiss job")).not.toBeNull()
+    const { getByLabelText, getByText, queryByText } = render(JobsTable)
+    await fireEvent.click(getByLabelText("More options"))
+    expect(getByText("Clear")).not.toBeNull()
     expect(queryByText("Delete")).toBeNull()
   })
 
-  it("gates row actions on status: running with pause shows Pause + Cancel", () => {
+  it("gates row actions on status: running with pause shows Pause + Cancel", async () => {
     jobs.set([
       makeJob({ id: "running", status: "running", supports_pause: true }),
     ])
-    const { getByText, queryByLabelText } = render(JobsTable)
+    const { getByLabelText, getByText, queryByText } = render(JobsTable)
+    await fireEvent.click(getByLabelText("More options"))
     expect(getByText("Pause")).not.toBeNull()
     expect(getByText("Cancel")).not.toBeNull()
-    expect(queryByLabelText("Dismiss job")).toBeNull()
+    expect(queryByText("Clear")).toBeNull()
   })
 
-  it("gates row actions on status: paused shows Resume + Cancel", () => {
+  it("gates row actions on status: paused shows Resume + Cancel", async () => {
     jobs.set([makeJob({ id: "paused", status: "paused" })])
-    const { getByText } = render(JobsTable)
+    const { getByLabelText, getByText } = render(JobsTable)
+    await fireEvent.click(getByLabelText("More options"))
     expect(getByText("Resume")).not.toBeNull()
     expect(getByText("Cancel")).not.toBeNull()
   })
 
-  it("gates row actions on status: pending shows only Cancel", () => {
+  it("gates row actions on status: pending shows only Cancel", async () => {
     jobs.set([makeJob({ id: "pending", status: "pending" })])
-    const { getByText, queryByText, queryByLabelText } = render(JobsTable)
+    const { getByLabelText, getByText, queryByText } = render(JobsTable)
+    await fireEvent.click(getByLabelText("More options"))
     expect(getByText("Cancel")).not.toBeNull()
     expect(queryByText("Pause")).toBeNull()
     expect(queryByText("Resume")).toBeNull()
-    expect(queryByLabelText("Dismiss job")).toBeNull()
+    expect(queryByText("Clear")).toBeNull()
   })
 
   it("shows the loading spinner before the first sync", () => {