diff --git a/app/desktop/desktop_server.py b/app/desktop/desktop_server.py index b72c15704..9abe71c72 100644 --- a/app/desktop/desktop_server.py +++ b/app/desktop/desktop_server.py @@ -33,6 +33,8 @@ from app.desktop.studio_server.eval_api import connect_evals_api from app.desktop.studio_server.finetune_api import connect_fine_tune_api from app.desktop.studio_server.import_api import connect_import_api +from app.desktop.studio_server.jobs.api import connect_jobs_api +from app.desktop.studio_server.jobs.registry import job_registry from app.desktop.studio_server.prompt_api import connect_prompt_api from app.desktop.studio_server.prompt_optimization_job_api import ( connect_prompt_optimization_job_api, @@ -111,6 +113,12 @@ async def lifespan(app: FastAPI): await _start_background_syncs() yield finally: + # End open SSE subscriptions so a UI holding the jobs stream open can't + # keep the worker alive (e.g. block a dev-server hot reload). Pure + # observer teardown — jobs keep running. Note uvicorn only reaches + # lifespan shutdown after its graceful-shutdown wait, so the dev server + # also sets timeout_graceful_shutdown to bound that wait. + job_registry.events.shutdown() try: await _stop_background_syncs() finally: @@ -146,6 +154,7 @@ def make_app(tk_root: tk.Tk | None = None): connect_agent_api(app) connect_dev_tools(app) connect_chat_api(app) + connect_jobs_api(app) # Important: webhost must be last, it handles all other URLs connect_webhost(app) return app diff --git a/app/desktop/dev_server.py b/app/desktop/dev_server.py index 2b8804b9c..514ab013a 100644 --- a/app/desktop/dev_server.py +++ b/app/desktop/dev_server.py @@ -36,4 +36,10 @@ reload=True, # Debounce when changing many files (changing branch) reload_delay=0.1, + # Bound the graceful-shutdown wait on reload. The UI holds the jobs SSE + # stream open; uvicorn waits for in-flight requests to finish BEFORE it + # runs lifespan shutdown (which closes the stream), so without a bound a + # reload would hang on the open SSE. After this many seconds uvicorn + # cancels the lingering request task instead. + timeout_graceful_shutdown=1, ) diff --git a/app/desktop/git_sync/middleware.py b/app/desktop/git_sync/middleware.py index 36eba0e92..f4b234f06 100644 --- a/app/desktop/git_sync/middleware.py +++ b/app/desktop/git_sync/middleware.py @@ -357,7 +357,11 @@ def _resolve_endpoint(self, request: Request) -> Callable[..., Any] | None: return None def _get_manager_for_request(self, request: Request) -> GitSyncManager | None: - """Extract project_id from URL, resolve to path, return manager if auto-sync enabled.""" + """Extract project_id from URL, resolve to path, return manager if auto-sync enabled. + + Keep the project_id -> manager resolution below in sync with the request-free + copy in save_context.get_manager_for_project (used by background job workers). + """ match = PROJECT_ID_PATTERN.match(request.url.path) if match is None: return None diff --git a/app/desktop/git_sync/save_context.py b/app/desktop/git_sync/save_context.py new file mode 100644 index 000000000..5ce24bedd --- /dev/null +++ b/app/desktop/git_sync/save_context.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from pathlib import Path + +from kiln_ai.utils.git_sync_protocols import SaveContext + +from app.desktop.git_sync.config import get_git_sync_config, project_path_from_id +from app.desktop.git_sync.git_sync_manager import GitSyncManager +from app.desktop.git_sync.registry import GitSyncRegistry + + +def get_manager_for_project(project_id: str) -> GitSyncManager | None: + """Resolve a project_id to its GitSyncManager when auto-sync is active. + + Request-free mirror of GitSyncMiddleware._get_manager_for_request (minus the + URL parsing). Returns None for every "not active" branch: the project has no + path, no git-sync config, sync_mode is not "auto", or no clone_path is set. + + Config is keyed by project_path; the manager is keyed by clone_path. The + manager is always obtained via GitSyncRegistry.get_or_create so the single + per-clone-path manager (and its executor + non-reentrant write lock) is + shared with the HTTP path. + """ + project_path = project_path_from_id(project_id) + if project_path is None: + return None + + config = get_git_sync_config(project_path) + if config is None: + return None + + if config["sync_mode"] != "auto": + return None + + clone_path = config.get("clone_path") + if clone_path is None: + return None + + return GitSyncRegistry.get_or_create( + repo_path=Path(clone_path), + remote_name=config["remote_name"], + pat_token=config.get("pat_token"), + oauth_token=config.get("oauth_token"), + auth_mode=config["auth_mode"], + ) + + +def save_context_for_project(project_id: str, context: str) -> SaveContext | None: + """Return a SaveContext wrapping writes in manager.atomic_write(context=...), + or None when git sync is not active for this project. + + Mirrors build_save_context(request) for callers that have only a project_id + (e.g. background job workers). Runners coalesce None to a no-op context. + """ + manager = get_manager_for_project(project_id) + if manager is None: + return None + + bg_sync = GitSyncRegistry.get_background_sync(manager.repo_path) + if bg_sync is not None: + bg_sync.notify_request() + + def factory(): + return manager.atomic_write(context=context) + + return factory diff --git a/app/desktop/git_sync/test_save_context.py b/app/desktop/git_sync/test_save_context.py new file mode 100644 index 000000000..a26d4590a --- /dev/null +++ b/app/desktop/git_sync/test_save_context.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from contextlib import ExitStack, asynccontextmanager +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from app.desktop.git_sync.config import GitSyncProjectConfig +from app.desktop.git_sync.save_context import ( + get_manager_for_project, + save_context_for_project, +) + +PROJECT_ID = "project_abc" +PROJECT_PATH = "/tmp/test/project.kiln" +CLONE_PATH = "/tmp/test/clone" + + +def _auto_config(clone_path: str | None = CLONE_PATH) -> GitSyncProjectConfig: + return GitSyncProjectConfig( + sync_mode="auto", + auth_mode="system_keys", + remote_name="origin", + branch="main", + clone_path=clone_path, + git_url=None, + pat_token=None, + oauth_token=None, + ) + + +def _manual_config() -> GitSyncProjectConfig: + return GitSyncProjectConfig( + sync_mode="manual", + auth_mode="system_keys", + remote_name="origin", + branch="main", + clone_path=CLONE_PATH, + git_url=None, + pat_token=None, + oauth_token=None, + ) + + +class _FakeManager: + """Minimal AtomicWriteCapable stand-in that records atomic_write calls.""" + + def __init__(self, repo_path: Path = Path(CLONE_PATH)): + self.repo_path = repo_path + self.calls: list[str] = [] + self.entered = False + + @asynccontextmanager + async def atomic_write(self, context: str): + self.calls.append(context) + self.entered = True + yield + + +def _patch_resolution(project_path, config, manager=None, bg_sync=None): + """Patch the config + registry calls used by the helper. + + project_path_from_id and get_git_sync_config are looked up in the + save_context module namespace, so patch them there. + """ + stack = ExitStack() + stack.enter_context( + patch( + "app.desktop.git_sync.save_context.project_path_from_id", + return_value=project_path, + ) + ) + stack.enter_context( + patch( + "app.desktop.git_sync.save_context.get_git_sync_config", + return_value=config, + ) + ) + stack.enter_context( + patch( + "app.desktop.git_sync.save_context.GitSyncRegistry.get_or_create", + return_value=manager, + ) + ) + stack.enter_context( + patch( + "app.desktop.git_sync.save_context.GitSyncRegistry.get_background_sync", + return_value=bg_sync, + ) + ) + return stack + + +# -- None branches ----------------------------------------------------------- + + +def test_returns_none_when_no_project_path(): + with _patch_resolution(project_path=None, config=None): + assert save_context_for_project(PROJECT_ID, context="ctx") is None + assert get_manager_for_project(PROJECT_ID) is None + + +def test_returns_none_when_no_git_sync_config(): + with _patch_resolution(project_path=PROJECT_PATH, config=None): + assert save_context_for_project(PROJECT_ID, context="ctx") is None + assert get_manager_for_project(PROJECT_ID) is None + + +def test_returns_none_when_sync_mode_not_auto(): + with _patch_resolution(project_path=PROJECT_PATH, config=_manual_config()): + assert save_context_for_project(PROJECT_ID, context="ctx") is None + assert get_manager_for_project(PROJECT_ID) is None + + +def test_returns_none_when_clone_path_missing(): + with _patch_resolution( + project_path=PROJECT_PATH, config=_auto_config(clone_path=None) + ): + assert save_context_for_project(PROJECT_ID, context="ctx") is None + assert get_manager_for_project(PROJECT_ID) is None + + +# -- active branches --------------------------------------------------------- + + +def test_get_manager_uses_registry_with_config_values(): + manager = _FakeManager() + with ( + patch( + "app.desktop.git_sync.save_context.project_path_from_id", + return_value=PROJECT_PATH, + ), + patch( + "app.desktop.git_sync.save_context.get_git_sync_config", + return_value=_auto_config(), + ), + patch( + "app.desktop.git_sync.save_context.GitSyncRegistry.get_or_create", + return_value=manager, + ) as mock_get_or_create, + ): + result = get_manager_for_project(PROJECT_ID) + + assert result is manager + mock_get_or_create.assert_called_once_with( + repo_path=Path(CLONE_PATH), + remote_name="origin", + pat_token=None, + oauth_token=None, + auth_mode="system_keys", + ) + + +async def test_save_context_enters_atomic_write_with_label(): + manager = _FakeManager() + with _patch_resolution( + project_path=PROJECT_PATH, config=_auto_config(), manager=manager + ): + save_context = save_context_for_project(PROJECT_ID, context="eval job e1/r1") + + assert save_context is not None + assert manager.entered is False # built lazily, not yet entered + + async with save_context(): + pass + + assert manager.calls == ["eval job e1/r1"] + + +def test_save_context_notifies_background_sync(): + manager = _FakeManager() + bg_sync = MagicMock() + with _patch_resolution( + project_path=PROJECT_PATH, + config=_auto_config(), + manager=manager, + bg_sync=bg_sync, + ): + save_context = save_context_for_project(PROJECT_ID, context="ctx") + + assert save_context is not None + bg_sync.notify_request.assert_called_once() + + +def test_save_context_no_background_sync_is_fine(): + manager = _FakeManager() + with _patch_resolution( + project_path=PROJECT_PATH, + config=_auto_config(), + manager=manager, + bg_sync=None, + ): + save_context = save_context_for_project(PROJECT_ID, context="ctx") + + assert save_context is not None + + +# -- error propagation ------------------------------------------------------- + + +def test_propagates_when_config_lookup_raises(): + # A corrupt/raising config lookup must surface (failing the job) rather than + # be swallowed to None, which would silently skip commits for an auto-sync + # project — the very bug this resolver exists to prevent. + with ( + patch( + "app.desktop.git_sync.save_context.project_path_from_id", + return_value=PROJECT_PATH, + ), + patch( + "app.desktop.git_sync.save_context.get_git_sync_config", + side_effect=RuntimeError("corrupt config"), + ), + ): + with pytest.raises(RuntimeError, match="corrupt config"): + get_manager_for_project(PROJECT_ID) + with pytest.raises(RuntimeError, match="corrupt config"): + save_context_for_project(PROJECT_ID, context="ctx") diff --git a/app/desktop/studio_server/jobs/__init__.py b/app/desktop/studio_server/jobs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/app/desktop/studio_server/jobs/api.py b/app/desktop/studio_server/jobs/api.py new file mode 100644 index 000000000..554470940 --- /dev/null +++ b/app/desktop/studio_server/jobs/api.py @@ -0,0 +1,355 @@ +from __future__ import annotations + +import asyncio +import json +from datetime import datetime +from typing import Annotated, Any, AsyncGenerator + +from fastapi import FastAPI, HTTPException, Path, Query, Response +from kiln_server.cancellable_streaming_response import CancellableStreamingResponse +from kiln_server.utils.agent_checks.policy import ( + ALLOW_AGENT, + agent_policy_require_approval, +) +from pydantic import BaseModel, Field, ValidationError + +from . import error_log +from .events import JobEvent +from .models import BackgroundJobStatus, JobRecord +from .registry import JobNotFoundError, JobOperationError, job_registry +from .workers.noop import NoopJobWorker + +KEEPALIVE_SECONDS = 15.0 + +_JOB_MUTATION_APPROVAL = agent_policy_require_approval( + "Allow agent to control background jobs (pause, resume, cancel, delete)?" +) + + +class CreateJobRequest(BaseModel): + """Request body for creating a job. Params are validated per job type.""" + + params: dict[str, Any] = Field( + default_factory=dict, + description="Type-specific job parameters, validated against the type's params model.", + ) + project_id: str | None = Field( + default=None, + description="Project to scope this job to (for filtering/visibility). " + "Falls back to the params' project_id when omitted.", + ) + metadata: dict[str, Any] | None = Field( + default=None, + description="Free-form pass-through attribution, stored verbatim.", + ) + + +class CreateJobResponse(BaseModel): + """Response returned when a job is created.""" + + job_id: str = Field(description="The id of the newly created job.") + status: BackgroundJobStatus = Field( + description="The job's status immediately after creation." + ) + + +def _project_id_from_params(validated_params: BaseModel) -> str | None: + return getattr(validated_params, "project_id", None) + + +def _format_sse(event: JobEvent) -> str: + return ( + f"event: {event.event}\ndata: {json.dumps(event.data, ensure_ascii=False)}\n\n" + ) + + +async def _event_stream( + job_id: str | None, + type_name: str | None, + project_id: str | None, +): + """Pure-observer SSE generator. + + Subscribes to the registry event bus and forwards snapshot/job/deleted + events, injecting a keepalive comment between events. Closing this generator + (client disconnect, via CancellableStreamingResponse) only unsubscribes from + the bus — it never touches any job's supervising task. Jobs keep running. + """ + # subscribe() handles the keepalive itself, yielding a "ping" event after + # `timeout` idle seconds. + subscription: AsyncGenerator[JobEvent, None] = job_registry.events.subscribe( + job_id=job_id, + type_name=type_name, + project_id=project_id, + timeout=KEEPALIVE_SECONDS, + ) + try: + async for event in subscription: + if event.event == "ping": + yield ": ping\n\n" + else: + yield _format_sse(event) + finally: + await subscription.aclose() + + +def connect_jobs_api(app: FastAPI) -> None: + # Register the workers this server exposes. register_type overwrites by + # type_name, so repeated calls (e.g. multiple make_app() in tests) are safe. + job_registry.register_type(NoopJobWorker) + + @app.get( + "/api/jobs/events", + summary="Stream Job Events", + tags=["Jobs"], + openapi_extra=ALLOW_AGENT, + ) + async def stream_job_events( + job_id: Annotated[ + str | None, Query(description="Only stream events for this job id.") + ] = None, + type: Annotated[ + str | None, Query(description="Only stream events for this job type.") + ] = None, + project_id: Annotated[ + str | None, Query(description="Only stream events for this project id.") + ] = None, + ) -> CancellableStreamingResponse: + """Server-sent events for jobs. Emits an initial `snapshot`, then per-job + `job` and `deleted` events. A pure observer: disconnecting never stops a job.""" + return CancellableStreamingResponse( + content=_event_stream(job_id, type, project_id), + media_type="text/event-stream", + ) + + @app.get( + "/api/jobs", + summary="List Jobs", + tags=["Jobs"], + openapi_extra=ALLOW_AGENT, + ) + async def list_jobs( + status: Annotated[ + BackgroundJobStatus | None, Query(description="Filter by job status.") + ] = None, + type: Annotated[str | None, Query(description="Filter by job type.")] = None, + project_id: Annotated[ + str | None, Query(description="Filter by project id.") + ] = None, + since: Annotated[ + datetime | None, + Query(description="Only jobs created at or after this ISO-8601 time."), + ] = None, + limit: Annotated[ + int | None, Query(description="Maximum number of jobs to return.") + ] = None, + ) -> list[JobRecord]: + return job_registry.list_jobs( + status=status, + type_name=type, + project_id=project_id, + since=since, + limit=limit, + ) + + @app.post( + "/api/jobs/{type}", + summary="Create Job", + tags=["Jobs"], + status_code=201, + response_model=CreateJobResponse | JobRecord, + openapi_extra=ALLOW_AGENT, + ) + async def create_job( + type: Annotated[str, Path(description="The registered job type to run.")], + request: CreateJobRequest, + wait: Annotated[ + bool, + Query( + description="When true, block until the job reaches a terminal " + "state and return the full JobRecord instead of CreateJobResponse." + ), + ] = False, + timeout: Annotated[ + float | None, + Query( + ge=0, + description="Seconds to wait when wait=true (504 on timeout). " + "Omit to wait indefinitely.", + ), + ] = None, + ) -> CreateJobResponse | JobRecord: + try: + worker = job_registry.worker_for(type) + except JobOperationError: + raise HTTPException(status_code=404, detail=f"Unknown job type: {type}") + + try: + validated = worker.params_model.model_validate(request.params) + except ValidationError as exc: + raise HTTPException(status_code=422, detail=exc.errors()) + + job = await job_registry.create( + type_name=type, + params=validated, + project_id=request.project_id or _project_id_from_params(validated), + metadata=request.metadata, + ) + if not wait: + return CreateJobResponse(job_id=job.id, status=job.status) + try: + return await job_registry.wait(job.id, timeout=timeout) + except asyncio.TimeoutError: + raise HTTPException( + status_code=504, detail="Job did not complete within the timeout." + ) + + @app.get( + "/api/jobs/{id}", + summary="Get Job", + tags=["Jobs"], + openapi_extra=ALLOW_AGENT, + ) + async def get_job( + id: Annotated[str, Path(description="The job id.")], + ) -> JobRecord: + job = await job_registry.get(id) + if job is None: + raise HTTPException(status_code=404, detail=f"Job not found: {id}") + return job + + @app.get( + "/api/jobs/{id}/result", + summary="Get Job Result", + tags=["Jobs"], + openapi_extra=ALLOW_AGENT, + ) + async def get_job_result( + id: Annotated[str, Path(description="The job id.")], + ) -> dict[str, Any]: + job = await job_registry.get(id) + if job is None: + raise HTTPException(status_code=404, detail=f"Job not found: {id}") + if not job.status.is_terminal or job.result is None: + raise HTTPException( + status_code=404, detail="No result available for this job." + ) + return job.result + + @app.get( + "/api/jobs/{id}/wait", + summary="Wait For Job", + tags=["Jobs"], + openapi_extra=ALLOW_AGENT, + ) + async def wait_for_job( + id: Annotated[str, Path(description="The job id.")], + timeout: Annotated[ + float | None, + Query( + ge=0, + description="Seconds to wait before giving up (504 on timeout). " + "Omit to wait indefinitely.", + ), + ] = None, + ) -> JobRecord: + """Block until the job reaches a terminal state, then return its record. + + A pure observer, like the SSE stream: if the client disconnects, uvicorn + cancels this handler coroutine, which cancels the wait() await and tears + down only the awaiter — the job's supervising task keeps running.""" + try: + return await job_registry.wait(id, timeout=timeout) + except JobNotFoundError: + raise HTTPException(status_code=404, detail=f"Job not found: {id}") + except asyncio.TimeoutError: + raise HTTPException( + status_code=504, detail="Job did not complete within the timeout." + ) + + @app.get( + "/api/jobs/{id}/errors", + summary="Get Job Errors", + tags=["Jobs"], + openapi_extra=ALLOW_AGENT, + ) + async def get_job_errors( + id: Annotated[str, Path(description="The job id.")], + run_id: Annotated[ + str | None, + Query(description="Read the error log for a specific past run id."), + ] = None, + ) -> list[dict[str, Any]]: + # Always 200, never errors (functional_spec §5). A plain non-reconciling + # lookup of the current run_id — we don't recompute state for a + # best-effort diagnostic read. + resolved_run_id = run_id or job_registry.run_id_for(id) + if resolved_run_id is None: + return [] + return error_log.read_errors(resolved_run_id) + + @app.post( + "/api/jobs/{id}/pause", + summary="Pause Job", + tags=["Jobs"], + status_code=202, + openapi_extra=_JOB_MUTATION_APPROVAL, + ) + async def pause_job( + id: Annotated[str, Path(description="The job id.")], + ) -> Response: + await _run_lifecycle(job_registry.pause, id) + return Response(status_code=202) + + @app.post( + "/api/jobs/{id}/resume", + summary="Resume Job", + tags=["Jobs"], + status_code=202, + openapi_extra=_JOB_MUTATION_APPROVAL, + ) + async def resume_job( + id: Annotated[str, Path(description="The job id.")], + ) -> Response: + await _run_lifecycle(job_registry.resume, id) + return Response(status_code=202) + + @app.post( + "/api/jobs/{id}/cancel", + summary="Cancel Job", + tags=["Jobs"], + status_code=202, + openapi_extra=_JOB_MUTATION_APPROVAL, + ) + async def cancel_job( + id: Annotated[str, Path(description="The job id.")], + ) -> Response: + await _run_lifecycle(job_registry.cancel, id) + return Response(status_code=202) + + @app.delete( + "/api/jobs/{id}", + summary="Delete Job", + tags=["Jobs"], + status_code=204, + openapi_extra=_JOB_MUTATION_APPROVAL, + ) + async def delete_job( + id: Annotated[str, Path(description="The job id.")], + ) -> Response: + await _run_lifecycle(job_registry.delete, id) + return Response(status_code=204) + + +async def _run_lifecycle(operation, job_id: str) -> Any: + """Invoke a registry lifecycle op, mapping its exceptions to HTTP status. + + JobNotFoundError -> 404, JobOperationError (invalid transition / unsupported + pause / delete in-flight) -> 409. + """ + try: + return await operation(job_id) + except JobNotFoundError: + raise HTTPException(status_code=404, detail=f"Job not found: {job_id}") + except JobOperationError as exc: + raise HTTPException(status_code=409, detail=str(exc)) diff --git a/app/desktop/studio_server/jobs/error_log.py b/app/desktop/studio_server/jobs/error_log.py new file mode 100644 index 000000000..6e8e23715 --- /dev/null +++ b/app/desktop/studio_server/jobs/error_log.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import json +import tempfile +from pathlib import Path +from typing import Any + +ERROR_LOG_DIR_NAME = "kiln_jobs" + + +def error_log_dir() -> Path: + return Path(tempfile.gettempdir()) / ERROR_LOG_DIR_NAME + + +def error_log_path(run_id: str) -> Path: + return error_log_dir() / f"{run_id}.json" + + +def append_error(run_id: str, entry: dict[str, Any]) -> None: + """Append a single error entry to this run's log (JSON Lines). Best-effort. + + Creates the directory lazily. Any IO/serialization failure is swallowed — + the error log is a diagnostic convenience, never a guarantee. + """ + try: + directory = error_log_dir() + directory.mkdir(parents=True, exist_ok=True) + line = json.dumps(entry, ensure_ascii=False) + with error_log_path(run_id).open("a", encoding="utf-8") as f: + f.write(line + "\n") + except Exception: + pass + + +def read_errors(run_id: str) -> list[dict[str, Any]]: + """Read the error log for a run as a list of objects. Best-effort. + + A missing or unreadable file returns []. Individual unparsable lines are + skipped rather than failing the whole read. Never raises. + """ + entries: list[dict[str, Any]] = [] + try: + path = error_log_path(run_id) + if not path.exists(): + return [] + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + parsed = json.loads(line) + except (ValueError, TypeError): + continue + if isinstance(parsed, dict): + entries.append(parsed) + except Exception: + return entries + return entries + + +def delete_errors(run_id: str) -> None: + """Best-effort remove the error log file for a run. Swallows all errors.""" + try: + error_log_path(run_id).unlink(missing_ok=True) + except Exception: + pass diff --git a/app/desktop/studio_server/jobs/events.py b/app/desktop/studio_server/jobs/events.py new file mode 100644 index 000000000..dacff3d48 --- /dev/null +++ b/app/desktop/studio_server/jobs/events.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import asyncio +from typing import Any, AsyncGenerator, Callable, Literal + +from pydantic import BaseModel + +from .models import JobRecord + + +class JobEvent(BaseModel): + """A single bus event. Per-job events carry the full record (idempotent snapshot).""" + + event: Literal["snapshot", "job", "deleted", "ping"] + data: dict[str, Any] + + +class _CloseSentinel: + """Pushed onto a subscriber's queue by ``shutdown()`` to end its stream + promptly (e.g. so a dev-server hot reload isn't blocked by open SSE + connections), distinct from a normal ``JobEvent``.""" + + +_CLOSE = _CloseSentinel() + + +class _Subscriber: + def __init__( + self, + job_id: str | None, + type_name: str | None, + project_id: str | None, + ) -> None: + self.queue: asyncio.Queue[JobEvent | _CloseSentinel] = asyncio.Queue() + self.job_id = job_id + self.type_name = type_name + self.project_id = project_id + + def matches( + self, + record_id: str | None, + record_type: str | None, + record_project_id: str | None, + ) -> bool: + if self.job_id is not None and self.job_id != record_id: + return False + if self.type_name is not None and self.type_name != record_type: + return False + if self.project_id is not None and self.project_id != record_project_id: + return False + return True + + +SnapshotProvider = Callable[[], list[JobRecord]] + + +class JobEventBus: + """In-process async pub/sub bus feeding the SSE endpoint (Phase 2). + + Subscribers receive an initial `snapshot` event, then per-job `job` events + and `deleted` tombstones, filtered by job_id / type / project_id. + """ + + def __init__(self, snapshot_provider: SnapshotProvider | None = None) -> None: + self._subscribers: set[_Subscriber] = set() + self._snapshot_provider = snapshot_provider + self._closed = False + + def set_snapshot_provider(self, provider: SnapshotProvider) -> None: + self._snapshot_provider = provider + + def _filtered_snapshot(self, subscriber: _Subscriber) -> list[JobRecord]: + if self._snapshot_provider is None: + return [] + return [ + record + for record in self._snapshot_provider() + if subscriber.matches(record.id, record.type, record.project_id) + ] + + async def subscribe( + self, + job_id: str | None = None, + type_name: str | None = None, + project_id: str | None = None, + timeout: float | None = None, + ) -> AsyncGenerator[JobEvent, None]: + """Yield the initial snapshot then per-job events. + + When ``timeout`` is set, a ``ping`` event is yielded after that many + seconds without a real event. The timeout MUST live here, inside the + generator: cancelling ``subscribe().__anext__()` from the outside (e.g. + ``asyncio.wait_for``) throws CancelledError into the suspended generator, + runs its ``finally``, and finalizes it — so the very next ``__anext__`` + would raise StopAsyncIteration and kill the stream after one ping. + + The generator ends (returns) when ``shutdown()`` has been called: either + immediately if the bus is already closed, or as soon as the close + sentinel reaches the head of the queue. + """ + if self._closed: + return + subscriber = _Subscriber(job_id, type_name, project_id) + self._subscribers.add(subscriber) + try: + snapshot = self._filtered_snapshot(subscriber) + yield JobEvent( + event="snapshot", + data={"jobs": [r.model_dump(mode="json") for r in snapshot]}, + ) + while True: + if timeout is None: + item = await subscriber.queue.get() + else: + try: + item = await asyncio.wait_for( + subscriber.queue.get(), timeout=timeout + ) + except asyncio.TimeoutError: + yield JobEvent(event="ping", data={}) + continue + if isinstance(item, _CloseSentinel): + return + yield item + finally: + self._subscribers.discard(subscriber) + + def shutdown(self) -> None: + """End every open subscription and reject new ones. + + Pushes a close sentinel onto each subscriber's queue so its + ``subscribe()`` generator returns promptly. Used on server shutdown so a + long-lived SSE connection (the jobs stream the UI holds open) doesn't + keep the worker alive — e.g. blocking a dev-server hot reload. A pure + observer teardown: it never touches any job's supervising task. + """ + self._closed = True + for subscriber in self._subscribers: + subscriber.queue.put_nowait(_CLOSE) + + def publish_job(self, record: JobRecord) -> None: + event = JobEvent(event="job", data=record.model_dump(mode="json")) + for subscriber in self._subscribers: + if subscriber.matches(record.id, record.type, record.project_id): + subscriber.queue.put_nowait(event) + + def publish_deleted( + self, + job_id: str, + type_name: str | None = None, + project_id: str | None = None, + ) -> None: + event = JobEvent(event="deleted", data={"id": job_id}) + for subscriber in self._subscribers: + if subscriber.matches(job_id, type_name, project_id): + subscriber.queue.put_nowait(event) diff --git a/app/desktop/studio_server/jobs/models.py b/app/desktop/studio_server/jobs/models.py new file mode 100644 index 000000000..3d5404f64 --- /dev/null +++ b/app/desktop/studio_server/jobs/models.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from enum import Enum +from typing import ( + Any, + Awaitable, + Callable, + ClassVar, + Generic, + TypeVar, +) + +from pydantic import BaseModel, Field + + +def _utc_now() -> datetime: + return datetime.now(timezone.utc) + + +class BackgroundJobStatus(str, Enum): + PENDING = "pending" + RUNNING = "running" + PAUSED = "paused" + SUCCEEDED = "succeeded" + FAILED = "failed" + CANCELLED = "cancelled" + + @property + def is_terminal(self) -> bool: + return self in TERMINAL_STATUSES + + +TERMINAL_STATUSES = frozenset( + { + BackgroundJobStatus.SUCCEEDED, + BackgroundJobStatus.FAILED, + BackgroundJobStatus.CANCELLED, + } +) + + +class JobProgress(BaseModel): + """Count-based progress for a job. + + Processed = success + error; remaining = total - success - error. The error + field is a count only — the actual messages live in the per-run error log. + """ + + total: int | None = None + success: int = 0 + error: int = 0 + message: str | None = None + updated_at: datetime = Field(default_factory=_utc_now) + + +class JobDerivedState(BaseModel): + """A worker's view of the operation's true state, read from source-of-truth entities.""" + + total: int | None = None + success: int = 0 + error: int = 0 + is_complete: bool = False + message: str | None = None + + +class JobError(BaseModel): + """Small failure summary stamped on the record. Detail lives in the error log.""" + + error: str | None = None + detail: dict[str, Any] | None = None + + +class JobRecord(BaseModel): + """Ephemeral, in-memory bookkeeping for a single job. Never persisted to disk.""" + + id: str + type: str + status: BackgroundJobStatus + run_id: str | None = None + progress: JobProgress = Field(default_factory=JobProgress) + # Typed, per-worker progress detail (validated against the worker's + # `progress_model`). The generic `progress` above is the universal counter; + # this carries the rich per-kind shape a worker needs the UI to render + # (e.g. RAG's four-phase breakdown). Kept as a dict on the wire so the core + # stays worker-agnostic; the frontend casts it to the worker's model. + progress_detail: dict[str, Any] | None = None + params: dict[str, Any] = Field(default_factory=dict) + result: dict[str, Any] | None = None + error: JobError | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + project_id: str | None = None + supports_pause: bool = False + created_at: datetime = Field(default_factory=_utc_now) + updated_at: datetime = Field(default_factory=_utc_now) + started_at: datetime | None = None + ended_at: datetime | None = None + + +ReportProgress = Callable[["JobProgressUpdate"], Awaitable[None]] +ReportProgressDetail = Callable[[BaseModel], Awaitable[None]] +ReportError = Callable[[str, dict[str, Any]], Awaitable[None]] + + +class JobProgressUpdate(BaseModel): + success: int + error: int = 0 + total: int | None = None + message: str | None = None + + +class JobContext: + """Provided to the worker by JobRegistry during run(). + + Holds the current job_id and run_id, plus registry-injected callbacks for + reporting progress (in-memory snapshot + event) and per-item errors (error log). + """ + + def __init__( + self, + job_id: str, + run_id: str, + report_progress: ReportProgress, + report_progress_detail: ReportProgressDetail, + report_error: ReportError, + ) -> None: + self.job_id = job_id + self.run_id = run_id + self._report_progress = report_progress + self._report_progress_detail = report_progress_detail + self._report_error = report_error + + async def report_progress( + self, + success: int, + error: int = 0, + total: int | None = None, + message: str | None = None, + ) -> None: + """Update the registry's in-memory progress snapshot and emit an event. + + A UI-smoothing signal only — the authoritative progress comes from + compute_state(). Cheap to call often. + """ + await self._report_progress( + JobProgressUpdate( + success=success, + error=error, + total=total, + message=message, + ) + ) + + async def report_progress_detail(self, detail: BaseModel) -> None: + """Stamp the job's typed `progress_detail` with a worker-specific model. + + For rich per-kind progress the generic counter can't carry (e.g. RAG's + per-phase breakdown). `detail` must be an instance of the worker's + declared `progress_model`; the registry validates and serializes it. + A UI-smoothing signal only — authoritative progress comes from + compute_state(). Cheap to call often. + """ + await self._report_progress_detail(detail) + + async def report_error(self, error_message: str, **extra: Any) -> None: + """Append one structured error entry to this run's error log. + + For non-fatal per-item errors that don't stop the run. Best-effort: a + failed write is swallowed, never propagated. Does not itself bump the + progress error count — report that via report_progress. + """ + await self._report_error(error_message, extra) + + +TParams = TypeVar("TParams", bound=BaseModel) +TResult = TypeVar("TResult", bound=BaseModel) + + +class JobWorker(Generic[TParams, TResult]): + type_name: ClassVar[str] + params_model: ClassVar[type[BaseModel]] + result_model: ClassVar[type[BaseModel]] + # Optional typed model for rich per-worker progress reported via + # JobContext.report_progress_detail(); stamped on JobRecord.progress_detail. + # Leave None for workers whose generic count progress is enough. + progress_model: ClassVar[type[BaseModel] | None] = None + supports_pause: ClassVar[bool] = False + + async def compute_state(self, params: TParams) -> JobDerivedState | None: + """Read source-of-truth Kiln entities and return the operation's true state. + + MUST be a pure read — no side effects, idempotent, safe to call any time. + Return None only when the worker has no backing entity to consult (e.g. + the NoopJob fixture); the registry then keeps the last believed snapshot. + Real workers must override this. + """ + return None + + async def run(self, params: TParams, ctx: JobContext) -> TResult: + """MUST be idempotent. Covers both first run and resume — the registry + calls run() again to resume a paused job; the worker re-orients via + compute_state(), not a handed-in checkpoint. + """ + raise NotImplementedError diff --git a/app/desktop/studio_server/jobs/registry.py b/app/desktop/studio_server/jobs/registry.py new file mode 100644 index 000000000..00ddff357 --- /dev/null +++ b/app/desktop/studio_server/jobs/registry.py @@ -0,0 +1,539 @@ +from __future__ import annotations + +import asyncio +import logging +import os +import secrets +import traceback +import uuid +from datetime import datetime +from typing import Any + +from pydantic import BaseModel + +from . import error_log +from .events import JobEventBus +from .models import ( + BackgroundJobStatus, + JobContext, + JobDerivedState, + JobError, + JobProgress, + JobProgressUpdate, + JobRecord, + JobWorker, + _utc_now, +) + +logger = logging.getLogger(__name__) + +DEFAULT_MAX_CONCURRENT = 10 +MAX_CONCURRENT_ENV_VAR = "KILN_JOBS_MAX_CONCURRENT" + +_JOB_ID_ALPHABET = "abcdefghijklmnopqrstuvwxyz234567" +_JOB_ID_LENGTH = 12 + + +class JobNotFoundError(Exception): + pass + + +class JobOperationError(Exception): + """Raised for invalid lifecycle operations (e.g. pause a non-running job). + + Phase 2 maps these to 409 Conflict. + """ + + +def _new_job_id() -> str: + suffix = "".join(secrets.choice(_JOB_ID_ALPHABET) for _ in range(_JOB_ID_LENGTH)) + return f"j_{suffix}" + + +def _resolve_max_concurrent(explicit: int | None) -> int: + if explicit is not None: + return explicit + raw = os.environ.get(MAX_CONCURRENT_ENV_VAR) + if raw: + try: + value = int(raw) + if value > 0: + return value + except ValueError: + pass + return DEFAULT_MAX_CONCURRENT + + +class JobRegistry: + """In-memory registry owning job lifecycle, concurrency, and reconciliation. + + Singleton per process. The in-memory index is the only store — no disk + persistence of state. Supervising tasks are owned here and decoupled from any + HTTP connection. + """ + + def __init__(self, max_concurrent: int | None = None) -> None: + self._max_concurrent = _resolve_max_concurrent(max_concurrent) + self._workers: dict[str, JobWorker] = {} + self._jobs: dict[str, JobRecord] = {} + self._tasks: dict[str, asyncio.Task] = {} + self._pending_ids: list[str] = [] + self._cancel_intent: set[str] = set() + self._pause_intent: set[str] = set() + # Job ids whose supervising task received a real (delivered-to-a-live- + # task) cancellation. Distinguishes "worker swallowed a cancel" from + # "worker finished before any cancel landed" when the worker returns + # normally — the former must transition to paused/cancelled, the latter + # must keep its succeeded result. + self._cancel_delivered: set[str] = set() + # Per-job completion events for awaiters (registry.wait). Created lazily + # by wait(); set by _emit() on the terminal transition; reclaimed in + # delete(). Bounded to one event per waited job, tracking the same + # lifecycle as the JobRecord. Shared across all awaiters of a job so one + # awaiter cancelling its wait() leaves the event (and the task) untouched. + self._completion_events: dict[str, asyncio.Event] = {} + self._running_count = 0 + self.events = JobEventBus(snapshot_provider=self._snapshot) + + # -- registration -------------------------------------------------------- + + def register_type(self, worker_cls: type[JobWorker]) -> None: + worker = worker_cls() + self._workers[worker_cls.type_name] = worker + + def worker_for(self, type_name: str) -> JobWorker: + worker = self._workers.get(type_name) + if worker is None: + raise JobOperationError(f"Unknown job type: {type_name}") + return worker + + # -- snapshots / reads --------------------------------------------------- + + def _snapshot(self) -> list[JobRecord]: + return list(self._jobs.values()) + + def _require(self, job_id: str) -> JobRecord: + job = self._jobs.get(job_id) + if job is None: + raise JobNotFoundError(job_id) + return job + + async def get(self, job_id: str) -> JobRecord | None: + job = self._jobs.get(job_id) + if job is None: + return None + await self._reconcile(job, emit_on_change=True) + return job + + def run_id_for(self, job_id: str) -> str | None: + """Current run_id for a job, or None if unknown. A plain read — no + reconciliation (used by the best-effort errors endpoint).""" + job = self._jobs.get(job_id) + return job.run_id if job is not None else None + + def list_jobs( + self, + status: BackgroundJobStatus | None = None, + type_name: str | None = None, + project_id: str | None = None, + since: datetime | None = None, + limit: int | None = None, + ) -> list[JobRecord]: + records = list(self._jobs.values()) + if status is not None: + records = [r for r in records if r.status == status] + if type_name is not None: + records = [r for r in records if r.type == type_name] + if project_id is not None: + records = [r for r in records if r.project_id == project_id] + if since is not None: + records = [r for r in records if r.created_at >= since] + records.sort(key=lambda r: r.created_at, reverse=True) + if limit is not None: + records = records[:limit] + return records + + # -- create -------------------------------------------------------------- + + async def create( + self, + type_name: str, + params: dict[str, Any] | BaseModel, + project_id: str | None = None, + metadata: dict[str, Any] | None = None, + ) -> JobRecord: + worker = self.worker_for(type_name) + validated = self._validate_params(worker, params) + job_id = self._fresh_job_id() + job = JobRecord( + id=job_id, + type=type_name, + status=BackgroundJobStatus.PENDING, + params=validated.model_dump(mode="json"), + metadata=metadata or {}, + project_id=project_id, + supports_pause=worker.supports_pause, + ) + self._jobs[job_id] = job + self._pending_ids.append(job_id) + self._emit(job) + self._dispatch_pending() + return job + + def _fresh_job_id(self) -> str: + job_id = _new_job_id() + while job_id in self._jobs: + job_id = _new_job_id() + return job_id + + def _validate_params( + self, worker: JobWorker, params: dict[str, Any] | BaseModel + ) -> BaseModel: + if isinstance(params, worker.params_model): + return params + if isinstance(params, BaseModel): + params = params.model_dump() + return worker.params_model.model_validate(params) + + # -- dispatch / supervision --------------------------------------------- + + def _dispatch_pending(self) -> None: + while self._running_count < self._max_concurrent and self._pending_ids: + job_id = self._pending_ids.pop(0) + job = self._jobs.get(job_id) + if job is None or job.status != BackgroundJobStatus.PENDING: + continue + self._launch(job) + + def _launch(self, job: JobRecord) -> None: + worker = self.worker_for(job.type) + run_id = str(uuid.uuid4()) + job.run_id = run_id + job.status = BackgroundJobStatus.RUNNING + job.started_at = _utc_now() + self._touch(job) + self._running_count += 1 + self._emit(job) + task = asyncio.create_task(self._supervise(job.id, worker, run_id)) + self._tasks[job.id] = task + + async def _supervise(self, job_id: str, worker: JobWorker, run_id: str) -> None: + job = self._jobs.get(job_id) + if job is None: + return + params = worker.params_model.model_validate(job.params) + ctx = self._build_context(job_id, run_id, worker) + try: + try: + await self._reconcile(job, emit_on_change=True) + if job.status == BackgroundJobStatus.SUCCEEDED: + return + result = await worker.run(params, ctx) + # The cancellation transition is unconditional (functional_spec + # §2): a worker that catches CancelledError for cleanup and then + # returns normally — even one that calls task.uncancel() so it is + # never re-raised — must still land in paused/cancelled, not + # succeeded. The registry enforces this off its own delivery + # record rather than trusting the worker to re-raise. A worker + # that finished naturally before any cancel landed has no + # delivery recorded, so its result stands. + if job_id in self._cancel_delivered: + self._finish_cancelled_or_paused(job) + else: + self._finish_succeeded(job, result) + except asyncio.CancelledError: + self._finish_cancelled_or_paused(job) + raise + except Exception as exc: + self._finish_failed(job, run_id, exc) + finally: + self._release_slot(job_id) + + def _build_context(self, job_id: str, run_id: str, worker: JobWorker) -> JobContext: + async def report_progress(update: JobProgressUpdate) -> None: + job = self._jobs.get(job_id) + if job is None or job.run_id != run_id: + return + job.progress = JobProgress( + total=update.total if update.total is not None else job.progress.total, + success=update.success, + error=update.error, + message=update.message + if update.message is not None + else job.progress.message, + ) + self._touch(job) + self._emit(job) + + async def report_progress_detail(detail: BaseModel) -> None: + job = self._jobs.get(job_id) + if job is None or job.run_id != run_id: + return + # Guard the worker's contract: the detail must be the model the + # worker declared, so progress_detail's shape is predictable for + # the frontend that casts it. + expected = worker.progress_model + if expected is not None and not isinstance(detail, expected): + raise TypeError( + f"report_progress_detail expected {expected.__name__}, " + f"got {type(detail).__name__}" + ) + job.progress_detail = detail.model_dump(mode="json") + self._touch(job) + self._emit(job) + + async def report_error(message: str, extra: dict[str, Any]) -> None: + error_log.append_error(run_id, {"error_message": message, **extra}) + + return JobContext( + job_id, run_id, report_progress, report_progress_detail, report_error + ) + + def _finish_succeeded(self, job: JobRecord, result: BaseModel) -> None: + job.status = BackgroundJobStatus.SUCCEEDED + job.result = result.model_dump(mode="json") + job.ended_at = _utc_now() + self._touch(job) + self._emit(job) + + def _finish_failed(self, job: JobRecord, run_id: str, exc: Exception) -> None: + job.status = BackgroundJobStatus.FAILED + job.error = JobError(error=str(exc) or exc.__class__.__name__) + job.ended_at = _utc_now() + self._touch(job) + error_log.append_error( + run_id, + { + "error_message": str(exc) or exc.__class__.__name__, + "traceback": "".join( + traceback.format_exception(type(exc), exc, exc.__traceback__) + ), + "fatal": True, + }, + ) + self._emit(job) + + def _finish_cancelled_or_paused(self, job: JobRecord) -> None: + if job.id in self._pause_intent: + job.status = BackgroundJobStatus.PAUSED + else: + job.status = BackgroundJobStatus.CANCELLED + job.ended_at = _utc_now() + self._touch(job) + self._emit(job) + + # -- lifecycle controls -------------------------------------------------- + + async def pause(self, job_id: str) -> JobRecord: + job = self._require(job_id) + if not job.supports_pause: + raise JobOperationError(f"Job type '{job.type}' does not support pause") + if job.status != BackgroundJobStatus.RUNNING: + raise JobOperationError( + f"Cannot pause a job in status '{job.status.value}'" + ) + self._pause_intent.add(job_id) + await self._cancel_task(job_id) + # If run() completed naturally during the cancel await, the job is + # already terminal — leave that state intact rather than forcing paused. + if job.status.is_terminal: + return job + if job.status != BackgroundJobStatus.PAUSED: + job.status = BackgroundJobStatus.PAUSED + self._touch(job) + worker = self.worker_for(job.type) + params = worker.params_model.model_validate(job.params) + derived = await worker.compute_state(params) + self._apply_derived(job, derived) + self._emit(job) + return job + + async def resume(self, job_id: str) -> JobRecord: + job = self._require(job_id) + if job.status != BackgroundJobStatus.PAUSED: + raise JobOperationError( + f"Cannot resume a job in status '{job.status.value}'" + ) + worker = self.worker_for(job.type) + params = worker.params_model.model_validate(job.params) + derived = await worker.compute_state(params) + if derived is not None and derived.is_complete: + self._apply_derived(job, derived) + job.status = BackgroundJobStatus.SUCCEEDED + job.ended_at = _utc_now() + self._touch(job) + self._emit(job) + return job + self._apply_derived(job, derived) + job.status = BackgroundJobStatus.PENDING + self._touch(job) + self._pending_ids.append(job_id) + self._emit(job) + self._dispatch_pending() + return job + + async def cancel(self, job_id: str) -> JobRecord: + job = self._require(job_id) + if job.status.is_terminal: + raise JobOperationError( + f"Cannot cancel a job in status '{job.status.value}'" + ) + if job.status == BackgroundJobStatus.PENDING: + self._remove_pending(job_id) + job.status = BackgroundJobStatus.CANCELLED + job.ended_at = _utc_now() + self._touch(job) + self._emit(job) + return job + if job.status == BackgroundJobStatus.PAUSED: + job.status = BackgroundJobStatus.CANCELLED + job.ended_at = _utc_now() + self._touch(job) + self._emit(job) + return job + self._cancel_intent.add(job_id) + await self._cancel_task(job_id) + if not job.status.is_terminal: + job.status = BackgroundJobStatus.CANCELLED + job.ended_at = _utc_now() + self._touch(job) + self._emit(job) + return self._jobs[job_id] + + async def delete(self, job_id: str) -> None: + job = self._require(job_id) + if not job.status.is_terminal: + raise JobOperationError( + f"Cannot delete a job in status '{job.status.value}'" + ) + self._jobs.pop(job_id, None) + self._remove_pending(job_id) + self._completion_events.pop(job_id, None) + if job.run_id is not None: + error_log.delete_errors(job.run_id) + self.events.publish_deleted(job_id, job.type, job.project_id) + + async def _cancel_task(self, job_id: str) -> None: + task = self._tasks.get(job_id) + if task is None: + return + # cancel() returns True only if the request landed on a not-yet-done + # task — i.e. the cancellation is actually delivered to the worker. If + # it returns False the worker already finished naturally; we must not + # override that terminal result. + if task.cancel(): + self._cancel_delivered.add(job_id) + try: + await task + except asyncio.CancelledError: + pass + except Exception: + # The worker raised while we awaited its cancellation. _supervise + # already routed this to the failed/terminal state and logged it; + # we only debug-log here so it isn't silently discarded. + logger.debug( + "Worker for job %s raised during cancel await", job_id, exc_info=True + ) + # If the task was cancelled before its coroutine body ever ran, its own + # finally never executed, so reclaim the slot here. Idempotent: whoever + # pops job_id from _tasks first owns the single decrement. + self._release_slot(job_id) + + def _release_slot(self, job_id: str) -> None: + if self._tasks.pop(job_id, None) is None: + return + self._cancel_intent.discard(job_id) + self._pause_intent.discard(job_id) + self._cancel_delivered.discard(job_id) + self._running_count -= 1 + self._dispatch_pending() + + def _remove_pending(self, job_id: str) -> None: + try: + self._pending_ids.remove(job_id) + except ValueError: + pass + + # -- reconciliation ------------------------------------------------------ + + async def _reconcile(self, job: JobRecord, emit_on_change: bool) -> bool: + worker = self._workers.get(job.type) + if worker is None: + return False + params = worker.params_model.model_validate(job.params) + try: + derived = await worker.compute_state(params) + except Exception: + # compute_state may touch on-disk entities (project/task/eval) that + # could be deleted or transiently unavailable. A failure here must + # not 500 the GET /api/jobs/{id} read or crash _supervise's initial + # reconcile — fall back to the last known in-memory state. + logger.exception("Failed to compute state for job %s", job.id) + return False + if derived is None: + return False + changed = self._apply_derived(job, derived) + if derived.is_complete and not job.status.is_terminal: + job.status = BackgroundJobStatus.SUCCEEDED + job.ended_at = _utc_now() + self._touch(job) + changed = True + if changed and emit_on_change: + self._emit(job) + return changed + + def _apply_derived(self, job: JobRecord, derived: JobDerivedState | None) -> bool: + if derived is None: + return False + new_progress = JobProgress( + total=derived.total if derived.total is not None else job.progress.total, + success=derived.success, + error=derived.error, + message=derived.message + if derived.message is not None + else job.progress.message, + ) + before = job.progress.model_dump(exclude={"updated_at"}) + after = new_progress.model_dump(exclude={"updated_at"}) + if before == after: + return False + job.progress = new_progress + self._touch(job) + return True + + # -- helpers ------------------------------------------------------------- + + def _touch(self, job: JobRecord) -> None: + job.updated_at = _utc_now() + + def _emit(self, job: JobRecord) -> None: + self.events.publish_job(job) + if job.status.is_terminal: + ev = self._completion_events.get(job.id) + if ev is not None: + ev.set() + + # -- await completion ---------------------------------------------------- + + async def wait(self, job_id: str, timeout: float | None = None) -> JobRecord: + """Observe a job until it reaches a terminal state, then return its record. + + A pure observer, mirroring the SSE stream's decoupling: cancelling this + await (caller drops off / client disconnects) tears down only the awaiter + — the job's supervising task is owned by the registry and keeps running. + Multi-waiter safe: all awaiters of a job share one Event. timeout=None + waits indefinitely; on timeout asyncio.wait_for raises + asyncio.TimeoutError, which propagates to the caller. + """ + job = self._require(job_id) + # Create the event before the terminal check so there's no race window: + # single-threaded asyncio guarantees no await between setdefault and the + # check, and _emit only sets the event if it already exists here. + ev = self._completion_events.setdefault(job_id, asyncio.Event()) + if job.status.is_terminal: + return job + await asyncio.wait_for(ev.wait(), timeout) + return job + + +job_registry = JobRegistry() diff --git a/app/desktop/studio_server/jobs/test_api.py b/app/desktop/studio_server/jobs/test_api.py new file mode 100644 index 000000000..46e5b8053 --- /dev/null +++ b/app/desktop/studio_server/jobs/test_api.py @@ -0,0 +1,773 @@ +from __future__ import annotations + +import asyncio +import json +import uuid + +import httpx +import pytest +import pytest_asyncio +from app.desktop.studio_server.jobs import api as jobs_api +from app.desktop.studio_server.jobs import error_log +from app.desktop.studio_server.jobs.api import connect_jobs_api +from app.desktop.studio_server.jobs.models import ( + BackgroundJobStatus, + JobDerivedState, + JobWorker, +) +from app.desktop.studio_server.jobs.registry import JobOperationError, JobRegistry +from app.desktop.studio_server.jobs.workers.noop import NoopJobWorker +from fastapi import FastAPI +from pydantic import BaseModel + + +async def _safe_cancel(registry: JobRegistry, job_id: str) -> None: + """Best-effort cleanup cancel; ignore a job that already reached terminal.""" + try: + await registry.cancel(job_id) + except JobOperationError: + pass + + +@pytest.fixture(autouse=True) +def temp_error_log_dir(tmp_path, monkeypatch): + monkeypatch.setattr( + "app.desktop.studio_server.jobs.error_log.tempfile.gettempdir", + lambda: str(tmp_path), + ) + + +# -- supporting test workers ------------------------------------------------- + + +class _ProjectParams(BaseModel): + project_id: str + steps: int = 50 + sleep_per_step_seconds: float = 0.05 + + +class _EmptyResult(BaseModel): + pass + + +class ProjectScopedWorker(JobWorker[_ProjectParams, _EmptyResult]): + """A worker whose params carry a project_id, so the record gets one.""" + + type_name = "project_scoped" + params_model = _ProjectParams + result_model = _EmptyResult + supports_pause = True + + async def run(self, params, ctx): + await asyncio.sleep(5) + return _EmptyResult() + + +class _EmptyParams(BaseModel): + pass + + +class ReconcileCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]): + """compute_state flips to complete once `done` is set, so a GET reconciles + the running job straight to succeeded.""" + + type_name = "reconcile_complete" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = True + done = False + + async def compute_state(self, params): + complete = type(self).done + return JobDerivedState( + total=3, success=3 if complete else 1, error=0, is_complete=complete + ) + + async def run(self, params, ctx): + await asyncio.sleep(5) + return _EmptyResult() + + +class NonPausableWorker(JobWorker[_EmptyParams, _EmptyResult]): + type_name = "nonpausable" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = False + + async def run(self, params, ctx): + await asyncio.sleep(5) + return _EmptyResult() + + +# -- fixtures ---------------------------------------------------------------- + + +@pytest.fixture +def registry(monkeypatch): + """Patch a fresh registry in for isolation, then register the test workers.""" + reg = JobRegistry(max_concurrent=10) + monkeypatch.setattr(jobs_api, "job_registry", reg) + reg.register_type(NoopJobWorker) + reg.register_type(ProjectScopedWorker) + reg.register_type(ReconcileCompleteWorker) + reg.register_type(NonPausableWorker) + return reg + + +@pytest.fixture +def app(registry): + app = FastAPI() + connect_jobs_api(app) + return app + + +@pytest_asyncio.fixture +async def client(app): + # Async client over ASGI so handlers AND the registry's background tasks + # share the test's event loop — background jobs progress while we await. + transport = httpx.ASGITransport(app=app) + async with httpx.AsyncClient( + transport=transport, base_url="http://test" + ) as http_client: + yield http_client + + +async def _wait_for_status( + registry: JobRegistry, + job_id: str, + target: BackgroundJobStatus | set[BackgroundJobStatus], + timeout: float = 3.0, +) -> None: + targets = {target} if isinstance(target, BackgroundJobStatus) else target + deadline = asyncio.get_event_loop().time() + timeout + while asyncio.get_event_loop().time() < deadline: + job = registry._jobs.get(job_id) + if job is not None and job.status in targets: + return + await asyncio.sleep(0.01) + job = registry._jobs.get(job_id) + actual = job.status if job else "missing" + raise AssertionError(f"Job {job_id} did not reach {targets}; was {actual}") + + +async def _create_noop(client, **params) -> str: + body = {"steps": 50, "sleep_per_step_seconds": 0.05} + body.update(params) + resp = await client.post("/api/jobs/noop", json={"params": body}) + assert resp.status_code == 201, resp.text + return resp.json()["job_id"] + + +# -- create ------------------------------------------------------------------ + + +@pytest.mark.asyncio +async def test_create_returns_201_and_status(client): + resp = await client.post( + "/api/jobs/noop", + json={"params": {"steps": 3, "sleep_per_step_seconds": 0.01}}, + ) + assert resp.status_code == 201 + body = resp.json() + assert body["job_id"].startswith("j_") + assert body["status"] in ("pending", "running") + + +@pytest.mark.asyncio +async def test_create_unknown_type_404(client): + resp = await client.post("/api/jobs/does_not_exist", json={"params": {}}) + assert resp.status_code == 404 + assert "Unknown job type" in resp.json()["detail"] + + +@pytest.mark.asyncio +async def test_create_invalid_params_422(client): + resp = await client.post("/api/jobs/noop", json={"params": {"steps": "not-an-int"}}) + assert resp.status_code == 422 + + +@pytest.mark.asyncio +async def test_create_stores_metadata_and_project_id(client, registry): + resp = await client.post( + "/api/jobs/project_scoped", + json={"params": {"project_id": "p_abc"}, "metadata": {"source": "test"}}, + ) + assert resp.status_code == 201 + job_id = resp.json()["job_id"] + record = registry._jobs[job_id] + assert record.project_id == "p_abc" + assert record.metadata == {"source": "test"} + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_create_noop_has_null_project_id(client, registry): + job_id = await _create_noop(client) + assert registry._jobs[job_id].project_id is None + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_create_explicit_project_id_scopes_typeless_job(client, registry): + # A job whose params carry no project_id (noop) still gets scoped when the + # request body sets project_id explicitly — this is what the project-filtered + # jobs panel / SSE stream rely on to show such jobs. + resp = await client.post( + "/api/jobs/noop", + json={ + "params": {"steps": 50, "sleep_per_step_seconds": 0.05}, + "project_id": "p_explicit", + }, + ) + assert resp.status_code == 201 + job_id = resp.json()["job_id"] + assert registry._jobs[job_id].project_id == "p_explicit" + rows = (await client.get("/api/jobs", params={"project_id": "p_explicit"})).json() + assert any(r["id"] == job_id for r in rows) + await registry.cancel(job_id) + + +# -- list -------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_list_empty(client): + resp = await client.get("/api/jobs") + assert resp.status_code == 200 + assert resp.json() == [] + + +@pytest.mark.asyncio +async def test_list_returns_jobs_sorted_desc(client, registry): + first = await _create_noop(client) + second = await _create_noop(client) + resp = await client.get("/api/jobs") + assert resp.status_code == 200 + ids = [r["id"] for r in resp.json()] + assert ids[0] == second + assert ids[1] == first + await registry.cancel(first) + await registry.cancel(second) + + +@pytest.mark.asyncio +async def test_list_filter_by_type(client, registry): + await _create_noop(client) + await client.post("/api/jobs/project_scoped", json={"params": {"project_id": "p1"}}) + resp = await client.get("/api/jobs", params={"type": "project_scoped"}) + assert resp.status_code == 200 + rows = resp.json() + assert len(rows) == 1 + assert rows[0]["type"] == "project_scoped" + + +@pytest.mark.asyncio +async def test_list_filter_by_status(client, registry): + job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01) + await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED) + resp = await client.get("/api/jobs", params={"status": "succeeded"}) + assert [r["id"] for r in resp.json()] == [job_id] + resp = await client.get("/api/jobs", params={"status": "running"}) + assert resp.json() == [] + + +@pytest.mark.asyncio +async def test_list_filter_by_project_id(client): + await client.post( + "/api/jobs/project_scoped", json={"params": {"project_id": "p_one"}} + ) + await client.post( + "/api/jobs/project_scoped", json={"params": {"project_id": "p_two"}} + ) + resp = await client.get("/api/jobs", params={"project_id": "p_one"}) + rows = resp.json() + assert len(rows) == 1 + assert rows[0]["project_id"] == "p_one" + + +@pytest.mark.asyncio +async def test_list_limit(client): + for _ in range(3): + await _create_noop(client) + resp = await client.get("/api/jobs", params={"limit": 2}) + assert len(resp.json()) == 2 + + +@pytest.mark.asyncio +async def test_list_since_excludes_older(client, registry): + old_id = await _create_noop(client) + newer_id = await _create_noop(client) + cutoff = registry._jobs[newer_id].created_at.isoformat() + resp = await client.get("/api/jobs", params={"since": cutoff}) + ids = [r["id"] for r in resp.json()] + assert newer_id in ids + assert old_id not in ids + + +# -- get --------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_get_returns_record(client, registry): + job_id = await _create_noop(client) + resp = await client.get(f"/api/jobs/{job_id}") + assert resp.status_code == 200 + body = resp.json() + assert body["id"] == job_id + assert body["type"] == "noop" + assert "progress" in body + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_get_unknown_404(client): + resp = await client.get("/api/jobs/j_missing") + assert resp.status_code == 404 + + +@pytest.mark.asyncio +async def test_get_reconciles_to_succeeded(client, registry): + ReconcileCompleteWorker.done = False + resp = await client.post("/api/jobs/reconcile_complete", json={"params": {}}) + job_id = resp.json()["job_id"] + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + ReconcileCompleteWorker.done = True + got = await client.get(f"/api/jobs/{job_id}") + assert got.status_code == 200 + assert got.json()["status"] == "succeeded" + assert got.json()["progress"]["success"] == 3 + + +# -- result ------------------------------------------------------------------ + + +@pytest.mark.asyncio +async def test_result_200_when_terminal(client, registry): + job_id = await _create_noop(client, steps=3, sleep_per_step_seconds=0.01) + await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED) + resp = await client.get(f"/api/jobs/{job_id}/result") + assert resp.status_code == 200 + assert resp.json() == {"completed_steps": 3} + + +@pytest.mark.asyncio +async def test_result_404_when_not_terminal(client, registry): + job_id = await _create_noop(client) + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + resp = await client.get(f"/api/jobs/{job_id}/result") + assert resp.status_code == 404 + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_result_404_unknown(client): + resp = await client.get("/api/jobs/j_missing/result") + assert resp.status_code == 404 + + +# -- errors ------------------------------------------------------------------ + + +@pytest.mark.asyncio +async def test_errors_returns_array(client, registry): + resp = await client.post( + "/api/jobs/noop", + json={ + "params": { + "steps": 4, + "sleep_per_step_seconds": 0.01, + "error_at_steps": [1, 3], + } + }, + ) + job_id = resp.json()["job_id"] + await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED) + resp = await client.get(f"/api/jobs/{job_id}/errors") + assert resp.status_code == 200 + messages = [e["error_message"] for e in resp.json()] + assert "intentional error at step 1" in messages + assert "intentional error at step 3" in messages + + +@pytest.mark.asyncio +async def test_errors_empty_when_none(client, registry): + job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01) + await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED) + resp = await client.get(f"/api/jobs/{job_id}/errors") + assert resp.status_code == 200 + assert resp.json() == [] + + +@pytest.mark.asyncio +async def test_errors_unknown_job_returns_empty_200(client): + resp = await client.get("/api/jobs/j_missing/errors") + assert resp.status_code == 200 + assert resp.json() == [] + + +@pytest.mark.asyncio +async def test_errors_specific_run_id(client): + run_id = str(uuid.uuid4()) + error_log.append_error(run_id, {"error_message": "from a past run"}) + resp = await client.get("/api/jobs/j_missing/errors", params={"run_id": run_id}) + assert resp.status_code == 200 + assert resp.json() == [{"error_message": "from a past run"}] + + +# -- pause / resume / cancel ------------------------------------------------- + + +@pytest.mark.asyncio +async def test_pause_then_resume(client, registry): + job_id = await _create_noop(client, steps=50, sleep_per_step_seconds=0.03) + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + + resp = await client.post(f"/api/jobs/{job_id}/pause") + assert resp.status_code == 202 + assert registry._jobs[job_id].status == BackgroundJobStatus.PAUSED + + resp = await client.post(f"/api/jobs/{job_id}/resume") + assert resp.status_code == 202 + assert registry._jobs[job_id].status in ( + BackgroundJobStatus.PENDING, + BackgroundJobStatus.RUNNING, + ) + + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_pause_409_when_not_running(client, registry): + job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01) + await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED) + resp = await client.post(f"/api/jobs/{job_id}/pause") + assert resp.status_code == 409 + + +@pytest.mark.asyncio +async def test_pause_409_when_unsupported(client, registry): + resp = await client.post("/api/jobs/nonpausable", json={"params": {}}) + job_id = resp.json()["job_id"] + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + resp = await client.post(f"/api/jobs/{job_id}/pause") + assert resp.status_code == 409 + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_pause_unknown_404(client): + resp = await client.post("/api/jobs/j_missing/pause") + assert resp.status_code == 404 + + +@pytest.mark.asyncio +async def test_resume_409_when_not_paused(client, registry): + job_id = await _create_noop(client) + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + resp = await client.post(f"/api/jobs/{job_id}/resume") + assert resp.status_code == 409 + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_cancel_202(client, registry): + job_id = await _create_noop(client) + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + resp = await client.post(f"/api/jobs/{job_id}/cancel") + assert resp.status_code == 202 + assert registry._jobs[job_id].status == BackgroundJobStatus.CANCELLED + + +@pytest.mark.asyncio +async def test_cancel_409_when_terminal(client, registry): + job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01) + await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED) + resp = await client.post(f"/api/jobs/{job_id}/cancel") + assert resp.status_code == 409 + + +@pytest.mark.asyncio +async def test_cancel_unknown_404(client): + resp = await client.post("/api/jobs/j_missing/cancel") + assert resp.status_code == 404 + + +# -- delete ------------------------------------------------------------------ + + +@pytest.mark.asyncio +async def test_delete_204_when_terminal(client, registry): + job_id = await _create_noop(client, steps=2, sleep_per_step_seconds=0.01) + await _wait_for_status(registry, job_id, BackgroundJobStatus.SUCCEEDED) + resp = await client.delete(f"/api/jobs/{job_id}") + assert resp.status_code == 204 + assert job_id not in registry._jobs + assert (await client.get("/api/jobs")).json() == [] + + +@pytest.mark.asyncio +async def test_delete_409_when_in_flight(client, registry): + job_id = await _create_noop(client) + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + resp = await client.delete(f"/api/jobs/{job_id}") + assert resp.status_code == 409 + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_delete_unknown_404(client): + resp = await client.delete("/api/jobs/j_missing") + assert resp.status_code == 404 + + +# -- wait -------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_wait_endpoint_200_terminal_record(client): + resp = await client.post( + "/api/jobs/noop", json={"params": {"steps": 3, "sleep_per_step_seconds": 0.02}} + ) + job_id = resp.json()["job_id"] + got = await client.get(f"/api/jobs/{job_id}/wait", timeout=10.0) + assert got.status_code == 200, got.text + body = got.json() + assert body["id"] == job_id + assert body["status"] == "succeeded" + assert body["result"] == {"completed_steps": 3} + + +@pytest.mark.asyncio +async def test_wait_endpoint_404_unknown(client): + resp = await client.get("/api/jobs/j_missing/wait") + assert resp.status_code == 404 + + +@pytest.mark.asyncio +async def test_wait_endpoint_504_on_timeout(client, registry): + job_id = await _create_noop(client, steps=50, sleep_per_step_seconds=0.05) + await _wait_for_status(registry, job_id, BackgroundJobStatus.RUNNING) + resp = await client.get(f"/api/jobs/{job_id}/wait", params={"timeout": 0.01}) + assert resp.status_code == 504 + await registry.cancel(job_id) + + +@pytest.mark.asyncio +async def test_create_wait_true_returns_terminal_record(client): + resp = await client.post( + "/api/jobs/noop", + params={"wait": "true"}, + json={"params": {"steps": 3, "sleep_per_step_seconds": 0.02}}, + timeout=10.0, + ) + assert resp.status_code == 201, resp.text + body = resp.json() + assert body["id"].startswith("j_") + assert body["status"] == "succeeded" + assert body["result"] == {"completed_steps": 3} + + +@pytest.mark.asyncio +async def test_create_wait_false_returns_create_response(client, registry): + resp = await client.post( + "/api/jobs/noop", + params={"wait": "false"}, + json={"params": {"steps": 50, "sleep_per_step_seconds": 0.05}}, + ) + assert resp.status_code == 201 + body = resp.json() + assert body["job_id"].startswith("j_") + assert body["status"] in ("pending", "running") + assert "result" not in body + await registry.cancel(body["job_id"]) + + +@pytest.mark.asyncio +async def test_create_wait_true_timeout_504(client, registry): + resp = await client.post( + "/api/jobs/noop", + params={"wait": "true", "timeout": 0.01}, + json={"params": {"steps": 50, "sleep_per_step_seconds": 0.05}}, + ) + assert resp.status_code == 504 + # The job was still created and keeps running despite the awaiter timing out. + running = [r for r in registry.list_jobs() if not r.status.is_terminal] + assert len(running) == 1 + await registry.cancel(running[0].id) + + +# -- wiring ------------------------------------------------------------------ + + +def test_connect_jobs_api_registers_noop_idempotently(monkeypatch): + reg = JobRegistry(max_concurrent=2) + monkeypatch.setattr(jobs_api, "job_registry", reg) + app = FastAPI() + connect_jobs_api(app) + connect_jobs_api(app) # second call must not raise + assert "noop" in reg._workers + + +# -- SSE --------------------------------------------------------------------- + + +def test_format_sse_wire_format(): + from app.desktop.studio_server.jobs.events import JobEvent + + event = JobEvent(event="job", data={"id": "j_abc", "status": "running"}) + wire = jobs_api._format_sse(event) + assert wire == 'event: job\ndata: {"id": "j_abc", "status": "running"}\n\n' + + +@pytest.mark.asyncio +async def test_event_stream_forwards_snapshot_then_job(registry): + # Unit-level test of the generator (independent of any HTTP transport): a + # subscriber gets the initial snapshot, and a job created afterward produces + # a `job` event. Proves pure-observer forwarding of the Phase 1 bus. + stream = jobs_api._event_stream(job_id=None, type_name=None, project_id=None) + try: + first = await asyncio.wait_for(stream.__anext__(), timeout=3.0) + assert first.startswith("event: snapshot\n") + + job = await registry.create( + "noop", {"steps": 40, "sleep_per_step_seconds": 0.05} + ) + # Drain until we see a job event for our job. + deadline = asyncio.get_event_loop().time() + 3.0 + saw_job = False + while asyncio.get_event_loop().time() < deadline: + chunk = await asyncio.wait_for(stream.__anext__(), timeout=3.0) + if chunk.startswith("event: job\n") and job.id in chunk: + saw_job = True + break + assert saw_job + await _safe_cancel(registry, job.id) + finally: + await stream.aclose() + + +def _parse_sse_block(block: str) -> tuple[str | None, dict | None]: + event_name: str | None = None + data: dict | None = None + for line in block.splitlines(): + if line.startswith("event:"): + event_name = line[len("event:") :].strip() + elif line.startswith("data:"): + data = json.loads(line[len("data:") :].strip()) + return event_name, data + + +# The SSE endpoint is now a correctly *infinite* stream (it pings forever until +# the client disconnects or the bus shuts down). httpx's ASGITransport runs the +# app to completion and buffers the whole body before returning a response, and +# its `receive()` only yields http.disconnect once the response is complete — so +# it cannot exercise an open-ended stream incrementally or simulate a mid-stream +# disconnect. We therefore drive `_event_stream` / `subscribe` directly for the +# streaming-content behavior, and keep one HTTP-level test that ends the stream +# via `events.shutdown()` so ASGITransport can return the buffered response. + + +async def _read_stream_until(stream, target: str, timeout: float = 3.0) -> dict: + """Pull SSE blocks straight from the `_event_stream` async generator until + one matches `target`; return its parsed data.""" + deadline = asyncio.get_event_loop().time() + timeout + while asyncio.get_event_loop().time() < deadline: + chunk = await asyncio.wait_for(stream.__anext__(), timeout=timeout) + event_name, data = _parse_sse_block(chunk) + if event_name == target and data is not None: + return data + raise AssertionError(f"did not see event '{target}' within {timeout}s") + + +def _parse_sse_body(body: str) -> list[tuple[str | None, dict | None]]: + return [_parse_sse_block(b) for b in body.split("\n\n") if b.strip()] + + +@pytest.mark.asyncio +async def test_sse_endpoint_returns_event_stream_and_ends_on_shutdown(app, registry): + # Full HTTP path: correct status + content-type and an initial snapshot. + # The stream is infinite, and ASGITransport buffers the whole body, so we + # end it with events.shutdown() (the same hook the server uses on reload) + # to let the buffered response come back. + transport = httpx.ASGITransport(app=app) + async with httpx.AsyncClient( + transport=transport, base_url="http://test" + ) as http_client: + get = asyncio.ensure_future(http_client.get("/api/jobs/events")) + # Wait until the endpoint's subscription is registered, then shut the + # bus so the (otherwise infinite) stream returns. + for _ in range(300): + if registry.events._subscribers: + break + await asyncio.sleep(0.01) + else: + raise AssertionError("SSE subscription never registered") + registry.events.shutdown() + + response = await asyncio.wait_for(get, timeout=3.0) + assert response.status_code == 200 + assert response.headers["content-type"].startswith("text/event-stream") + blocks = _parse_sse_body(response.text) + assert ("snapshot", {"jobs": []}) in blocks + + +@pytest.mark.asyncio +async def test_event_stream_emits_keepalive_ping(registry, monkeypatch): + # The keepalive is the regression we fixed: a timeout must yield a `: ping` + # comment WITHOUT finalizing the generator, so MANY pings arrive over time. + monkeypatch.setattr(jobs_api, "KEEPALIVE_SECONDS", 0.05) + stream = jobs_api._event_stream(job_id=None, type_name=None, project_id=None) + try: + first = await asyncio.wait_for(stream.__anext__(), timeout=3.0) + assert first.startswith("event: snapshot\n") + # Two consecutive pings prove the stream survives repeated timeouts. + for _ in range(2): + chunk = await asyncio.wait_for(stream.__anext__(), timeout=3.0) + assert chunk == ": ping\n\n" + finally: + await stream.aclose() + + +@pytest.mark.asyncio +async def test_event_stream_filters_by_job_id(registry): + # Both jobs run; only `target`'s events reach a job_id-filtered stream. + other = await registry.create("noop", {"steps": 40, "sleep_per_step_seconds": 0.05}) + target = await registry.create( + "noop", {"steps": 40, "sleep_per_step_seconds": 0.05} + ) + stream = jobs_api._event_stream(job_id=target.id, type_name=None, project_id=None) + try: + snapshot = await _read_stream_until(stream, "snapshot") + snapshot_ids = {j["id"] for j in snapshot["jobs"]} + assert target.id in snapshot_ids + assert other.id not in snapshot_ids + + # Every live event that arrives is for the target, never `other`. + data = await _read_stream_until(stream, "job") + assert data["id"] == target.id + finally: + await stream.aclose() + await _safe_cancel(registry, other.id) + await _safe_cancel(registry, target.id) + + +@pytest.mark.asyncio +async def test_event_stream_disconnect_leaves_job_running(registry): + """The decoupling guarantee: dropping the SSE stream mid-run must NOT stop + the job. Only explicit cancel/pause stops a job. Closing the generator is + exactly what CancellableStreamingResponse does on a real client disconnect.""" + job = await registry.create("noop", {"steps": 6, "sleep_per_step_seconds": 0.05}) + + stream = jobs_api._event_stream(job_id=None, type_name=None, project_id=None) + await _read_stream_until(stream, "snapshot") + # Observe at least one live job event so we know the run is underway. + await _read_stream_until(stream, "job") + # Simulate the client disconnecting mid-stream. + await stream.aclose() + + assert registry._jobs[job.id].status in ( + BackgroundJobStatus.RUNNING, + BackgroundJobStatus.SUCCEEDED, + ) + await _wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + assert registry._jobs[job.id].result == {"completed_steps": 6} diff --git a/app/desktop/studio_server/jobs/test_error_log.py b/app/desktop/studio_server/jobs/test_error_log.py new file mode 100644 index 000000000..d4291c9de --- /dev/null +++ b/app/desktop/studio_server/jobs/test_error_log.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import uuid + +import pytest + +from app.desktop.studio_server.jobs import error_log + + +@pytest.fixture +def run_id(tmp_path, monkeypatch): + monkeypatch.setattr( + "app.desktop.studio_server.jobs.error_log.tempfile.gettempdir", + lambda: str(tmp_path), + ) + return str(uuid.uuid4()) + + +def test_append_and_read_round_trip(run_id): + error_log.append_error(run_id, {"error_message": "first", "step": 1}) + error_log.append_error(run_id, {"error_message": "second", "item_id": "x"}) + + entries = error_log.read_errors(run_id) + assert entries == [ + {"error_message": "first", "step": 1}, + {"error_message": "second", "item_id": "x"}, + ] + + +def test_read_missing_file_returns_empty(run_id): + assert error_log.read_errors(run_id) == [] + + +def test_read_skips_unparsable_lines(run_id): + error_log.append_error(run_id, {"error_message": "good"}) + with error_log.error_log_path(run_id).open("a", encoding="utf-8") as f: + f.write("not json at all\n") + f.write("\n") + error_log.append_error(run_id, {"error_message": "also good"}) + + entries = error_log.read_errors(run_id) + assert entries == [ + {"error_message": "good"}, + {"error_message": "also good"}, + ] + + +def test_delete_removes_file(run_id): + error_log.append_error(run_id, {"error_message": "x"}) + assert error_log.error_log_path(run_id).exists() + + error_log.delete_errors(run_id) + assert not error_log.error_log_path(run_id).exists() + assert error_log.read_errors(run_id) == [] + + +def test_delete_missing_file_is_noop(run_id): + error_log.delete_errors(run_id) + assert error_log.read_errors(run_id) == [] + + +def test_append_never_raises_on_bad_dir(monkeypatch, run_id): + def boom(*args, **kwargs): + raise OSError("disk full") + + monkeypatch.setattr("app.desktop.studio_server.jobs.error_log.Path.mkdir", boom) + error_log.append_error(run_id, {"error_message": "swallowed"}) diff --git a/app/desktop/studio_server/jobs/test_events.py b/app/desktop/studio_server/jobs/test_events.py new file mode 100644 index 000000000..95eb19b2c --- /dev/null +++ b/app/desktop/studio_server/jobs/test_events.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import asyncio + +import pytest +from app.desktop.studio_server.jobs.events import JobEvent, JobEventBus +from app.desktop.studio_server.jobs.models import BackgroundJobStatus, JobRecord + + +def _record( + job_id: str = "j_aaaaaaaaaaaa", + type_name: str = "noop", + project_id: str | None = None, + status: BackgroundJobStatus = BackgroundJobStatus.RUNNING, +) -> JobRecord: + return JobRecord( + id=job_id, + type=type_name, + status=status, + project_id=project_id, + ) + + +async def _next_event(gen, timeout: float = 1.0) -> JobEvent: + return await asyncio.wait_for(gen.__anext__(), timeout=timeout) + + +@pytest.mark.asyncio +async def test_snapshot_then_job_event(): + existing = _record("j_existing0001") + bus = JobEventBus(snapshot_provider=lambda: [existing]) + + gen = bus.subscribe() + snapshot = await _next_event(gen) + assert snapshot.event == "snapshot" + assert [j["id"] for j in snapshot.data["jobs"]] == ["j_existing0001"] + + new = _record("j_new000000001") + bus.publish_job(new) + job_event = await _next_event(gen) + assert job_event.event == "job" + assert job_event.data["id"] == "j_new000000001" + + await gen.aclose() + + +@pytest.mark.asyncio +async def test_deleted_event(): + bus = JobEventBus(snapshot_provider=lambda: []) + gen = bus.subscribe() + await _next_event(gen) # snapshot + + bus.publish_deleted("j_gone00000001") + event = await _next_event(gen) + assert event.event == "deleted" + assert event.data == {"id": "j_gone00000001"} + + await gen.aclose() + + +@pytest.mark.asyncio +async def test_filter_by_project_id(): + matching = _record("j_match0000001", project_id="p_keep") + other = _record("j_other0000001", project_id="p_drop") + bus = JobEventBus(snapshot_provider=lambda: [matching, other]) + + gen = bus.subscribe(project_id="p_keep") + snapshot = await _next_event(gen) + assert [j["id"] for j in snapshot.data["jobs"]] == ["j_match0000001"] + + bus.publish_job(other) + bus.publish_job(matching) + event = await _next_event(gen) + assert event.data["id"] == "j_match0000001" + + await gen.aclose() + + +@pytest.mark.asyncio +async def test_filter_by_type_and_job_id(): + bus = JobEventBus(snapshot_provider=lambda: []) + gen = bus.subscribe(type_name="eval", job_id="j_target000001") + await _next_event(gen) # snapshot + + bus.publish_job(_record("j_other0000001", type_name="noop")) + bus.publish_job(_record("j_target000001", type_name="eval")) + event = await _next_event(gen) + assert event.data["id"] == "j_target000001" + + await gen.aclose() + + +@pytest.mark.asyncio +async def test_keepalive_ping_does_not_finalize_generator(): + # Regression: the timeout must yield `ping` events from inside the generator + # and keep it alive, not finalize it after the first one. + bus = JobEventBus(snapshot_provider=lambda: []) + gen = bus.subscribe(timeout=0.02) + assert (await _next_event(gen)).event == "snapshot" + assert (await _next_event(gen)).event == "ping" + assert (await _next_event(gen)).event == "ping" + + # A real event still flows after pings. + bus.publish_job(_record("j_after000001")) + event = await _next_event(gen) + assert event.event == "job" + assert event.data["id"] == "j_after000001" + + await gen.aclose() + + +@pytest.mark.asyncio +async def test_shutdown_ends_open_stream_and_rejects_new_ones(): + bus = JobEventBus(snapshot_provider=lambda: []) + gen = bus.subscribe() + assert (await _next_event(gen)).event == "snapshot" + + # shutdown() pushes a close sentinel so the open generator returns. + bus.shutdown() + with pytest.raises(StopAsyncIteration): + await _next_event(gen) + + # A subscription opened after shutdown ends immediately (no snapshot). + gen2 = bus.subscribe() + with pytest.raises(StopAsyncIteration): + await _next_event(gen2) + + +@pytest.mark.asyncio +async def test_shutdown_unblocks_subscriber_waiting_without_timeout(): + # With no keepalive timeout the subscriber blocks on queue.get(); shutdown() + # must still wake it so a hot reload isn't held open. + bus = JobEventBus(snapshot_provider=lambda: []) + gen = bus.subscribe() # timeout=None + assert (await _next_event(gen)).event == "snapshot" + + async def _drain(): + with pytest.raises(StopAsyncIteration): + await gen.__anext__() + + waiter = asyncio.ensure_future(_drain()) + await asyncio.sleep(0) # let the waiter block on queue.get() + bus.shutdown() + await asyncio.wait_for(waiter, timeout=1.0) diff --git a/app/desktop/studio_server/jobs/test_registry.py b/app/desktop/studio_server/jobs/test_registry.py new file mode 100644 index 000000000..71a46a5bd --- /dev/null +++ b/app/desktop/studio_server/jobs/test_registry.py @@ -0,0 +1,874 @@ +from __future__ import annotations + +import asyncio +import uuid + +import pytest +from pydantic import BaseModel + +from app.desktop.studio_server.jobs import error_log +from app.desktop.studio_server.jobs.models import ( + JobDerivedState, + BackgroundJobStatus, + JobWorker, +) +from app.desktop.studio_server.jobs.registry import ( + JobNotFoundError, + JobOperationError, + JobRegistry, + _new_job_id, +) +from app.desktop.studio_server.jobs.workers.noop import NoopJobWorker + + +@pytest.fixture(autouse=True) +def temp_error_log_dir(tmp_path, monkeypatch): + monkeypatch.setattr( + "app.desktop.studio_server.jobs.error_log.tempfile.gettempdir", + lambda: str(tmp_path), + ) + + +@pytest.fixture +def registry(): + reg = JobRegistry(max_concurrent=10) + reg.register_type(NoopJobWorker) + return reg + + +async def wait_for_status( + registry: JobRegistry, + job_id: str, + target: BackgroundJobStatus | set[BackgroundJobStatus], + timeout: float = 3.0, +) -> None: + targets = {target} if isinstance(target, BackgroundJobStatus) else target + deadline = asyncio.get_event_loop().time() + timeout + while asyncio.get_event_loop().time() < deadline: + job = registry._jobs.get(job_id) + if job is not None and job.status in targets: + return + await asyncio.sleep(0.01) + job = registry._jobs.get(job_id) + actual = job.status if job else "missing" + raise AssertionError(f"Job {job_id} did not reach {targets}; was {actual}") + + +# -- supporting test workers ------------------------------------------------ + + +class _EmptyParams(BaseModel): + pass + + +class _EmptyResult(BaseModel): + pass + + +class NonPausableWorker(JobWorker[_EmptyParams, _EmptyResult]): + type_name = "nonpausable" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = False + + async def run(self, params, ctx): + await asyncio.sleep(5) + return _EmptyResult() + + +class AlreadyCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]): + type_name = "already_complete" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = True + run_called = False + + async def compute_state(self, params): + return JobDerivedState(total=5, success=5, error=0, is_complete=True) + + async def run(self, params, ctx): + type(self).run_called = True + return _EmptyResult() + + +class PartialProgressWorker(JobWorker[_EmptyParams, _EmptyResult]): + """First reports the full set (total + message), then a count-only update. + The later partial update must preserve the earlier total/message, not null + them. + """ + + type_name = "partial_progress" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = False + + async def run(self, params, ctx): + await ctx.report_progress(success=1, total=50, message="starting") + await ctx.report_progress(success=5) + return _EmptyResult() + + +class RaceCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]): + """run() blocks on a test-controlled gate, then returns normally without + ever observing a cancellation. The test opens the gate (so run() returns and + the supervising task drives the job to its terminal succeeded state) and only + then issues pause/cancel — reproducing the completion-vs-cancel race where + the job finished naturally during the cancel await. + """ + + type_name = "race_complete" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = True + gate: asyncio.Event + + async def run(self, params, ctx): + await type(self).gate.wait() + return _EmptyResult() + + +class SwallowCancelWorker(JobWorker[_EmptyParams, _EmptyResult]): + """Catches CancelledError, fully clears the cancellation (uncancel) so it is + not re-raised, and returns normally — the worst-case "swallows CancelledError + and returns silently" worker. The cancellation transition is unconditional, + so the registry itself must land the job in paused/cancelled rather than + trusting the worker to re-raise. + + `started` is set once run() is actually suspended at its await point, so a + test can guarantee the cancellation is delivered into the worker body (not + before it runs) before issuing pause/cancel. + """ + + type_name = "swallow_cancel" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = True + started: asyncio.Event + gate: asyncio.Event + + async def run(self, params, ctx): + type(self).started.set() + try: + await type(self).gate.wait() + except asyncio.CancelledError: + task = asyncio.current_task() + # task.uncancel() was added in Python 3.11; on 3.10 simply + # swallowing the CancelledError exercises the same worst-case + # "swallows cancel and returns normally" path. + if task is not None and hasattr(task, "uncancel"): + task.uncancel() + return _EmptyResult() + + +class TotalThenNoneWorker(JobWorker[_EmptyParams, _EmptyResult]): + """run() reports a known total via report_progress, then compute_state at + pause returns total=None alongside success/error counts. The reconcile must + preserve the prior total rather than wiping the denominator to None. + """ + + type_name = "total_then_none" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = True + started: asyncio.Event + gate: asyncio.Event + + async def compute_state(self, params): + return JobDerivedState(total=None, success=2, error=1, is_complete=False) + + async def run(self, params, ctx): + await ctx.report_progress(success=0, total=10, message="starting") + type(self).started.set() + try: + await type(self).gate.wait() + except asyncio.CancelledError: + task = asyncio.current_task() + # task.uncancel() was added in Python 3.11; on 3.10 simply + # swallowing the CancelledError exercises the same worst-case + # "swallows cancel and returns normally" path. + if task is not None and hasattr(task, "uncancel"): + task.uncancel() + return _EmptyResult() + + +class ReconcileCompleteWorker(JobWorker[_EmptyParams, _EmptyResult]): + """compute_state reports complete only once the test flips `done`, so a + get() issued while the job is still running (run() is a long sleep) + reconciles it straight to succeeded mid-flight. + """ + + type_name = "reconcile_complete" + params_model = _EmptyParams + result_model = _EmptyResult + supports_pause = True + done = False + + async def compute_state(self, params): + complete = type(self).done + return JobDerivedState( + total=3, success=3 if complete else 1, error=0, is_complete=complete + ) + + async def run(self, params, ctx): + await asyncio.sleep(5) + return _EmptyResult() + + +# -- job id ------------------------------------------------------------------ + + +def test_job_id_format(): + job_id = _new_job_id() + assert job_id.startswith("j_") + suffix = job_id[2:] + assert len(suffix) == 12 + assert all(c in "abcdefghijklmnopqrstuvwxyz234567" for c in suffix) + + +# -- lifecycle --------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_full_lifecycle_succeeds(registry): + job = await registry.create("noop", {"steps": 3, "sleep_per_step_seconds": 0.01}) + assert job.status in (BackgroundJobStatus.PENDING, BackgroundJobStatus.RUNNING) + assert job.supports_pause is True + + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + final = registry._jobs[job.id] + assert final.result == {"completed_steps": 3} + assert final.started_at is not None + assert final.ended_at is not None + assert final.run_id is not None + assert final.progress.success == 3 + + +@pytest.mark.asyncio +async def test_failure_path_captures_error_log(registry): + job = await registry.create( + "noop", + {"steps": 5, "sleep_per_step_seconds": 0.01, "fail_at_step": 2}, + ) + await wait_for_status(registry, job.id, BackgroundJobStatus.FAILED) + + final = registry._jobs[job.id] + assert final.error is not None + assert final.error.error is not None + assert "intentional fail at step 2" in final.error.error + + entries = error_log.read_errors(final.run_id) + fatal = [e for e in entries if e.get("fatal")] + assert len(fatal) == 1 + assert "intentional fail at step 2" in fatal[0]["error_message"] + + +@pytest.mark.asyncio +async def test_non_fatal_errors_logged_and_counted(registry): + job = await registry.create( + "noop", + { + "steps": 4, + "sleep_per_step_seconds": 0.01, + "error_at_steps": [1, 3], + }, + ) + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + + final = registry._jobs[job.id] + assert final.progress.error == 2 + assert final.progress.success == 2 + + entries = error_log.read_errors(final.run_id) + messages = [e["error_message"] for e in entries] + assert "intentional error at step 1" in messages + assert "intentional error at step 3" in messages + steps = sorted(e["step"] for e in entries if "step" in e) + assert steps == [1, 3] + + +@pytest.mark.asyncio +async def test_error_log_missing_returns_empty(): + assert error_log.read_errors(str(uuid.uuid4())) == [] + + +# -- cancel ------------------------------------------------------------------ + + +@pytest.mark.asyncio +async def test_cancel_pending_job_never_starts(): + reg = JobRegistry(max_concurrent=1) + reg.register_type(NoopJobWorker) + running = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + await wait_for_status(reg, running.id, BackgroundJobStatus.RUNNING) + pending = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + assert reg._jobs[pending.id].status == BackgroundJobStatus.PENDING + + await reg.cancel(pending.id) + assert reg._jobs[pending.id].status == BackgroundJobStatus.CANCELLED + assert pending.id not in reg._tasks + + await reg.cancel(running.id) + + +@pytest.mark.asyncio +async def test_cancel_from_running(registry): + job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + await registry.cancel(job.id) + assert registry._jobs[job.id].status == BackgroundJobStatus.CANCELLED + + +@pytest.mark.asyncio +async def test_cancel_immediately_after_create_reclaims_slot(): + # Cancelling right after create can race the supervising task before its + # coroutine body runs; the registry must still reclaim the concurrency slot. + reg = JobRegistry(max_concurrent=2) + reg.register_type(NoopJobWorker) + ids = [] + for _ in range(6): + job = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.02}) + ids.append(job.id) + for job_id in ids: + await reg.cancel(job_id) + await asyncio.sleep(0.05) + + assert all(reg._jobs[i].status == BackgroundJobStatus.CANCELLED for i in ids) + assert reg._running_count == 0 + assert reg._tasks == {} + assert reg._pending_ids == [] + + +@pytest.mark.asyncio +async def test_cancel_terminal_raises(registry): + job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01}) + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + with pytest.raises(JobOperationError): + await registry.cancel(job.id) + + +# -- pause / resume ---------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_pause_then_resume_succeeds(registry): + job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.03}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + first_run_id = registry._jobs[job.id].run_id + + await registry.pause(job.id) + assert registry._jobs[job.id].status == BackgroundJobStatus.PAUSED + + # Make resume finish quickly by checking it re-runs with a fresh run_id. + await registry.resume(job.id) + assert registry._jobs[job.id].status in ( + BackgroundJobStatus.PENDING, + BackgroundJobStatus.RUNNING, + ) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + second_run_id = registry._jobs[job.id].run_id + assert second_run_id is not None + assert second_run_id != first_run_id + + await registry.cancel(job.id) + + +@pytest.mark.asyncio +async def test_resume_to_succeeded_when_complete(): + reg = JobRegistry(max_concurrent=2) + reg.register_type(NoopJobWorker) + reg.register_type(AlreadyCompleteWorker) + AlreadyCompleteWorker.run_called = False + + # Start a noop that we pause so we have a paused job to resume against a + # complete worker. Simpler: create the complete worker job, it succeeds + # immediately via reconcile at launch. + job = await reg.create("already_complete", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.SUCCEEDED) + assert AlreadyCompleteWorker.run_called is False + assert reg._jobs[job.id].progress.success == 5 + + +@pytest.mark.asyncio +async def test_pause_rejected_when_not_supported(): + reg = JobRegistry(max_concurrent=2) + reg.register_type(NonPausableWorker) + job = await reg.create("nonpausable", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + with pytest.raises(JobOperationError): + await reg.pause(job.id) + await reg.cancel(job.id) + + +@pytest.mark.asyncio +async def test_pause_rejected_when_not_running(registry): + job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01}) + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + with pytest.raises(JobOperationError): + await registry.pause(job.id) + + +@pytest.mark.asyncio +async def test_resume_rejected_when_not_paused(registry): + job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + with pytest.raises(JobOperationError): + await registry.resume(job.id) + await registry.cancel(job.id) + + +async def _drive_completion_race(operation: str) -> JobRegistry: + # Reproduce the completion-vs-cancel race deterministically: the worker's + # run() is gated; we open the gate at the exact moment the lifecycle op + # begins its cancel await, so the supervising task finishes naturally + # (job -> succeeded, task done) before/while task.cancel() lands. The job + # was running at the op's entry check, so it gets past the guard, but the + # terminal succeeded state must survive. + reg = JobRegistry(max_concurrent=2) + reg.register_type(RaceCompleteWorker) + RaceCompleteWorker.gate = asyncio.Event() + job = await reg.create("race_complete", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + + original_cancel_task = reg._cancel_task + + async def open_gate_then_cancel(job_id: str) -> None: + # Let run() return and the supervising task drive to terminal first. + RaceCompleteWorker.gate.set() + task = reg._tasks.get(job_id) + if task is not None: + try: + await task + except asyncio.CancelledError: + pass + await original_cancel_task(job_id) + + reg._cancel_task = open_gate_then_cancel # type: ignore[method-assign] + + if operation == "pause": + await reg.pause(job.id) + else: + await reg.cancel(job.id) + return reg + + +@pytest.mark.asyncio +async def test_pause_loses_race_to_natural_completion_keeps_succeeded(): + # Regression: if run() completes naturally during pause()'s cancel-await, + # the job is already terminal (succeeded) and pause() must not clobber it + # back to paused (which would drop the result and allow a resume re-run). + reg = await _drive_completion_race("pause") + job_id = next(iter(reg._jobs)) + assert reg._jobs[job_id].status == BackgroundJobStatus.SUCCEEDED + assert reg._jobs[job_id].result is not None + + +@pytest.mark.asyncio +async def test_cancel_loses_race_to_natural_completion_keeps_succeeded(): + # The cancel() path already guards on is_terminal; lock it in. + reg = await _drive_completion_race("cancel") + job_id = next(iter(reg._jobs)) + assert reg._jobs[job_id].status == BackgroundJobStatus.SUCCEEDED + assert reg._jobs[job_id].result is not None + + +@pytest.mark.asyncio +async def test_pause_enforced_when_worker_swallows_cancel(): + # A worker that catches CancelledError (and uncancels it) then returns + # normally must still be paused, not succeeded — the cancellation transition + # is unconditional and enforced by the registry, not the worker. + reg = JobRegistry(max_concurrent=2) + reg.register_type(SwallowCancelWorker) + SwallowCancelWorker.started = asyncio.Event() + SwallowCancelWorker.gate = asyncio.Event() + job = await reg.create("swallow_cancel", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + await asyncio.wait_for(SwallowCancelWorker.started.wait(), timeout=3.0) + + result = await reg.pause(job.id) + assert result.status == BackgroundJobStatus.PAUSED + assert reg._jobs[job.id].result is None + + +@pytest.mark.asyncio +async def test_cancel_enforced_when_worker_swallows_cancel(): + reg = JobRegistry(max_concurrent=2) + reg.register_type(SwallowCancelWorker) + SwallowCancelWorker.started = asyncio.Event() + SwallowCancelWorker.gate = asyncio.Event() + job = await reg.create("swallow_cancel", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + await asyncio.wait_for(SwallowCancelWorker.started.wait(), timeout=3.0) + + result = await reg.cancel(job.id) + assert result.status == BackgroundJobStatus.CANCELLED + assert reg._jobs[job.id].result is None + + +@pytest.mark.asyncio +async def test_cancel_from_paused(): + reg = JobRegistry(max_concurrent=2) + reg.register_type(NoopJobWorker) + job = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.03}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + await reg.pause(job.id) + assert reg._jobs[job.id].status == BackgroundJobStatus.PAUSED + + result = await reg.cancel(job.id) + assert result.status == BackgroundJobStatus.CANCELLED + assert reg._jobs[job.id].status == BackgroundJobStatus.CANCELLED + assert reg._jobs[job.id].ended_at is not None + + +# -- delete ------------------------------------------------------------------ + + +@pytest.mark.asyncio +async def test_delete_terminal_emits_deleted(registry): + job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01}) + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + + events = [] + gen = registry.events.subscribe() + await asyncio.wait_for(gen.__anext__(), timeout=1.0) # snapshot + + async def collect(): + async for event in gen: + events.append(event) + + collector = asyncio.create_task(collect()) + await registry.delete(job.id) + await asyncio.sleep(0.05) + collector.cancel() + try: + await collector + except asyncio.CancelledError: + pass + + assert job.id not in registry._jobs + assert any(e.event == "deleted" and e.data["id"] == job.id for e in events) + + +@pytest.mark.asyncio +async def test_delete_running_raises(registry): + job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + with pytest.raises(JobOperationError): + await registry.delete(job.id) + await registry.cancel(job.id) + + +@pytest.mark.asyncio +async def test_delete_pending_raises(): + reg = JobRegistry(max_concurrent=1) + reg.register_type(NoopJobWorker) + running = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + await wait_for_status(reg, running.id, BackgroundJobStatus.RUNNING) + pending = await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + assert reg._jobs[pending.id].status == BackgroundJobStatus.PENDING + with pytest.raises(JobOperationError): + await reg.delete(pending.id) + await reg.cancel(running.id) + await reg.cancel(pending.id) + + +# -- reconciliation ---------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_compute_state_none_keeps_snapshot(registry): + # Noop's compute_state returns None, so the believed snapshot from + # report_progress is preserved and never flipped to complete early. + job = await registry.create("noop", {"steps": 4, "sleep_per_step_seconds": 0.02}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + # get() triggers reconcile; with None it must not change progress/status. + got = await registry.get(job.id) + assert got is not None + assert got.status in (BackgroundJobStatus.RUNNING, BackgroundJobStatus.SUCCEEDED) + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + assert registry._jobs[job.id].progress.success == 4 + + +@pytest.mark.asyncio +async def test_report_progress_preserves_total_and_message_when_omitted(): + # A count-only report_progress call must not wipe a total/message set by an + # earlier call. + reg = JobRegistry(max_concurrent=2) + reg.register_type(PartialProgressWorker) + job = await reg.create("partial_progress", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.SUCCEEDED) + + final = reg._jobs[job.id] + assert final.progress.success == 5 + assert final.progress.total == 50 + assert final.progress.message == "starting" + + +@pytest.mark.asyncio +async def test_apply_derived_preserves_total_when_compute_state_returns_none(): + # A compute_state that returns total=None (unknown denominator) alongside + # success/error counts must not wipe a total set earlier via report_progress. + # total=None means "unknown, keep what we had", mirroring message handling. + reg = JobRegistry(max_concurrent=2) + reg.register_type(TotalThenNoneWorker) + TotalThenNoneWorker.started = asyncio.Event() + TotalThenNoneWorker.gate = asyncio.Event() + job = await reg.create("total_then_none", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + await asyncio.wait_for(TotalThenNoneWorker.started.wait(), timeout=3.0) + assert reg._jobs[job.id].progress.total == 10 + + # pause() runs compute_state (total=None, success=2, error=1) through + # _apply_derived; the prior total of 10 must survive. + result = await reg.pause(job.id) + assert result.status == BackgroundJobStatus.PAUSED + assert result.progress.total == 10 + assert result.progress.success == 2 + assert result.progress.error == 1 + + +@pytest.mark.asyncio +async def test_get_reconciles_running_job_to_succeeded_mid_flight(): + # A long-running job whose source-of-truth state flips to complete should be + # reconciled straight to succeeded by get() (the running/get() reconcile + # path), not only at launch time. + reg = JobRegistry(max_concurrent=2) + reg.register_type(ReconcileCompleteWorker) + ReconcileCompleteWorker.done = False + job = await reg.create("reconcile_complete", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + # Still running here (run() is a 5s sleep); now flip the source of truth. + assert reg._jobs[job.id].status == BackgroundJobStatus.RUNNING + ReconcileCompleteWorker.done = True + + got = await reg.get(job.id) + assert got is not None + assert got.status == BackgroundJobStatus.SUCCEEDED + assert got.progress.success == 3 + assert got.ended_at is not None + + +# -- concurrency ------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_semaphore_caps_concurrency_fifo(): + reg = JobRegistry(max_concurrent=2) + reg.register_type(NoopJobWorker) + + jobs = [] + for _ in range(4): + jobs.append( + await reg.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + ) + + await asyncio.sleep(0.05) + statuses = [reg._jobs[j.id].status for j in jobs] + running = [s for s in statuses if s == BackgroundJobStatus.RUNNING] + pending = [s for s in statuses if s == BackgroundJobStatus.PENDING] + assert len(running) == 2 + assert len(pending) == 2 + # FIFO: the first two created are the running ones. + assert statuses[0] == BackgroundJobStatus.RUNNING + assert statuses[1] == BackgroundJobStatus.RUNNING + assert statuses[2] == BackgroundJobStatus.PENDING + assert statuses[3] == BackgroundJobStatus.PENDING + + # Cancel the running ones; pending should be promoted. + await reg.cancel(jobs[0].id) + await reg.cancel(jobs[1].id) + await wait_for_status(reg, jobs[2].id, BackgroundJobStatus.RUNNING) + await wait_for_status(reg, jobs[3].id, BackgroundJobStatus.RUNNING) + + await reg.cancel(jobs[2].id) + await reg.cancel(jobs[3].id) + + +# -- events ------------------------------------------------------------------ + + +@pytest.mark.asyncio +async def test_registry_emits_snapshot_and_job_events(registry): + gen = registry.events.subscribe() + snapshot = await asyncio.wait_for(gen.__anext__(), timeout=1.0) + assert snapshot.event == "snapshot" + assert snapshot.data["jobs"] == [] + + events = [] + + async def collect(): + async for event in gen: + events.append(event) + + collector = asyncio.create_task(collect()) + job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01}) + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + await asyncio.sleep(0.02) + collector.cancel() + try: + await collector + except asyncio.CancelledError: + pass + + job_events = [e for e in events if e.event == "job"] + assert len(job_events) >= 2 + assert any(e.data["status"] == "running" for e in job_events) + assert any(e.data["status"] == "succeeded" for e in job_events) + + +# -- wait -------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_wait_returns_immediately_for_terminal_job(registry): + job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01}) + await wait_for_status(registry, job.id, BackgroundJobStatus.SUCCEEDED) + awaited = await asyncio.wait_for(registry.wait(job.id), timeout=1.0) + assert awaited.id == job.id + assert awaited.status == BackgroundJobStatus.SUCCEEDED + assert awaited.result == {"completed_steps": 2} + + +@pytest.mark.asyncio +async def test_wait_blocks_then_returns_terminal_record(registry): + job = await registry.create("noop", {"steps": 4, "sleep_per_step_seconds": 0.03}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + awaited = await asyncio.wait_for(registry.wait(job.id), timeout=3.0) + assert awaited.status == BackgroundJobStatus.SUCCEEDED + assert awaited.result == {"completed_steps": 4} + + +@pytest.mark.asyncio +async def test_wait_unknown_raises(registry): + with pytest.raises(JobNotFoundError): + await registry.wait("j_doesnotexist") + + +@pytest.mark.asyncio +async def test_wait_times_out(registry): + job = await registry.create("noop", {"steps": 50, "sleep_per_step_seconds": 0.05}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + with pytest.raises(asyncio.TimeoutError): + await registry.wait(job.id, timeout=0.01) + await registry.cancel(job.id) + + +@pytest.mark.asyncio +async def test_wait_cancellation_leaves_job_running(registry): + # The load-bearing decoupling invariant: abandoning a wait() must NOT stop + # the job. A second concurrent waiter still resolves to the terminal record. + job = await registry.create("noop", {"steps": 6, "sleep_per_step_seconds": 0.05}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + + abandoned = asyncio.create_task(registry.wait(job.id)) + survivor = asyncio.create_task(registry.wait(job.id)) + # Let both awaiters reach their await point, then abandon the first. + await asyncio.sleep(0.02) + abandoned.cancel() + with pytest.raises(asyncio.CancelledError): + await abandoned + + # The job keeps running and the surviving waiter resolves to its terminal + # record — the supervising task was untouched by the cancelled awaiter. + result = await asyncio.wait_for(survivor, timeout=3.0) + assert result.status == BackgroundJobStatus.SUCCEEDED + assert result.result == {"completed_steps": 6} + + +@pytest.mark.asyncio +async def test_wait_multiple_waiters_both_resolve(registry): + job = await registry.create("noop", {"steps": 4, "sleep_per_step_seconds": 0.03}) + await wait_for_status(registry, job.id, BackgroundJobStatus.RUNNING) + first = asyncio.create_task(registry.wait(job.id)) + second = asyncio.create_task(registry.wait(job.id)) + one, two = await asyncio.wait_for(asyncio.gather(first, second), timeout=3.0) + assert one.status == BackgroundJobStatus.SUCCEEDED + assert two.status == BackgroundJobStatus.SUCCEEDED + assert one.result == two.result == {"completed_steps": 4} + + +@pytest.mark.asyncio +async def test_delete_removes_completion_event(registry): + job = await registry.create("noop", {"steps": 2, "sleep_per_step_seconds": 0.01}) + # wait() lazily creates the completion event; it survives to the terminal set. + awaited = await asyncio.wait_for(registry.wait(job.id), timeout=3.0) + assert awaited.status == BackgroundJobStatus.SUCCEEDED + assert job.id in registry._completion_events + + await registry.delete(job.id) + assert job.id not in registry._completion_events + + +# -- not found --------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_get_unknown_returns_none(registry): + assert await registry.get("j_doesnotexist") is None + + +@pytest.mark.asyncio +async def test_lifecycle_op_unknown_raises(registry): + with pytest.raises(JobNotFoundError): + await registry.cancel("j_doesnotexist") + + +# -- typed progress detail --------------------------------------------------- + + +class DetailModel(BaseModel): + phase: str + done: int + + +class DetailWorker(JobWorker[_EmptyParams, _EmptyResult]): + type_name = "detail" + params_model = _EmptyParams + result_model = _EmptyResult + progress_model = DetailModel + gate: asyncio.Event + + async def run(self, params, ctx): + await ctx.report_progress_detail(DetailModel(phase="extract", done=3)) + await type(self).gate.wait() + return _EmptyResult() + + +@pytest.mark.asyncio +async def test_report_progress_detail_stamps_typed_payload(): + reg = JobRegistry(max_concurrent=2) + reg.register_type(DetailWorker) + DetailWorker.gate = asyncio.Event() + job = await reg.create("detail", {}) + await wait_for_status(reg, job.id, BackgroundJobStatus.RUNNING) + # Give the worker a tick to report the detail. + for _ in range(50): + if reg._jobs[job.id].progress_detail is not None: + break + await asyncio.sleep(0.01) + assert reg._jobs[job.id].progress_detail == {"phase": "extract", "done": 3} + DetailWorker.gate.set() + + +class WrongModel(BaseModel): + other: str + + +class BadDetailWorker(JobWorker[_EmptyParams, _EmptyResult]): + type_name = "bad_detail" + params_model = _EmptyParams + result_model = _EmptyResult + progress_model = DetailModel + + async def run(self, params, ctx): + await ctx.report_progress_detail(WrongModel(other="x")) + return _EmptyResult() + + +@pytest.mark.asyncio +async def test_report_progress_detail_rejects_wrong_model(): + reg = JobRegistry(max_concurrent=2) + reg.register_type(BadDetailWorker) + job = await reg.create("bad_detail", {}) + # The type guard raises inside run(), routing the job to FAILED. + await wait_for_status(reg, job.id, BackgroundJobStatus.FAILED) + assert reg._jobs[job.id].error is not None diff --git a/app/desktop/studio_server/jobs/workers/__init__.py b/app/desktop/studio_server/jobs/workers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/app/desktop/studio_server/jobs/workers/noop.py b/app/desktop/studio_server/jobs/workers/noop.py new file mode 100644 index 000000000..23cc8d04a --- /dev/null +++ b/app/desktop/studio_server/jobs/workers/noop.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import asyncio + +from pydantic import BaseModel + +from ..models import JobContext, JobDerivedState, JobWorker + + +class NoopJobParams(BaseModel): + steps: int = 10 + sleep_per_step_seconds: float = 0.5 + fail_at_step: int | None = None + error_at_steps: list[int] = [] + + +class NoopJobResult(BaseModel): + completed_steps: int + + +class NoopJobWorker(JobWorker[NoopJobParams, NoopJobResult]): + type_name = "noop" + params_model = NoopJobParams + result_model = NoopJobResult + supports_pause = True + + async def compute_state(self, params: NoopJobParams) -> JobDerivedState | None: + return None + + async def run(self, params: NoopJobParams, ctx: JobContext) -> NoopJobResult: + success = error = 0 + for i in range(params.steps): + await asyncio.sleep(params.sleep_per_step_seconds) + if params.fail_at_step == i: + raise RuntimeError(f"intentional fail at step {i}") + if i in params.error_at_steps: + error += 1 + await ctx.report_error(f"intentional error at step {i}", step=i) + else: + success += 1 + await ctx.report_progress( + success=success, + error=error, + total=params.steps, + message=f"step {i + 1}/{params.steps}", + ) + return NoopJobResult(completed_steps=success + error) diff --git a/app/web_ui/.env.example b/app/web_ui/.env.example index c966be48d..5bd044fe1 100644 --- a/app/web_ui/.env.example +++ b/app/web_ui/.env.example @@ -12,5 +12,9 @@ # Useful for debugging agent behavior during development. # PUBLIC_SHOW_TOOL_CALL_DETAILS=true +# Background Jobs UI — set to "true" to show the Jobs entry in the sidebar and +# enable the jobs dialog. When unset or any other value, the feature is hidden. +# PUBLIC_ENABLE_JOBS=true + # Sentry — set the DSN to enable client-side error reporting. Unset = no-op. # VITE_KILN_SENTRY_DSN=https://...@o.../... diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 2ae200867..d8b538d6e 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -3122,6 +3122,188 @@ export interface paths { patch?: never; trace?: never; }; + "/api/jobs/events": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** + * Stream Job Events + * @description Server-sent events for jobs. Emits an initial `snapshot`, then per-job + * `job` and `deleted` events. A pure observer: disconnecting never stops a job. + */ + get: operations["stream_job_events_api_jobs_events_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** List Jobs */ + get: operations["list_jobs_api_jobs_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{type}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Create Job */ + post: operations["create_job_api_jobs__type__post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{id}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Job */ + get: operations["get_job_api_jobs__id__get"]; + put?: never; + post?: never; + /** Delete Job */ + delete: operations["delete_job_api_jobs__id__delete"]; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{id}/result": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Job Result */ + get: operations["get_job_result_api_jobs__id__result_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{id}/wait": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** + * Wait For Job + * @description Block until the job reaches a terminal state, then return its record. + * + * A pure observer, like the SSE stream: if the client disconnects, uvicorn + * cancels this handler coroutine, which cancels the wait() await and tears + * down only the awaiter — the job's supervising task keeps running. + */ + get: operations["wait_for_job_api_jobs__id__wait_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{id}/errors": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Job Errors */ + get: operations["get_job_errors_api_jobs__id__errors_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{id}/pause": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Pause Job */ + post: operations["pause_job_api_jobs__id__pause_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{id}/resume": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Resume Job */ + post: operations["resume_job_api_jobs__id__resume_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/jobs/{id}/cancel": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Cancel Job */ + post: operations["cancel_job_api_jobs__id__cancel_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; } export type webhooks = Record; export interface components { @@ -3589,6 +3771,11 @@ export interface components { */ provider_type: "builtin" | "custom"; }; + /** + * BackgroundJobStatus + * @enum {string} + */ + BackgroundJobStatus: "pending" | "running" | "paused" | "succeeded" | "failed" | "cancelled"; /** * BasePrompt * @description A prompt for a task. This is the basic data storage format which can be used throughout a project. @@ -4381,6 +4568,44 @@ export interface components { data_strategy: components["schemas"]["ChatStrategy"]; run_config_properties?: components["schemas"]["KilnAgentRunConfigProperties"] | null; }; + /** + * CreateJobRequest + * @description Request body for creating a job. Params are validated per job type. + */ + CreateJobRequest: { + /** + * Params + * @description Type-specific job parameters, validated against the type's params model. + */ + params?: { + [key: string]: unknown; + }; + /** + * Project Id + * @description Project to scope this job to (for filtering/visibility). Falls back to the params' project_id when omitted. + */ + project_id?: string | null; + /** + * Metadata + * @description Free-form pass-through attribution, stored verbatim. + */ + metadata?: { + [key: string]: unknown; + } | null; + }; + /** + * CreateJobResponse + * @description Response returned when a job is created. + */ + CreateJobResponse: { + /** + * Job Id + * @description The id of the newly created job. + */ + job_id: string; + /** @description The job's status immediately after creation. */ + status: components["schemas"]["BackgroundJobStatus"]; + }; /** CreateKilnCopilotApiKeyRequest */ CreateKilnCopilotApiKeyRequest: { /** @@ -6792,6 +7017,98 @@ export interface components { */ template: string; }; + /** + * JobError + * @description Small failure summary stamped on the record. Detail lives in the error log. + */ + JobError: { + /** Error */ + error?: string | null; + /** Detail */ + detail?: { + [key: string]: unknown; + } | null; + }; + /** + * JobProgress + * @description Count-based progress for a job. + * + * Processed = success + error; remaining = total - success - error. The error + * field is a count only — the actual messages live in the per-run error log. + */ + JobProgress: { + /** Total */ + total?: number | null; + /** + * Success + * @default 0 + */ + success: number; + /** + * Error + * @default 0 + */ + error: number; + /** Message */ + message?: string | null; + /** + * Updated At + * Format: date-time + */ + updated_at?: string; + }; + /** + * JobRecord + * @description Ephemeral, in-memory bookkeeping for a single job. Never persisted to disk. + */ + JobRecord: { + /** Id */ + id: string; + /** Type */ + type: string; + status: components["schemas"]["BackgroundJobStatus"]; + /** Run Id */ + run_id?: string | null; + progress?: components["schemas"]["JobProgress"]; + /** Progress Detail */ + progress_detail?: { + [key: string]: unknown; + } | null; + /** Params */ + params?: { + [key: string]: unknown; + }; + /** Result */ + result?: { + [key: string]: unknown; + } | null; + error?: components["schemas"]["JobError"] | null; + /** Metadata */ + metadata?: { + [key: string]: unknown; + }; + /** Project Id */ + project_id?: string | null; + /** + * Supports Pause + * @default false + */ + supports_pause: boolean; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** + * Updated At + * Format: date-time + */ + updated_at?: string; + /** Started At */ + started_at?: string | null; + /** Ended At */ + ended_at?: string | null; + }; /** * JobStatus * @enum {string} @@ -17724,4 +18041,385 @@ export interface operations { }; }; }; + stream_job_events_api_jobs_events_get: { + parameters: { + query?: { + /** @description Only stream events for this job id. */ + job_id?: string | null; + /** @description Only stream events for this job type. */ + type?: string | null; + /** @description Only stream events for this project id. */ + project_id?: string | null; + }; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + list_jobs_api_jobs_get: { + parameters: { + query?: { + /** @description Filter by job status. */ + status?: components["schemas"]["BackgroundJobStatus"] | null; + /** @description Filter by job type. */ + type?: string | null; + /** @description Filter by project id. */ + project_id?: string | null; + /** @description Only jobs created at or after this ISO-8601 time. */ + since?: string | null; + /** @description Maximum number of jobs to return. */ + limit?: number | null; + }; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["JobRecord"][]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + create_job_api_jobs__type__post: { + parameters: { + query?: { + /** @description When true, block until the job reaches a terminal state and return the full JobRecord instead of CreateJobResponse. */ + wait?: boolean; + /** @description Seconds to wait when wait=true (504 on timeout). Omit to wait indefinitely. */ + timeout?: number | null; + }; + header?: never; + path: { + /** @description The registered job type to run. */ + type: string; + }; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["CreateJobRequest"]; + }; + }; + responses: { + /** @description Successful Response */ + 201: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["CreateJobResponse"] | components["schemas"]["JobRecord"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + get_job_api_jobs__id__get: { + parameters: { + query?: never; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["JobRecord"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + delete_job_api_jobs__id__delete: { + parameters: { + query?: never; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 204: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + get_job_result_api_jobs__id__result_get: { + parameters: { + query?: never; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": { + [key: string]: unknown; + }; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + wait_for_job_api_jobs__id__wait_get: { + parameters: { + query?: { + /** @description Seconds to wait before giving up (504 on timeout). Omit to wait indefinitely. */ + timeout?: number | null; + }; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["JobRecord"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + get_job_errors_api_jobs__id__errors_get: { + parameters: { + query?: { + /** @description Read the error log for a specific past run id. */ + run_id?: string | null; + }; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": { + [key: string]: unknown; + }[]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + pause_job_api_jobs__id__pause_post: { + parameters: { + query?: never; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 202: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + resume_job_api_jobs__id__resume_post: { + parameters: { + query?: never; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 202: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + cancel_job_api_jobs__id__cancel_post: { + parameters: { + query?: never; + header?: never; + path: { + /** @description The job id. */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 202: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; } diff --git a/app/web_ui/src/lib/components/SidebarJobsIndicator.svelte b/app/web_ui/src/lib/components/SidebarJobsIndicator.svelte new file mode 100644 index 000000000..78bb13d83 --- /dev/null +++ b/app/web_ui/src/lib/components/SidebarJobsIndicator.svelte @@ -0,0 +1,60 @@ + + +{#if indicator.kind !== "hidden"} + {#if variant === "rail"} + + {#if indicator.kind === "spinner"} + + {/if} + {label} + + {:else} + + {#if indicator.kind === "spinner"} + + {/if} + {label} + + {/if} +{/if} diff --git a/app/web_ui/src/lib/components/SidebarJobsIndicator.test.ts b/app/web_ui/src/lib/components/SidebarJobsIndicator.test.ts new file mode 100644 index 000000000..352ffd3ee --- /dev/null +++ b/app/web_ui/src/lib/components/SidebarJobsIndicator.test.ts @@ -0,0 +1,59 @@ +// @vitest-environment jsdom +import { describe, it, expect, afterEach, vi } from "vitest" +import { render, cleanup } from "@testing-library/svelte" +import { readable } from "svelte/store" + +// The real jobs_store opens an EventSource on subscribe; mock it with plain +// stores so the indicator can render in isolation. The component takes +// active_count / total_count overrides for the actual assertions below. +vi.mock("$lib/stores/jobs_store", () => ({ + active_jobs_count: readable(0), + jobs: readable([]), +})) + +const SidebarJobsIndicator = (await import("./SidebarJobsIndicator.svelte")) + .default + +describe("SidebarJobsIndicator", () => { + afterEach(() => { + cleanup() + }) + + it("shows a spinner and active count when jobs are active", () => { + const { container, getByText } = render(SidebarJobsIndicator, { + props: { active_count: 3, total_count: 5 }, + }) + expect(getByText("3")).not.toBeNull() + expect(container.querySelector(".loading-spinner")).not.toBeNull() + }) + + it("shows a static muted count without a spinner when none are active", () => { + const { container, getByText } = render(SidebarJobsIndicator, { + props: { active_count: 0, total_count: 4 }, + }) + expect(getByText("4")).not.toBeNull() + expect(container.querySelector(".loading-spinner")).toBeNull() + }) + + it("renders nothing when there are no jobs", () => { + const { container } = render(SidebarJobsIndicator, { + props: { active_count: 0, total_count: 0 }, + }) + expect(container.textContent?.trim()).toBe("") + }) + + it("caps the displayed count at 99+", () => { + const { getByText } = render(SidebarJobsIndicator, { + props: { active_count: 150, total_count: 150 }, + }) + expect(getByText("99+")).not.toBeNull() + }) + + it("uses the rail variant styling when requested", () => { + const { container } = render(SidebarJobsIndicator, { + props: { active_count: 2, total_count: 2, variant: "rail" }, + }) + const span = container.querySelector("span") + expect(span?.className).toContain("absolute") + }) +}) diff --git a/app/web_ui/src/lib/components/jobs_dialog.component.test.ts b/app/web_ui/src/lib/components/jobs_dialog.component.test.ts new file mode 100644 index 000000000..73f346a6b --- /dev/null +++ b/app/web_ui/src/lib/components/jobs_dialog.component.test.ts @@ -0,0 +1,118 @@ +// @vitest-environment jsdom +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest" +import { render, cleanup } from "@testing-library/svelte" +import { tick } from "svelte" +import { writable } from "svelte/store" +import type { JobRecord } from "$lib/stores/jobs_api" +import { jobs_dialog } from "$lib/stores/jobs_dialog" + +// The dialog hosts JobsTable, which subscribes to the job stream. Mock the +// stores/api so the table renders an inert empty state. +const jobs = writable([]) +const synced = writable(true) +const connection = writable<"idle" | "connecting" | "open" | "errored">("open") + +vi.mock("$lib/stores/jobs_store", () => ({ + jobs, + synced, + connection, +})) + +vi.mock("$lib/stores/jobs_api", () => ({ + pause_job: vi.fn().mockResolvedValue(undefined), + resume_job: vi.fn().mockResolvedValue(undefined), + cancel_job: vi.fn().mockResolvedValue(undefined), + delete_job: vi.fn().mockResolvedValue(undefined), + get_job_errors: vi.fn().mockResolvedValue([]), + get_job_result: vi.fn().mockResolvedValue({}), +})) + +const JobsDialog = (await import("./jobs_dialog.svelte")).default + +// jsdom doesn't implement HTMLDialogElement.showModal/close; emulate them so +// the `open` property reflects the real show()/close() calls the component makes. +beforeEach(() => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ;(HTMLDialogElement.prototype as any).showModal = function ( + this: HTMLDialogElement, + ) { + this.setAttribute("open", "") + } + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ;(HTMLDialogElement.prototype as any).close = function ( + this: HTMLDialogElement, + ) { + this.removeAttribute("open") + } + jobs.set([]) + synced.set(true) + connection.set("open") +}) + +afterEach(() => { + cleanup() +}) + +function jobsDialogEl(container: HTMLElement): HTMLDialogElement { + // The first dialog in the tree is the Jobs dialog (the errors/result + // sub-dialogs live inside JobsTable and follow it). + const el = container.querySelector("dialog") + expect(el).not.toBeNull() + return el as HTMLDialogElement +} + +describe("JobsDialog open signal", () => { + it("stays closed on mount even if the signal has already advanced", async () => { + // Advance the module-level signal before the component mounts. + jobs_dialog.open() + jobs_dialog.open() + + const { container } = render(JobsDialog) + await tick() + + expect(jobsDialogEl(container).open).toBe(false) + }) + + it("opens when jobs_dialog.open() is called", async () => { + const { container } = render(JobsDialog) + await tick() + expect(jobsDialogEl(container).open).toBe(false) + + jobs_dialog.open() + await tick() + + expect(jobsDialogEl(container).open).toBe(true) + }) + + it("re-opens after being closed", async () => { + const { container } = render(JobsDialog) + await tick() + + jobs_dialog.open() + await tick() + expect(jobsDialogEl(container).open).toBe(true) + + // Close it the way the user would (the dialog's own close()). + jobsDialogEl(container).close() + expect(jobsDialogEl(container).open).toBe(false) + + jobs_dialog.open() + await tick() + expect(jobsDialogEl(container).open).toBe(true) + }) + + it("does not reopen on an unrelated reactive update", async () => { + const { container } = render(JobsDialog) + await tick() + expect(jobsDialogEl(container).open).toBe(false) + + // Mutate unrelated reactive inputs the dialog/table read; none of these + // touch the open signal, so the dialog must remain closed. + jobs.set([]) + synced.set(true) + connection.set("open") + await tick() + + expect(jobsDialogEl(container).open).toBe(false) + }) +}) diff --git a/app/web_ui/src/lib/components/jobs_dialog.svelte b/app/web_ui/src/lib/components/jobs_dialog.svelte new file mode 100644 index 000000000..71f6f3040 --- /dev/null +++ b/app/web_ui/src/lib/components/jobs_dialog.svelte @@ -0,0 +1,30 @@ + + + + + diff --git a/app/web_ui/src/lib/components/jobs_table.svelte b/app/web_ui/src/lib/components/jobs_table.svelte new file mode 100644 index 000000000..bc801a350 --- /dev/null +++ b/app/web_ui/src/lib/components/jobs_table.svelte @@ -0,0 +1,346 @@ + + +{#if action_error} + +{/if} + +{#if !$synced && $connection === "errored"} +
+
+ +
+

Can't connect to the job stream

+

+ We lost the connection to the background job updates and are retrying + automatically. Jobs keep running in the background — this page will + refresh once the connection is restored. +

+
+{:else if !$synced} +
+
+
+{:else if $jobs.length === 0} +
+ + + +
+{:else} +
+ +
+
+ + + + + + + + + + {#each $jobs as job (job.id)} + + + + + + {/each} + +
DetailsStatus
+
+ {job_type_display(job.type)} + {job.id} + {formatDate(job.created_at)} +
+
+
+ + {job_status_display_label(job)} + +
+ {#if job.status === "running"} + {progress_percent(job.progress)}% Complete + {/if} + {#if job.progress?.total} + {progress_label(job.progress)} + {/if} +
+ {#if progress_percent(job.progress) < 100} + + {#if failure_error(job)?.error} + {failure_error(job)?.error} + {:else if job.progress?.message} + {job.progress.message} + {/if} + {/if} +
+
+
+ +
+
+
+{/if} + + + {#if errors_summary?.error} + + {/if} + {#if errors_loading} +
+
+
+ {:else if errors_load_error} +
+ {errors_load_error.getMessage() || "Could not load errors."} +
+ {:else if error_entries.length === 0} +

+ No error messages recorded for this job. +

+ {:else} +
    + {#each error_entries as entry, index (index)} +
  • + {entry.error_message || JSON.stringify(entry)} +
  • + {/each} +
+ {/if} +
+ + + {#if result_loading} +
+
+
+ {:else if result_load_error} +
+ {result_load_error.getMessage() || "Could not load result."} +
+ {:else if result_data} +
{JSON.stringify(
+        result_data,
+        null,
+        2,
+      )}
+ {:else} +

No result available.

+ {/if} +
diff --git a/app/web_ui/src/lib/components/jobs_table.test.ts b/app/web_ui/src/lib/components/jobs_table.test.ts new file mode 100644 index 000000000..5cb27901d --- /dev/null +++ b/app/web_ui/src/lib/components/jobs_table.test.ts @@ -0,0 +1,146 @@ +// @vitest-environment jsdom +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest" +import { render, fireEvent, waitFor, cleanup } from "@testing-library/svelte" +import { writable } from "svelte/store" +import type { JobRecord } from "$lib/stores/jobs_api" + +// Live job list the table renders from. Replaced per-test. +const jobs = writable([]) +const synced = writable(true) +const connection = writable<"idle" | "connecting" | "open" | "errored">("open") + +vi.mock("$lib/stores/jobs_store", () => ({ + jobs, + synced, + connection, +})) + +const api = { + pause_job: vi.fn().mockResolvedValue(undefined), + resume_job: vi.fn().mockResolvedValue(undefined), + cancel_job: vi.fn().mockResolvedValue(undefined), + delete_job: vi.fn().mockResolvedValue(undefined), + get_job_errors: vi.fn().mockResolvedValue([]), + get_job_result: vi.fn().mockResolvedValue({}), +} +vi.mock("$lib/stores/jobs_api", () => api) + +const JobsTable = (await import("./jobs_table.svelte")).default + +function makeJob(overrides: Partial = {}): JobRecord { + return { + id: "j_1", + type: "noop", + status: "running", + supports_pause: false, + created_at: "2024-01-01T00:00:00Z", + ...overrides, + } +} + +describe("JobsTable", () => { + beforeEach(() => { + vi.clearAllMocks() + synced.set(true) + connection.set("open") + jobs.set([]) + }) + + afterEach(() => { + cleanup() + }) + + it("Clear completed deletes exactly the terminal jobs", async () => { + jobs.set([ + makeJob({ id: "running", status: "running" }), + makeJob({ id: "succeeded", status: "succeeded" }), + makeJob({ id: "pending", status: "pending" }), + makeJob({ id: "failed", status: "failed" }), + makeJob({ id: "cancelled", status: "cancelled" }), + ]) + const { getByText } = render(JobsTable) + + await fireEvent.click(getByText("Clear completed")) + + await waitFor(() => { + expect(api.delete_job).toHaveBeenCalledTimes(3) + }) + const deleted = api.delete_job.mock.calls.map((c) => c[0]).sort() + expect(deleted).toEqual(["cancelled", "failed", "succeeded"]) + // It must not touch the active jobs. + expect(deleted).not.toContain("running") + expect(deleted).not.toContain("pending") + }) + + it("Clear completed surfaces an error when a delete fails", async () => { + jobs.set([makeJob({ id: "failed", status: "failed" })]) + api.delete_job.mockRejectedValueOnce(new Error("boom")) + const { getByText, getByRole } = render(JobsTable) + + await fireEvent.click(getByText("Clear completed")) + + await waitFor(() => { + expect(getByRole("alert").textContent).toContain("boom") + }) + }) + + it("offers a Clear action (not a Delete label) for terminal rows", async () => { + jobs.set([makeJob({ id: "succeeded", status: "succeeded" })]) + const { getByLabelText, getByText, queryByText } = render(JobsTable) + await fireEvent.click(getByLabelText("More options")) + expect(getByText("Clear")).not.toBeNull() + expect(queryByText("Delete")).toBeNull() + }) + + it("gates row actions on status: running with pause shows Pause + Cancel", async () => { + jobs.set([ + makeJob({ id: "running", status: "running", supports_pause: true }), + ]) + const { getByLabelText, getByText, queryByText } = render(JobsTable) + await fireEvent.click(getByLabelText("More options")) + expect(getByText("Pause")).not.toBeNull() + expect(getByText("Cancel")).not.toBeNull() + expect(queryByText("Clear")).toBeNull() + }) + + it("gates row actions on status: paused shows Resume + Cancel", async () => { + jobs.set([makeJob({ id: "paused", status: "paused" })]) + const { getByLabelText, getByText } = render(JobsTable) + await fireEvent.click(getByLabelText("More options")) + expect(getByText("Resume")).not.toBeNull() + expect(getByText("Cancel")).not.toBeNull() + }) + + it("gates row actions on status: pending shows only Cancel", async () => { + jobs.set([makeJob({ id: "pending", status: "pending" })]) + const { getByLabelText, getByText, queryByText } = render(JobsTable) + await fireEvent.click(getByLabelText("More options")) + expect(getByText("Cancel")).not.toBeNull() + expect(queryByText("Pause")).toBeNull() + expect(queryByText("Resume")).toBeNull() + expect(queryByText("Clear")).toBeNull() + }) + + it("shows the loading spinner before the first sync", () => { + synced.set(false) + connection.set("connecting") + const { container, queryByText } = render(JobsTable) + expect(container.querySelector(".loading.loading-spinner")).not.toBeNull() + // Neither the table nor the empty state should render while syncing. + expect(queryByText("No jobs yet")).toBeNull() + expect(container.querySelector("table")).toBeNull() + }) + + it("shows the empty state when there are no jobs", () => { + jobs.set([]) + const { getByText } = render(JobsTable) + expect(getByText("No jobs yet")).not.toBeNull() + }) + + it("shows the connection-error state when errored before first sync", () => { + synced.set(false) + connection.set("errored") + const { getByText } = render(JobsTable) + expect(getByText("Can't connect to the job stream")).not.toBeNull() + }) +}) diff --git a/app/web_ui/src/lib/stores/job_status.test.ts b/app/web_ui/src/lib/stores/job_status.test.ts new file mode 100644 index 000000000..8b8a5d0dc --- /dev/null +++ b/app/web_ui/src/lib/stores/job_status.test.ts @@ -0,0 +1,218 @@ +import { describe, it, expect } from "vitest" +import { + available_actions, + completed_jobs, + is_active, + is_terminal, + job_completed_with_errors, + job_status_badge_class, + job_status_display, + job_status_display_badge_class, + job_status_display_label, + jobs_indicator, + progress_label, + progress_percent, +} from "./job_status" +import type { BackgroundJobStatus, JobRecord } from "./jobs_api" + +function makeJob(overrides: Partial = {}): JobRecord { + return { + id: "j_1", + type: "noop", + status: "running", + supports_pause: false, + ...overrides, + } +} + +describe("is_active / is_terminal", () => { + it("treats pending, running, paused as active", () => { + expect(is_active("pending")).toBe(true) + expect(is_active("running")).toBe(true) + expect(is_active("paused")).toBe(true) + }) + + it("treats terminal statuses as not active", () => { + expect(is_active("succeeded")).toBe(false) + expect(is_active("failed")).toBe(false) + expect(is_active("cancelled")).toBe(false) + }) + + it("identifies terminal statuses", () => { + expect(is_terminal("succeeded")).toBe(true) + expect(is_terminal("failed")).toBe(true) + expect(is_terminal("cancelled")).toBe(true) + expect(is_terminal("running")).toBe(false) + }) +}) + +describe("available_actions", () => { + it("running without pause support: cancel only", () => { + expect(available_actions(makeJob({ status: "running" }))).toEqual([ + "cancel", + ]) + }) + + it("running with pause support: pause then cancel", () => { + expect( + available_actions(makeJob({ status: "running", supports_pause: true })), + ).toEqual(["pause", "cancel"]) + }) + + it("paused: resume and cancel", () => { + expect( + available_actions(makeJob({ status: "paused", supports_pause: true })), + ).toEqual(["resume", "cancel"]) + }) + + it("pending: cancel only", () => { + expect(available_actions(makeJob({ status: "pending" }))).toEqual([ + "cancel", + ]) + }) + + it("terminal states: delete only", () => { + for (const status of [ + "succeeded", + "failed", + "cancelled", + ] as BackgroundJobStatus[]) { + expect(available_actions(makeJob({ status }))).toEqual(["delete"]) + } + }) +}) + +describe("job_status_display / job_status_badge_class", () => { + const cases: [BackgroundJobStatus, string, string][] = [ + ["pending", "Pending", "badge-outline"], + ["running", "Running", "badge-outline badge-success"], + ["paused", "Paused", "badge-outline badge-warning"], + ["succeeded", "Succeeded", "badge-outline badge-primary"], + ["failed", "Failed", "badge-outline badge-error"], + ["cancelled", "Cancelled", "badge-outline"], + ] + it.each(cases)("maps %s", (status, label, badge) => { + expect(job_status_display(status)).toBe(label) + expect(job_status_badge_class(status)).toBe(badge) + }) +}) + +describe("job_completed_with_errors / display helpers", () => { + it("is true only when succeeded with a positive error count", () => { + expect( + job_completed_with_errors( + makeJob({ status: "succeeded", progress: { success: 8, error: 2 } }), + ), + ).toBe(true) + }) + + it("is false when succeeded without errors", () => { + expect( + job_completed_with_errors( + makeJob({ status: "succeeded", progress: { success: 10, error: 0 } }), + ), + ).toBe(false) + }) + + it("is false for non-succeeded statuses even with errors", () => { + expect( + job_completed_with_errors( + makeJob({ status: "running", progress: { success: 1, error: 3 } }), + ), + ).toBe(false) + expect( + job_completed_with_errors( + makeJob({ status: "failed", progress: { success: 1, error: 3 } }), + ), + ).toBe(false) + }) + + it("derives label and badge for completed-with-errors", () => { + const job = makeJob({ + status: "succeeded", + progress: { success: 8, error: 2 }, + }) + expect(job_status_display_label(job)).toBe("Completed with errors") + expect(job_status_display_badge_class(job)).toBe( + "badge-outline badge-error", + ) + }) + + it("falls back to plain status display when there are no errors", () => { + const job = makeJob({ + status: "succeeded", + progress: { success: 10, error: 0 }, + }) + expect(job_status_display_label(job)).toBe("Succeeded") + expect(job_status_display_badge_class(job)).toBe( + "badge-outline badge-primary", + ) + }) +}) + +describe("progress_label", () => { + it("shows count only when total is null", () => { + expect(progress_label({ success: 3, error: 0 })).toBe("3") + }) + + it("shows success / total", () => { + expect(progress_label({ success: 3, error: 0, total: 10 })).toBe("3 / 10") + }) + + it("appends errored count when present", () => { + expect(progress_label({ success: 3, error: 2, total: 10 })).toBe( + "3 / 10 (2 errored)", + ) + }) + + it("handles undefined progress", () => { + expect(progress_label(undefined)).toBe("0") + }) +}) + +describe("progress_percent", () => { + it("returns 0 when total is null or zero", () => { + expect(progress_percent({ success: 1, error: 0 })).toBe(0) + expect(progress_percent({ success: 1, error: 0, total: 0 })).toBe(0) + }) + + it("computes processed / total as a percent", () => { + expect(progress_percent({ success: 2, error: 1, total: 10 })).toBe(30) + }) + + it("returns 100 when complete", () => { + expect(progress_percent({ success: 8, error: 2, total: 10 })).toBe(100) + }) +}) + +describe("completed_jobs", () => { + it("returns exactly the terminal jobs", () => { + const jobs = [ + makeJob({ id: "a", status: "running" }), + makeJob({ id: "b", status: "succeeded" }), + makeJob({ id: "c", status: "pending" }), + makeJob({ id: "d", status: "failed" }), + makeJob({ id: "e", status: "paused" }), + makeJob({ id: "f", status: "cancelled" }), + ] + expect(completed_jobs(jobs).map((j) => j.id)).toEqual(["b", "d", "f"]) + }) + + it("returns an empty array when nothing is terminal", () => { + expect(completed_jobs([makeJob({ status: "running" })])).toEqual([]) + }) +}) + +describe("jobs_indicator", () => { + it("shows a spinner with the active count when any job is active", () => { + expect(jobs_indicator(2, 5)).toEqual({ kind: "spinner", count: 2 }) + }) + + it("shows a static total count when none active but jobs remain", () => { + expect(jobs_indicator(0, 3)).toEqual({ kind: "static", count: 3 }) + }) + + it("is hidden when there are no jobs at all", () => { + expect(jobs_indicator(0, 0)).toEqual({ kind: "hidden" }) + }) +}) diff --git a/app/web_ui/src/lib/stores/job_status.ts b/app/web_ui/src/lib/stores/job_status.ts new file mode 100644 index 000000000..6e58dfc92 --- /dev/null +++ b/app/web_ui/src/lib/stores/job_status.ts @@ -0,0 +1,159 @@ +import type { BackgroundJobStatus, JobProgress, JobRecord } from "./jobs_api" + +export const ACTIVE_STATUSES: readonly BackgroundJobStatus[] = [ + "pending", + "running", + "paused", +] + +export const TERMINAL_STATUSES: readonly BackgroundJobStatus[] = [ + "succeeded", + "failed", + "cancelled", +] + +export function is_active(status: BackgroundJobStatus): boolean { + return ACTIVE_STATUSES.includes(status) +} + +export function is_terminal(status: BackgroundJobStatus): boolean { + return TERMINAL_STATUSES.includes(status) +} + +export function job_status_display(status: BackgroundJobStatus): string { + switch (status) { + case "pending": + return "Pending" + case "running": + return "Running" + case "paused": + return "Paused" + case "succeeded": + return "Succeeded" + case "failed": + return "Failed" + case "cancelled": + return "Cancelled" + default: { + const exhaustive: never = status + return exhaustive + } + } +} + +export function job_status_badge_class(status: BackgroundJobStatus): string { + switch (status) { + case "running": + return "badge-outline badge-success" + case "succeeded": + return "badge-outline badge-primary" + case "failed": + return "badge-outline badge-error" + case "paused": + return "badge-outline badge-warning" + case "pending": + return "badge-outline" + case "cancelled": + return "badge-outline" + default: { + const exhaustive: never = status + return exhaustive + } + } +} + +// A job that finished successfully but logged one or more non-fatal per-item +// errors. Like RAG's `completed_with_errors`, this is a frontend-derived display +// state only — the backend status stays `succeeded` and the error detail lives +// in the per-run error log. No worker/backend change is needed. +export function job_completed_with_errors(job: JobRecord): boolean { + return job.status === "succeeded" && (job.progress?.error ?? 0) > 0 +} + +export function job_status_display_label(job: JobRecord): string { + if (job_completed_with_errors(job)) { + return "Completed with errors" + } + return job_status_display(job.status) +} + +export function job_status_display_badge_class(job: JobRecord): string { + if (job_completed_with_errors(job)) { + return "badge-outline badge-error" + } + return job_status_badge_class(job.status) +} + +export type JobAction = "pause" | "resume" | "cancel" | "delete" + +// The set of lifecycle actions valid for a job given its status and whether +// its worker supports pause. Mirrors the state machine (functional_spec §3) and +// the delete policy (architecture open item #7: delete only on terminal state). +export function available_actions(job: JobRecord): JobAction[] { + switch (job.status) { + case "running": { + const actions: JobAction[] = ["cancel"] + if (job.supports_pause) { + actions.unshift("pause") + } + return actions + } + case "paused": + return ["resume", "cancel"] + case "pending": + return ["cancel"] + case "succeeded": + case "failed": + case "cancelled": + return ["delete"] + default: { + const exhaustive: never = job.status + return exhaustive + } + } +} + +export function progress_label(progress: JobProgress | undefined): string { + const success = progress?.success ?? 0 + const total = progress?.total + const base = total == null ? `${success}` : `${success} / ${total}` + const error = progress?.error ?? 0 + return error > 0 ? `${base} (${error} errored)` : base +} + +export function progress_percent(progress: JobProgress | undefined): number { + const total = progress?.total + if (!total || total <= 0) { + return 0 + } + const processed = (progress?.success ?? 0) + (progress?.error ?? 0) + return Math.max(0, Math.min(100, Math.round((processed / total) * 100))) +} + +// The jobs that "Clear completed" removes: every job in a terminal state. +export function completed_jobs(jobs: JobRecord[]): JobRecord[] { + return jobs.filter((job) => is_terminal(job.status)) +} + +// What the sidebar Jobs indicator should render, derived purely from the live +// counts so it can be unit-tested without mounting the component: +// - "spinner": at least one active job; show a subtle spinner + active count. +// - "static": no active jobs but some still exist; show a muted total count. +// - "hidden": no jobs at all; show no indicator. +export type JobsIndicator = + | { kind: "spinner"; count: number } + | { kind: "static"; count: number } + | { kind: "hidden" } + +export function jobs_indicator( + active_count: number, + total_count: number, +): JobsIndicator { + if (active_count > 0) { + return { kind: "spinner", count: active_count } + } + if (total_count > 0) { + return { kind: "static", count: total_count } + } + return { kind: "hidden" } +} diff --git a/app/web_ui/src/lib/stores/jobs_api.test.ts b/app/web_ui/src/lib/stores/jobs_api.test.ts new file mode 100644 index 000000000..84770438c --- /dev/null +++ b/app/web_ui/src/lib/stores/jobs_api.test.ts @@ -0,0 +1,150 @@ +import { describe, it, expect, vi, beforeEach } from "vitest" +import { client } from "$lib/api_client" +import { + cancel_job, + create_job, + delete_job, + get_job, + get_job_errors, + get_job_result, + list_jobs, + pause_job, + resume_job, +} from "./jobs_api" + +vi.mock("$lib/api_client", () => ({ + client: { + GET: vi.fn(), + POST: vi.fn(), + DELETE: vi.fn(), + }, + base_url: "http://localhost:8757", +})) + +const mockGET = client.GET as unknown as ReturnType +const mockPOST = client.POST as unknown as ReturnType +const mockDELETE = client.DELETE as unknown as ReturnType + +describe("jobs_api", () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + it("list_jobs calls GET /api/jobs with the query and returns data", async () => { + mockGET.mockResolvedValue({ data: [{ id: "j_1" }], error: undefined }) + const result = await list_jobs({ project_id: "p_1", status: "running" }) + expect(mockGET).toHaveBeenCalledWith("/api/jobs", { + params: { query: { project_id: "p_1", status: "running" } }, + }) + expect(result).toEqual([{ id: "j_1" }]) + }) + + it("list_jobs throws when the client returns an error", async () => { + mockGET.mockResolvedValue({ data: undefined, error: { detail: "boom" } }) + await expect(list_jobs()).rejects.toEqual({ detail: "boom" }) + }) + + it("get_job calls GET /api/jobs/{id}", async () => { + mockGET.mockResolvedValue({ data: { id: "j_2" }, error: undefined }) + const result = await get_job("j_2") + expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}", { + params: { path: { id: "j_2" } }, + }) + expect(result).toEqual({ id: "j_2" }) + }) + + it("create_job calls POST /api/jobs/{type} with params and metadata", async () => { + mockPOST.mockResolvedValue({ + data: { job_id: "j_3", status: "pending" }, + error: undefined, + }) + const result = await create_job("eval", { eval_id: "e_1" }, { src: "ui" }) + expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{type}", { + params: { path: { type: "eval" } }, + body: { + params: { eval_id: "e_1" }, + metadata: { src: "ui" }, + project_id: null, + }, + }) + expect(result).toEqual({ job_id: "j_3", status: "pending" }) + }) + + it("create_job passes an explicit project_id in the body", async () => { + mockPOST.mockResolvedValue({ + data: { job_id: "j_3b", status: "pending" }, + error: undefined, + }) + await create_job("noop", { steps: 5 }, null, "p_current") + expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{type}", { + params: { path: { type: "noop" } }, + body: { params: { steps: 5 }, metadata: null, project_id: "p_current" }, + }) + }) + + it("get_job_result calls GET /api/jobs/{id}/result", async () => { + mockGET.mockResolvedValue({ data: { total: 5 }, error: undefined }) + const result = await get_job_result("j_4") + expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}/result", { + params: { path: { id: "j_4" } }, + }) + expect(result).toEqual({ total: 5 }) + }) + + it("get_job_errors calls GET /api/jobs/{id}/errors with optional run_id", async () => { + mockGET.mockResolvedValue({ + data: [{ error_message: "oops" }], + error: undefined, + }) + const result = await get_job_errors("j_5", "run_xyz") + expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}/errors", { + params: { path: { id: "j_5" }, query: { run_id: "run_xyz" } }, + }) + expect(result).toEqual([{ error_message: "oops" }]) + }) + + it("get_job_errors omits run_id query when not provided", async () => { + mockGET.mockResolvedValue({ data: [], error: undefined }) + await get_job_errors("j_6") + expect(mockGET).toHaveBeenCalledWith("/api/jobs/{id}/errors", { + params: { path: { id: "j_6" }, query: {} }, + }) + }) + + it("pause_job calls POST /api/jobs/{id}/pause", async () => { + mockPOST.mockResolvedValue({ data: undefined, error: undefined }) + await pause_job("j_7") + expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{id}/pause", { + params: { path: { id: "j_7" } }, + }) + }) + + it("resume_job calls POST /api/jobs/{id}/resume", async () => { + mockPOST.mockResolvedValue({ data: undefined, error: undefined }) + await resume_job("j_8") + expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{id}/resume", { + params: { path: { id: "j_8" } }, + }) + }) + + it("cancel_job calls POST /api/jobs/{id}/cancel", async () => { + mockPOST.mockResolvedValue({ data: undefined, error: undefined }) + await cancel_job("j_9") + expect(mockPOST).toHaveBeenCalledWith("/api/jobs/{id}/cancel", { + params: { path: { id: "j_9" } }, + }) + }) + + it("delete_job calls DELETE /api/jobs/{id}", async () => { + mockDELETE.mockResolvedValue({ data: undefined, error: undefined }) + await delete_job("j_10") + expect(mockDELETE).toHaveBeenCalledWith("/api/jobs/{id}", { + params: { path: { id: "j_10" } }, + }) + }) + + it("lifecycle calls throw on client error", async () => { + mockPOST.mockResolvedValue({ data: undefined, error: { detail: "409" } }) + await expect(cancel_job("j_11")).rejects.toEqual({ detail: "409" }) + }) +}) diff --git a/app/web_ui/src/lib/stores/jobs_api.ts b/app/web_ui/src/lib/stores/jobs_api.ts new file mode 100644 index 000000000..d0070c53a --- /dev/null +++ b/app/web_ui/src/lib/stores/jobs_api.ts @@ -0,0 +1,121 @@ +import { client } from "$lib/api_client" +import type { components } from "$lib/api_schema" + +export type JobRecord = components["schemas"]["JobRecord"] +export type JobProgress = components["schemas"]["JobProgress"] +export type JobError = components["schemas"]["JobError"] +export type BackgroundJobStatus = components["schemas"]["BackgroundJobStatus"] + +export type JobErrorEntry = { + error_message?: string +} & Record + +export type ListJobsQuery = { + status?: BackgroundJobStatus + type?: string + project_id?: string + since?: string + limit?: number +} + +export async function list_jobs( + query: ListJobsQuery = {}, +): Promise { + const { data, error } = await client.GET("/api/jobs", { + params: { query }, + }) + if (error) { + throw error + } + return data +} + +export async function get_job(id: string): Promise { + const { data, error } = await client.GET("/api/jobs/{id}", { + params: { path: { id } }, + }) + if (error) { + throw error + } + return data +} + +export async function create_job( + type: string, + params: Record = {}, + metadata: Record | null = null, + project_id: string | null = null, +): Promise< + | components["schemas"]["CreateJobResponse"] + | components["schemas"]["JobRecord"] +> { + const { data, error } = await client.POST("/api/jobs/{type}", { + params: { path: { type } }, + body: { params, metadata, project_id }, + }) + if (error) { + throw error + } + return data +} + +export async function get_job_result( + id: string, +): Promise> { + const { data, error } = await client.GET("/api/jobs/{id}/result", { + params: { path: { id } }, + }) + if (error) { + throw error + } + return data +} + +export async function get_job_errors( + id: string, + run_id?: string, +): Promise { + const { data, error } = await client.GET("/api/jobs/{id}/errors", { + params: { path: { id }, query: run_id ? { run_id } : {} }, + }) + if (error) { + throw error + } + return data as JobErrorEntry[] +} + +export async function pause_job(id: string): Promise { + const { error } = await client.POST("/api/jobs/{id}/pause", { + params: { path: { id } }, + }) + if (error) { + throw error + } +} + +export async function resume_job(id: string): Promise { + const { error } = await client.POST("/api/jobs/{id}/resume", { + params: { path: { id } }, + }) + if (error) { + throw error + } +} + +export async function cancel_job(id: string): Promise { + const { error } = await client.POST("/api/jobs/{id}/cancel", { + params: { path: { id } }, + }) + if (error) { + throw error + } +} + +export async function delete_job(id: string): Promise { + const { error } = await client.DELETE("/api/jobs/{id}", { + params: { path: { id } }, + }) + if (error) { + throw error + } +} diff --git a/app/web_ui/src/lib/stores/jobs_dialog.test.ts b/app/web_ui/src/lib/stores/jobs_dialog.test.ts new file mode 100644 index 000000000..f36289de4 --- /dev/null +++ b/app/web_ui/src/lib/stores/jobs_dialog.test.ts @@ -0,0 +1,14 @@ +import { describe, it, expect } from "vitest" +import { get } from "svelte/store" +import { jobs_dialog } from "./jobs_dialog" + +describe("jobs_dialog", () => { + it("bumps the open signal each time open() is called", () => { + const before = get(jobs_dialog.open_signal) + jobs_dialog.open() + const afterOne = get(jobs_dialog.open_signal) + expect(afterOne).toBe(before + 1) + jobs_dialog.open() + expect(get(jobs_dialog.open_signal)).toBe(before + 2) + }) +}) diff --git a/app/web_ui/src/lib/stores/jobs_dialog.ts b/app/web_ui/src/lib/stores/jobs_dialog.ts new file mode 100644 index 000000000..43ea3b13d --- /dev/null +++ b/app/web_ui/src/lib/stores/jobs_dialog.ts @@ -0,0 +1,22 @@ +import { writable } from "svelte/store" + +// Cross-component channel for opening the global jobs dialog. The dialog itself +// is mounted once in (app)/+layout.svelte and subscribes here; any component +// (e.g. the sidebar Jobs widget) can trigger it via `jobs_dialog.open()`. +function createJobsDialog() { + // Bumped on each open() call. The layout-mounted dialog watches this counter + // and shows itself whenever it changes, so repeated opens always re-show even + // if the value of a boolean flag wouldn't have changed. + const open_signal = writable(0) + + function open() { + open_signal.update((n) => n + 1) + } + + return { + open_signal: { subscribe: open_signal.subscribe }, + open, + } +} + +export const jobs_dialog = createJobsDialog() diff --git a/app/web_ui/src/lib/stores/jobs_store.test.ts b/app/web_ui/src/lib/stores/jobs_store.test.ts new file mode 100644 index 000000000..2eb1d5def --- /dev/null +++ b/app/web_ui/src/lib/stores/jobs_store.test.ts @@ -0,0 +1,305 @@ +// @vitest-environment jsdom +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest" +import { get, writable } from "svelte/store" +import type { JobRecord } from "./jobs_api" + +// ui_state drives the project filter. Provide a real writable so we can flip +// the current project mid-test. +const ui_state = writable<{ current_project_id: string | null }>({ + current_project_id: null, +}) + +vi.mock("$lib/api_client", () => ({ + base_url: "http://localhost:8757", + client: {}, +})) + +vi.mock("$lib/stores", () => ({ + ui_state, +})) + +// Spy on every mutation entry point. The store is a pure observer: it must +// never call any of these. We assert that explicitly on teardown below. +const mutationSpies = { + pause_job: vi.fn(), + resume_job: vi.fn(), + cancel_job: vi.fn(), + delete_job: vi.fn(), + create_job: vi.fn(), +} +vi.mock("./jobs_api", () => mutationSpies) + +// A controllable fake EventSource installed on globalThis. Records construction +// URLs and close() calls so tests can assert the pure-observer / reconnect +// behavior without a real network connection. +type Listener = (event: MessageEvent) => void + +class FakeEventSource { + static instances: FakeEventSource[] = [] + url: string + closed = false + onerror: ((this: EventSource, ev: Event) => void) | null = null + private listeners: Record = {} + + constructor(url: string) { + this.url = url + FakeEventSource.instances.push(this) + } + + addEventListener(type: string, listener: Listener) { + ;(this.listeners[type] ||= []).push(listener) + } + + close() { + this.closed = true + } + + emit(type: string, data: unknown) { + const event = { data: JSON.stringify(data) } as MessageEvent + for (const listener of this.listeners[type] || []) { + listener(event) + } + } + + fail() { + this.onerror?.call(this as unknown as EventSource, new Event("error")) + } + + static latest(): FakeEventSource { + return FakeEventSource.instances[FakeEventSource.instances.length - 1] + } + + static reset() { + FakeEventSource.instances = [] + } +} + +function makeJob(overrides: Partial = {}): JobRecord { + return { + id: "j_1", + type: "noop", + status: "running", + supports_pause: true, + created_at: "2026-05-28T12:00:00Z", + ...overrides, + } +} + +// Import the module fresh per test so the ref-counted connection and the +// module-level ui_state subscription start clean. +async function loadStore() { + vi.resetModules() + ui_state.set({ current_project_id: null }) + FakeEventSource.reset() + return await import("./jobs_store") +} + +describe("jobs_store", () => { + beforeEach(() => { + vi.useFakeTimers() + // @ts-expect-error install fake on global + globalThis.EventSource = FakeEventSource + for (const spy of Object.values(mutationSpies)) { + spy.mockClear() + } + }) + + afterEach(() => { + vi.useRealTimers() + vi.restoreAllMocks() + }) + + it("snapshot replaces the whole map", async () => { + const { jobs } = await loadStore() + const unsub = jobs.subscribe(() => {}) + const source = FakeEventSource.latest() + + source.emit("snapshot", { + jobs: [makeJob({ id: "j_1" }), makeJob({ id: "j_2" })], + }) + expect( + get(jobs) + .map((j) => j.id) + .sort(), + ).toEqual(["j_1", "j_2"]) + + // A second snapshot fully replaces the prior contents. + source.emit("snapshot", { jobs: [makeJob({ id: "j_3" })] }) + expect(get(jobs).map((j) => j.id)).toEqual(["j_3"]) + unsub() + }) + + it("job event inserts a new job", async () => { + const { jobs } = await loadStore() + const unsub = jobs.subscribe(() => {}) + const source = FakeEventSource.latest() + source.emit("snapshot", { jobs: [] }) + source.emit("job", makeJob({ id: "j_new" })) + expect(get(jobs).map((j) => j.id)).toEqual(["j_new"]) + unsub() + }) + + it("job event upserts status + progress for an existing job", async () => { + const { jobs } = await loadStore() + const unsub = jobs.subscribe(() => {}) + const source = FakeEventSource.latest() + source.emit("snapshot", { + jobs: [ + makeJob({ + id: "j_1", + status: "running", + progress: { success: 1, error: 0, total: 10 }, + }), + ], + }) + source.emit( + "job", + makeJob({ + id: "j_1", + status: "succeeded", + progress: { success: 10, error: 0, total: 10 }, + }), + ) + const job = get(jobs)[0] + expect(job.status).toBe("succeeded") + expect(job.progress?.success).toBe(10) + unsub() + }) + + it("deleted event removes a job; unknown id is a no-op", async () => { + const { jobs } = await loadStore() + const unsub = jobs.subscribe(() => {}) + const source = FakeEventSource.latest() + source.emit("snapshot", { + jobs: [makeJob({ id: "j_1" }), makeJob({ id: "j_2" })], + }) + source.emit("deleted", { id: "j_1" }) + expect(get(jobs).map((j) => j.id)).toEqual(["j_2"]) + source.emit("deleted", { id: "does_not_exist" }) + expect(get(jobs).map((j) => j.id)).toEqual(["j_2"]) + unsub() + }) + + it("reconnects on error and re-syncs from the fresh snapshot", async () => { + const { jobs } = await loadStore() + const unsub = jobs.subscribe(() => {}) + const first = FakeEventSource.latest() + first.emit("snapshot", { jobs: [makeJob({ id: "stale" })] }) + expect(get(jobs).map((j) => j.id)).toEqual(["stale"]) + + first.fail() + expect(first.closed).toBe(true) + + // After the backoff a new EventSource is constructed. + vi.advanceTimersByTime(2000) + expect(FakeEventSource.instances.length).toBe(2) + const second = FakeEventSource.latest() + expect(second).not.toBe(first) + + second.emit("snapshot", { jobs: [makeJob({ id: "fresh" })] }) + expect(get(jobs).map((j) => j.id)).toEqual(["fresh"]) + unsub() + }) + + it("active_jobs_count counts only pending/running/paused", async () => { + const { jobs, active_jobs_count } = await loadStore() + const unsubJobs = jobs.subscribe(() => {}) + const unsub = active_jobs_count.subscribe(() => {}) + const source = FakeEventSource.latest() + source.emit("snapshot", { + jobs: [ + makeJob({ id: "a", status: "pending" }), + makeJob({ id: "b", status: "running" }), + makeJob({ id: "c", status: "paused" }), + makeJob({ id: "d", status: "succeeded" }), + makeJob({ id: "e", status: "failed" }), + ], + }) + expect(get(active_jobs_count)).toBe(3) + unsub() + unsubJobs() + }) + + it("closes the EventSource when the last subscriber unsubscribes (pure observer)", async () => { + const { jobs } = await loadStore() + const unsub1 = jobs.subscribe(() => {}) + const unsub2 = jobs.subscribe(() => {}) + const source = FakeEventSource.latest() + // Only one EventSource is opened regardless of subscriber count. + expect(FakeEventSource.instances.length).toBe(1) + + unsub1() + expect(source.closed).toBe(false) + unsub2() + expect(source.closed).toBe(true) + }) + + it("opens with the project filter and re-opens when the project changes", async () => { + const { jobs } = await loadStore() + ui_state.set({ current_project_id: "p_1" }) + const unsub = jobs.subscribe(() => {}) + const first = FakeEventSource.latest() + expect(first.url).toContain("project_id=p_1") + + ui_state.set({ current_project_id: "p_2" }) + expect(first.closed).toBe(true) + const second = FakeEventSource.latest() + expect(second).not.toBe(first) + expect(second.url).toContain("project_id=p_2") + unsub() + }) + + it("ignores ui_state changes that don't touch current_project_id", async () => { + const { jobs } = await loadStore() + ui_state.set({ current_project_id: "p_1" }) + const unsub = jobs.subscribe(() => {}) + const first = FakeEventSource.latest() + expect(FakeEventSource.instances.length).toBe(1) + + // An unrelated ui_state update with the same project id must not re-open. + ui_state.set({ current_project_id: "p_1", other: "x" } as { + current_project_id: string | null + }) + expect(FakeEventSource.instances.length).toBe(1) + expect(first.closed).toBe(false) + unsub() + }) + + it("reports an errored connection when the stream fails before syncing", async () => { + const { jobs, connection } = await loadStore() + const unsub = jobs.subscribe(() => {}) + expect(get(connection)).toBe("connecting") + + FakeEventSource.latest().fail() + expect(get(connection)).toBe("errored") + unsub() + }) + + it("connection becomes open once a snapshot arrives", async () => { + const { jobs, connection } = await loadStore() + const unsub = jobs.subscribe(() => {}) + FakeEventSource.latest().emit("snapshot", { jobs: [] }) + expect(get(connection)).toBe("open") + unsub() + }) + + it("never calls a mutation endpoint (pure observer) across its full lifecycle", async () => { + const { jobs } = await loadStore() + const unsub = jobs.subscribe(() => {}) + const source = FakeEventSource.latest() + + // Drive every observable path: snapshot, job upsert, deletion, an error + + // reconnect, a project switch, and finally teardown. + source.emit("snapshot", { jobs: [makeJob({ id: "j_1" })] }) + source.emit("job", makeJob({ id: "j_1", status: "succeeded" })) + source.emit("deleted", { id: "j_1" }) + source.fail() + vi.advanceTimersByTime(2000) + ui_state.set({ current_project_id: "p_switch" }) + unsub() + + for (const spy of Object.values(mutationSpies)) { + expect(spy).not.toHaveBeenCalled() + } + }) +}) diff --git a/app/web_ui/src/lib/stores/jobs_store.ts b/app/web_ui/src/lib/stores/jobs_store.ts new file mode 100644 index 000000000..1718f95bf --- /dev/null +++ b/app/web_ui/src/lib/stores/jobs_store.ts @@ -0,0 +1,244 @@ +import { derived, get, writable, type Readable } from "svelte/store" +import { base_url } from "$lib/api_client" +import { ui_state } from "$lib/stores" +import type { JobRecord } from "./jobs_api" +import { is_active } from "./job_status" + +const RECONNECT_DELAY_MS = 2000 + +type JobsMap = Map + +// Connection state surfaced to the UI so the panel can distinguish "still +// connecting" from "can't connect". Stays a pure observer: this only reports +// the EventSource lifecycle, it never triggers a job mutation. +export type JobsConnection = "idle" | "connecting" | "open" | "errored" + +function createJobsStore() { + const jobs_map = writable(new Map()) + + // True once the first `snapshot` event for the current connection has been + // processed. Lets the panel show a loading state until the stream syncs. + const synced = writable(false) + + // Lifecycle of the underlying EventSource. The panel pairs this with `synced` + // to show a "can't connect / retrying" affordance instead of spinning forever + // when the stream errors before its first snapshot. + const connection = writable("idle") + + let event_source: EventSource | null = null + let reconnect_timer: ReturnType | null = null + let subscriber_count = 0 + let current_project_id: string | null = null + + function build_url(): string { + const url = new URL(`${base_url}/api/jobs/events`) + if (current_project_id) { + url.searchParams.set("project_id", current_project_id) + } + return url.toString() + } + + function upsert(record: JobRecord) { + jobs_map.update((map) => { + const next = new Map(map) + next.set(record.id, record) + return next + }) + } + + function remove(id: string) { + jobs_map.update((map) => { + if (!map.has(id)) { + return map + } + const next = new Map(map) + next.delete(id) + return next + }) + } + + function replace_all(records: JobRecord[]) { + const next: JobsMap = new Map() + for (const record of records) { + next.set(record.id, record) + } + jobs_map.set(next) + } + + function handle_snapshot(event: MessageEvent) { + try { + const parsed = JSON.parse(event.data) as { jobs?: JobRecord[] } + replace_all(parsed.jobs ?? []) + synced.set(true) + connection.set("open") + } catch { + // Ignore malformed payloads; the next snapshot will re-sync. + } + } + + function handle_job(event: MessageEvent) { + try { + const record = JSON.parse(event.data) as JobRecord + upsert(record) + } catch { + // Ignore malformed payloads. + } + } + + function handle_deleted(event: MessageEvent) { + try { + const parsed = JSON.parse(event.data) as { id?: string } + if (parsed.id) { + remove(parsed.id) + } + } catch { + // Ignore malformed payloads. + } + } + + function clear_reconnect() { + if (reconnect_timer !== null) { + clearTimeout(reconnect_timer) + reconnect_timer = null + } + } + + function schedule_reconnect() { + if (reconnect_timer !== null || subscriber_count === 0) { + return + } + reconnect_timer = setTimeout(() => { + reconnect_timer = null + if (subscriber_count > 0) { + connect() + } + }, RECONNECT_DELAY_MS) + } + + function close_source() { + if (event_source) { + event_source.close() + event_source = null + } + } + + function connect() { + // Pure observer: opening or closing this stream never affects a job. A + // dropped connection is recovered by reconnecting; the fresh `snapshot` + // re-syncs the map (no Last-Event-ID needed). + const EventSourceCtor = globalThis.EventSource + if (!EventSourceCtor) { + return + } + close_source() + clear_reconnect() + synced.set(false) + connection.set("connecting") + + const source = new EventSourceCtor(build_url()) + event_source = source + + source.addEventListener("snapshot", handle_snapshot as EventListener) + source.addEventListener("job", handle_job as EventListener) + source.addEventListener("deleted", handle_deleted as EventListener) + source.onerror = () => { + // Only reconnect if this is still the active source (avoids racing a + // teardown or a project switch). + if (event_source !== source) { + return + } + close_source() + connection.set("errored") + schedule_reconnect() + } + } + + function disconnect() { + close_source() + clear_reconnect() + synced.set(false) + connection.set("idle") + } + + // Re-open the stream against a new project filter. Called by the ui_state + // subscription below and exposed for tests. + function set_project(project_id: string | null) { + if (project_id === current_project_id) { + return + } + current_project_id = project_id + if (subscriber_count > 0) { + connect() + } + } + + // Track the active project from UI state so the badge/panel stay scoped to + // the project the user is viewing. `ui_state` fires on any field change, so + // we react only when `current_project_id` actually differs from what we last + // saw — keeping rapid project switches correct (the old source is closed by + // `connect()` before the new one opens, so there's no leak). + current_project_id = get(ui_state).current_project_id ?? null + let last_seen_project_id = current_project_id + ui_state.subscribe((state) => { + const next = state.current_project_id ?? null + if (next === last_seen_project_id) { + return + } + last_seen_project_id = next + set_project(next) + }) + + const subscribe: Readable["subscribe"] = (run, invalidate) => { + if (subscriber_count === 0) { + connect() + } + subscriber_count += 1 + const unsubscribe = jobs_map.subscribe(run, invalidate) + return () => { + unsubscribe() + subscriber_count -= 1 + if (subscriber_count <= 0) { + subscriber_count = 0 + disconnect() + } + } + } + + return { + subscribe, + synced: { subscribe: synced.subscribe } as Readable, + connection: { + subscribe: connection.subscribe, + } as Readable, + set_project, + // Exposed for tests / explicit teardown; not part of normal usage. + _disconnect: disconnect, + } +} + +export const jobs_store = createJobsStore() + +export const synced: Readable = jobs_store.synced + +export const connection: Readable = jobs_store.connection + +export const jobs: Readable = derived(jobs_store, ($map) => + Array.from($map.values()).sort( + (a, b) => + new Date(b.created_at ?? 0).getTime() - + new Date(a.created_at ?? 0).getTime(), + ), +) + +export const active_jobs_count: Readable = derived( + jobs_store, + ($map) => { + let count = 0 + for (const job of $map.values()) { + if (is_active(job.status)) { + count += 1 + } + } + return count + }, +) diff --git a/app/web_ui/src/lib/ui/icons/jobs_icon.svelte b/app/web_ui/src/lib/ui/icons/jobs_icon.svelte new file mode 100644 index 000000000..8e6f82c3f --- /dev/null +++ b/app/web_ui/src/lib/ui/icons/jobs_icon.svelte @@ -0,0 +1,29 @@ + diff --git a/app/web_ui/src/routes/(app)/+layout.svelte b/app/web_ui/src/routes/(app)/+layout.svelte index ac367455b..26e510c06 100644 --- a/app/web_ui/src/routes/(app)/+layout.svelte +++ b/app/web_ui/src/routes/(app)/+layout.svelte @@ -18,6 +18,10 @@ import ToolsIcon from "$lib/ui/icons/tools_icon.svelte" import ChatBar from "./chat_bar.svelte" import ChatIcon from "$lib/ui/icons/chat_icon.svelte" + import JobsIcon from "$lib/ui/icons/jobs_icon.svelte" + import SidebarJobsIndicator from "$lib/components/SidebarJobsIndicator.svelte" + import JobsDialog from "$lib/components/jobs_dialog.svelte" + import { jobs_dialog } from "$lib/stores/jobs_dialog" import { Section } from "$lib/ui/section" import Dialog from "$lib/ui/dialog.svelte" import SidebarRail from "./sidebar_rail.svelte" @@ -25,6 +29,11 @@ import { chatBarExpanded } from "$lib/stores/chat_ui_state" import { derived } from "svelte/store" import DatabaseIcon from "$lib/ui/icons/database_icon.svelte" + import { env } from "$env/dynamic/public" + + // Feature flag: the background Jobs UI (sidebar entry + dialog) only renders + // when PUBLIC_ENABLE_JOBS is explicitly "true". See .env.example. + const jobs_enabled = env.PUBLIC_ENABLE_JOBS === "true" // Rail-eligibility predicate: lg breakpoint, narrow viewport (< 1550px), // and chat bar expanded. See functional_spec.md "Trigger". @@ -161,7 +170,11 @@ > {#if showRail} - taskDialog?.show()} /> + taskDialog?.show()} + /> {:else}