diff --git a/CLAUDE.md b/CLAUDE.md index 652f49b..c0b53ce 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -153,8 +153,41 @@ def verb( result = client.method(project, ...) formatter.render(result) formatter.render_success(f"Target '{target}' verbed.") + surface_warnings(ctx, formatter, result) # mutating ops: flag degraded-but-successful state ``` +### Error reporting (the diagnosis layer) + +Errors must give **clarity and a next step**: say what's wrong, point neutrally at +where to look, and suggest the fix. The machinery lives in `api/errors.py`: + +- **`Fault`** (StrEnum): `USER_INPUT`, `USER_APP`, `USER_CONFIG`, `AUTH`, `PLATFORM`, + `NETWORK`, `UNKNOWN`. Drives a neutral source label (`FAULT_SOURCE`), color + (`FAULT_COLOR`), and CI/CD exit code (`FAULT_EXIT_CODE`: 1 = your fault, 2 = + platform/transient, 3 = unknown/unattributable). +- **`Diagnosis`** (dataclass): `fault`, `headline`, `summary`, `details`, `next_steps`, + `status_code`. `to_dict()` is the json contract (CI branches on `fault`). +- **`diagnose_http_error`** parses status codes and FastAPI `422` validation arrays; + **`diagnose_task_failure`** digs into `processing.component_failures`, `error_type`, + and `ErrorCategory`; **`degraded_diagnoses`** catches "succeeded but unhealthy". + +Rules for new code: +- The client raises `ZadApiError` / `TaskFailedError` / `TaskTimeoutError` with a + `.diagnosis` attached (build it at the raise site via the `diagnose_*` helpers or + `_http_error`). `handle_api_errors` renders it and exits with `diagnosis.exit_code`. +- Render failures with `formatter.render_diagnosis(d)`, degraded-success with + `formatter.render_warnings(diags)`. Diagnostics go to **stderr**; json error objects + go to stdout. Never hardcode an error string where a `Diagnosis` belongs. +- After any mutating op, call `surface_warnings(ctx, formatter, result)` so warnings / + unhealthy components are surfaced (and `--strict` can fail CI). +- **Honesty:** when the API gives no category, the fault is `UNKNOWN` and we point at + the logs (exit code 3); don't guess whose fault it is. + +**Spec coupling:** `CATEGORY_FAULT` / `CATEGORY_HINT` are keyed by `ErrorCategory`, and +`tests/test_spec_conformance.py` asserts the enum matches `api/upstream-openapi.json` and +that every category is mapped. When the api-sync workflow surfaces a new `ErrorCategory`, +add it to `models.ErrorCategory` **and** both maps; the conformance test tells you. + ### Client method conventions - One public method per API endpoint on `ZadClient` diff --git a/README.md b/README.md index cb21426..1d57e41 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ zad backup create production | API URL | `--api-url` | `ZAD_API_URL` | `api_url` | production URL | | Output | `-o` | `ZAD_OUTPUT_FORMAT` | - | `table` | | No wait | `--no-wait` | - | - | wait | +| Strict | `--strict` | - | - | off | Precedence: **flags > env vars / `.env` > config file > defaults** @@ -71,6 +72,35 @@ Every command supports `--output` / `-o`: `table` (default), `json`, `yaml`. zad metrics overview --output json | jq '.cpu_usage' ``` +## Errors & exit codes + +Errors tell you **what's wrong and what to do next**, with a neutral label for where +to look (your request, your application, your configuration, your credentials, or the +ZAD platform) instead of a bare HTTP code. A failed image pull points you straight at +the image and registry (`Source: your application (cluster runtime)`) with the fix. + +Each error carries a structured diagnosis. In `--output json` it's a single object +on stdout you can branch on in CI/CD: + +```bash +zad deployment create app -c web=img:tag -o json > out.json || jq -r .fault out.json +# UserInput | UserApp | UserConfig | Auth | Platform | Network | Unknown +``` + +Exit codes: + +| Code | Meaning | +|------|---------| +| `0` | success | +| `1` | your fault, fix it (bad input, app/config failure, auth) | +| `2` | platform/network, transient and safe to retry | +| `3` | unknown, the API gave no signal to attribute the failure (check the logs) | + +`--strict` makes a command that *succeeds but reports warnings* (e.g. the deploy +applied but a component is crash-looping) exit non-zero, so a pipeline fails the +build instead of going green on an unhealthy app. Diagnostics go to **stderr**; +data (and the json error object) go to **stdout**, so pipes stay clean. + ## Commands ``` diff --git a/src/zad_cli/api/client.py b/src/zad_cli/api/client.py index d3bcfa2..b021df5 100644 --- a/src/zad_cli/api/client.py +++ b/src/zad_cli/api/client.py @@ -10,33 +10,41 @@ import httpx from pydantic import ValidationError +from zad_cli.api.errors import Diagnosis, Fault, diagnose_http_error, diagnose_task_failure from zad_cli.api.models import DeploymentDetail, DeploymentListResponse, TaskStatus class ZadApiError(Exception): - """Raised when the ZAD API returns an error.""" + """Raised when the ZAD API returns an error. - def __init__(self, status_code: int, message: str, details: dict | None = None): + Carries a :class:`~zad_cli.api.errors.Diagnosis` so the CLI can render an + honest, source-labelled message instead of a bare ``HTTP ``. + """ + + def __init__(self, status_code: int, message: str, details: dict | None = None, diagnosis: Diagnosis | None = None): self.status_code = status_code self.message = message self.details = details or {} + self.diagnosis = diagnosis super().__init__(f"HTTP {status_code}: {message}") class TaskTimeoutError(Exception): """Raised when task polling exceeds the timeout.""" - def __init__(self, message: str, task_id: str | None = None): + def __init__(self, message: str, task_id: str | None = None, diagnosis: Diagnosis | None = None): self.task_id = task_id + self.diagnosis = diagnosis super().__init__(message) class TaskFailedError(Exception): """Raised when a polled task reports failure.""" - def __init__(self, message: str, details: dict | None = None): + def __init__(self, message: str, details: dict | None = None, diagnosis: Diagnosis | None = None): self.message = message self.details = details or {} + self.diagnosis = diagnosis super().__init__(message) @@ -48,7 +56,20 @@ def _parse_v2_response(model_cls: type, payload: Any) -> dict: try: return model_cls.model_validate(payload).model_dump(mode="json") except ValidationError as e: - raise ZadApiError(502, f"Unexpected API response shape for {model_cls.__name__}: {e}") from e + raise ZadApiError( + 502, + f"Unexpected API response shape for {model_cls.__name__}: {e}", + diagnosis=Diagnosis( + fault=Fault.PLATFORM, + headline="ZAD returned a response this CLI couldn't read; likely a CLI/API version mismatch.", + summary=f"Schema {model_cls.__name__} failed to validate.", + next_steps=[ + "Retry shortly (exit code 2 = transient).", + "If it persists, the CLI may be out of date; update it or report the mismatch.", + ], + status_code=502, + ), + ) from e class ZadClient: @@ -113,22 +134,17 @@ def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response: time.sleep(delay) delay *= 2 continue - raise ZadApiError(0, f"Connection failed: {e}") from e + raise ZadApiError(0, f"Connection failed: {e}", diagnosis=diagnose_http_error(0, str(e))) from e if response.status_code in _RETRYABLE_CODES and attempt < self.max_retries: print(f"HTTP {response.status_code}, retrying in {delay}s...", file=sys.stderr) time.sleep(delay) delay *= 2 - last_error = ZadApiError(response.status_code, response.text) + last_error = self._http_error(response) continue if response.status_code >= 400: - try: - body = response.json() - message = body.get("message", body.get("detail", response.text)) - except Exception: - message = response.text - raise ZadApiError(response.status_code, message) + raise self._http_error(response) if self.verbose: print(f"<-- {response.status_code} ({response.elapsed.total_seconds():.2f}s)", file=sys.stderr) @@ -137,6 +153,21 @@ def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response: raise last_error or ZadApiError(0, "Request failed") + @staticmethod + def _http_error(response: httpx.Response) -> ZadApiError: + """Build a diagnosed ZadApiError from a >=400 response.""" + try: + body: Any = response.json() + except Exception: + body = response.text + if isinstance(body, dict): + message = body.get("message") or body.get("detail") or response.text + else: + message = response.text or str(body) + if not isinstance(message, str): + message = str(message) + return ZadApiError(response.status_code, message, diagnosis=diagnose_http_error(response.status_code, body)) + def _async_request(self, method: str, path: str, **kwargs: Any) -> dict: """Make a v2 async request. Polls for result unless self.wait is False.""" response = self._request(method, path, **kwargs) @@ -183,7 +214,7 @@ def _poll_task(self, poll_url: str) -> dict: continue if response.status_code >= 400: - raise ZadApiError(response.status_code, data.get("detail", data.get("message", str(data)))) + raise self._http_error(response) status = TaskStatus(**data) if isinstance(data, dict) else TaskStatus(status="unknown") task_id = task_id or data.get("task_id") @@ -195,13 +226,32 @@ def _poll_task(self, poll_url: str) -> dict: if status.status == "completed": return status.result or data if status.status == "failed": - raise TaskFailedError(status.error_message or "Task failed", details=status.result) + raise TaskFailedError( + status.error_message or "Task failed", + details=status.result, + diagnosis=diagnose_task_failure(status.error_message, status.result), + ) if status.status == "cancelled": - raise TaskFailedError("Task was cancelled") + raise TaskFailedError( + "Task was cancelled", + diagnosis=Diagnosis( + fault=Fault.UNKNOWN, + headline="The task was cancelled before it finished.", + next_steps=["Re-run the command, or check `zad task list` for details."], + ), + ) time.sleep(self.task_poll_interval) - raise TaskTimeoutError(f"Task did not complete within {self.task_timeout}s", task_id=task_id) + raise TaskTimeoutError( + f"Task did not complete within {self.task_timeout}s", + task_id=task_id, + diagnosis=Diagnosis( + fault=Fault.UNKNOWN, + headline=f"Timed out after {self.task_timeout}s waiting for the task; it may still be running.", + next_steps=["This is a wait limit, not a failure. Check `zad task status `."], + ), + ) # --- V2 project/deployment operations (async, poll for result) --- diff --git a/src/zad_cli/api/errors.py b/src/zad_cli/api/errors.py new file mode 100644 index 0000000..959b293 --- /dev/null +++ b/src/zad_cli/api/errors.py @@ -0,0 +1,390 @@ +"""Clear, actionable diagnosis of API and task failures. + +The goal is simple: tell the user *what went wrong and what to do next*. The +upstream API already carries the signal for that (``ErrorCategory`` on cluster +errors, ``ComponentFailureInfo`` with log tails on failed deployment tasks, +``HTTPValidationError`` on bad input, ``error_type`` on task results), but a bare +``HTTP 500`` / ``Task failed`` string throws it away. + +This module turns those raw signals into a :class:`Diagnosis`: a plain-language +headline, a neutral source label so you know where to look ("Source: your +application"), the concrete message, the backend's own explanation, and a next +step. The fault vocabulary is kept in lockstep with the OpenAPI spec by +``tests/test_spec_conformance.py`` (strict coupling: drift fails CI) while runtime +parsing degrades gracefully on unknown values (loose coupling). + +We never claim more certainty than the data supports: when the API gives no +category, the fault is ``UNKNOWN`` and we point at the logs rather than guessing. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import StrEnum + +from pydantic import ValidationError + +from zad_cli.api.models import ErrorCategory, ProcessingStatus + + +class Fault(StrEnum): + """Who/what a failure belongs to. Drives the source label, color and exit code.""" + + USER_INPUT = "UserInput" # the request you sent is wrong + USER_APP = "UserApp" # your container/workload failed at runtime + USER_CONFIG = "UserConfig" # your git config/manifests couldn't be applied + AUTH = "Auth" # your API key / permissions + PLATFORM = "Platform" # ZAD itself errored + NETWORK = "Network" # couldn't reach ZAD + UNKNOWN = "Unknown" # not enough signal to attribute honestly + + +# Neutral, source-labelled phrasing (no blame, just where the fault lives). +FAULT_SOURCE: dict[Fault, str] = { + Fault.USER_INPUT: "your request", + Fault.USER_APP: "your application (cluster runtime)", + Fault.USER_CONFIG: "your configuration / git", + Fault.AUTH: "your credentials / permissions", + Fault.PLATFORM: "ZAD platform", + Fault.NETWORK: "network / connectivity", + Fault.UNKNOWN: "unknown (see logs)", +} + +# Rich color: user-fixable = yellow, escalate/investigate = red, auth = magenta. +FAULT_COLOR: dict[Fault, str] = { + Fault.USER_INPUT: "yellow", + Fault.USER_APP: "yellow", + Fault.USER_CONFIG: "yellow", + Fault.AUTH: "magenta", + Fault.PLATFORM: "red", + Fault.NETWORK: "red", + Fault.UNKNOWN: "red", +} + +# CI/CD exit codes: 1 = your fault (fix it), 2 = platform/transient (safe to retry), +# 3 = unattributable. UNKNOWN gets its own code rather than claiming "your fault" +# (1) or "safe to retry" (2) when the API gave us no signal to attribute the failure. +FAULT_EXIT_CODE: dict[Fault, int] = { + Fault.USER_INPUT: 1, + Fault.USER_APP: 1, + Fault.USER_CONFIG: 1, + Fault.AUTH: 1, + Fault.PLATFORM: 2, + Fault.NETWORK: 2, + Fault.UNKNOWN: 3, +} + +# Which fault each cluster ErrorCategory implies. Keyed by ErrorCategory so the +# spec-conformance test can assert every upstream category is mapped here. +CATEGORY_FAULT: dict[ErrorCategory, Fault] = { + ErrorCategory.IMAGE_PULL: Fault.USER_APP, + ErrorCategory.CRASH_LOOP: Fault.USER_APP, + ErrorCategory.OUT_OF_MEMORY: Fault.USER_APP, + ErrorCategory.HEALTH_CHECK: Fault.USER_APP, + ErrorCategory.SYNC_FAILED: Fault.USER_CONFIG, + ErrorCategory.COMPARISON_ERROR: Fault.USER_CONFIG, + ErrorCategory.UNKNOWN: Fault.UNKNOWN, +} + +# Fallback next-step hint, used ONLY when the backend gave no explanation of its +# own. We always prefer the server's words over these. +CATEGORY_HINT: dict[ErrorCategory, str] = { + ErrorCategory.IMAGE_PULL: "Check the image tag exists and the registry is reachable / credentials are set.", + ErrorCategory.CRASH_LOOP: "The container starts then exits. Check `zad logs` for the crash reason.", + ErrorCategory.OUT_OF_MEMORY: "The container exceeded its memory limit. Reduce usage or raise the limit.", + ErrorCategory.HEALTH_CHECK: "The app started but its readiness/liveness probe never passed. Check the probe.", + ErrorCategory.SYNC_FAILED: "ZAD could not sync your config from git. Check the repo, branch, and manifests.", + ErrorCategory.COMPARISON_ERROR: "ZAD could not compare desired vs live state. Retry `zad deployment refresh`.", + ErrorCategory.UNKNOWN: "", +} + +# Unambiguous Kubernetes reason tokens we can map without guessing. Used only as +# a last resort when the API gives a raw message but no structured category. +_K8S_TOKEN_CATEGORY: dict[str, ErrorCategory] = { + "imagepullbackoff": ErrorCategory.IMAGE_PULL, + "errimagepull": ErrorCategory.IMAGE_PULL, + "invalidimagename": ErrorCategory.IMAGE_PULL, + "crashloopbackoff": ErrorCategory.CRASH_LOOP, + "oomkilled": ErrorCategory.OUT_OF_MEMORY, +} + +# HTTP status -> fault for the cases that aren't a simple 4xx/5xx split. +_HTTP_FAULT: dict[int, Fault] = { + 400: Fault.USER_INPUT, + 401: Fault.AUTH, + 403: Fault.AUTH, + 404: Fault.USER_INPUT, + 409: Fault.USER_INPUT, + 422: Fault.USER_INPUT, +} + + +@dataclass +class Diagnosis: + """A structured, source-labelled explanation of a failure. + + ``details`` are concrete lines (validation errors, component failures, log + tails); ``next_steps`` are actionable suggestions. ``summary`` is the raw + upstream message when we have one. + """ + + fault: Fault + headline: str + summary: str | None = None + details: list[str] = field(default_factory=list) + next_steps: list[str] = field(default_factory=list) + status_code: int | None = None + + @property + def source(self) -> str: + return FAULT_SOURCE[self.fault] + + @property + def color(self) -> str: + return FAULT_COLOR[self.fault] + + @property + def exit_code(self) -> int: + return FAULT_EXIT_CODE[self.fault] + + def to_dict(self) -> dict: + """Flat, machine-readable shape for `--output json` (CI/CD branch key: `fault`).""" + return { + "fault": self.fault.value, + "source": self.source, + "headline": self.headline, + "summary": self.summary, + "details": self.details, + "next_steps": self.next_steps, + "status_code": self.status_code, + } + + +def category_of(value: str | None) -> ErrorCategory: + """Coerce an arbitrary string to a known ErrorCategory (case-insensitive), else UNKNOWN. + + Loose coupling: an upstream category we don't know yet maps to UNKNOWN rather + than raising. + """ + if isinstance(value, str): + for cat in ErrorCategory: + if cat.value.lower() == value.lower(): + return cat + return ErrorCategory.UNKNOWN + + +def _scan_category(text: str | None) -> ErrorCategory: + """Best-effort category from raw text. + + Matches the backend's own ``ErrorCategory`` vocabulary (spec-derived) plus a + few unambiguous Kubernetes reason tokens. Returns UNKNOWN rather than guessing + when nothing matches. + """ + if not text: + return ErrorCategory.UNKNOWN + low = text.lower().replace(" ", "") + for token, cat in _K8S_TOKEN_CATEGORY.items(): + if token in low: + return cat + for cat in ErrorCategory: + if cat is not ErrorCategory.UNKNOWN and cat.value.lower() in low: + return cat + return ErrorCategory.UNKNOWN + + +def _parse_processing(raw: object) -> ProcessingStatus | None: + if not isinstance(raw, dict): + return None + try: + return ProcessingStatus.model_validate(raw) + except ValidationError: + return None + + +def _format_validation(detail: object) -> list[str]: + """Turn a FastAPI HTTPValidationError ``detail`` array into readable field lines.""" + if not isinstance(detail, list): + return [] + lines: list[str] = [] + for item in detail: + if not isinstance(item, dict): + lines.append(str(item)) + continue + loc = [str(p) for p in item.get("loc", [])] + if loc and loc[0] in {"body", "query", "path", "header", "cookie"}: + loc = loc[1:] + field_path = ".".join(loc) or "(request)" + lines.append(f"{field_path}: {item.get('msg', 'invalid value')}") + return lines + + +def diagnose_http_error(status_code: int, body: object) -> Diagnosis: + """Diagnose a failed HTTP response. + + ``status_code == 0`` means the request never reached ZAD (connection error). + ``body`` may be a parsed dict, a raw string, or None. + """ + if status_code == 0: + return Diagnosis( + fault=Fault.NETWORK, + headline="Could not reach the ZAD API.", + summary=str(body) if body else None, + next_steps=[ + "Check your network/VPN and that --api-url is correct.", + "If ZAD should be reachable, retry shortly (exit code 2 = transient).", + ], + status_code=0, + ) + + fault = _HTTP_FAULT.get(status_code) + if fault is None: + fault = ( + Fault.USER_INPUT if 400 <= status_code < 500 else Fault.PLATFORM if status_code >= 500 else Fault.UNKNOWN + ) + + body_dict = body if isinstance(body, dict) else None + details: list[str] = [] + summary: str | None = None + + if status_code == 422 and body_dict is not None: + details = _format_validation(body_dict.get("detail")) + if body_dict is not None and not details: + raw = body_dict.get("message") or body_dict.get("detail") + summary = raw if isinstance(raw, str) else None + elif isinstance(body, str) and body.strip(): + summary = body.strip() + + headline, next_steps = _http_headline(status_code, fault) + return Diagnosis( + fault=fault, + headline=headline, + summary=summary, + details=details, + next_steps=next_steps, + status_code=status_code, + ) + + +def _http_headline(status_code: int, fault: Fault) -> tuple[str, list[str]]: + if status_code in (401, 403): + verb = "Authentication failed" if status_code == 401 else "Permission denied" + return ( + f"{verb} (HTTP {status_code}).", + ["Set a valid ZAD_API_KEY (or --api-key) with access to this project."], + ) + if status_code == 404: + return ( + "Not found (HTTP 404): the resource you referenced doesn't exist.", + ["Check the name/spelling and that it exists (e.g. `zad deployment list`)."], + ) + if status_code == 409: + return ( + "Conflict (HTTP 409): the resource is in a state that blocks this action.", + ["Check its current state, then retry once it settles."], + ) + if status_code == 422: + return ( + "Invalid request (HTTP 422): the values you sent didn't pass validation.", + ["Fix the field(s) listed above and retry."], + ) + if fault is Fault.PLATFORM: + return ( + f"ZAD platform error (HTTP {status_code}), usually transient.", + ["Retry shortly (exit code 2 = transient). If it persists, report it with the time of the call."], + ) + return (f"Request rejected (HTTP {status_code}).", []) + + +def diagnose_task_failure(error_message: str | None, result: object) -> Diagnosis: + """Diagnose a failed async task from its ``error_message`` and ``result`` payload.""" + result_dict = result if isinstance(result, dict) else {} + processing = _parse_processing(result_dict.get("processing")) + failures = (processing.component_failures if processing else None) or [] + + details: list[str] = [] + next_steps: list[str] = [] + + if failures: + cats: list[ErrorCategory] = [] + for fail in failures: + cat = category_of(fail.failure_type) + cats.append(cat) + label = f"{fail.component} ({fail.failure_type}): {fail.message}" + details.append(label) + for line in (fail.logs or [])[:5]: + details.append(f" {line}") + hint = CATEGORY_HINT.get(cat) + if hint and hint not in next_steps: + next_steps.append(hint) + known = [c for c in cats if c is not ErrorCategory.UNKNOWN] + # A component concretely failed at runtime → it's the app, even if the + # exact category is unrecognised. + fault = CATEGORY_FAULT[known[0]] if known else Fault.USER_APP + else: + # No structured failures: fall back to a category scan of the raw text. + text = " ".join( + t for t in [error_message, processing.error if processing else None, result_dict.get("error")] if t + ) + cat = _scan_category(text) + fault = CATEGORY_FAULT[cat] if cat is not ErrorCategory.UNKNOWN else Fault.UNKNOWN + hint = CATEGORY_HINT.get(cat) + if hint: + next_steps.append(hint) + + summary = ( + error_message or (processing.error if processing else None) or (processing.message if processing else None) + ) + + if fault is Fault.USER_APP: + headline = "Your application didn't start on the cluster (the deploy reached the cluster; the workload failed)." + next_steps.append("Inspect `zad logs -d ` and `zad deployment describe `.") + elif fault is Fault.USER_CONFIG: + headline = "Your configuration couldn't be applied." + next_steps.append("Fix your git repo/manifests, then `zad deployment refresh`.") + else: + headline = "The operation failed. Check the details below for the cause." + next_steps.append("Run `zad task status ` and `zad logs` for the full output.") + + return Diagnosis(fault=fault, headline=headline, summary=summary, details=details, next_steps=next_steps) + + +def degraded_diagnoses(result: object) -> list[Diagnosis]: + """Inspect a *successful* task result for degraded state worth surfacing. + + Returns an empty list for a genuinely clean result. Catches the + "looks like it worked but your app is actually unhealthy" case: component + failures, ``warnings``, or a ``partial``/``degraded`` status on an otherwise + 200/completed response. + """ + result_dict = result if isinstance(result, dict) else {} + out: list[Diagnosis] = [] + + processing = _parse_processing(result_dict.get("processing")) + if processing and processing.component_failures: + diag = diagnose_task_failure(None, result_dict) + diag.headline = "The operation succeeded, but some components are unhealthy." + out.append(diag) + + warnings = result_dict.get("warnings") or [] + if warnings: + out.append( + Diagnosis( + fault=Fault.USER_CONFIG, + headline="The operation succeeded with warnings.", + details=[str(w) for w in warnings], + next_steps=["Review the warnings above; they usually point at your configuration."], + ) + ) + + status = str(result_dict.get("status", "")).lower() + if status in {"partial", "degraded"} and not out: + out.append( + Diagnosis( + fault=Fault.UNKNOWN, + headline=f"The operation finished with status '{result_dict.get('status')}'.", + summary=result_dict.get("message") or None, + next_steps=["Run `zad deployment describe ` to see the current state."], + ) + ) + + return out diff --git a/src/zad_cli/api/models.py b/src/zad_cli/api/models.py index 6e7283f..e3a6eb3 100644 --- a/src/zad_cli/api/models.py +++ b/src/zad_cli/api/models.py @@ -149,14 +149,65 @@ class TaskResponse(BaseModel): status: str = "pending" +class SubtaskStatus(BaseModel): + """Status of a single subtask within a task's progress. + + Mirrors the upstream ``SubtaskStatus`` schema. Extra fields are ignored so + additive upstream changes don't break parsing (loose coupling). + """ + + id: str + name: str + status: str + error: str | None = None + parent_id: str | None = None + + +class ComponentFailureInfo(BaseModel): + """Per-component failure detail for deployment health issues. + + Mirrors the upstream ``ComponentFailureInfo`` schema. This is the richest + diagnostic the API surfaces on a failed deployment task: which component + failed, the failure type, a message, and the tail of its container logs. + """ + + component: str + deployment: str = "" + failure_type: str + message: str + logs: list[str] | None = None + + +class ProcessingStatus(BaseModel): + """Processing-step status inside a task result. + + Mirrors the upstream task-models ``ProcessingStatus`` schema (the richer of + the two same-named schemas, the one that carries ``component_failures``). + """ + + status: str + message: str | None = None + error: str | None = None + result: object | None = None + component_failures: list[ComponentFailureInfo] | None = None + + class TaskStatus(BaseModel): - """Status of a polled task.""" + """Status of a polled task. + + ``result`` stays an untyped dict because its shape varies per task type + (UpsertDeploymentResult, RefreshDeploymentResult, ...); the diagnosis layer + parses the bits it understands defensively. ``subtasks``/``task_type`` are + optional so older API responses still validate. + """ status: str + task_type: str | None = None current_step: str | None = None progress_percent: int | None = None result: dict | None = None error_message: str | None = None + subtasks: list[SubtaskStatus] | None = None class DeploymentStatus(StrEnum): diff --git a/src/zad_cli/cli.py b/src/zad_cli/cli.py index 13f8a4c..9c4565b 100644 --- a/src/zad_cli/cli.py +++ b/src/zad_cli/cli.py @@ -28,7 +28,7 @@ class _GlobalOptionsGroup(TyperGroup): """Hoist global options to before the subcommand so they work in any position.""" _OPTS_WITH_VALUE = frozenset({"--output", "-o", "--api-key", "--api-url", "--project", "-p"}) - _FLAGS = frozenset({"--no-wait", "--verbose", "-v", "--version", "-V"}) + _FLAGS = frozenset({"--no-wait", "--verbose", "-v", "--version", "-V", "--strict"}) def parse_args(self, ctx, args): # noqa: ANN001 global_args: list[str] = [] @@ -96,6 +96,9 @@ def main_callback( project_id: str = typer.Option(None, "--project", "-p", envvar="ZAD_PROJECT_ID", help="Project ID"), no_wait: bool = typer.Option(False, "--no-wait", help="Don't wait for async operations, return task ID"), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose request logging"), + strict: bool = typer.Option( + False, "--strict", help="Exit non-zero when an operation succeeds but reports warnings (for CI/CD)" + ), version: bool = typer.Option( False, "--version", "-V", help="Show version and exit", callback=_version_callback, is_eager=True ), @@ -111,6 +114,7 @@ def main_callback( ctx.obj["settings"] = settings ctx.obj["formatter"] = OutputFormatter(fmt=settings.output_format) ctx.obj["no_wait"] = no_wait + ctx.obj["strict"] = strict @app.command(deprecated=True) diff --git a/src/zad_cli/commands/clone.py b/src/zad_cli/commands/clone.py index 9d57cf4..0c4cc14 100644 --- a/src/zad_cli/commands/clone.py +++ b/src/zad_cli/commands/clone.py @@ -5,7 +5,7 @@ import typer from zad_cli.api.models import CloneBucketRequest, CloneDatabaseRequest -from zad_cli.helpers import get_helpers, handle_api_errors, require_project +from zad_cli.helpers import get_helpers, handle_api_errors, require_project, surface_warnings app = typer.Typer( help="Clone data from external sources.\n\nRequires ZAD_API_KEY and ZAD_PROJECT_ID (or --api-key and -p).", @@ -60,6 +60,7 @@ def database( result = client.clone_database(project, deployment, request.to_api_payload()) formatter.render(result) formatter.render_success("Database clone started.") + surface_warnings(ctx, formatter, result) @app.command() @@ -109,6 +110,7 @@ def bucket( result = client.clone_bucket(project, deployment, request.to_api_payload()) formatter.render(result) formatter.render_success("Bucket clone started.") + surface_warnings(ctx, formatter, result) @app.command() diff --git a/src/zad_cli/commands/deployment.py b/src/zad_cli/commands/deployment.py index a74f4ae..81d0491 100644 --- a/src/zad_cli/commands/deployment.py +++ b/src/zad_cli/commands/deployment.py @@ -12,8 +12,10 @@ confirm_action, get_helpers, handle_api_errors, + issues_cell, render_dry_run, require_project, + surface_warnings, ) app = typer.Typer( @@ -42,17 +44,21 @@ def list_deployments(ctx: typer.Context) -> None: rows = [] for dep in deployments: + status = dep.get("status", "Active") rows.append( { "deployment": dep["deployment"], "components": str(len(dep["components"])), - "status": dep.get("status", "Active"), + "status": f"[{_status_color(status)}]{status}[/{_status_color(status)}]", + "issues": issues_cell(dep.get("errors")), "namespace": dep["namespace"], } ) formatter.render( - rows, columns=["deployment", "components", "status", "namespace"], title=f"Deployments in {project}" + rows, + columns=["deployment", "components", "status", "issues", "namespace"], + title=f"Deployments in {project}", ) @@ -207,6 +213,7 @@ def create( result = client.upsert_deployment(project, request.to_api_payload()) formatter.render(result) formatter.render_success(f"Deployment '{deployment_name}' created/updated in project '{project}'.") + surface_warnings(ctx, formatter, result) @app.command("update-image") @@ -245,6 +252,7 @@ def update_image( result = client.update_image(project, deployment, component, image, **kwargs) formatter.render(result) formatter.render_success(f"Image updated: {component} -> {image}") + surface_warnings(ctx, formatter, result) @app.command() @@ -271,6 +279,7 @@ def refresh( result = client.refresh_deployment(project, deployment, force_clone=force_clone) formatter.render(result) formatter.render_success(f"Deployment '{deployment}' refreshed.") + surface_warnings(ctx, formatter, result) @app.command() diff --git a/src/zad_cli/commands/project.py b/src/zad_cli/commands/project.py index 3b1998c..e824622 100644 --- a/src/zad_cli/commands/project.py +++ b/src/zad_cli/commands/project.py @@ -4,7 +4,15 @@ import typer -from zad_cli.helpers import confirm_action, get_helpers, handle_api_errors, render_dry_run, require_project +from zad_cli.helpers import ( + confirm_action, + get_helpers, + handle_api_errors, + issues_cell, + render_dry_run, + require_project, + surface_warnings, +) app = typer.Typer( help="Manage projects.\n\nMost commands require ZAD_API_KEY and ZAD_PROJECT_ID (or --api-key and -p).", @@ -71,6 +79,7 @@ def status(ctx: typer.Context) -> None: table = Table(title="Deployments", show_header=True) table.add_column("Deployment", style="bold cyan") table.add_column("Components") + table.add_column("Issues") table.add_column("URL") for dep in result["deployments"]: @@ -79,7 +88,7 @@ def status(ctx: typer.Context) -> None: if dep.get("urls"): first_url = next(iter(dep["urls"].values()), "") url = first_url - table.add_row(dep["deployment"], components, url) + table.add_row(dep["deployment"], components, issues_cell(dep.get("errors")), url) console.print(table) @@ -102,6 +111,7 @@ def refresh( result = client.refresh_project(project, force_clone=force_clone) formatter.render(result) formatter.render_success(f"Project '{project}' refreshed.") + surface_warnings(ctx, formatter, result) @app.command() diff --git a/src/zad_cli/helpers.py b/src/zad_cli/helpers.py index af678f8..21def10 100644 --- a/src/zad_cli/helpers.py +++ b/src/zad_cli/helpers.py @@ -68,10 +68,15 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: except (ZadApiError, TaskFailedError, TaskTimeoutError) as e: ctx = kwargs.get("ctx") or next((a for a in args if isinstance(a, typer.Context)), None) formatter = ctx.obj["formatter"] if ctx else None - details = getattr(e, "details", None) - status_code = getattr(e, "status_code", None) - if formatter: - formatter.render_error(str(e), details=details, status_code=status_code) + diagnosis = getattr(e, "diagnosis", None) + exit_code = 1 + if formatter and diagnosis is not None: + formatter.render_diagnosis(diagnosis) + exit_code = diagnosis.exit_code + elif formatter: + formatter.render_error( + str(e), details=getattr(e, "details", None), status_code=getattr(e, "status_code", None) + ) else: print(f"Error: {e}", file=sys.stderr) # On timeout, show task ID so the user can follow up @@ -79,11 +84,47 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: if task_id: print(f"Task is still running. Check status with: zad task status {task_id}", file=sys.stderr) print(f"Or wait for completion with: zad task wait {task_id}", file=sys.stderr) - raise typer.Exit(1) from e + raise typer.Exit(exit_code) from e return wrapper +def surface_warnings(ctx: typer.Context, formatter: OutputFormatter, result: Any) -> None: + """After a successful mutating op, surface any degraded state (unhealthy components, + warnings, partial status). Under global --strict, exit non-zero so CI/CD fails the build. + """ + from zad_cli.api.errors import degraded_diagnoses + + diagnoses = degraded_diagnoses(result) + if not diagnoses: + return + formatter.render_warnings(diagnoses) + if ctx.obj.get("strict"): + # Honor the per-fault exit code contract: a retryable (PLATFORM/NETWORK) + # warning exits 2, not 1. Pick the most severe across all diagnoses. + raise typer.Exit(max(d.exit_code for d in diagnoses)) + + +def issues_cell(errors: list[dict] | None) -> str: + """Rich-markup summary of a deployment's cluster errors for list/status tables. + + Empty string when clean; otherwise a colored `` `` cell so + degraded deployments are never silently green. + """ + if not errors: + return "" + from collections import Counter + + from zad_cli.api.errors import CATEGORY_FAULT, FAULT_COLOR, category_of + + counts = Counter(str(e.get("category", "Unknown")) for e in errors) + top, _ = counts.most_common(1)[0] + total = len(errors) + label = f"{total} {top}" if total > 1 else top + color = FAULT_COLOR[CATEGORY_FAULT[category_of(top)]] + return f"[{color}]{label}[/{color}]" + + def render_dry_run(formatter: OutputFormatter, method: str, endpoint: str, payload: dict | None = None) -> None: """Show what would be sent without making the API call.""" info: dict = {"dry_run": True, "method": method, "endpoint": endpoint} diff --git a/src/zad_cli/output/formatter.py b/src/zad_cli/output/formatter.py index 73dfdfc..64664e8 100644 --- a/src/zad_cli/output/formatter.py +++ b/src/zad_cli/output/formatter.py @@ -4,15 +4,33 @@ import json import sys +from typing import TYPE_CHECKING import yaml from rich.console import Console +from rich.markup import escape from rich.table import Table +if TYPE_CHECKING: + from zad_cli.api.errors import Diagnosis + # Console for stderr (status messages), stdout is for data output err_console = Console(stderr=True) +def _supports_unicode() -> bool: + """True if stderr can render unicode glyphs. CI logs with ascii encoding get ascii fallbacks.""" + enc = (getattr(sys.stderr, "encoding", None) or "").lower() + return "utf" in enc + + +def _glyphs() -> tuple[str, str, str]: + """Return (error, warning, arrow) glyphs, ascii-safe when unicode isn't available.""" + if _supports_unicode(): + return "✗", "⚠", "→" # ✗ ⚠ → + return "x", "!", "->" + + class OutputFormatter: """Render data in table, json, or yaml format.""" @@ -71,6 +89,52 @@ def render_error(self, message: str, details: dict | None = None, status_code: i for k, v in details.items(): print(f" {k}: {v}", file=sys.stderr) + def render_diagnosis(self, diagnosis: Diagnosis) -> None: + """Render a failure diagnosis: source-labelled, with details and next steps. + + In json mode the structured diagnosis goes to stdout (the command failed, so + there is no other stdout payload; automation branches on the ``fault`` key). + In table mode it renders to stderr. + """ + if self.fmt == "json": + print(json.dumps(diagnosis.to_dict(), indent=2, default=str)) + return + cross, _, _ = _glyphs() + self._diagnosis_block(diagnosis, glyph=cross, header_color=diagnosis.color) + + def render_warnings(self, diagnoses: list[Diagnosis]) -> None: + """Render degraded-but-successful warnings to stderr (never blocks stdout data). + + In json mode the result payload is already on stdout, so warnings go to stderr + as a single JSON object to keep stdout a valid single document. + """ + if not diagnoses: + return + if self.fmt == "json": + payload = {"warnings": [d.to_dict() for d in diagnoses]} + print(json.dumps(payload, indent=2, default=str), file=sys.stderr) + return + _, warn, _ = _glyphs() + for diagnosis in diagnoses: + self._diagnosis_block(diagnosis, glyph=warn, header_color="yellow") + + def _diagnosis_block(self, diagnosis: Diagnosis, *, glyph: str, header_color: str) -> None: + """Shared stderr layout for diagnoses and warnings.""" + _, _, arrow = _glyphs() + err_console.print(f"[{header_color}]{glyph} {escape(diagnosis.headline)}[/{header_color}]") + err_console.print(f" [dim]Source:[/dim] {escape(diagnosis.source)}") + if diagnosis.summary: + err_console.print(f"\n {escape(diagnosis.summary)}") + if diagnosis.details: + err_console.print() + for line in diagnosis.details: + err_console.print(f" {escape(line)}") + if diagnosis.next_steps: + err_console.print() + for step in diagnosis.next_steps: + err_console.print(f" [cyan]{arrow}[/cyan] {escape(step)}") + err_console.print() + def _table( self, data: list[dict] | dict, diff --git a/tests/commands/test_deployment.py b/tests/commands/test_deployment.py index a31c224..2cefd2f 100644 --- a/tests/commands/test_deployment.py +++ b/tests/commands/test_deployment.py @@ -115,3 +115,85 @@ def test_describe_renders_degraded_deployment_with_errors(monkeypatch: pytest.Mo assert "ImagePull" in result.output assert "Back-off pulling image" in result.output assert "Container image cannot be pulled." in result.output + + +def _stub_client(monkeypatch: pytest.MonkeyPatch, **methods: Any) -> None: + """Install a stub ZadClient exposing the given methods, plus auth env.""" + + class _StubClient: + def __init__(self, *_args, **_kwargs): + self.wait = True + self.verbose = False + + def close(self) -> None: + pass + + for name, fn in methods.items(): + setattr(_StubClient, name, staticmethod(fn)) + + monkeypatch.setenv("ZAD_API_KEY", "test-key") + monkeypatch.setenv("ZAD_PROJECT_ID", "my-project") + monkeypatch.setenv("ZAD_API_URL", "https://api.example.com") + import zad_cli.api.client as client_module + + monkeypatch.setattr(client_module, "ZadClient", _StubClient) + monkeypatch.setattr("zad_cli.helpers.ZadClient", _StubClient, raising=False) + + +def test_list_shows_issues_column(monkeypatch: pytest.MonkeyPatch) -> None: + def _list(_project: str) -> list[dict[str, Any]]: + return [ + { + "deployment": "staging", + "project": "my-project", + "namespace": "ns-staging", + "components": ["web"], + "status": "Degraded", + "urls": {}, + "sync_revision": None, + "last_synced_at": None, + "errors": [{"category": "ImagePull", "resource": "Pod/web", "message": "back-off"}], + } + ] + + _stub_client(monkeypatch, list_deployments=_list) + result = CliRunner().invoke(app, ["deployment", "list"]) + assert result.exit_code == 0, result.output + assert "Issues" in result.output + assert "ImagePull" in result.output + + +def _degraded_result() -> dict[str, Any]: + return { + "status": "success", + "processing": { + "status": "completed", + "component_failures": [{"component": "web", "failure_type": "CrashLoop", "message": "exited 1"}], + }, + } + + +def test_create_surfaces_warnings_but_exits_zero(monkeypatch: pytest.MonkeyPatch) -> None: + _stub_client(monkeypatch, upsert_deployment=lambda _p, _payload: _degraded_result()) + result = CliRunner().invoke(app, ["deployment", "create", "staging", "--component", "web", "--image", "x:1", "-y"]) + assert result.exit_code == 0, result.output + assert "unhealthy" in result.output.lower() + + +def test_create_strict_exits_nonzero_on_warnings(monkeypatch: pytest.MonkeyPatch) -> None: + _stub_client(monkeypatch, upsert_deployment=lambda _p, _payload: _degraded_result()) + result = CliRunner().invoke( + app, ["--strict", "deployment", "create", "staging", "--component", "web", "--image", "x:1", "-y"] + ) + assert result.exit_code == 1, result.output + assert "unhealthy" in result.output.lower() + + +def test_create_strict_exit_code_follows_fault_contract(monkeypatch: pytest.MonkeyPatch) -> None: + """--strict honors the per-fault exit code: a 'degraded' status is UNKNOWN (exit 3), + not a hardcoded 1.""" + _stub_client(monkeypatch, upsert_deployment=lambda _p, _payload: {"status": "degraded", "message": "half up"}) + result = CliRunner().invoke( + app, ["--strict", "deployment", "create", "staging", "--component", "web", "--image", "x:1", "-y"] + ) + assert result.exit_code == 3, result.output diff --git a/tests/test_client.py b/tests/test_client.py index 671379c..58cc47a 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -113,6 +113,71 @@ def test_v2_async_poll_timeout(client): assert exc_info.value.task_id == "abc" +@respx.mock +def test_http_error_carries_auth_diagnosis(client): + respx.get("https://api.example.com/metrics/health").mock(return_value=httpx.Response(401, text="Unauthorized")) + with pytest.raises(ZadApiError) as exc_info: + client.health() + diag = exc_info.value.diagnosis + assert diag is not None + assert diag.fault.value == "Auth" + assert diag.exit_code == 1 + + +@respx.mock +def test_http_422_diagnosis_has_field_paths(client): + body = {"detail": [{"loc": ["body", "deploymentName"], "msg": "field required", "type": "missing"}]} + respx.post("https://api.example.com/v2/projects/my-project/:upsert-deployment").mock( + return_value=httpx.Response(422, json=body) + ) + with pytest.raises(ZadApiError) as exc_info: + client.upsert_deployment("my-project", {}) + diag = exc_info.value.diagnosis + assert diag.fault.value == "UserInput" + assert "deploymentName: field required" in diag.details + + +@respx.mock +def test_500_diagnosis_is_platform_and_retryable(client): + respx.get("https://api.example.com/metrics/health").mock(return_value=httpx.Response(503, text="down")) + with pytest.raises(ZadApiError) as exc_info: + client.health() + diag = exc_info.value.diagnosis + assert diag.fault.value == "Platform" + assert diag.exit_code == 2 # CI/CD: safe to retry + + +@respx.mock +def test_task_failure_carries_app_diagnosis(client): + respx.post("https://api.example.com/v2/projects/my-project/:upsert-deployment").mock( + return_value=httpx.Response(202, json={"task_id": "abc", "status": "accepted"}) + ) + respx.get("https://api.example.com/tasks/abc").mock( + return_value=httpx.Response( + 200, + json={ + "status": "failed", + "error_message": "deployment failed", + "result": { + "status": "failed", + "processing": { + "status": "failed", + "component_failures": [ + {"component": "web", "failure_type": "ImagePull", "message": "back-off pulling"} + ], + }, + }, + }, + ) + ) + with pytest.raises(TaskFailedError) as exc_info: + client.upsert_deployment("my-project", {"deploymentName": "test", "components": []}) + diag = exc_info.value.diagnosis + assert diag is not None + assert diag.fault.value == "UserApp" + assert any("web (ImagePull)" in line for line in diag.details) + + def test_build_poll_url_relative(client): assert client._build_poll_url("/tasks/abc").endswith("/tasks/abc") assert client._build_poll_url("/tasks/abc").startswith("https://") diff --git a/tests/test_errors.py b/tests/test_errors.py new file mode 100644 index 0000000..3673d62 --- /dev/null +++ b/tests/test_errors.py @@ -0,0 +1,118 @@ +"""Unit tests for the diagnosis layer (api/errors.py).""" + +from zad_cli.api.errors import ( + Fault, + degraded_diagnoses, + diagnose_http_error, + diagnose_task_failure, +) + + +def test_http_422_extracts_field_paths() -> None: + body = { + "detail": [ + {"loc": ["body", "components", 0, "image"], "msg": "field required", "type": "missing"}, + {"loc": ["body", "deployment_name"], "msg": "string too short", "type": "value_error"}, + ] + } + d = diagnose_http_error(422, body) + assert d.fault is Fault.USER_INPUT + assert d.exit_code == 1 + assert "components.0.image: field required" in d.details + assert "deployment_name: string too short" in d.details + # The 'body' prefix is stripped for readability. + assert not any(line.startswith("body.") for line in d.details) + + +def test_http_401_is_auth() -> None: + d = diagnose_http_error(401, {"detail": "invalid api key"}) + assert d.fault is Fault.AUTH + assert d.source == "your credentials / permissions" + assert d.exit_code == 1 + + +def test_http_404_is_user_input() -> None: + assert diagnose_http_error(404, {"detail": "not found"}).fault is Fault.USER_INPUT + + +def test_http_500_is_platform_and_retryable_exit_code() -> None: + d = diagnose_http_error(500, "boom") + assert d.fault is Fault.PLATFORM + assert d.exit_code == 2 + assert "platform" in d.headline.lower() + + +def test_connection_failure_is_network() -> None: + d = diagnose_http_error(0, "connection refused") + assert d.fault is Fault.NETWORK + assert d.exit_code == 2 + + +def test_task_failure_component_imagepull_is_user_app() -> None: + result = { + "status": "failed", + "processing": { + "status": "failed", + "component_failures": [ + { + "component": "web", + "failure_type": "ImagePull", + "message": "Back-off pulling image ghcr.io/org/web:bad", + "logs": ["Error: manifest unknown"], + } + ], + }, + } + d = diagnose_task_failure("deployment failed", result) + assert d.fault is Fault.USER_APP + assert "your application" in d.source + assert any("web (ImagePull)" in line for line in d.details) + assert any("manifest unknown" in line for line in d.details) + + +def test_task_failure_syncfailed_text_is_user_config() -> None: + # No structured failures, but the message carries the backend's category vocabulary. + d = diagnose_task_failure("git clone failed (SyncFailed)", {}) + assert d.fault is Fault.USER_CONFIG + + +def test_task_failure_unknown_stays_unknown() -> None: + d = diagnose_task_failure("something odd happened", {}) + assert d.fault is Fault.UNKNOWN + # UNKNOWN gets its own exit code: not "your fault" (1), not "safe to retry" (2). + assert d.exit_code == 3 + assert "logs" in " ".join(d.next_steps).lower() + + +def test_degraded_diagnoses_flags_warnings() -> None: + diags = degraded_diagnoses({"status": "success", "warnings": ["deprecated field 'foo'"]}) + assert len(diags) == 1 + assert diags[0].fault is Fault.USER_CONFIG + assert "deprecated field 'foo'" in diags[0].details + + +def test_degraded_diagnoses_flags_unhealthy_components() -> None: + result = { + "status": "success", + "processing": { + "status": "completed", + "component_failures": [{"component": "web", "failure_type": "CrashLoop", "message": "exited 1"}], + }, + } + diags = degraded_diagnoses(result) + assert len(diags) == 1 + assert diags[0].fault is Fault.USER_APP + + +def test_degraded_diagnoses_clean_result_is_empty() -> None: + assert degraded_diagnoses({"status": "success"}) == [] + assert degraded_diagnoses(None) == [] + + +def test_to_dict_is_machine_readable() -> None: + d = diagnose_http_error(500, "boom") + payload = d.to_dict() + assert payload["fault"] == "Platform" + assert payload["source"] == "ZAD platform" + assert payload["status_code"] == 500 + assert set(payload) == {"fault", "source", "headline", "summary", "details", "next_steps", "status_code"} diff --git a/tests/test_output.py b/tests/test_output.py index d5db7b6..364f858 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -2,6 +2,7 @@ import json +from zad_cli.api.errors import Diagnosis, Fault from zad_cli.output.formatter import OutputFormatter @@ -67,3 +68,51 @@ def test_render_error_table(capsys): fmt.render_error("something broke") err = capsys.readouterr().err assert "something broke" in err + + +def _sample_diagnosis() -> Diagnosis: + return Diagnosis( + fault=Fault.USER_APP, + headline="Your application failed to run on the cluster.", + summary="deployment failed", + details=["web (ImagePull): back-off pulling image"], + next_steps=["Inspect `zad logs`."], + status_code=None, + ) + + +def test_render_diagnosis_json_goes_to_stdout(capsys): + fmt = OutputFormatter(fmt="json") + fmt.render_diagnosis(_sample_diagnosis()) + out = capsys.readouterr().out # failure has no other stdout payload -> diagnosis to stdout + data = json.loads(out) + assert data["fault"] == "UserApp" + assert data["source"] == "your application (cluster runtime)" + assert data["details"] == ["web (ImagePull): back-off pulling image"] + + +def test_render_diagnosis_table_goes_to_stderr(capsys): + fmt = OutputFormatter(fmt="table") + fmt.render_diagnosis(_sample_diagnosis()) + captured = capsys.readouterr() + assert captured.out == "" # stdout stays clean for pipelines + assert "Your application failed to run" in captured.err + assert "Source: your application" in captured.err + assert "Inspect" in captured.err + + +def test_render_warnings_json_goes_to_stderr(capsys): + # The result payload is already on stdout, so warnings must not corrupt it. + fmt = OutputFormatter(fmt="json") + fmt.render_warnings([_sample_diagnosis()]) + captured = capsys.readouterr() + assert captured.out == "" + data = json.loads(captured.err) + assert data["warnings"][0]["fault"] == "UserApp" + + +def test_render_warnings_empty_is_noop(capsys): + OutputFormatter(fmt="table").render_warnings([]) + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" diff --git a/tests/test_spec_conformance.py b/tests/test_spec_conformance.py new file mode 100644 index 0000000..4545a1b --- /dev/null +++ b/tests/test_spec_conformance.py @@ -0,0 +1,68 @@ +"""Spec-conformance tests: keep the CLI's error vocabulary in lockstep with the API. + +These assert the CLI's hand-written enums/mappings match the vendored OpenAPI spec +(api/upstream-openapi.json). When the daily api-sync workflow refreshes the spec and +the upstream adds/changes an ErrorCategory, these tests fail loudly with a clear +instruction, turning a silent drift into an actionable PR. This is the *strict* half +of the coupling; runtime parsing (category_of / _coerce_unknown_*) is the *loose* half. + +The vendored spec is part of the repo, so a missing file is a hard failure, not a +skip: skipping would let the whole conformance check pass silently if the file is ever +moved or renamed. +""" + +import json +from pathlib import Path + +import pytest + +from zad_cli.api.errors import CATEGORY_FAULT, CATEGORY_HINT +from zad_cli.api.models import ErrorCategory + +_SPEC_PATH = Path(__file__).resolve().parent.parent / "api" / "upstream-openapi.json" + + +def _spec_schemas() -> dict: + if not _SPEC_PATH.exists(): + pytest.fail( + f"vendored spec not found at {_SPEC_PATH}. It is part of the repo and these " + "conformance tests depend on it; if it moved, update _SPEC_PATH instead of skipping." + ) + return json.loads(_SPEC_PATH.read_text())["components"]["schemas"] + + +def test_error_category_enum_matches_spec() -> None: + """Our ErrorCategory must equal the spec's enum, value-for-value.""" + spec_values = set(_spec_schemas()["ErrorCategory"]["enum"]) + cli_values = {c.value for c in ErrorCategory} + assert cli_values == spec_values, ( + "ErrorCategory drifted from the API spec. Update zad_cli.api.models.ErrorCategory " + f"and the CATEGORY_FAULT/CATEGORY_HINT maps. spec-only={spec_values - cli_values}, " + f"cli-only={cli_values - spec_values}" + ) + + +def test_every_category_has_a_fault_and_hint() -> None: + """A new category must not silently fall through the diagnosis layer.""" + for cat in ErrorCategory: + assert cat in CATEGORY_FAULT, f"{cat} missing from CATEGORY_FAULT" + assert cat in CATEGORY_HINT, f"{cat} missing from CATEGORY_HINT" + + +@pytest.mark.parametrize( + "schema_name,model_path", + [ + ("ComponentFailureInfo", "zad_cli.api.models:ComponentFailureInfo"), + ("SubtaskStatus", "zad_cli.api.models:SubtaskStatus"), + ], +) +def test_models_cover_spec_required_fields(schema_name: str, model_path: str) -> None: + """Our pydantic models must declare every field the spec marks required.""" + import importlib + + module_name, cls_name = model_path.split(":") + model = getattr(importlib.import_module(module_name), cls_name) + spec_required = set(_spec_schemas()[schema_name].get("required", [])) + model_fields = set(model.model_fields) + missing = spec_required - model_fields + assert not missing, f"{cls_name} is missing spec-required fields: {missing}"