diff --git a/CLAUDE.md b/CLAUDE.md
index 652f49b..c0b53ce 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -153,8 +153,41 @@ def verb(
     result = client.method(project, ...)
     formatter.render(result)
     formatter.render_success(f"Target '{target}' verbed.")
+    surface_warnings(ctx, formatter, result)  # mutating ops: flag degraded-but-successful state
 ```
 
+### Error reporting (the diagnosis layer)
+
+Errors must give **clarity and a next step**: say what's wrong, point neutrally at
+where to look, and suggest the fix. The machinery lives in `api/errors.py`:
+
+- **`Fault`** (StrEnum): `USER_INPUT`, `USER_APP`, `USER_CONFIG`, `AUTH`, `PLATFORM`,
+  `NETWORK`, `UNKNOWN`. Drives a neutral source label (`FAULT_SOURCE`), color
+  (`FAULT_COLOR`), and CI/CD exit code (`FAULT_EXIT_CODE`: 1 = your fault, 2 =
+  platform/transient, 3 = unknown/unattributable).
+- **`Diagnosis`** (dataclass): `fault`, `headline`, `summary`, `details`, `next_steps`,
+  `status_code`. `to_dict()` is the json contract (CI branches on `fault`).
+- **`diagnose_http_error`** parses status codes and FastAPI `422` validation arrays;
+  **`diagnose_task_failure`** digs into `processing.component_failures`, `error_type`,
+  and `ErrorCategory`; **`degraded_diagnoses`** catches "succeeded but unhealthy".
+
+Rules for new code:
+- The client raises `ZadApiError` / `TaskFailedError` / `TaskTimeoutError` with a
+  `.diagnosis` attached (build it at the raise site via the `diagnose_*` helpers or
+  `_http_error`). `handle_api_errors` renders it and exits with `diagnosis.exit_code`.
+- Render failures with `formatter.render_diagnosis(d)`, degraded-success with
+  `formatter.render_warnings(diags)`. Diagnostics go to **stderr**; json error objects
+  go to stdout. Never hardcode an error string where a `Diagnosis` belongs.
+- After any mutating op, call `surface_warnings(ctx, formatter, result)` so warnings /
+  unhealthy components are surfaced (and `--strict` can fail CI).
+- **Honesty:** when the API gives no category, the fault is `UNKNOWN` and we point at
+  the logs (exit code 3); don't guess whose fault it is.
+
+**Spec coupling:** `CATEGORY_FAULT` / `CATEGORY_HINT` are keyed by `ErrorCategory`, and
+`tests/test_spec_conformance.py` asserts the enum matches `api/upstream-openapi.json` and
+that every category is mapped. When the api-sync workflow surfaces a new `ErrorCategory`,
+add it to `models.ErrorCategory` **and** both maps; the conformance test tells you.
+
 ### Client method conventions
 
 - One public method per API endpoint on `ZadClient`
diff --git a/README.md b/README.md
index cb21426..1d57e41 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ zad backup create production
 | API URL | `--api-url` | `ZAD_API_URL` | `api_url` | production URL |
 | Output | `-o` | `ZAD_OUTPUT_FORMAT` | - | `table` |
 | No wait | `--no-wait` | - | - | wait |
+| Strict | `--strict` | - | - | off |
 
 Precedence: **flags > env vars / `.env` > config file > defaults**
 
@@ -71,6 +72,35 @@ Every command supports `--output` / `-o`: `table` (default), `json`, `yaml`.
 zad metrics overview --output json | jq '.cpu_usage'
 ```
 
+## Errors & exit codes
+
+Errors tell you **what's wrong and what to do next**, with a neutral label for where
+to look (your request, your application, your configuration, your credentials, or the
+ZAD platform) instead of a bare HTTP code. A failed image pull points you straight at
+the image and registry (`Source: your application (cluster runtime)`) with the fix.
+
+Each error carries a structured diagnosis. In `--output json` it's a single object
+on stdout you can branch on in CI/CD:
+
+```bash
+zad deployment create app -c web=img:tag -o json > out.json || jq -r .fault out.json
+# UserInput | UserApp | UserConfig | Auth | Platform | Network | Unknown
+```
+
+Exit codes:
+
+| Code | Meaning |
+|------|---------|
+| `0` | success |
+| `1` | your fault, fix it (bad input, app/config failure, auth) |
+| `2` | platform/network, transient and safe to retry |
+| `3` | unknown, the API gave no signal to attribute the failure (check the logs) |
+
+`--strict` makes a command that *succeeds but reports warnings* (e.g. the deploy
+applied but a component is crash-looping) exit non-zero, so a pipeline fails the
+build instead of going green on an unhealthy app. Diagnostics go to **stderr**;
+data (and the json error object) go to **stdout**, so pipes stay clean.
+
 ## Commands
 
 ```
diff --git a/src/zad_cli/api/client.py b/src/zad_cli/api/client.py
index d3bcfa2..b021df5 100644
--- a/src/zad_cli/api/client.py
+++ b/src/zad_cli/api/client.py
@@ -10,33 +10,41 @@
 import httpx
 from pydantic import ValidationError
 
+from zad_cli.api.errors import Diagnosis, Fault, diagnose_http_error, diagnose_task_failure
 from zad_cli.api.models import DeploymentDetail, DeploymentListResponse, TaskStatus
 
 
 class ZadApiError(Exception):
-    """Raised when the ZAD API returns an error."""
+    """Raised when the ZAD API returns an error.
 
-    def __init__(self, status_code: int, message: str, details: dict | None = None):
+    Carries a :class:`~zad_cli.api.errors.Diagnosis` so the CLI can render an
+    honest, source-labelled message instead of a bare ``HTTP <code>``.
+    """
+
+    def __init__(self, status_code: int, message: str, details: dict | None = None, diagnosis: Diagnosis | None = None):
         self.status_code = status_code
         self.message = message
         self.details = details or {}
+        self.diagnosis = diagnosis
         super().__init__(f"HTTP {status_code}: {message}")
 
 
 class TaskTimeoutError(Exception):
     """Raised when task polling exceeds the timeout."""
 
-    def __init__(self, message: str, task_id: str | None = None):
+    def __init__(self, message: str, task_id: str | None = None, diagnosis: Diagnosis | None = None):
         self.task_id = task_id
+        self.diagnosis = diagnosis
         super().__init__(message)
 
 
 class TaskFailedError(Exception):
     """Raised when a polled task reports failure."""
 
-    def __init__(self, message: str, details: dict | None = None):
+    def __init__(self, message: str, details: dict | None = None, diagnosis: Diagnosis | None = None):
         self.message = message
         self.details = details or {}
+        self.diagnosis = diagnosis
         super().__init__(message)
 
 
@@ -48,7 +56,20 @@ def _parse_v2_response(model_cls: type, payload: Any) -> dict:
     try:
         return model_cls.model_validate(payload).model_dump(mode="json")
     except ValidationError as e:
-        raise ZadApiError(502, f"Unexpected API response shape for {model_cls.__name__}: {e}") from e
+        raise ZadApiError(
+            502,
+            f"Unexpected API response shape for {model_cls.__name__}: {e}",
+            diagnosis=Diagnosis(
+                fault=Fault.PLATFORM,
+                headline="ZAD returned a response this CLI couldn't read; likely a CLI/API version mismatch.",
+                summary=f"Schema {model_cls.__name__} failed to validate.",
+                next_steps=[
+                    "Retry shortly (exit code 2 = transient).",
+                    "If it persists, the CLI may be out of date; update it or report the mismatch.",
+                ],
+                status_code=502,
+            ),
+        ) from e
 
 
 class ZadClient:
@@ -113,22 +134,17 @@ def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
                     time.sleep(delay)
                     delay *= 2
                     continue
-                raise ZadApiError(0, f"Connection failed: {e}") from e
+                raise ZadApiError(0, f"Connection failed: {e}", diagnosis=diagnose_http_error(0, str(e))) from e
 
             if response.status_code in _RETRYABLE_CODES and attempt < self.max_retries:
                 print(f"HTTP {response.status_code}, retrying in {delay}s...", file=sys.stderr)
                 time.sleep(delay)
                 delay *= 2
-                last_error = ZadApiError(response.status_code, response.text)
+                last_error = self._http_error(response)
                 continue
 
             if response.status_code >= 400:
-                try:
-                    body = response.json()
-                    message = body.get("message", body.get("detail", response.text))
-                except Exception:
-                    message = response.text
-                raise ZadApiError(response.status_code, message)
+                raise self._http_error(response)
 
             if self.verbose:
                 print(f"<-- {response.status_code} ({response.elapsed.total_seconds():.2f}s)", file=sys.stderr)
@@ -137,6 +153,21 @@ def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
 
         raise last_error or ZadApiError(0, "Request failed")
 
+    @staticmethod
+    def _http_error(response: httpx.Response) -> ZadApiError:
+        """Build a diagnosed ZadApiError from a >=400 response."""
+        try:
+            body: Any = response.json()
+        except Exception:
+            body = response.text
+        if isinstance(body, dict):
+            message = body.get("message") or body.get("detail") or response.text
+        else:
+            message = response.text or str(body)
+        if not isinstance(message, str):
+            message = str(message)
+        return ZadApiError(response.status_code, message, diagnosis=diagnose_http_error(response.status_code, body))
+
     def _async_request(self, method: str, path: str, **kwargs: Any) -> dict:
         """Make a v2 async request. Polls for result unless self.wait is False."""
         response = self._request(method, path, **kwargs)
@@ -183,7 +214,7 @@ def _poll_task(self, poll_url: str) -> dict:
                     continue
 
                 if response.status_code >= 400:
-                    raise ZadApiError(response.status_code, data.get("detail", data.get("message", str(data))))
+                    raise self._http_error(response)
 
                 status = TaskStatus(**data) if isinstance(data, dict) else TaskStatus(status="unknown")
                 task_id = task_id or data.get("task_id")
@@ -195,13 +226,32 @@ def _poll_task(self, poll_url: str) -> dict:
                 if status.status == "completed":
                     return status.result or data
                 if status.status == "failed":
-                    raise TaskFailedError(status.error_message or "Task failed", details=status.result)
+                    raise TaskFailedError(
+                        status.error_message or "Task failed",
+                        details=status.result,
+                        diagnosis=diagnose_task_failure(status.error_message, status.result),
+                    )
                 if status.status == "cancelled":
-                    raise TaskFailedError("Task was cancelled")
+                    raise TaskFailedError(
+                        "Task was cancelled",
+                        diagnosis=Diagnosis(
+                            fault=Fault.UNKNOWN,
+                            headline="The task was cancelled before it finished.",
+                            next_steps=["Re-run the command, or check `zad task list` for details."],
+                        ),
+                    )
 
                 time.sleep(self.task_poll_interval)
 
-        raise TaskTimeoutError(f"Task did not complete within {self.task_timeout}s", task_id=task_id)
+        raise TaskTimeoutError(
+            f"Task did not complete within {self.task_timeout}s",
+            task_id=task_id,
+            diagnosis=Diagnosis(
+                fault=Fault.UNKNOWN,
+                headline=f"Timed out after {self.task_timeout}s waiting for the task; it may still be running.",
+                next_steps=["This is a wait limit, not a failure. Check `zad task status <id>`."],
+            ),
+        )
 
     # --- V2 project/deployment operations (async, poll for result) ---
 
diff --git a/src/zad_cli/api/errors.py b/src/zad_cli/api/errors.py
new file mode 100644
index 0000000..959b293
--- /dev/null
+++ b/src/zad_cli/api/errors.py
@@ -0,0 +1,390 @@
+"""Clear, actionable diagnosis of API and task failures.
+
+The goal is simple: tell the user *what went wrong and what to do next*. The
+upstream API already carries the signal for that (``ErrorCategory`` on cluster
+errors, ``ComponentFailureInfo`` with log tails on failed deployment tasks,
+``HTTPValidationError`` on bad input, ``error_type`` on task results), but a bare
+``HTTP 500`` / ``Task failed`` string throws it away.
+
+This module turns those raw signals into a :class:`Diagnosis`: a plain-language
+headline, a neutral source label so you know where to look ("Source: your
+application"), the concrete message, the backend's own explanation, and a next
+step. The fault vocabulary is kept in lockstep with the OpenAPI spec by
+``tests/test_spec_conformance.py`` (strict coupling: drift fails CI) while runtime
+parsing degrades gracefully on unknown values (loose coupling).
+
+We never claim more certainty than the data supports: when the API gives no
+category, the fault is ``UNKNOWN`` and we point at the logs rather than guessing.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import StrEnum
+
+from pydantic import ValidationError
+
+from zad_cli.api.models import ErrorCategory, ProcessingStatus
+
+
+class Fault(StrEnum):
+    """Who/what a failure belongs to. Drives the source label, color and exit code."""
+
+    USER_INPUT = "UserInput"  # the request you sent is wrong
+    USER_APP = "UserApp"  # your container/workload failed at runtime
+    USER_CONFIG = "UserConfig"  # your git config/manifests couldn't be applied
+    AUTH = "Auth"  # your API key / permissions
+    PLATFORM = "Platform"  # ZAD itself errored
+    NETWORK = "Network"  # couldn't reach ZAD
+    UNKNOWN = "Unknown"  # not enough signal to attribute honestly
+
+
+# Neutral, source-labelled phrasing (no blame, just where the fault lives).
+FAULT_SOURCE: dict[Fault, str] = {
+    Fault.USER_INPUT: "your request",
+    Fault.USER_APP: "your application (cluster runtime)",
+    Fault.USER_CONFIG: "your configuration / git",
+    Fault.AUTH: "your credentials / permissions",
+    Fault.PLATFORM: "ZAD platform",
+    Fault.NETWORK: "network / connectivity",
+    Fault.UNKNOWN: "unknown (see logs)",
+}
+
+# Rich color: user-fixable = yellow, escalate/investigate = red, auth = magenta.
+FAULT_COLOR: dict[Fault, str] = {
+    Fault.USER_INPUT: "yellow",
+    Fault.USER_APP: "yellow",
+    Fault.USER_CONFIG: "yellow",
+    Fault.AUTH: "magenta",
+    Fault.PLATFORM: "red",
+    Fault.NETWORK: "red",
+    Fault.UNKNOWN: "red",
+}
+
+# CI/CD exit codes: 1 = your fault (fix it), 2 = platform/transient (safe to retry),
+# 3 = unattributable. UNKNOWN gets its own code rather than claiming "your fault"
+# (1) or "safe to retry" (2) when the API gave us no signal to attribute the failure.
+FAULT_EXIT_CODE: dict[Fault, int] = {
+    Fault.USER_INPUT: 1,
+    Fault.USER_APP: 1,
+    Fault.USER_CONFIG: 1,
+    Fault.AUTH: 1,
+    Fault.PLATFORM: 2,
+    Fault.NETWORK: 2,
+    Fault.UNKNOWN: 3,
+}
+
+# Which fault each cluster ErrorCategory implies. Keyed by ErrorCategory so the
+# spec-conformance test can assert every upstream category is mapped here.
+CATEGORY_FAULT: dict[ErrorCategory, Fault] = {
+    ErrorCategory.IMAGE_PULL: Fault.USER_APP,
+    ErrorCategory.CRASH_LOOP: Fault.USER_APP,
+    ErrorCategory.OUT_OF_MEMORY: Fault.USER_APP,
+    ErrorCategory.HEALTH_CHECK: Fault.USER_APP,
+    ErrorCategory.SYNC_FAILED: Fault.USER_CONFIG,
+    ErrorCategory.COMPARISON_ERROR: Fault.USER_CONFIG,
+    ErrorCategory.UNKNOWN: Fault.UNKNOWN,
+}
+
+# Fallback next-step hint, used ONLY when the backend gave no explanation of its
+# own. We always prefer the server's words over these.
+CATEGORY_HINT: dict[ErrorCategory, str] = {
+    ErrorCategory.IMAGE_PULL: "Check the image tag exists and the registry is reachable / credentials are set.",
+    ErrorCategory.CRASH_LOOP: "The container starts then exits. Check `zad logs` for the crash reason.",
+    ErrorCategory.OUT_OF_MEMORY: "The container exceeded its memory limit. Reduce usage or raise the limit.",
+    ErrorCategory.HEALTH_CHECK: "The app started but its readiness/liveness probe never passed. Check the probe.",
+    ErrorCategory.SYNC_FAILED: "ZAD could not sync your config from git. Check the repo, branch, and manifests.",
+    ErrorCategory.COMPARISON_ERROR: "ZAD could not compare desired vs live state. Retry `zad deployment refresh`.",
+    ErrorCategory.UNKNOWN: "",
+}
+
+# Unambiguous Kubernetes reason tokens we can map without guessing. Used only as
+# a last resort when the API gives a raw message but no structured category.
+_K8S_TOKEN_CATEGORY: dict[str, ErrorCategory] = {
+    "imagepullbackoff": ErrorCategory.IMAGE_PULL,
+    "errimagepull": ErrorCategory.IMAGE_PULL,
+    "invalidimagename": ErrorCategory.IMAGE_PULL,
+    "crashloopbackoff": ErrorCategory.CRASH_LOOP,
+    "oomkilled": ErrorCategory.OUT_OF_MEMORY,
+}
+
+# HTTP status -> fault for the cases that aren't a simple 4xx/5xx split.
+_HTTP_FAULT: dict[int, Fault] = {
+    400: Fault.USER_INPUT,
+    401: Fault.AUTH,
+    403: Fault.AUTH,
+    404: Fault.USER_INPUT,
+    409: Fault.USER_INPUT,
+    422: Fault.USER_INPUT,
+}
+
+
+@dataclass
+class Diagnosis:
+    """A structured, source-labelled explanation of a failure.
+
+    ``details`` are concrete lines (validation errors, component failures, log
+    tails); ``next_steps`` are actionable suggestions. ``summary`` is the raw
+    upstream message when we have one.
+    """
+
+    fault: Fault
+    headline: str
+    summary: str | None = None
+    details: list[str] = field(default_factory=list)
+    next_steps: list[str] = field(default_factory=list)
+    status_code: int | None = None
+
+    @property
+    def source(self) -> str:
+        return FAULT_SOURCE[self.fault]
+
+    @property
+    def color(self) -> str:
+        return FAULT_COLOR[self.fault]
+
+    @property
+    def exit_code(self) -> int:
+        return FAULT_EXIT_CODE[self.fault]
+
+    def to_dict(self) -> dict:
+        """Flat, machine-readable shape for `--output json` (CI/CD branch key: `fault`)."""
+        return {
+            "fault": self.fault.value,
+            "source": self.source,
+            "headline": self.headline,
+            "summary": self.summary,
+            "details": self.details,
+            "next_steps": self.next_steps,
+            "status_code": self.status_code,
+        }
+
+
+def category_of(value: str | None) -> ErrorCategory:
+    """Coerce an arbitrary string to a known ErrorCategory (case-insensitive), else UNKNOWN.
+
+    Loose coupling: an upstream category we don't know yet maps to UNKNOWN rather
+    than raising.
+    """
+    if isinstance(value, str):
+        for cat in ErrorCategory:
+            if cat.value.lower() == value.lower():
+                return cat
+    return ErrorCategory.UNKNOWN
+
+
+def _scan_category(text: str | None) -> ErrorCategory:
+    """Best-effort category from raw text.
+
+    Matches the backend's own ``ErrorCategory`` vocabulary (spec-derived) plus a
+    few unambiguous Kubernetes reason tokens. Returns UNKNOWN rather than guessing
+    when nothing matches.
+    """
+    if not text:
+        return ErrorCategory.UNKNOWN
+    low = text.lower().replace(" ", "")
+    for token, cat in _K8S_TOKEN_CATEGORY.items():
+        if token in low:
+            return cat
+    for cat in ErrorCategory:
+        if cat is not ErrorCategory.UNKNOWN and cat.value.lower() in low:
+            return cat
+    return ErrorCategory.UNKNOWN
+
+
+def _parse_processing(raw: object) -> ProcessingStatus | None:
+    if not isinstance(raw, dict):
+        return None
+    try:
+        return ProcessingStatus.model_validate(raw)
+    except ValidationError:
+        return None
+
+
+def _format_validation(detail: object) -> list[str]:
+    """Turn a FastAPI HTTPValidationError ``detail`` array into readable field lines."""
+    if not isinstance(detail, list):
+        return []
+    lines: list[str] = []
+    for item in detail:
+        if not isinstance(item, dict):
+            lines.append(str(item))
+            continue
+        loc = [str(p) for p in item.get("loc", [])]
+        if loc and loc[0] in {"body", "query", "path", "header", "cookie"}:
+            loc = loc[1:]
+        field_path = ".".join(loc) or "(request)"
+        lines.append(f"{field_path}: {item.get('msg', 'invalid value')}")
+    return lines
+
+
+def diagnose_http_error(status_code: int, body: object) -> Diagnosis:
+    """Diagnose a failed HTTP response.
+
+    ``status_code == 0`` means the request never reached ZAD (connection error).
+    ``body`` may be a parsed dict, a raw string, or None.
+    """
+    if status_code == 0:
+        return Diagnosis(
+            fault=Fault.NETWORK,
+            headline="Could not reach the ZAD API.",
+            summary=str(body) if body else None,
+            next_steps=[
+                "Check your network/VPN and that --api-url is correct.",
+                "If ZAD should be reachable, retry shortly (exit code 2 = transient).",
+            ],
+            status_code=0,
+        )
+
+    fault = _HTTP_FAULT.get(status_code)
+    if fault is None:
+        fault = (
+            Fault.USER_INPUT if 400 <= status_code < 500 else Fault.PLATFORM if status_code >= 500 else Fault.UNKNOWN
+        )
+
+    body_dict = body if isinstance(body, dict) else None
+    details: list[str] = []
+    summary: str | None = None
+
+    if status_code == 422 and body_dict is not None:
+        details = _format_validation(body_dict.get("detail"))
+    if body_dict is not None and not details:
+        raw = body_dict.get("message") or body_dict.get("detail")
+        summary = raw if isinstance(raw, str) else None
+    elif isinstance(body, str) and body.strip():
+        summary = body.strip()
+
+    headline, next_steps = _http_headline(status_code, fault)
+    return Diagnosis(
+        fault=fault,
+        headline=headline,
+        summary=summary,
+        details=details,
+        next_steps=next_steps,
+        status_code=status_code,
+    )
+
+
+def _http_headline(status_code: int, fault: Fault) -> tuple[str, list[str]]:
+    if status_code in (401, 403):
+        verb = "Authentication failed" if status_code == 401 else "Permission denied"
+        return (
+            f"{verb} (HTTP {status_code}).",
+            ["Set a valid ZAD_API_KEY (or --api-key) with access to this project."],
+        )
+    if status_code == 404:
+        return (
+            "Not found (HTTP 404): the resource you referenced doesn't exist.",
+            ["Check the name/spelling and that it exists (e.g. `zad deployment list`)."],
+        )
+    if status_code == 409:
+        return (
+            "Conflict (HTTP 409): the resource is in a state that blocks this action.",
+            ["Check its current state, then retry once it settles."],
+        )
+    if status_code == 422:
+        return (
+            "Invalid request (HTTP 422): the values you sent didn't pass validation.",
+            ["Fix the field(s) listed above and retry."],
+        )
+    if fault is Fault.PLATFORM:
+        return (
+            f"ZAD platform error (HTTP {status_code}), usually transient.",
+            ["Retry shortly (exit code 2 = transient). If it persists, report it with the time of the call."],
+        )
+    return (f"Request rejected (HTTP {status_code}).", [])
+
+
+def diagnose_task_failure(error_message: str | None, result: object) -> Diagnosis:
+    """Diagnose a failed async task from its ``error_message`` and ``result`` payload."""
+    result_dict = result if isinstance(result, dict) else {}
+    processing = _parse_processing(result_dict.get("processing"))
+    failures = (processing.component_failures if processing else None) or []
+
+    details: list[str] = []
+    next_steps: list[str] = []
+
+    if failures:
+        cats: list[ErrorCategory] = []
+        for fail in failures:
+            cat = category_of(fail.failure_type)
+            cats.append(cat)
+            label = f"{fail.component} ({fail.failure_type}): {fail.message}"
+            details.append(label)
+            for line in (fail.logs or [])[:5]:
+                details.append(f"    {line}")
+            hint = CATEGORY_HINT.get(cat)
+            if hint and hint not in next_steps:
+                next_steps.append(hint)
+        known = [c for c in cats if c is not ErrorCategory.UNKNOWN]
+        # A component concretely failed at runtime → it's the app, even if the
+        # exact category is unrecognised.
+        fault = CATEGORY_FAULT[known[0]] if known else Fault.USER_APP
+    else:
+        # No structured failures: fall back to a category scan of the raw text.
+        text = " ".join(
+            t for t in [error_message, processing.error if processing else None, result_dict.get("error")] if t
+        )
+        cat = _scan_category(text)
+        fault = CATEGORY_FAULT[cat] if cat is not ErrorCategory.UNKNOWN else Fault.UNKNOWN
+        hint = CATEGORY_HINT.get(cat)
+        if hint:
+            next_steps.append(hint)
+
+    summary = (
+        error_message or (processing.error if processing else None) or (processing.message if processing else None)
+    )
+
+    if fault is Fault.USER_APP:
+        headline = "Your application didn't start on the cluster (the deploy reached the cluster; the workload failed)."
+        next_steps.append("Inspect `zad logs -d <deployment>` and `zad deployment describe <deployment>`.")
+    elif fault is Fault.USER_CONFIG:
+        headline = "Your configuration couldn't be applied."
+        next_steps.append("Fix your git repo/manifests, then `zad deployment refresh`.")
+    else:
+        headline = "The operation failed. Check the details below for the cause."
+        next_steps.append("Run `zad task status <id>` and `zad logs` for the full output.")
+
+    return Diagnosis(fault=fault, headline=headline, summary=summary, details=details, next_steps=next_steps)
+
+
+def degraded_diagnoses(result: object) -> list[Diagnosis]:
+    """Inspect a *successful* task result for degraded state worth surfacing.
+
+    Returns an empty list for a genuinely clean result. Catches the
+    "looks like it worked but your app is actually unhealthy" case: component
+    failures, ``warnings``, or a ``partial``/``degraded`` status on an otherwise
+    200/completed response.
+    """
+    result_dict = result if isinstance(result, dict) else {}
+    out: list[Diagnosis] = []
+
+    processing = _parse_processing(result_dict.get("processing"))
+    if processing and processing.component_failures:
+        diag = diagnose_task_failure(None, result_dict)
+        diag.headline = "The operation succeeded, but some components are unhealthy."
+        out.append(diag)
+
+    warnings = result_dict.get("warnings") or []
+    if warnings:
+        out.append(
+            Diagnosis(
+                fault=Fault.USER_CONFIG,
+                headline="The operation succeeded with warnings.",
+                details=[str(w) for w in warnings],
+                next_steps=["Review the warnings above; they usually point at your configuration."],
+            )
+        )
+
+    status = str(result_dict.get("status", "")).lower()
+    if status in {"partial", "degraded"} and not out:
+        out.append(
+            Diagnosis(
+                fault=Fault.UNKNOWN,
+                headline=f"The operation finished with status '{result_dict.get('status')}'.",
+                summary=result_dict.get("message") or None,
+                next_steps=["Run `zad deployment describe <name>` to see the current state."],
+            )
+        )
+
+    return out
diff --git a/src/zad_cli/api/models.py b/src/zad_cli/api/models.py
index 6e7283f..e3a6eb3 100644
--- a/src/zad_cli/api/models.py
+++ b/src/zad_cli/api/models.py
@@ -149,14 +149,65 @@ class TaskResponse(BaseModel):
     status: str = "pending"
 
 
+class SubtaskStatus(BaseModel):
+    """Status of a single subtask within a task's progress.
+
+    Mirrors the upstream ``SubtaskStatus`` schema. Extra fields are ignored so
+    additive upstream changes don't break parsing (loose coupling).
+    """
+
+    id: str
+    name: str
+    status: str
+    error: str | None = None
+    parent_id: str | None = None
+
+
+class ComponentFailureInfo(BaseModel):
+    """Per-component failure detail for deployment health issues.
+
+    Mirrors the upstream ``ComponentFailureInfo`` schema. This is the richest
+    diagnostic the API surfaces on a failed deployment task: which component
+    failed, the failure type, a message, and the tail of its container logs.
+    """
+
+    component: str
+    deployment: str = ""
+    failure_type: str
+    message: str
+    logs: list[str] | None = None
+
+
+class ProcessingStatus(BaseModel):
+    """Processing-step status inside a task result.
+
+    Mirrors the upstream task-models ``ProcessingStatus`` schema (the richer of
+    the two same-named schemas, the one that carries ``component_failures``).
+    """
+
+    status: str
+    message: str | None = None
+    error: str | None = None
+    result: object | None = None
+    component_failures: list[ComponentFailureInfo] | None = None
+
+
 class TaskStatus(BaseModel):
-    """Status of a polled task."""
+    """Status of a polled task.
+
+    ``result`` stays an untyped dict because its shape varies per task type
+    (UpsertDeploymentResult, RefreshDeploymentResult, ...); the diagnosis layer
+    parses the bits it understands defensively. ``subtasks``/``task_type`` are
+    optional so older API responses still validate.
+    """
 
     status: str
+    task_type: str | None = None
     current_step: str | None = None
     progress_percent: int | None = None
     result: dict | None = None
     error_message: str | None = None
+    subtasks: list[SubtaskStatus] | None = None
 
 
 class DeploymentStatus(StrEnum):
diff --git a/src/zad_cli/cli.py b/src/zad_cli/cli.py
index 13f8a4c..9c4565b 100644
--- a/src/zad_cli/cli.py
+++ b/src/zad_cli/cli.py
@@ -28,7 +28,7 @@ class _GlobalOptionsGroup(TyperGroup):
     """Hoist global options to before the subcommand so they work in any position."""
 
     _OPTS_WITH_VALUE = frozenset({"--output", "-o", "--api-key", "--api-url", "--project", "-p"})
-    _FLAGS = frozenset({"--no-wait", "--verbose", "-v", "--version", "-V"})
+    _FLAGS = frozenset({"--no-wait", "--verbose", "-v", "--version", "-V", "--strict"})
 
     def parse_args(self, ctx, args):  # noqa: ANN001
         global_args: list[str] = []
@@ -96,6 +96,9 @@ def main_callback(
     project_id: str = typer.Option(None, "--project", "-p", envvar="ZAD_PROJECT_ID", help="Project ID"),
     no_wait: bool = typer.Option(False, "--no-wait", help="Don't wait for async operations, return task ID"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose request logging"),
+    strict: bool = typer.Option(
+        False, "--strict", help="Exit non-zero when an operation succeeds but reports warnings (for CI/CD)"
+    ),
     version: bool = typer.Option(
         False, "--version", "-V", help="Show version and exit", callback=_version_callback, is_eager=True
     ),
@@ -111,6 +114,7 @@ def main_callback(
     ctx.obj["settings"] = settings
     ctx.obj["formatter"] = OutputFormatter(fmt=settings.output_format)
     ctx.obj["no_wait"] = no_wait
+    ctx.obj["strict"] = strict
 
 
 @app.command(deprecated=True)
diff --git a/src/zad_cli/commands/clone.py b/src/zad_cli/commands/clone.py
index 9d57cf4..0c4cc14 100644
--- a/src/zad_cli/commands/clone.py
+++ b/src/zad_cli/commands/clone.py
@@ -5,7 +5,7 @@
 import typer
 
 from zad_cli.api.models import CloneBucketRequest, CloneDatabaseRequest
-from zad_cli.helpers import get_helpers, handle_api_errors, require_project
+from zad_cli.helpers import get_helpers, handle_api_errors, require_project, surface_warnings
 
 app = typer.Typer(
     help="Clone data from external sources.\n\nRequires ZAD_API_KEY and ZAD_PROJECT_ID (or --api-key and -p).",
@@ -60,6 +60,7 @@ def database(
     result = client.clone_database(project, deployment, request.to_api_payload())
     formatter.render(result)
     formatter.render_success("Database clone started.")
+    surface_warnings(ctx, formatter, result)
 
 
 @app.command()
@@ -109,6 +110,7 @@ def bucket(
     result = client.clone_bucket(project, deployment, request.to_api_payload())
     formatter.render(result)
     formatter.render_success("Bucket clone started.")
+    surface_warnings(ctx, formatter, result)
 
 
 @app.command()
diff --git a/src/zad_cli/commands/deployment.py b/src/zad_cli/commands/deployment.py
index a74f4ae..81d0491 100644
--- a/src/zad_cli/commands/deployment.py
+++ b/src/zad_cli/commands/deployment.py
@@ -12,8 +12,10 @@
     confirm_action,
     get_helpers,
     handle_api_errors,
+    issues_cell,
     render_dry_run,
     require_project,
+    surface_warnings,
 )
 
 app = typer.Typer(
@@ -42,17 +44,21 @@ def list_deployments(ctx: typer.Context) -> None:
 
     rows = []
     for dep in deployments:
+        status = dep.get("status", "Active")
         rows.append(
             {
                 "deployment": dep["deployment"],
                 "components": str(len(dep["components"])),
-                "status": dep.get("status", "Active"),
+                "status": f"[{_status_color(status)}]{status}[/{_status_color(status)}]",
+                "issues": issues_cell(dep.get("errors")),
                 "namespace": dep["namespace"],
             }
         )
 
     formatter.render(
-        rows, columns=["deployment", "components", "status", "namespace"], title=f"Deployments in {project}"
+        rows,
+        columns=["deployment", "components", "status", "issues", "namespace"],
+        title=f"Deployments in {project}",
     )
 
 
@@ -207,6 +213,7 @@ def create(
     result = client.upsert_deployment(project, request.to_api_payload())
     formatter.render(result)
     formatter.render_success(f"Deployment '{deployment_name}' created/updated in project '{project}'.")
+    surface_warnings(ctx, formatter, result)
 
 
 @app.command("update-image")
@@ -245,6 +252,7 @@ def update_image(
     result = client.update_image(project, deployment, component, image, **kwargs)
     formatter.render(result)
     formatter.render_success(f"Image updated: {component} -> {image}")
+    surface_warnings(ctx, formatter, result)
 
 
 @app.command()
@@ -271,6 +279,7 @@ def refresh(
     result = client.refresh_deployment(project, deployment, force_clone=force_clone)
     formatter.render(result)
     formatter.render_success(f"Deployment '{deployment}' refreshed.")
+    surface_warnings(ctx, formatter, result)
 
 
 @app.command()
diff --git a/src/zad_cli/commands/project.py b/src/zad_cli/commands/project.py
index 3b1998c..e824622 100644
--- a/src/zad_cli/commands/project.py
+++ b/src/zad_cli/commands/project.py
@@ -4,7 +4,15 @@
 
 import typer
 
-from zad_cli.helpers import confirm_action, get_helpers, handle_api_errors, render_dry_run, require_project
+from zad_cli.helpers import (
+    confirm_action,
+    get_helpers,
+    handle_api_errors,
+    issues_cell,
+    render_dry_run,
+    require_project,
+    surface_warnings,
+)
 
 app = typer.Typer(
     help="Manage projects.\n\nMost commands require ZAD_API_KEY and ZAD_PROJECT_ID (or --api-key and -p).",
@@ -71,6 +79,7 @@ def status(ctx: typer.Context) -> None:
     table = Table(title="Deployments", show_header=True)
     table.add_column("Deployment", style="bold cyan")
     table.add_column("Components")
+    table.add_column("Issues")
     table.add_column("URL")
 
     for dep in result["deployments"]:
@@ -79,7 +88,7 @@ def status(ctx: typer.Context) -> None:
         if dep.get("urls"):
             first_url = next(iter(dep["urls"].values()), "")
             url = first_url
-        table.add_row(dep["deployment"], components, url)
+        table.add_row(dep["deployment"], components, issues_cell(dep.get("errors")), url)
 
     console.print(table)
 
@@ -102,6 +111,7 @@ def refresh(
     result = client.refresh_project(project, force_clone=force_clone)
     formatter.render(result)
     formatter.render_success(f"Project '{project}' refreshed.")
+    surface_warnings(ctx, formatter, result)
 
 
 @app.command()
diff --git a/src/zad_cli/helpers.py b/src/zad_cli/helpers.py
index af678f8..21def10 100644
--- a/src/zad_cli/helpers.py
+++ b/src/zad_cli/helpers.py
@@ -68,10 +68,15 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
         except (ZadApiError, TaskFailedError, TaskTimeoutError) as e:
             ctx = kwargs.get("ctx") or next((a for a in args if isinstance(a, typer.Context)), None)
             formatter = ctx.obj["formatter"] if ctx else None
-            details = getattr(e, "details", None)
-            status_code = getattr(e, "status_code", None)
-            if formatter:
-                formatter.render_error(str(e), details=details, status_code=status_code)
+            diagnosis = getattr(e, "diagnosis", None)
+            exit_code = 1
+            if formatter and diagnosis is not None:
+                formatter.render_diagnosis(diagnosis)
+                exit_code = diagnosis.exit_code
+            elif formatter:
+                formatter.render_error(
+                    str(e), details=getattr(e, "details", None), status_code=getattr(e, "status_code", None)
+                )
             else:
                 print(f"Error: {e}", file=sys.stderr)
             # On timeout, show task ID so the user can follow up
@@ -79,11 +84,47 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             if task_id:
                 print(f"Task is still running. Check status with: zad task status {task_id}", file=sys.stderr)
                 print(f"Or wait for completion with: zad task wait {task_id}", file=sys.stderr)
-            raise typer.Exit(1) from e
+            raise typer.Exit(exit_code) from e
 
     return wrapper
 
 
+def surface_warnings(ctx: typer.Context, formatter: OutputFormatter, result: Any) -> None:
+    """After a successful mutating op, surface any degraded state (unhealthy components,
+    warnings, partial status). Under global --strict, exit non-zero so CI/CD fails the build.
+    """
+    from zad_cli.api.errors import degraded_diagnoses
+
+    diagnoses = degraded_diagnoses(result)
+    if not diagnoses:
+        return
+    formatter.render_warnings(diagnoses)
+    if ctx.obj.get("strict"):
+        # Honor the per-fault exit code contract: a retryable (PLATFORM/NETWORK)
+        # warning exits 2, not 1. Pick the most severe across all diagnoses.
+        raise typer.Exit(max(d.exit_code for d in diagnoses))
+
+
+def issues_cell(errors: list[dict] | None) -> str:
+    """Rich-markup summary of a deployment's cluster errors for list/status tables.
+
+    Empty string when clean; otherwise a colored ``<count> <dominant-category>`` cell so
+    degraded deployments are never silently green.
+    """
+    if not errors:
+        return ""
+    from collections import Counter
+
+    from zad_cli.api.errors import CATEGORY_FAULT, FAULT_COLOR, category_of
+
+    counts = Counter(str(e.get("category", "Unknown")) for e in errors)
+    top, _ = counts.most_common(1)[0]
+    total = len(errors)
+    label = f"{total} {top}" if total > 1 else top
+    color = FAULT_COLOR[CATEGORY_FAULT[category_of(top)]]
+    return f"[{color}]{label}[/{color}]"
+
+
 def render_dry_run(formatter: OutputFormatter, method: str, endpoint: str, payload: dict | None = None) -> None:
     """Show what would be sent without making the API call."""
     info: dict = {"dry_run": True, "method": method, "endpoint": endpoint}
diff --git a/src/zad_cli/output/formatter.py b/src/zad_cli/output/formatter.py
index 73dfdfc..64664e8 100644
--- a/src/zad_cli/output/formatter.py
+++ b/src/zad_cli/output/formatter.py
@@ -4,15 +4,33 @@
 
 import json
 import sys
+from typing import TYPE_CHECKING
 
 import yaml
 from rich.console import Console
+from rich.markup import escape
 from rich.table import Table
 
+if TYPE_CHECKING:
+    from zad_cli.api.errors import Diagnosis
+
 # Console for stderr (status messages), stdout is for data output
 err_console = Console(stderr=True)
 
 
+def _supports_unicode() -> bool:
+    """True if stderr can render unicode glyphs. CI logs with ascii encoding get ascii fallbacks."""
+    enc = (getattr(sys.stderr, "encoding", None) or "").lower()
+    return "utf" in enc
+
+
+def _glyphs() -> tuple[str, str, str]:
+    """Return (error, warning, arrow) glyphs, ascii-safe when unicode isn't available."""
+    if _supports_unicode():
+        return "✗", "⚠", "→"  # ✗  ⚠  →
+    return "x", "!", "->"
+
+
 class OutputFormatter:
     """Render data in table, json, or yaml format."""
 
@@ -71,6 +89,52 @@ def render_error(self, message: str, details: dict | None = None, status_code: i
                 for k, v in details.items():
                     print(f"  {k}: {v}", file=sys.stderr)
 
+    def render_diagnosis(self, diagnosis: Diagnosis) -> None:
+        """Render a failure diagnosis: source-labelled, with details and next steps.
+
+        In json mode the structured diagnosis goes to stdout (the command failed, so
+        there is no other stdout payload; automation branches on the ``fault`` key).
+        In table mode it renders to stderr.
+        """
+        if self.fmt == "json":
+            print(json.dumps(diagnosis.to_dict(), indent=2, default=str))
+            return
+        cross, _, _ = _glyphs()
+        self._diagnosis_block(diagnosis, glyph=cross, header_color=diagnosis.color)
+
+    def render_warnings(self, diagnoses: list[Diagnosis]) -> None:
+        """Render degraded-but-successful warnings to stderr (never blocks stdout data).
+
+        In json mode the result payload is already on stdout, so warnings go to stderr
+        as a single JSON object to keep stdout a valid single document.
+        """
+        if not diagnoses:
+            return
+        if self.fmt == "json":
+            payload = {"warnings": [d.to_dict() for d in diagnoses]}
+            print(json.dumps(payload, indent=2, default=str), file=sys.stderr)
+            return
+        _, warn, _ = _glyphs()
+        for diagnosis in diagnoses:
+            self._diagnosis_block(diagnosis, glyph=warn, header_color="yellow")
+
+    def _diagnosis_block(self, diagnosis: Diagnosis, *, glyph: str, header_color: str) -> None:
+        """Shared stderr layout for diagnoses and warnings."""
+        _, _, arrow = _glyphs()
+        err_console.print(f"[{header_color}]{glyph} {escape(diagnosis.headline)}[/{header_color}]")
+        err_console.print(f"  [dim]Source:[/dim] {escape(diagnosis.source)}")
+        if diagnosis.summary:
+            err_console.print(f"\n  {escape(diagnosis.summary)}")
+        if diagnosis.details:
+            err_console.print()
+            for line in diagnosis.details:
+                err_console.print(f"  {escape(line)}")
+        if diagnosis.next_steps:
+            err_console.print()
+            for step in diagnosis.next_steps:
+                err_console.print(f"  [cyan]{arrow}[/cyan] {escape(step)}")
+        err_console.print()
+
     def _table(
         self,
         data: list[dict] | dict,
diff --git a/tests/commands/test_deployment.py b/tests/commands/test_deployment.py
index a31c224..2cefd2f 100644
--- a/tests/commands/test_deployment.py
+++ b/tests/commands/test_deployment.py
@@ -115,3 +115,85 @@ def test_describe_renders_degraded_deployment_with_errors(monkeypatch: pytest.Mo
     assert "ImagePull" in result.output
     assert "Back-off pulling image" in result.output
     assert "Container image cannot be pulled." in result.output
+
+
+def _stub_client(monkeypatch: pytest.MonkeyPatch, **methods: Any) -> None:
+    """Install a stub ZadClient exposing the given methods, plus auth env."""
+
+    class _StubClient:
+        def __init__(self, *_args, **_kwargs):
+            self.wait = True
+            self.verbose = False
+
+        def close(self) -> None:
+            pass
+
+    for name, fn in methods.items():
+        setattr(_StubClient, name, staticmethod(fn))
+
+    monkeypatch.setenv("ZAD_API_KEY", "test-key")
+    monkeypatch.setenv("ZAD_PROJECT_ID", "my-project")
+    monkeypatch.setenv("ZAD_API_URL", "https://api.example.com")
+    import zad_cli.api.client as client_module
+
+    monkeypatch.setattr(client_module, "ZadClient", _StubClient)
+    monkeypatch.setattr("zad_cli.helpers.ZadClient", _StubClient, raising=False)
+
+
+def test_list_shows_issues_column(monkeypatch: pytest.MonkeyPatch) -> None:
+    def _list(_project: str) -> list[dict[str, Any]]:
+        return [
+            {
+                "deployment": "staging",
+                "project": "my-project",
+                "namespace": "ns-staging",
+                "components": ["web"],
+                "status": "Degraded",
+                "urls": {},
+                "sync_revision": None,
+                "last_synced_at": None,
+                "errors": [{"category": "ImagePull", "resource": "Pod/web", "message": "back-off"}],
+            }
+        ]
+
+    _stub_client(monkeypatch, list_deployments=_list)
+    result = CliRunner().invoke(app, ["deployment", "list"])
+    assert result.exit_code == 0, result.output
+    assert "Issues" in result.output
+    assert "ImagePull" in result.output
+
+
+def _degraded_result() -> dict[str, Any]:
+    return {
+        "status": "success",
+        "processing": {
+            "status": "completed",
+            "component_failures": [{"component": "web", "failure_type": "CrashLoop", "message": "exited 1"}],
+        },
+    }
+
+
+def test_create_surfaces_warnings_but_exits_zero(monkeypatch: pytest.MonkeyPatch) -> None:
+    _stub_client(monkeypatch, upsert_deployment=lambda _p, _payload: _degraded_result())
+    result = CliRunner().invoke(app, ["deployment", "create", "staging", "--component", "web", "--image", "x:1", "-y"])
+    assert result.exit_code == 0, result.output
+    assert "unhealthy" in result.output.lower()
+
+
+def test_create_strict_exits_nonzero_on_warnings(monkeypatch: pytest.MonkeyPatch) -> None:
+    _stub_client(monkeypatch, upsert_deployment=lambda _p, _payload: _degraded_result())
+    result = CliRunner().invoke(
+        app, ["--strict", "deployment", "create", "staging", "--component", "web", "--image", "x:1", "-y"]
+    )
+    assert result.exit_code == 1, result.output
+    assert "unhealthy" in result.output.lower()
+
+
+def test_create_strict_exit_code_follows_fault_contract(monkeypatch: pytest.MonkeyPatch) -> None:
+    """--strict honors the per-fault exit code: a 'degraded' status is UNKNOWN (exit 3),
+    not a hardcoded 1."""
+    _stub_client(monkeypatch, upsert_deployment=lambda _p, _payload: {"status": "degraded", "message": "half up"})
+    result = CliRunner().invoke(
+        app, ["--strict", "deployment", "create", "staging", "--component", "web", "--image", "x:1", "-y"]
+    )
+    assert result.exit_code == 3, result.output
diff --git a/tests/test_client.py b/tests/test_client.py
index 671379c..58cc47a 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -113,6 +113,71 @@ def test_v2_async_poll_timeout(client):
     assert exc_info.value.task_id == "abc"
 
 
+@respx.mock
+def test_http_error_carries_auth_diagnosis(client):
+    respx.get("https://api.example.com/metrics/health").mock(return_value=httpx.Response(401, text="Unauthorized"))
+    with pytest.raises(ZadApiError) as exc_info:
+        client.health()
+    diag = exc_info.value.diagnosis
+    assert diag is not None
+    assert diag.fault.value == "Auth"
+    assert diag.exit_code == 1
+
+
+@respx.mock
+def test_http_422_diagnosis_has_field_paths(client):
+    body = {"detail": [{"loc": ["body", "deploymentName"], "msg": "field required", "type": "missing"}]}
+    respx.post("https://api.example.com/v2/projects/my-project/:upsert-deployment").mock(
+        return_value=httpx.Response(422, json=body)
+    )
+    with pytest.raises(ZadApiError) as exc_info:
+        client.upsert_deployment("my-project", {})
+    diag = exc_info.value.diagnosis
+    assert diag.fault.value == "UserInput"
+    assert "deploymentName: field required" in diag.details
+
+
+@respx.mock
+def test_500_diagnosis_is_platform_and_retryable(client):
+    respx.get("https://api.example.com/metrics/health").mock(return_value=httpx.Response(503, text="down"))
+    with pytest.raises(ZadApiError) as exc_info:
+        client.health()
+    diag = exc_info.value.diagnosis
+    assert diag.fault.value == "Platform"
+    assert diag.exit_code == 2  # CI/CD: safe to retry
+
+
+@respx.mock
+def test_task_failure_carries_app_diagnosis(client):
+    respx.post("https://api.example.com/v2/projects/my-project/:upsert-deployment").mock(
+        return_value=httpx.Response(202, json={"task_id": "abc", "status": "accepted"})
+    )
+    respx.get("https://api.example.com/tasks/abc").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "status": "failed",
+                "error_message": "deployment failed",
+                "result": {
+                    "status": "failed",
+                    "processing": {
+                        "status": "failed",
+                        "component_failures": [
+                            {"component": "web", "failure_type": "ImagePull", "message": "back-off pulling"}
+                        ],
+                    },
+                },
+            },
+        )
+    )
+    with pytest.raises(TaskFailedError) as exc_info:
+        client.upsert_deployment("my-project", {"deploymentName": "test", "components": []})
+    diag = exc_info.value.diagnosis
+    assert diag is not None
+    assert diag.fault.value == "UserApp"
+    assert any("web (ImagePull)" in line for line in diag.details)
+
+
 def test_build_poll_url_relative(client):
     assert client._build_poll_url("/tasks/abc").endswith("/tasks/abc")
     assert client._build_poll_url("/tasks/abc").startswith("https://")
diff --git a/tests/test_errors.py b/tests/test_errors.py
new file mode 100644
index 0000000..3673d62
--- /dev/null
+++ b/tests/test_errors.py
@@ -0,0 +1,118 @@
+"""Unit tests for the diagnosis layer (api/errors.py)."""
+
+from zad_cli.api.errors import (
+    Fault,
+    degraded_diagnoses,
+    diagnose_http_error,
+    diagnose_task_failure,
+)
+
+
+def test_http_422_extracts_field_paths() -> None:
+    body = {
+        "detail": [
+            {"loc": ["body", "components", 0, "image"], "msg": "field required", "type": "missing"},
+            {"loc": ["body", "deployment_name"], "msg": "string too short", "type": "value_error"},
+        ]
+    }
+    d = diagnose_http_error(422, body)
+    assert d.fault is Fault.USER_INPUT
+    assert d.exit_code == 1
+    assert "components.0.image: field required" in d.details
+    assert "deployment_name: string too short" in d.details
+    # The 'body' prefix is stripped for readability.
+    assert not any(line.startswith("body.") for line in d.details)
+
+
+def test_http_401_is_auth() -> None:
+    d = diagnose_http_error(401, {"detail": "invalid api key"})
+    assert d.fault is Fault.AUTH
+    assert d.source == "your credentials / permissions"
+    assert d.exit_code == 1
+
+
+def test_http_404_is_user_input() -> None:
+    assert diagnose_http_error(404, {"detail": "not found"}).fault is Fault.USER_INPUT
+
+
+def test_http_500_is_platform_and_retryable_exit_code() -> None:
+    d = diagnose_http_error(500, "boom")
+    assert d.fault is Fault.PLATFORM
+    assert d.exit_code == 2
+    assert "platform" in d.headline.lower()
+
+
+def test_connection_failure_is_network() -> None:
+    d = diagnose_http_error(0, "connection refused")
+    assert d.fault is Fault.NETWORK
+    assert d.exit_code == 2
+
+
+def test_task_failure_component_imagepull_is_user_app() -> None:
+    result = {
+        "status": "failed",
+        "processing": {
+            "status": "failed",
+            "component_failures": [
+                {
+                    "component": "web",
+                    "failure_type": "ImagePull",
+                    "message": "Back-off pulling image ghcr.io/org/web:bad",
+                    "logs": ["Error: manifest unknown"],
+                }
+            ],
+        },
+    }
+    d = diagnose_task_failure("deployment failed", result)
+    assert d.fault is Fault.USER_APP
+    assert "your application" in d.source
+    assert any("web (ImagePull)" in line for line in d.details)
+    assert any("manifest unknown" in line for line in d.details)
+
+
+def test_task_failure_syncfailed_text_is_user_config() -> None:
+    # No structured failures, but the message carries the backend's category vocabulary.
+    d = diagnose_task_failure("git clone failed (SyncFailed)", {})
+    assert d.fault is Fault.USER_CONFIG
+
+
+def test_task_failure_unknown_stays_unknown() -> None:
+    d = diagnose_task_failure("something odd happened", {})
+    assert d.fault is Fault.UNKNOWN
+    # UNKNOWN gets its own exit code: not "your fault" (1), not "safe to retry" (2).
+    assert d.exit_code == 3
+    assert "logs" in " ".join(d.next_steps).lower()
+
+
+def test_degraded_diagnoses_flags_warnings() -> None:
+    diags = degraded_diagnoses({"status": "success", "warnings": ["deprecated field 'foo'"]})
+    assert len(diags) == 1
+    assert diags[0].fault is Fault.USER_CONFIG
+    assert "deprecated field 'foo'" in diags[0].details
+
+
+def test_degraded_diagnoses_flags_unhealthy_components() -> None:
+    result = {
+        "status": "success",
+        "processing": {
+            "status": "completed",
+            "component_failures": [{"component": "web", "failure_type": "CrashLoop", "message": "exited 1"}],
+        },
+    }
+    diags = degraded_diagnoses(result)
+    assert len(diags) == 1
+    assert diags[0].fault is Fault.USER_APP
+
+
+def test_degraded_diagnoses_clean_result_is_empty() -> None:
+    assert degraded_diagnoses({"status": "success"}) == []
+    assert degraded_diagnoses(None) == []
+
+
+def test_to_dict_is_machine_readable() -> None:
+    d = diagnose_http_error(500, "boom")
+    payload = d.to_dict()
+    assert payload["fault"] == "Platform"
+    assert payload["source"] == "ZAD platform"
+    assert payload["status_code"] == 500
+    assert set(payload) == {"fault", "source", "headline", "summary", "details", "next_steps", "status_code"}
diff --git a/tests/test_output.py b/tests/test_output.py
index d5db7b6..364f858 100644
--- a/tests/test_output.py
+++ b/tests/test_output.py
@@ -2,6 +2,7 @@
 
 import json
 
+from zad_cli.api.errors import Diagnosis, Fault
 from zad_cli.output.formatter import OutputFormatter
 
 
@@ -67,3 +68,51 @@ def test_render_error_table(capsys):
     fmt.render_error("something broke")
     err = capsys.readouterr().err
     assert "something broke" in err
+
+
+def _sample_diagnosis() -> Diagnosis:
+    return Diagnosis(
+        fault=Fault.USER_APP,
+        headline="Your application failed to run on the cluster.",
+        summary="deployment failed",
+        details=["web (ImagePull): back-off pulling image"],
+        next_steps=["Inspect `zad logs`."],
+        status_code=None,
+    )
+
+
+def test_render_diagnosis_json_goes_to_stdout(capsys):
+    fmt = OutputFormatter(fmt="json")
+    fmt.render_diagnosis(_sample_diagnosis())
+    out = capsys.readouterr().out  # failure has no other stdout payload -> diagnosis to stdout
+    data = json.loads(out)
+    assert data["fault"] == "UserApp"
+    assert data["source"] == "your application (cluster runtime)"
+    assert data["details"] == ["web (ImagePull): back-off pulling image"]
+
+
+def test_render_diagnosis_table_goes_to_stderr(capsys):
+    fmt = OutputFormatter(fmt="table")
+    fmt.render_diagnosis(_sample_diagnosis())
+    captured = capsys.readouterr()
+    assert captured.out == ""  # stdout stays clean for pipelines
+    assert "Your application failed to run" in captured.err
+    assert "Source: your application" in captured.err
+    assert "Inspect" in captured.err
+
+
+def test_render_warnings_json_goes_to_stderr(capsys):
+    # The result payload is already on stdout, so warnings must not corrupt it.
+    fmt = OutputFormatter(fmt="json")
+    fmt.render_warnings([_sample_diagnosis()])
+    captured = capsys.readouterr()
+    assert captured.out == ""
+    data = json.loads(captured.err)
+    assert data["warnings"][0]["fault"] == "UserApp"
+
+
+def test_render_warnings_empty_is_noop(capsys):
+    OutputFormatter(fmt="table").render_warnings([])
+    captured = capsys.readouterr()
+    assert captured.out == ""
+    assert captured.err == ""
diff --git a/tests/test_spec_conformance.py b/tests/test_spec_conformance.py
new file mode 100644
index 0000000..4545a1b
--- /dev/null
+++ b/tests/test_spec_conformance.py
@@ -0,0 +1,68 @@
+"""Spec-conformance tests: keep the CLI's error vocabulary in lockstep with the API.
+
+These assert the CLI's hand-written enums/mappings match the vendored OpenAPI spec
+(api/upstream-openapi.json). When the daily api-sync workflow refreshes the spec and
+the upstream adds/changes an ErrorCategory, these tests fail loudly with a clear
+instruction, turning a silent drift into an actionable PR. This is the *strict* half
+of the coupling; runtime parsing (category_of / _coerce_unknown_*) is the *loose* half.
+
+The vendored spec is part of the repo, so a missing file is a hard failure, not a
+skip: skipping would let the whole conformance check pass silently if the file is ever
+moved or renamed.
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from zad_cli.api.errors import CATEGORY_FAULT, CATEGORY_HINT
+from zad_cli.api.models import ErrorCategory
+
+_SPEC_PATH = Path(__file__).resolve().parent.parent / "api" / "upstream-openapi.json"
+
+
+def _spec_schemas() -> dict:
+    if not _SPEC_PATH.exists():
+        pytest.fail(
+            f"vendored spec not found at {_SPEC_PATH}. It is part of the repo and these "
+            "conformance tests depend on it; if it moved, update _SPEC_PATH instead of skipping."
+        )
+    return json.loads(_SPEC_PATH.read_text())["components"]["schemas"]
+
+
+def test_error_category_enum_matches_spec() -> None:
+    """Our ErrorCategory must equal the spec's enum, value-for-value."""
+    spec_values = set(_spec_schemas()["ErrorCategory"]["enum"])
+    cli_values = {c.value for c in ErrorCategory}
+    assert cli_values == spec_values, (
+        "ErrorCategory drifted from the API spec. Update zad_cli.api.models.ErrorCategory "
+        f"and the CATEGORY_FAULT/CATEGORY_HINT maps. spec-only={spec_values - cli_values}, "
+        f"cli-only={cli_values - spec_values}"
+    )
+
+
+def test_every_category_has_a_fault_and_hint() -> None:
+    """A new category must not silently fall through the diagnosis layer."""
+    for cat in ErrorCategory:
+        assert cat in CATEGORY_FAULT, f"{cat} missing from CATEGORY_FAULT"
+        assert cat in CATEGORY_HINT, f"{cat} missing from CATEGORY_HINT"
+
+
+@pytest.mark.parametrize(
+    "schema_name,model_path",
+    [
+        ("ComponentFailureInfo", "zad_cli.api.models:ComponentFailureInfo"),
+        ("SubtaskStatus", "zad_cli.api.models:SubtaskStatus"),
+    ],
+)
+def test_models_cover_spec_required_fields(schema_name: str, model_path: str) -> None:
+    """Our pydantic models must declare every field the spec marks required."""
+    import importlib
+
+    module_name, cls_name = model_path.split(":")
+    model = getattr(importlib.import_module(module_name), cls_name)
+    spec_required = set(_spec_schemas()[schema_name].get("required", []))
+    model_fields = set(model.model_fields)
+    missing = spec_required - model_fields
+    assert not missing, f"{cls_name} is missing spec-required fields: {missing}"