|
1 | | -"""Honest, source-labelled diagnosis of API and task failures. |
2 | | -
|
3 | | -The upstream API already attributes failures accurately: ``ErrorCategory`` on |
4 | | -cluster errors, ``ComponentFailureInfo`` (with log tails) on failed deployment |
5 | | -tasks, ``HTTPValidationError`` on bad input, and ``error_type`` on task results. |
6 | | -The CLI used to collapse all of that into a bare ``HTTP 500`` / ``Task failed`` |
7 | | -string, which made every failure look like the platform was broken. |
8 | | -
|
9 | | -This module turns those raw signals into a :class:`Diagnosis`: a clear, |
10 | | -**source-labelled** headline ("Source: your application"), the concrete message, |
11 | | -the backend's own explanation, and a next step. The fault vocabulary is kept in |
12 | | -lockstep with the OpenAPI spec by ``tests/test_spec_conformance.py`` (strict |
13 | | -coupling: drift fails CI) while runtime parsing degrades gracefully on unknown |
14 | | -values (loose coupling). |
15 | | -
|
16 | | -Honesty rule: never claim more certainty than the data supports. When the API |
17 | | -gives no category, the fault is ``UNKNOWN`` and we point at the logs rather than |
18 | | -guessing whose fault it is. |
| 1 | +"""Clear, actionable diagnosis of API and task failures. |
| 2 | +
|
| 3 | +The goal is simple: tell the user *what went wrong and what to do next*. The |
| 4 | +upstream API already carries the signal for that — ``ErrorCategory`` on cluster |
| 5 | +errors, ``ComponentFailureInfo`` (with log tails) on failed deployment tasks, |
| 6 | +``HTTPValidationError`` on bad input, ``error_type`` on task results — but a bare |
| 7 | +``HTTP 500`` / ``Task failed`` string throws it away. |
| 8 | +
|
| 9 | +This module turns those raw signals into a :class:`Diagnosis`: a plain-language |
| 10 | +headline, a neutral source label so you know where to look ("Source: your |
| 11 | +application"), the concrete message, the backend's own explanation, and a next |
| 12 | +step. The fault vocabulary is kept in lockstep with the OpenAPI spec by |
| 13 | +``tests/test_spec_conformance.py`` (strict coupling: drift fails CI) while runtime |
| 14 | +parsing degrades gracefully on unknown values (loose coupling). |
| 15 | +
|
| 16 | +We never claim more certainty than the data supports: when the API gives no |
| 17 | +category, the fault is ``UNKNOWN`` and we point at the logs rather than guessing. |
19 | 18 | """ |
20 | 19 |
|
21 | 20 | from __future__ import annotations |
@@ -288,7 +287,7 @@ def _http_headline(status_code: int, fault: Fault) -> tuple[str, list[str]]: |
288 | 287 | ) |
289 | 288 | if fault is Fault.PLATFORM: |
290 | 289 | return ( |
291 | | - f"ZAD had an internal error (HTTP {status_code}) — this is the platform, not your request.", |
| 290 | + f"ZAD platform error (HTTP {status_code}) — usually transient.", |
292 | 291 | ["Retry shortly (exit code 2 = transient). If it persists, report it with the time of the call."], |
293 | 292 | ) |
294 | 293 | return (f"Request rejected (HTTP {status_code}).", []) |
@@ -335,13 +334,13 @@ def diagnose_task_failure(error_message: str | None, result: object) -> Diagnosi |
335 | 334 | ) |
336 | 335 |
|
337 | 336 | if fault is Fault.USER_APP: |
338 | | - headline = "Your application failed to run on the cluster — ZAD applied your config, the workload didn't start." |
| 337 | + headline = "Your application didn't start on the cluster (the deploy reached the cluster; the workload failed)." |
339 | 338 | next_steps.append("Inspect `zad logs -d <deployment>` and `zad deployment describe <deployment>`.") |
340 | 339 | elif fault is Fault.USER_CONFIG: |
341 | | - headline = "ZAD could not apply your configuration." |
| 340 | + headline = "Your configuration couldn't be applied." |
342 | 341 | next_steps.append("Fix your git repo/manifests, then `zad deployment refresh`.") |
343 | 342 | else: |
344 | | - headline = "The operation failed, and ZAD did not report a category." |
| 343 | + headline = "The operation failed. Check the details below for the cause." |
345 | 344 | next_steps.append("Run `zad task status <id>` and `zad logs` for the full output.") |
346 | 345 |
|
347 | 346 | return Diagnosis(fault=fault, headline=headline, summary=summary, details=details, next_steps=next_steps) |
|
0 commit comments