|
1 | 1 | # Copyright The Marin Authors |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
4 | | -"""Verify every Datakit source's staged_path exists under gs://marin-us-central1. |
| 4 | +"""Verify every Datakit source's staged dump terminated SUCCESS. |
5 | 5 |
|
6 | 6 | Each :class:`marin.datakit.sources.DatakitSource` with a non-empty ``staged_path`` |
7 | | -must resolve to a GCS prefix with at least one object — otherwise the ferry's |
8 | | -verify-only download step will 404 at runtime. Enforced daily as a parallel |
9 | | -lane of the datakit-smoke workflow. |
| 7 | +must resolve to a GCS prefix under ``gs://marin-us-central1`` whose |
| 8 | +``.executor_status`` file (plain text or legacy JSON-lines) reports ``SUCCESS`` — |
| 9 | +otherwise the ferry's verify-only download step is pointing at a partial or |
| 10 | +missing dump. Enforced daily as a parallel lane of the datakit-smoke workflow. |
10 | 11 | """ |
11 | 12 |
|
12 | 13 | import logging |
13 | 14 | import sys |
14 | 15 | from concurrent.futures import ThreadPoolExecutor |
15 | 16 |
|
16 | 17 | from marin.datakit.sources import all_sources |
17 | | -from rigging.filesystem import url_to_fs |
| 18 | +from marin.execution.executor_step_status import STATUS_SUCCESS, StatusFile |
18 | 19 | from rigging.log_setup import configure_logging |
19 | 20 |
|
20 | 21 | logger = logging.getLogger(__name__) |
21 | 22 |
|
22 | 23 | BUCKET = "gs://marin-us-central1" |
23 | 24 | MAX_WORKERS = 16 |
| 25 | +WORKER_ID = "datakit-smoke-sources-check" |
24 | 26 |
|
25 | 27 |
|
26 | | -def _check(full_path: str) -> tuple[str, bool]: |
27 | | - fs, _ = url_to_fs(full_path) |
28 | | - try: |
29 | | - children = fs.ls(full_path, detail=False) |
30 | | - except FileNotFoundError: |
31 | | - return full_path, False |
32 | | - return full_path, bool(children) |
| 28 | +def _check(staged_path: str) -> tuple[str, str]: |
| 29 | + """Return (output_path, status) where status is ``SUCCESS`` or a failure token.""" |
| 30 | + output_path = f"{BUCKET}/{staged_path}" |
| 31 | + status = StatusFile(output_path, worker_id=WORKER_ID).status |
| 32 | + return output_path, status or "MISSING" |
33 | 33 |
|
34 | 34 |
|
35 | 35 | def main() -> None: |
36 | 36 | configure_logging() |
37 | 37 | sources = all_sources() |
38 | 38 | unique_paths = sorted({s.staged_path for s in sources.values() if s.staged_path}) |
39 | 39 | logger.info("Verifying %d unique staged paths under %s", len(unique_paths), BUCKET) |
40 | | - urls = [f"{BUCKET}/{p}" for p in unique_paths] |
41 | 40 |
|
42 | | - missing: list[str] = [] |
| 41 | + bad: list[tuple[str, str]] = [] |
43 | 42 | with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool: |
44 | | - for full, exists in pool.map(_check, urls): |
45 | | - if exists: |
46 | | - logger.debug("OK: %s", full) |
| 43 | + for output_path, status in pool.map(_check, unique_paths): |
| 44 | + if status == STATUS_SUCCESS: |
| 45 | + logger.debug("OK: %s", output_path) |
47 | 46 | else: |
48 | | - logger.error("MISSING: %s", full) |
49 | | - missing.append(full) |
| 47 | + logger.error("%s: %s", status, output_path) |
| 48 | + bad.append((output_path, status)) |
50 | 49 |
|
51 | | - if missing: |
52 | | - raise SystemExit(f"{len(missing)}/{len(unique_paths)} staged paths missing under {BUCKET}") |
53 | | - logger.info("All %d staged paths present under %s", len(unique_paths), BUCKET) |
| 50 | + if bad: |
| 51 | + raise SystemExit(f"{len(bad)}/{len(unique_paths)} staged paths not SUCCESS under {BUCKET}") |
| 52 | + logger.info("All %d staged paths report SUCCESS under %s", len(unique_paths), BUCKET) |
54 | 53 |
|
55 | 54 |
|
56 | 55 | if __name__ == "__main__": |
|
0 commit comments