|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Validate kopiur backup coverage against a RENDERED manifest stream. |
| 3 | +
|
| 4 | +Replaces the retired pvc-plumber `validate-restore-contract.sh` and folds in the |
| 5 | +now-dead `backup-exempt-contract` job. kopiur has no `/audit` ledger, so this is |
| 6 | +the CI guard that catches the silent gaps that ledger used to surface. |
| 7 | +
|
| 8 | +Runs on the rendered kustomize stream (so Helm-rendered PVCs — gitea, tubesync — |
| 9 | +are covered, which a static grep of *.yaml cannot do). |
| 10 | +
|
| 11 | + python3 scripts/validate-kopiur-coverage.py /tmp/all-manifests.yaml |
| 12 | +
|
| 13 | +HARD FAILS (exit 1): |
| 14 | + [dsr] A backed-up PVC (target of a kopiur SnapshotPolicy) whose |
| 15 | + spec.dataSourceRef does NOT point at a kopiur Restore → it recreates |
| 16 | + EMPTY in DR. The single most dangerous silent gap. |
| 17 | + [nslabel] A namespace containing a SnapshotPolicy that lacks the |
| 18 | + `kopiur.home-operations.com/repo: cluster-kopia` label → the |
| 19 | + ClusterExternalSecret won't fan the repo creds in; the mover can't auth. |
| 20 | +
|
| 21 | +WARNINGS (printed, exit 0): |
| 22 | + [mover] A SnapshotPolicy/Restore with no spec.mover security context (neither |
| 23 | + securityContext nor inheritSecurityContextFrom) → likely PermissionDenied |
| 24 | + (the #1 kopiur gotcha — see docs/domains/storage/kopiur-mover-permissions.md). |
| 25 | + [gap] A longhorn PVC that is neither backed up nor backup-exempt → review. |
| 26 | + [exempt] A backup-exempt PVC missing the fully-qualified reason annotation |
| 27 | + (kept for grep-ability now that pvc-plumber no longer enforces it). |
| 28 | +""" |
| 29 | +import sys |
| 30 | + |
| 31 | +try: |
| 32 | + import yaml |
| 33 | +except ImportError: |
| 34 | + sys.stderr.write("pyyaml required: pip3 install pyyaml\n") |
| 35 | + sys.exit(2) |
| 36 | + |
| 37 | +KOPIUR_GROUP = "kopiur.home-operations.com" |
| 38 | +REPO_LABEL = "kopiur.home-operations.com/repo" |
| 39 | +REPO_LABEL_VAL = "cluster-kopia" |
| 40 | +EXEMPT_LABEL = "backup-exempt" |
| 41 | +EXEMPT_REASON = "storage.vanillax.dev/backup-exempt-reason" |
| 42 | +SYSTEM_NS = { |
| 43 | + "kube-system", "argocd", "longhorn-system", "kopiur-system", "cert-manager", |
| 44 | + "external-secrets", "kube-node-lease", "kube-public", "monitoring", "gateway", |
| 45 | + "1passwordconnect", "volsync-system", |
| 46 | +} |
| 47 | + |
| 48 | + |
| 49 | +def meta(d, key): |
| 50 | + return (d.get("metadata") or {}).get(key) |
| 51 | + |
| 52 | + |
| 53 | +def labels_of(d): |
| 54 | + return (d.get("metadata") or {}).get("labels") or {} |
| 55 | + |
| 56 | + |
| 57 | +def anns_of(d): |
| 58 | + return (d.get("metadata") or {}).get("annotations") or {} |
| 59 | + |
| 60 | + |
| 61 | +def has_mover_sc(d): |
| 62 | + mover = (d.get("spec") or {}).get("mover") or {} |
| 63 | + return bool(mover.get("securityContext") or mover.get("inheritSecurityContextFrom")) |
| 64 | + |
| 65 | + |
| 66 | +def main(): |
| 67 | + if len(sys.argv) != 2: |
| 68 | + sys.stderr.write("usage: validate-kopiur-coverage.py <rendered-manifests.yaml>\n") |
| 69 | + return 2 |
| 70 | + |
| 71 | + # kube-prometheus-stack CRDs contain a bare `=` enum value (AlertManager |
| 72 | + # matchType), which PyYAML maps to the special value-tag and otherwise fails |
| 73 | + # to construct. Treat it as a literal scalar so the rendered stream parses. |
| 74 | + yaml.SafeLoader.add_constructor( |
| 75 | + "tag:yaml.org,2002:value", lambda loader, node: loader.construct_scalar(node) |
| 76 | + ) |
| 77 | + |
| 78 | + with open(sys.argv[1]) as fh: |
| 79 | + docs = [d for d in yaml.safe_load_all(fh) if isinstance(d, dict) and d.get("kind")] |
| 80 | + |
| 81 | + pvcs, namespaces, policies, restores = {}, {}, [], [] |
| 82 | + for d in docs: |
| 83 | + kind = d.get("kind") |
| 84 | + group = (d.get("apiVersion") or "").split("/")[0] |
| 85 | + if kind == "PersistentVolumeClaim": |
| 86 | + pvcs[(meta(d, "namespace"), meta(d, "name"))] = d |
| 87 | + elif kind == "Namespace": |
| 88 | + namespaces[meta(d, "name")] = d |
| 89 | + elif group == KOPIUR_GROUP and kind == "SnapshotPolicy": |
| 90 | + policies.append(d) |
| 91 | + elif group == KOPIUR_GROUP and kind == "Restore": |
| 92 | + restores.append(d) |
| 93 | + |
| 94 | + fails, warns = [], [] |
| 95 | + backed_pvcs, backed_namespaces = set(), set() |
| 96 | + |
| 97 | + for p in policies: |
| 98 | + pns, pname = meta(p, "namespace"), meta(p, "name") |
| 99 | + backed_namespaces.add(pns) |
| 100 | + if not has_mover_sc(p): |
| 101 | + warns.append(f"[mover] SnapshotPolicy {pns}/{pname}: no spec.mover security context (set the data-owner uid:gid)") |
| 102 | + for src in ((p.get("spec") or {}).get("sources") or []): |
| 103 | + pvcname = (src.get("pvc") or {}).get("name") |
| 104 | + if not pvcname: |
| 105 | + continue |
| 106 | + backed_pvcs.add((pns, pvcname)) |
| 107 | + pvc = pvcs.get((pns, pvcname)) |
| 108 | + if pvc is None: |
| 109 | + fails.append(f"[dsr] SnapshotPolicy {pns}/{pname} backs up PVC '{pvcname}' but no such PVC was rendered") |
| 110 | + continue |
| 111 | + dsr = (pvc.get("spec") or {}).get("dataSourceRef") or {} |
| 112 | + if dsr.get("apiGroup") != KOPIUR_GROUP or dsr.get("kind") != "Restore": |
| 113 | + fails.append(f"[dsr] PVC {pns}/{pvcname} is backed up but dataSourceRef is not a kopiur Restore → recreates EMPTY in DR (got: {dsr or 'none'})") |
| 114 | + |
| 115 | + for r in restores: |
| 116 | + if not has_mover_sc(r): |
| 117 | + warns.append(f"[mover] Restore {meta(r, 'namespace')}/{meta(r, 'name')}: no spec.mover security context") |
| 118 | + |
| 119 | + for ns in sorted(backed_namespaces): |
| 120 | + nd = namespaces.get(ns) |
| 121 | + if nd is None: |
| 122 | + warns.append(f"[nslabel] namespace '{ns}' has kopiur stubs but no Namespace object rendered (can't verify repo label)") |
| 123 | + elif labels_of(nd).get(REPO_LABEL) != REPO_LABEL_VAL: |
| 124 | + fails.append(f"[nslabel] namespace '{ns}' is backed up but missing label {REPO_LABEL}={REPO_LABEL_VAL} → repo creds won't fan in") |
| 125 | + |
| 126 | + for (pns, pname), pvc in sorted(pvcs.items(), key=lambda kv: (kv[0][0] or "", kv[0][1] or "")): |
| 127 | + if pns in SYSTEM_NS: |
| 128 | + continue |
| 129 | + if (pvc.get("spec") or {}).get("storageClassName") != "longhorn": |
| 130 | + continue |
| 131 | + lbls = labels_of(pvc) |
| 132 | + if any(k.startswith("cnpg.io/") for k in lbls): # CNPG = Barman, not kopiur |
| 133 | + continue |
| 134 | + if (pns, pname) in backed_pvcs: |
| 135 | + continue |
| 136 | + if lbls.get(EXEMPT_LABEL) == "true": |
| 137 | + if not anns_of(pvc).get(EXEMPT_REASON): |
| 138 | + warns.append(f"[exempt] PVC {pns}/{pname} is backup-exempt but missing {EXEMPT_REASON} annotation") |
| 139 | + continue |
| 140 | + warns.append(f"[gap] PVC {pns}/{pname} (longhorn) is neither backed up nor backup-exempt → review") |
| 141 | + |
| 142 | + print("== kopiur backup coverage ==") |
| 143 | + print(f" policies={len(policies)} restores={len(restores)} pvcs={len(pvcs)} backed-namespaces={len(backed_namespaces)}") |
| 144 | + for w in warns: |
| 145 | + print(f" WARN {w}") |
| 146 | + for f in fails: |
| 147 | + print(f" FAIL {f}") |
| 148 | + if fails: |
| 149 | + print(f"\n{len(fails)} hard failure(s): a backup would silently fail or a PVC would recreate empty in DR.") |
| 150 | + return 1 |
| 151 | + print(f"\nOK — coverage intact ({len(warns)} warning(s), 0 failures).") |
| 152 | + return 0 |
| 153 | + |
| 154 | + |
| 155 | +if __name__ == "__main__": |
| 156 | + sys.exit(main()) |
0 commit comments