|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Panama -> contributions/postcodes/PA.json importer for issue #1039. |
| 3 | +
|
| 4 | +Source data |
| 5 | +----------- |
| 6 | +The community ``viquezr-dev/codigos_postales`` repository ships |
| 7 | +GeoJSON polygons for each Correos de Panamá estafeta (post-office |
| 8 | +delivery zone). Each feature carries: |
| 9 | +
|
| 10 | + { |
| 11 | + "VM_LEVEL": "B7241", # 5-char alphanumeric postal code |
| 12 | + "PROV_NOMB": "DARIÉN", # province (uppercase, with diacritics) |
| 13 | + "ESTAF_NAME": "Estafeta de Yaviza", |
| 14 | + "ESTAF_CODE": "41", # 2-digit office code |
| 15 | + "VM_LEVEL2D": "BA" # province + zone prefix |
| 16 | + } |
| 17 | +
|
| 18 | +Source URL: https://raw.githubusercontent.com/viquezr-dev/codigos_postales/master/estafetas.geojson |
| 19 | +
|
| 20 | +What this script does |
| 21 | +--------------------- |
| 22 | +1. Fetches the GeoJSON via urllib (curl is blocked). |
| 23 | +2. Skips 65 'POR DEFINIR' (undefined) features without an |
| 24 | + assigned ESTAF_CODE. |
| 25 | +3. Maps the 13 source province labels (uppercase Spanish) to CSC's |
| 26 | + iso2 codes via PROV_NOMB_TO_ISO2. |
| 27 | +4. Computes the centroid of each MultiPolygon for lat/lng. |
| 28 | +5. Writes contributions/postcodes/PA.json idempotently. |
| 29 | +
|
| 30 | +Regex fix |
| 31 | +--------- |
| 32 | +Before this PR, countries.json had PA regex `^\\d{5}$` (5-digit |
| 33 | +numeric). Panama's actual Correos de Panamá codes are 1-letter + |
| 34 | +4-digit alphanumeric (e.g. 'B7241', 'K4299'), so the old regex |
| 35 | +rejected 100% of legitimate codes. Updated to `^[A-Z]\\d{4}$` / |
| 36 | +format `@####`. |
| 37 | +
|
| 38 | +Centroid computation |
| 39 | +-------------------- |
| 40 | +The source ships MultiPolygon geometries, not centroids. The |
| 41 | +importer computes the unweighted mean of all polygon vertices — |
| 42 | +sufficient for representative coordinates. (Strictly correct |
| 43 | +geographic centroid would require area-weighting, but this |
| 44 | +2D-mean approximation is within ~1 km for postal-zone-sized |
| 45 | +polygons.) |
| 46 | +
|
| 47 | +Coverage gap |
| 48 | +------------ |
| 49 | +Source predates 2020 — Naso Tjër Di Comarca (CSC iso2 'NT', created |
| 50 | +2020) is not represented. Idempotent merge contract allows future |
| 51 | +sources covering NT to layer in. |
| 52 | +
|
| 53 | +License & attribution |
| 54 | +--------------------- |
| 55 | +- Source: viquezr-dev/codigos_postales (no formal license file) |
| 56 | +- Upstream: Correos de Panamá publicly published map data |
| 57 | +- Each row: ``source: "correos-panama-via-viquezr"`` |
| 58 | +
|
| 59 | +Tier 5 per #1039 license-tier policy. |
| 60 | +
|
| 61 | +Usage |
| 62 | +----- |
| 63 | + python3 bin/scripts/sync/import_panama_postcodes.py |
| 64 | +""" |
| 65 | + |
| 66 | +from __future__ import annotations |
| 67 | + |
| 68 | +import argparse |
| 69 | +import json |
| 70 | +import re |
| 71 | +import sys |
| 72 | +import urllib.request |
| 73 | +from pathlib import Path |
| 74 | +from typing import Dict, List, Tuple |
| 75 | + |
| 76 | + |
| 77 | +SOURCE_URL = ( |
| 78 | + "https://raw.githubusercontent.com/viquezr-dev/codigos_postales/" |
| 79 | + "master/estafetas.geojson" |
| 80 | +) |
| 81 | + |
| 82 | +# Source PROV_NOMB (uppercase Spanish) -> CSC iso2 in PA states.json. |
| 83 | +PROV_NOMB_TO_ISO2: Dict[str, str] = { |
| 84 | + "BOCAS DEL TORO": "1", |
| 85 | + "CHIRIQUÍ": "4", |
| 86 | + "COCLÉ": "2", |
| 87 | + "COLÓN": "3", |
| 88 | + "DARIÉN": "5", |
| 89 | + "EMBERA": "EM", # Emberá-Wounaan Comarca |
| 90 | + "HERRERA": "6", |
| 91 | + "KUNA_YALA": "KY", # Guna Yala (older Spanish spelling) |
| 92 | + "LOS SANTOS": "7", |
| 93 | + "NGÄBE BUGLÉ": "NB", # Ngöbe-Buglé Comarca |
| 94 | + "PANAMÁ": "8", |
| 95 | + "PANAMÁ OESTE": "10", |
| 96 | + "VERAGUAS": "9", |
| 97 | +} |
| 98 | + |
| 99 | + |
| 100 | +def fetch_json(url: str) -> dict: |
| 101 | + req = urllib.request.Request( |
| 102 | + url, headers={"User-Agent": "csc-database-postcode-importer"} |
| 103 | + ) |
| 104 | + with urllib.request.urlopen(req, timeout=120) as r: |
| 105 | + return json.loads(r.read()) |
| 106 | + |
| 107 | + |
| 108 | +def polygon_centroid(coords: list) -> Tuple[float, float]: |
| 109 | + """Compute unweighted mean of all polygon vertices. |
| 110 | +
|
| 111 | + Sufficient for representative postal-zone coordinates. Strict |
| 112 | + geographic centroid would require area-weighting. |
| 113 | + """ |
| 114 | + lons: List[float] = [] |
| 115 | + lats: List[float] = [] |
| 116 | + |
| 117 | + def collect(node: object) -> None: |
| 118 | + if isinstance(node, list) and node: |
| 119 | + if isinstance(node[0], (int, float)) and len(node) >= 2: |
| 120 | + lons.append(float(node[0])) |
| 121 | + lats.append(float(node[1])) |
| 122 | + else: |
| 123 | + for child in node: |
| 124 | + collect(child) |
| 125 | + |
| 126 | + collect(coords) |
| 127 | + if not lats: |
| 128 | + return (0.0, 0.0) |
| 129 | + return (sum(lats) / len(lats), sum(lons) / len(lons)) |
| 130 | + |
| 131 | + |
| 132 | +def main() -> int: |
| 133 | + parser = argparse.ArgumentParser(description=__doc__) |
| 134 | + parser.add_argument("--input", default=None, help="local geojson (skip fetch)") |
| 135 | + parser.add_argument("--dry-run", action="store_true") |
| 136 | + args = parser.parse_args() |
| 137 | + |
| 138 | + data = ( |
| 139 | + json.loads(Path(args.input).read_text(encoding="utf-8")) |
| 140 | + if args.input |
| 141 | + else fetch_json(SOURCE_URL) |
| 142 | + ) |
| 143 | + features = data.get("features", []) |
| 144 | + print(f"Source features: {len(features):,}") |
| 145 | + |
| 146 | + project_root = Path(__file__).resolve().parents[3] |
| 147 | + countries = json.load( |
| 148 | + (project_root / "contributions/countries/countries.json").open(encoding="utf-8") |
| 149 | + ) |
| 150 | + pa_country = next((c for c in countries if c.get("iso2") == "PA"), None) |
| 151 | + if pa_country is None: |
| 152 | + print("ERROR: PA not in countries.json", file=sys.stderr) |
| 153 | + return 2 |
| 154 | + regex = re.compile(pa_country.get("postal_code_regex") or ".*") |
| 155 | + |
| 156 | + states = json.load( |
| 157 | + (project_root / "contributions/states/states.json").open(encoding="utf-8") |
| 158 | + ) |
| 159 | + pa_states = [s for s in states if s.get("country_id") == pa_country["id"]] |
| 160 | + state_by_iso2: Dict[str, dict] = { |
| 161 | + s["iso2"]: s for s in pa_states if s.get("iso2") |
| 162 | + } |
| 163 | + print( |
| 164 | + f"Country: Panama (id={pa_country['id']}); states indexed: {len(pa_states)}" |
| 165 | + ) |
| 166 | + |
| 167 | + seen: set = set() |
| 168 | + records: List[dict] = [] |
| 169 | + skipped_no_code = 0 |
| 170 | + skipped_bad_regex = 0 |
| 171 | + skipped_no_state = 0 |
| 172 | + matched_state = 0 |
| 173 | + unknown_provs: Dict[str, int] = {} |
| 174 | + |
| 175 | + for feat in features: |
| 176 | + props = feat.get("properties", {}) |
| 177 | + code = (props.get("VM_LEVEL") or "").strip() |
| 178 | + estaf_code = props.get("ESTAF_CODE") |
| 179 | + estaf_name = (props.get("ESTAF_NAME") or "").strip() |
| 180 | + if not code or not estaf_code or estaf_name == "POR DEFINIR": |
| 181 | + skipped_no_code += 1 |
| 182 | + continue |
| 183 | + if not regex.match(code): |
| 184 | + skipped_bad_regex += 1 |
| 185 | + continue |
| 186 | + |
| 187 | + prov_nomb = (props.get("PROV_NOMB") or "").strip() |
| 188 | + iso2 = PROV_NOMB_TO_ISO2.get(prov_nomb) |
| 189 | + state = state_by_iso2.get(iso2) if iso2 else None |
| 190 | + if state is None: |
| 191 | + unknown_provs[prov_nomb] = unknown_provs.get(prov_nomb, 0) + 1 |
| 192 | + skipped_no_state += 1 |
| 193 | + |
| 194 | + geom = feat.get("geometry", {}) or {} |
| 195 | + lat, lon = polygon_centroid(geom.get("coordinates", [])) |
| 196 | + |
| 197 | + key = (code, estaf_name.lower()) |
| 198 | + if key in seen: |
| 199 | + continue |
| 200 | + seen.add(key) |
| 201 | + |
| 202 | + record: Dict[str, object] = { |
| 203 | + "code": code, |
| 204 | + "country_id": int(pa_country["id"]), |
| 205 | + "country_code": "PA", |
| 206 | + } |
| 207 | + if state is not None: |
| 208 | + record["state_id"] = int(state["id"]) |
| 209 | + record["state_code"] = state.get("iso2") |
| 210 | + matched_state += 1 |
| 211 | + if estaf_name: |
| 212 | + record["locality_name"] = estaf_name |
| 213 | + if lat or lon: |
| 214 | + record["latitude"] = f"{lat:.6f}" |
| 215 | + record["longitude"] = f"{lon:.6f}" |
| 216 | + record["type"] = "full" |
| 217 | + record["source"] = "correos-panama-via-viquezr" |
| 218 | + records.append(record) |
| 219 | + |
| 220 | + print(f"Skipped (no code/POR DEFINIR): {skipped_no_code:,}") |
| 221 | + print(f"Skipped (regex fail): {skipped_bad_regex:,}") |
| 222 | + print(f"Skipped (no state FK): {skipped_no_state:,}") |
| 223 | + print(f"Records emitted: {len(records):,}") |
| 224 | + pct = matched_state * 100 // max(1, len(records)) |
| 225 | + print(f" with state: {matched_state:,} ({pct}%)") |
| 226 | + if unknown_provs: |
| 227 | + print("Unknown PROV_NOMB (not in PROV_NOMB_TO_ISO2):") |
| 228 | + for p, n in sorted(unknown_provs.items(), key=lambda x: -x[1]): |
| 229 | + print(f" {p!r}: {n}") |
| 230 | + |
| 231 | + if args.dry_run: |
| 232 | + return 0 |
| 233 | + |
| 234 | + target = project_root / "contributions/postcodes/PA.json" |
| 235 | + target.parent.mkdir(parents=True, exist_ok=True) |
| 236 | + if target.exists(): |
| 237 | + with target.open(encoding="utf-8") as f: |
| 238 | + existing = json.load(f) |
| 239 | + existing_seen = { |
| 240 | + (r["code"], (r.get("locality_name") or "").lower()) for r in existing |
| 241 | + } |
| 242 | + merged = list(existing) |
| 243 | + for r in records: |
| 244 | + key = (r["code"], (r.get("locality_name") or "").lower()) |
| 245 | + if key not in existing_seen: |
| 246 | + merged.append(r) |
| 247 | + existing_seen.add(key) |
| 248 | + merged.sort(key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 249 | + else: |
| 250 | + merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 251 | + |
| 252 | + with target.open("w", encoding="utf-8") as f: |
| 253 | + json.dump(merged, f, ensure_ascii=False, indent=2) |
| 254 | + f.write("\n") |
| 255 | + size_kb = target.stat().st_size / 1024 |
| 256 | + print( |
| 257 | + f"\n[OK] Wrote {target.relative_to(project_root)} " |
| 258 | + f"({len(merged):,} rows, {size_kb:.0f} KB)" |
| 259 | + ) |
| 260 | + return 0 |
| 261 | + |
| 262 | + |
| 263 | +if __name__ == "__main__": |
| 264 | + raise SystemExit(main()) |
0 commit comments