|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""United Kingdom -> contributions/postcodes/GB.json importer for issue #1039. |
| 3 | +
|
| 4 | +Source data |
| 5 | +----------- |
| 6 | +The community ``dwyl/uk-postcodes-latitude-longitude-complete-csv`` |
| 7 | +repository ships a 32 MB ZIP containing 1,738,243 UK postcodes with |
| 8 | +WGS-84 lat/lng centroids (October 2017 snapshot from Ordnance Survey |
| 9 | +Code-Point Open). |
| 10 | +
|
| 11 | + id,postcode,latitude,longitude |
| 12 | + 1,AB10 1XG,57.144165,-2.114848 |
| 13 | + 2,AB10 6RN,57.137880,-2.121487 |
| 14 | +
|
| 15 | +What this script ships |
| 16 | +---------------------- |
| 17 | +**Postcode-area level (1-2 letter prefix), 124 records.** |
| 18 | +
|
| 19 | +The full 6-7 character UK postcode list at 1.7M rows would generate |
| 20 | +a ~500 MB JSON, which exceeds the in-band cities/*.json size envelope |
| 21 | +(largest currently is PT.json at 38 MB). Per the maintainer's notes, |
| 22 | +records >200k need the gz-to-Releases pattern (#1374) — not yet |
| 23 | +deployed. |
| 24 | +
|
| 25 | +Postcode-area level is the UK equivalent of Canada's FSA: 124 1-2 |
| 26 | +letter prefixes (M, SW, EC1, etc.) each covering thousands of full |
| 27 | +postcodes. Each row carries the area-centroid lat/lng (mean of all |
| 28 | +underlying full postcode centroids) and the canonical city/region |
| 29 | +locality label. |
| 30 | +
|
| 31 | +What this script does |
| 32 | +--------------------- |
| 33 | +1. Fetches the ZIP via urllib (curl is blocked). |
| 34 | +2. Extracts ukpostcodes.csv in-memory. |
| 35 | +3. Aggregates 1.7M rows by 1-2 letter area prefix to compute mean |
| 36 | + centroid. |
| 37 | +4. Joins each area to its canonical city/region label via |
| 38 | + AREA_TO_LOCALITY (124-entry hand-curated map). |
| 39 | +5. Writes contributions/postcodes/GB.json idempotently. |
| 40 | +
|
| 41 | +Why country-only state FK |
| 42 | +------------------------- |
| 43 | +CSC has 221 GB states across nine types (unitary authority, |
| 44 | +metropolitan district, london borough, council area, two-tier county, |
| 45 | +district, country, province, city). Postcode areas often span |
| 46 | +multiple unitary authorities or counties, so a 1:1 area->state map |
| 47 | +would be misleading. Future PRs can layer in postcode-district-level |
| 48 | +FK (~3,000 districts), which is finer-grained than area but still |
| 49 | +manageable in size. |
| 50 | +
|
| 51 | +This matches the "country-only" pattern already used for SE (Sweden) |
| 52 | +and SI (Slovenia) where source data doesn't map cleanly to CSC's |
| 53 | +state hierarchy. |
| 54 | +
|
| 55 | +License & attribution |
| 56 | +--------------------- |
| 57 | +- Source: dwyl/uk-postcodes-latitude-longitude-complete-csv (no |
| 58 | + formal license file, October 2017 snapshot) |
| 59 | +- Upstream: Ordnance Survey Code-Point Open (OS OpenData / OGL3, |
| 60 | + Crown Copyright) |
| 61 | +- Each row: ``source: "ordnance-survey-via-dwyl"`` |
| 62 | +
|
| 63 | +Tier 5 per #1039 license-tier policy (free redistribution permitted, |
| 64 | +no formal licence) — flagged in PR. |
| 65 | +
|
| 66 | +Usage |
| 67 | +----- |
| 68 | + python3 bin/scripts/sync/import_uk_postcodes.py |
| 69 | +""" |
| 70 | + |
| 71 | +from __future__ import annotations |
| 72 | + |
| 73 | +import argparse |
| 74 | +import csv |
| 75 | +import io |
| 76 | +import json |
| 77 | +import re |
| 78 | +import sys |
| 79 | +import urllib.request |
| 80 | +import zipfile |
| 81 | +from pathlib import Path |
| 82 | +from typing import Dict, List |
| 83 | + |
| 84 | + |
| 85 | +SOURCE_URL = ( |
| 86 | + "https://raw.githubusercontent.com/dwyl/" |
| 87 | + "uk-postcodes-latitude-longitude-complete-csv/master/ukpostcodes.csv.zip" |
| 88 | +) |
| 89 | + |
| 90 | +# The 124 UK postcode areas mapped to their canonical city/region label. |
| 91 | +# Ref: Royal Mail Postcode Address File area definitions. |
| 92 | +AREA_TO_LOCALITY: Dict[str, str] = { |
| 93 | + "AB": "Aberdeen", |
| 94 | + "AL": "St Albans", |
| 95 | + "B": "Birmingham", |
| 96 | + "BA": "Bath", |
| 97 | + "BB": "Blackburn", |
| 98 | + "BD": "Bradford", |
| 99 | + "BH": "Bournemouth", |
| 100 | + "BL": "Bolton", |
| 101 | + "BN": "Brighton", |
| 102 | + "BR": "Bromley", |
| 103 | + "BS": "Bristol", |
| 104 | + "BT": "Belfast", |
| 105 | + "CA": "Carlisle", |
| 106 | + "CB": "Cambridge", |
| 107 | + "CF": "Cardiff", |
| 108 | + "CH": "Chester", |
| 109 | + "CM": "Chelmsford", |
| 110 | + "CO": "Colchester", |
| 111 | + "CR": "Croydon", |
| 112 | + "CT": "Canterbury", |
| 113 | + "CV": "Coventry", |
| 114 | + "CW": "Crewe", |
| 115 | + "DA": "Dartford", |
| 116 | + "DD": "Dundee", |
| 117 | + "DE": "Derby", |
| 118 | + "DG": "Dumfries", |
| 119 | + "DH": "Durham", |
| 120 | + "DL": "Darlington", |
| 121 | + "DN": "Doncaster", |
| 122 | + "DT": "Dorchester", |
| 123 | + "DY": "Dudley", |
| 124 | + "E": "London East", |
| 125 | + "EC": "London East Central", |
| 126 | + "EH": "Edinburgh", |
| 127 | + "EN": "Enfield", |
| 128 | + "EX": "Exeter", |
| 129 | + "FK": "Falkirk", |
| 130 | + "FY": "Blackpool (Fylde)", |
| 131 | + "G": "Glasgow", |
| 132 | + "GL": "Gloucester", |
| 133 | + "GU": "Guildford", |
| 134 | + "GY": "Guernsey", |
| 135 | + "HA": "Harrow", |
| 136 | + "HD": "Huddersfield", |
| 137 | + "HG": "Harrogate", |
| 138 | + "HP": "Hemel Hempstead", |
| 139 | + "HR": "Hereford", |
| 140 | + "HS": "Outer Hebrides", |
| 141 | + "HU": "Hull", |
| 142 | + "HX": "Halifax", |
| 143 | + "IG": "Ilford", |
| 144 | + "IM": "Isle of Man", |
| 145 | + "IP": "Ipswich", |
| 146 | + "IV": "Inverness", |
| 147 | + "JE": "Jersey", |
| 148 | + "KA": "Kilmarnock", |
| 149 | + "KT": "Kingston upon Thames", |
| 150 | + "KW": "Kirkwall", |
| 151 | + "KY": "Kirkcaldy", |
| 152 | + "L": "Liverpool", |
| 153 | + "LA": "Lancaster", |
| 154 | + "LD": "Llandrindod Wells", |
| 155 | + "LE": "Leicester", |
| 156 | + "LL": "Llandudno", |
| 157 | + "LN": "Lincoln", |
| 158 | + "LS": "Leeds", |
| 159 | + "LU": "Luton", |
| 160 | + "M": "Manchester", |
| 161 | + "ME": "Medway", |
| 162 | + "MK": "Milton Keynes", |
| 163 | + "ML": "Motherwell", |
| 164 | + "N": "London North", |
| 165 | + "NE": "Newcastle upon Tyne", |
| 166 | + "NG": "Nottingham", |
| 167 | + "NN": "Northampton", |
| 168 | + "NP": "Newport", |
| 169 | + "NR": "Norwich", |
| 170 | + "NW": "London North West", |
| 171 | + "OL": "Oldham", |
| 172 | + "OX": "Oxford", |
| 173 | + "PA": "Paisley", |
| 174 | + "PE": "Peterborough", |
| 175 | + "PH": "Perth", |
| 176 | + "PL": "Plymouth", |
| 177 | + "PO": "Portsmouth", |
| 178 | + "PR": "Preston", |
| 179 | + "RG": "Reading", |
| 180 | + "RH": "Redhill", |
| 181 | + "RM": "Romford", |
| 182 | + "S": "Sheffield", |
| 183 | + "SA": "Swansea", |
| 184 | + "SE": "London South East", |
| 185 | + "SG": "Stevenage", |
| 186 | + "SK": "Stockport", |
| 187 | + "SL": "Slough", |
| 188 | + "SM": "Sutton", |
| 189 | + "SN": "Swindon", |
| 190 | + "SO": "Southampton", |
| 191 | + "SP": "Salisbury", |
| 192 | + "SR": "Sunderland", |
| 193 | + "SS": "Southend-on-Sea", |
| 194 | + "ST": "Stoke-on-Trent", |
| 195 | + "SW": "London South West", |
| 196 | + "SY": "Shrewsbury", |
| 197 | + "TA": "Taunton", |
| 198 | + "TD": "Tweeddale", |
| 199 | + "TF": "Telford", |
| 200 | + "TN": "Tonbridge", |
| 201 | + "TQ": "Torquay", |
| 202 | + "TR": "Truro", |
| 203 | + "TS": "Cleveland", |
| 204 | + "TW": "Twickenham", |
| 205 | + "UB": "Southall", |
| 206 | + "W": "London West", |
| 207 | + "WA": "Warrington", |
| 208 | + "WC": "London West Central", |
| 209 | + "WD": "Watford", |
| 210 | + "WF": "Wakefield", |
| 211 | + "WN": "Wigan", |
| 212 | + "WR": "Worcester", |
| 213 | + "WS": "Walsall", |
| 214 | + "WV": "Wolverhampton", |
| 215 | + "YO": "York", |
| 216 | + "ZE": "Shetland", |
| 217 | +} |
| 218 | + |
| 219 | +AREA_RE = re.compile(r"^([A-Z]{1,2})") |
| 220 | + |
| 221 | + |
| 222 | +def fetch_zip(url: str) -> bytes: |
| 223 | + req = urllib.request.Request( |
| 224 | + url, headers={"User-Agent": "csc-database-postcode-importer"} |
| 225 | + ) |
| 226 | + with urllib.request.urlopen(req, timeout=180) as r: |
| 227 | + return r.read() |
| 228 | + |
| 229 | + |
| 230 | +def main() -> int: |
| 231 | + parser = argparse.ArgumentParser(description=__doc__) |
| 232 | + parser.add_argument("--input", default=None, help="local zip (skip fetch)") |
| 233 | + parser.add_argument("--dry-run", action="store_true") |
| 234 | + args = parser.parse_args() |
| 235 | + |
| 236 | + raw = ( |
| 237 | + Path(args.input).read_bytes() |
| 238 | + if args.input |
| 239 | + else fetch_zip(SOURCE_URL) |
| 240 | + ) |
| 241 | + print(f"zip size: {len(raw):,} bytes") |
| 242 | + |
| 243 | + zf = zipfile.ZipFile(io.BytesIO(raw)) |
| 244 | + csv_name = next( |
| 245 | + n |
| 246 | + for n in zf.namelist() |
| 247 | + if n.endswith(".csv") and not n.startswith("__") |
| 248 | + ) |
| 249 | + with zf.open(csv_name) as f: |
| 250 | + text = f.read().decode("utf-8", errors="replace") |
| 251 | + |
| 252 | + project_root = Path(__file__).resolve().parents[3] |
| 253 | + countries = json.load( |
| 254 | + (project_root / "contributions/countries/countries.json").open(encoding="utf-8") |
| 255 | + ) |
| 256 | + gb_country = next((c for c in countries if c.get("iso2") == "GB"), None) |
| 257 | + if gb_country is None: |
| 258 | + print("ERROR: GB not in countries.json", file=sys.stderr) |
| 259 | + return 2 |
| 260 | + regex = re.compile(gb_country.get("postal_code_regex") or ".*") |
| 261 | + print(f"Country: United Kingdom (id={gb_country['id']})") |
| 262 | + |
| 263 | + reader = csv.DictReader(io.StringIO(text)) |
| 264 | + area_data: Dict[str, Dict[str, float]] = {} |
| 265 | + total = 0 |
| 266 | + for row in reader: |
| 267 | + total += 1 |
| 268 | + pc = (row.get("postcode") or "").replace(" ", "").upper() |
| 269 | + m = AREA_RE.match(pc) |
| 270 | + if not m: |
| 271 | + continue |
| 272 | + area = m.group(1) |
| 273 | + try: |
| 274 | + lat = float(row["latitude"]) |
| 275 | + lon = float(row["longitude"]) |
| 276 | + except (ValueError, TypeError, KeyError): |
| 277 | + continue |
| 278 | + d = area_data.setdefault( |
| 279 | + area, {"count": 0, "lat_sum": 0.0, "lon_sum": 0.0} |
| 280 | + ) |
| 281 | + d["count"] += 1 |
| 282 | + d["lat_sum"] += lat |
| 283 | + d["lon_sum"] += lon |
| 284 | + print(f"Source rows: {total:,}; distinct postcode areas: {len(area_data)}") |
| 285 | + |
| 286 | + records: List[dict] = [] |
| 287 | + skipped_bad_regex = 0 |
| 288 | + unknown_areas: List[str] = [] |
| 289 | + |
| 290 | + for area in sorted(area_data): |
| 291 | + d = area_data[area] |
| 292 | + if not regex.match(area): |
| 293 | + skipped_bad_regex += 1 |
| 294 | + continue |
| 295 | + lat = d["lat_sum"] / d["count"] |
| 296 | + lon = d["lon_sum"] / d["count"] |
| 297 | + locality = AREA_TO_LOCALITY.get(area) |
| 298 | + if locality is None: |
| 299 | + unknown_areas.append(area) |
| 300 | + locality = "" |
| 301 | + |
| 302 | + record: Dict[str, object] = { |
| 303 | + "code": area, |
| 304 | + "country_id": int(gb_country["id"]), |
| 305 | + "country_code": "GB", |
| 306 | + } |
| 307 | + if locality: |
| 308 | + record["locality_name"] = locality |
| 309 | + record["latitude"] = f"{lat:.6f}" |
| 310 | + record["longitude"] = f"{lon:.6f}" |
| 311 | + record["type"] = "area" |
| 312 | + record["source"] = "ordnance-survey-via-dwyl" |
| 313 | + records.append(record) |
| 314 | + |
| 315 | + print(f"Skipped (regex fail): {skipped_bad_regex:,}") |
| 316 | + print(f"Records emitted: {len(records):,}") |
| 317 | + if unknown_areas: |
| 318 | + print(f"Unknown areas (not in AREA_TO_LOCALITY): {unknown_areas}") |
| 319 | + |
| 320 | + if args.dry_run: |
| 321 | + return 0 |
| 322 | + |
| 323 | + target = project_root / "contributions/postcodes/GB.json" |
| 324 | + target.parent.mkdir(parents=True, exist_ok=True) |
| 325 | + if target.exists(): |
| 326 | + with target.open(encoding="utf-8") as f: |
| 327 | + existing = json.load(f) |
| 328 | + existing_seen = { |
| 329 | + (r["code"], (r.get("locality_name") or "").lower()) for r in existing |
| 330 | + } |
| 331 | + merged = list(existing) |
| 332 | + for r in records: |
| 333 | + key = (r["code"], (r.get("locality_name") or "").lower()) |
| 334 | + if key not in existing_seen: |
| 335 | + merged.append(r) |
| 336 | + existing_seen.add(key) |
| 337 | + merged.sort(key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 338 | + else: |
| 339 | + merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", ""))) |
| 340 | + |
| 341 | + with target.open("w", encoding="utf-8") as f: |
| 342 | + json.dump(merged, f, ensure_ascii=False, indent=2) |
| 343 | + f.write("\n") |
| 344 | + size_kb = target.stat().st_size / 1024 |
| 345 | + print( |
| 346 | + f"\n[OK] Wrote {target.relative_to(project_root)} " |
| 347 | + f"({len(merged):,} rows, {size_kb:.0f} KB)" |
| 348 | + ) |
| 349 | + return 0 |
| 350 | + |
| 351 | + |
| 352 | +if __name__ == "__main__": |
| 353 | + raise SystemExit(main()) |
0 commit comments