Skip to content

Commit f3e7674

Browse files
dr5hnclaude
andcommitted
feat(postcodes/PL): bulk-import 22,090 codes via Polish open data (#1039)
Adds Polish postcodes via the mberezinski/kody-pocztowe-geo mirror, which includes voivodeship + coordinates for every code. 1. bin/scripts/sync/import_poland_postcodes.py — pipeline reading the semicolon-delimited UTF-8-BOM CSV. Resolves voivodeship via Polish name -> CSC iso2 alias map (16 voivodships). 2. contributions/postcodes/PL.json — 22,090 records, 100% state_id + 100% coordinate resolution. Validation (zero errors) - All codes match countries.postal_code_regex (^\d{2}-\d{3}$) - All FKs resolve, all state_codes agree with state.iso2 License & attribution - Mirror: github.com/mberezinski/kody-pocztowe-geo - Each row: source: 'kody-pocztowe-geo' Refs: #1039 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 69601e3 commit f3e7674

2 files changed

Lines changed: 265251 additions & 0 deletions

File tree

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env python3
2+
"""Poland -> contributions/postcodes/PL.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community-maintained ``mberezinski/kody-pocztowe-geo`` repo
7+
redistributes Polish postcode data with coordinates and voivodeship.
8+
9+
https://github.com/mberezinski/kody-pocztowe-geo
10+
11+
CSV format (semicolon-delimited, UTF-8 BOM):
12+
Country;PostCode;Longitude;Latitude;Address;City;County;Voivodeship;CityBasedAproximation
13+
14+
~22k records.
15+
16+
What this script does
17+
---------------------
18+
1. Reads the CSV (UTF-8 with BOM, semicolon-delimited)
19+
2. Picks one canonical record per unique postcode (first city alphabetical)
20+
3. Resolves voivodeship via Polish-name -> CSC iso2 alias map
21+
4. Writes contributions/postcodes/PL.json
22+
23+
License & attribution
24+
---------------------
25+
- Mirror: github.com/mberezinski/kody-pocztowe-geo
26+
- Each row: source: "kody-pocztowe-geo"
27+
"""
28+
29+
from __future__ import annotations
30+
31+
import argparse
32+
import csv
33+
import json
34+
import sys
35+
from pathlib import Path
36+
from typing import Dict, List, Optional
37+
38+
# Polish "Województwo X" -> CSC state.iso2
39+
VOIVODESHIP_TO_ISO2: Dict[str, str] = {
40+
"Województwo dolnośląskie": "02", # Lower Silesia
41+
"Województwo kujawsko-pomorskie": "04", # Kuyavia-Pomerania
42+
"Województwo lubelskie": "06", # Lublin
43+
"Województwo lubuskie": "08", # Lubusz
44+
"Województwo łódzkie": "10", # Łódź
45+
"Województwo małopolskie": "12", # Lesser Poland
46+
"Województwo mazowieckie": "14", # Mazovia
47+
"Województwo opolskie": "16", # Opole (states.json names "Upper Silesia")
48+
"Województwo podkarpackie": "18", # Subcarpathia
49+
"Województwo podlaskie": "20", # Podlaskie
50+
"Województwo pomorskie": "22", # Pomerania
51+
"Województwo śląskie": "24", # Silesia
52+
"Województwo świętokrzyskie": "26", # Holy Cross / Świętokrzyskie
53+
"Województwo warmińsko-mazurskie": "28", # Warmia-Masuria
54+
"Województwo wielkopolskie": "30", # Greater Poland
55+
"Województwo zachodniopomorskie": "32", # West Pomerania
56+
}
57+
58+
59+
def parse_coord(v: str) -> Optional[str]:
60+
if not v or v.strip().upper() == "NULL":
61+
return None
62+
try:
63+
f = float(v)
64+
if abs(f) > 180:
65+
return None
66+
return f"{f:.8f}".rstrip("0").rstrip(".") or "0"
67+
except ValueError:
68+
return None
69+
70+
71+
def main() -> int:
72+
parser = argparse.ArgumentParser(description=__doc__)
73+
parser.add_argument("--input", default="/tmp/pl_kody.csv")
74+
parser.add_argument("--dry-run", action="store_true")
75+
args = parser.parse_args()
76+
77+
src = Path(args.input)
78+
if not src.exists():
79+
print(f"ERROR: input not found: {src}", file=sys.stderr)
80+
return 2
81+
82+
project_root = Path(__file__).resolve().parents[3]
83+
countries = json.load((project_root / "contributions/countries/countries.json").open(encoding="utf-8"))
84+
pl = next((c for c in countries if c.get("iso2") == "PL"), None)
85+
if pl is None:
86+
print("ERROR: PL not in countries.json", file=sys.stderr)
87+
return 2
88+
states = json.load((project_root / "contributions/states/states.json").open(encoding="utf-8"))
89+
pl_states = [s for s in states if s.get("country_id") == pl["id"]]
90+
state_by_iso2: Dict[str, dict] = {(s.get("iso2") or "").upper(): s for s in pl_states if s.get("iso2")}
91+
print(f"Country: Poland (id={pl['id']}); states indexed: {len(state_by_iso2)}")
92+
93+
by_code: Dict[str, List[dict]] = {}
94+
bad = 0
95+
with src.open(encoding="utf-8-sig", newline="") as f:
96+
reader = csv.DictReader(f, delimiter=";")
97+
for row in reader:
98+
code = (row.get("PostCode") or "").strip()
99+
if not code:
100+
bad += 1
101+
continue
102+
by_code.setdefault(code, []).append(row)
103+
print(f"Skipped malformed: {bad:,}")
104+
print(f"Unique postcodes: {len(by_code):,}")
105+
106+
records: List[dict] = []
107+
matched_state = 0
108+
matched_coord = 0
109+
for code in sorted(by_code):
110+
rows = sorted(by_code[code], key=lambda r: (r.get("City") or "").upper())
111+
chosen = rows[0]
112+
record = {
113+
"code": code,
114+
"country_id": int(pl["id"]),
115+
"country_code": "PL",
116+
}
117+
voiv = (chosen.get("Voivodeship") or "").strip()
118+
iso2 = VOIVODESHIP_TO_ISO2.get(voiv)
119+
if iso2:
120+
state = state_by_iso2.get(iso2)
121+
if state is not None:
122+
record["state_id"] = int(state["id"])
123+
record["state_code"] = iso2
124+
matched_state += 1
125+
city = (chosen.get("City") or "").strip()
126+
if city:
127+
record["locality_name"] = city
128+
record["type"] = "full"
129+
lat = parse_coord(chosen.get("Latitude") or "")
130+
lng = parse_coord(chosen.get("Longitude") or "")
131+
if lat is not None and lng is not None:
132+
record["latitude"] = lat
133+
record["longitude"] = lng
134+
matched_coord += 1
135+
record["source"] = "kody-pocztowe-geo"
136+
records.append(record)
137+
138+
print(f"Records: {len(records):,}")
139+
print(f" with state: {matched_state:,} ({matched_state*100//max(1,len(records))}%)")
140+
print(f" with coords: {matched_coord:,} ({matched_coord*100//max(1,len(records))}%)")
141+
142+
if args.dry_run:
143+
return 0
144+
145+
target = project_root / "contributions/postcodes/PL.json"
146+
if target.exists():
147+
with target.open(encoding="utf-8") as f:
148+
existing = json.load(f)
149+
seen = {(r["code"], (r.get("locality_name") or "").lower()) for r in existing}
150+
merged = list(existing)
151+
for r in records:
152+
key = (r["code"], (r.get("locality_name") or "").lower())
153+
if key not in seen:
154+
merged.append(r)
155+
seen.add(key)
156+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
157+
else:
158+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
159+
160+
with target.open("w", encoding="utf-8") as f:
161+
json.dump(merged, f, ensure_ascii=False, indent=2)
162+
f.write("\n")
163+
size_mb = target.stat().st_size / (1024 * 1024)
164+
print(f"\n[OK] Wrote {target.relative_to(project_root)} ({len(merged):,} rows, {size_mb:.1f} MB)")
165+
return 0
166+
167+
168+
if __name__ == "__main__":
169+
raise SystemExit(main())

0 commit comments

Comments
 (0)