Skip to content

Commit 5154beb

Browse files
dr5hnclaude
andauthored
feat(postcodes/ES): bulk-import 8,124 codes via INE/Correos (#1039) (#1437)
Adds the importer + first run for Spain. Uses the walterleonardo mirror of INE/Correos's official codigos-postales-municipios mapping. 1. bin/scripts/sync/import_spain_postcodes.py — pipeline reading the 3-column header-less CSV (id, postal_code, municipality_name). Resolves state via the well-known Spanish postal-prefix to ISO2 convention (01->VI Álava, 28->M Madrid, 08->B Barcelona, ..., 51->CE Ceuta, 52->ML Melilla). states.json uses license-plate-style codes (A, B, M, GR, ...) so a hardcoded 52-entry prefix map is the cleanest resolution path. 2. contributions/postcodes/ES.json — 8,124 unique postcodes with 100% state_id resolution. State resolution - 50 provinces + 2 autonomous cities (Ceuta, Melilla) covered - Postal prefixes 01-52 are stable since 1980s; map is authoritative Validation (zero errors across 8,124 records) - All codes match countries.postal_code_regex (^(\\d{5})\$) - All FKs resolve, all state_codes agree with state.iso2 - No auto-managed fields present License & attribution - Upstream: INE/Correos (open data) - Mirror: github.com/walterleonardo/codigos_postales_espa-a - Each row: source: "ine" Refs: #1039 Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 21cb337 commit 5154beb

2 files changed

Lines changed: 81451 additions & 0 deletions

File tree

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
#!/usr/bin/env python3
2+
"""Spain CP -> contributions/postcodes/ES.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community-maintained ``walterleonardo/codigos_postales_espa-a`` archive
7+
is a redistribution of the official INE/Correos Spanish postcode-municipio
8+
mapping. The source file format is a header-less semicolon-OR-comma CSV
9+
with three columns:
10+
11+
id, postal_code, municipality_name
12+
13+
~12,000 unique postcodes covering all 50 provinces plus Ceuta (51) and
14+
Melilla (52).
15+
16+
What this script does
17+
---------------------
18+
1. Reads the CSV (UTF-8, comma-delimited, no header)
19+
2. Picks ONE canonical municipality per unique postcode (first alphabetical)
20+
3. Resolves state by mapping the postcode's first 2 digits to province ISO
21+
codes via POSTAL_PREFIX_TO_ISO2 (the well-known Spanish convention,
22+
01=VI/Álava, 28=M/Madrid, 08=B/Barcelona, etc.)
23+
4. Writes contributions/postcodes/ES.json
24+
25+
Why the prefix map
26+
------------------
27+
Spain's states.json uses license-plate-style ISO codes (A, B, M, GR, ...)
28+
rather than numeric postal prefixes. Since the source CSV has no province
29+
column, mapping from the postcode prefix is the only way to resolve state
30+
and is universally documented.
31+
32+
License
33+
-------
34+
- Upstream: INE/Correos (open data, no formal redistribution licence text;
35+
used widely in commercial and open contexts)
36+
- Mirror: github.com/walterleonardo/codigos_postales_espa-a
37+
- Each row: source: "ine"
38+
39+
Usage
40+
-----
41+
python3 -c "import urllib.request; urllib.request.urlretrieve(
42+
'https://raw.githubusercontent.com/walterleonardo/codigos_postales_espa-a/master/codigos_postales_municipios.csv',
43+
'/tmp/es_postales.csv')"
44+
45+
python3 bin/scripts/sync/import_spain_postcodes.py
46+
"""
47+
48+
from __future__ import annotations
49+
50+
import argparse
51+
import csv
52+
import json
53+
import sys
54+
from pathlib import Path
55+
from typing import Dict, List, Optional
56+
57+
58+
# Spanish postcode-prefix -> state ISO2 (from states.json, license-plate style)
59+
# Authoritative mapping; postal prefixes have been stable since the 1980s.
60+
POSTAL_PREFIX_TO_ISO2: Dict[str, str] = {
61+
"01": "VI", # Araba / Álava
62+
"02": "AB", # Albacete
63+
"03": "A", # Alicante
64+
"04": "AL", # Almería
65+
"05": "AV", # Ávila
66+
"06": "BA", # Badajoz
67+
"07": "PM", # Islas Baleares
68+
"08": "B", # Barcelona
69+
"09": "BU", # Burgos
70+
"10": "CC", # Cáceres
71+
"11": "CA", # Cádiz
72+
"12": "CS", # Castellón
73+
"13": "CR", # Ciudad Real
74+
"14": "CO", # Córdoba
75+
"15": "C", # A Coruña
76+
"16": "CU", # Cuenca
77+
"17": "GI", # Girona
78+
"18": "GR", # Granada
79+
"19": "GU", # Guadalajara
80+
"20": "SS", # Gipuzkoa
81+
"21": "H", # Huelva
82+
"22": "HU", # Huesca
83+
"23": "J", # Jaén
84+
"24": "LE", # León
85+
"25": "L", # Lleida
86+
"26": "LO", # La Rioja (Logroño)
87+
"27": "LU", # Lugo
88+
"28": "M", # Madrid
89+
"29": "MA", # Málaga
90+
"30": "MU", # Murcia
91+
"31": "NA", # Navarra
92+
"32": "OR", # Ourense
93+
"33": "O", # Asturias
94+
"34": "P", # Palencia
95+
"35": "GC", # Las Palmas
96+
"36": "PO", # Pontevedra
97+
"37": "SA", # Salamanca
98+
"38": "TF", # Santa Cruz de Tenerife
99+
"39": "S", # Cantabria
100+
"40": "SG", # Segovia
101+
"41": "SE", # Sevilla
102+
"42": "SO", # Soria
103+
"43": "T", # Tarragona
104+
"44": "TE", # Teruel
105+
"45": "TO", # Toledo
106+
"46": "V", # Valencia
107+
"47": "VA", # Valladolid
108+
"48": "BI", # Bizkaia / Vizcaya
109+
"49": "ZA", # Zamora
110+
"50": "Z", # Zaragoza
111+
"51": "CE", # Ceuta
112+
"52": "ML", # Melilla
113+
}
114+
115+
116+
def main() -> int:
117+
parser = argparse.ArgumentParser(description=__doc__)
118+
parser.add_argument("--input", default="/tmp/es_postales.csv")
119+
parser.add_argument("--dry-run", action="store_true")
120+
args = parser.parse_args()
121+
122+
src = Path(args.input)
123+
if not src.exists():
124+
print(f"ERROR: input not found: {src}", file=sys.stderr)
125+
return 2
126+
127+
project_root = Path(__file__).resolve().parents[3]
128+
countries = json.load((project_root / "contributions/countries/countries.json").open(encoding="utf-8"))
129+
es = next((c for c in countries if c.get("iso2") == "ES"), None)
130+
if es is None:
131+
print("ERROR: ES not in countries.json", file=sys.stderr)
132+
return 2
133+
states = json.load((project_root / "contributions/states/states.json").open(encoding="utf-8"))
134+
es_states = [s for s in states if s.get("country_id") == es["id"]]
135+
state_by_iso2: Dict[str, dict] = {(s.get("iso2") or "").upper(): s for s in es_states if s.get("iso2")}
136+
137+
print(f"Country: Spain (id={es['id']}); states indexed by iso2: {len(state_by_iso2)}")
138+
139+
by_postcode: Dict[str, List[str]] = {}
140+
bad = 0
141+
with src.open(encoding="utf-8", newline="") as f:
142+
reader = csv.reader(f)
143+
for row in reader:
144+
if len(row) < 3:
145+
continue
146+
code = (row[1] or "").strip().zfill(5)
147+
commune = (row[2] or "").strip().strip('"')
148+
if not code.isdigit() or len(code) != 5:
149+
bad += 1
150+
continue
151+
by_postcode.setdefault(code, []).append(commune)
152+
153+
print(f"Skipped malformed rows: {bad:,}")
154+
print(f"Unique postcodes: {len(by_postcode):,}")
155+
156+
records: List[dict] = []
157+
matched_state = 0
158+
for code in sorted(by_postcode):
159+
commune = sorted(by_postcode[code], key=lambda s: s.upper())[0]
160+
record = {
161+
"code": code,
162+
"country_id": int(es["id"]),
163+
"country_code": "ES",
164+
}
165+
prefix = code[:2]
166+
iso2 = POSTAL_PREFIX_TO_ISO2.get(prefix)
167+
if iso2:
168+
state = state_by_iso2.get(iso2)
169+
if state is not None:
170+
record["state_id"] = int(state["id"])
171+
record["state_code"] = iso2
172+
matched_state += 1
173+
if commune:
174+
record["locality_name"] = commune
175+
record["type"] = "full"
176+
record["source"] = "ine"
177+
records.append(record)
178+
179+
print(f"Records: {len(records):,}")
180+
print(f" with state: {matched_state:,} ({matched_state*100//max(1,len(records))}%)")
181+
182+
if args.dry_run:
183+
return 0
184+
185+
target = project_root / "contributions/postcodes/ES.json"
186+
if target.exists():
187+
with target.open(encoding="utf-8") as f:
188+
existing = json.load(f)
189+
seen = {(r["code"], (r.get("locality_name") or "").lower()) for r in existing}
190+
merged = list(existing)
191+
for r in records:
192+
key = (r["code"], (r.get("locality_name") or "").lower())
193+
if key not in seen:
194+
merged.append(r)
195+
seen.add(key)
196+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
197+
else:
198+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
199+
200+
with target.open("w", encoding="utf-8") as f:
201+
json.dump(merged, f, ensure_ascii=False, indent=2)
202+
f.write("\n")
203+
size_kb = target.stat().st_size / 1024
204+
print(f"\n[OK] Wrote {target.relative_to(project_root)} ({len(merged):,} rows, {size_kb:.0f} KB)")
205+
return 0
206+
207+
208+
if __name__ == "__main__":
209+
raise SystemExit(main())

0 commit comments

Comments
 (0)