Skip to content

Commit bfa5c60

Browse files
dr5hnclaude
andcommitted
feat(postcodes/IT): bulk-import 4,678 CAP codes via Istat (#1039)
Adds the importer + first run for Italy. Uses the matteocontrini/comuni-json mirror of Istat's official Italian commune list with postcodes (CAP). 1. bin/scripts/sync/import_italy_postcodes.py — pipeline reading the community-maintained UTF-8 JSON. Each commune has a cap[] array (large cities like Rome have 80+ CAPs); pipeline expands one row per (cap, commune) and picks first commune alphabetically as canonical per code. State resolution is direct sigla -> state.iso2 match (RM=Rome, MI=Milan, etc.) with one alias bridge: Aosta uses sigla 'AO' but states.json has it as the 'Aosta Valley' autonomous region with iso2 '23'. 2. contributions/postcodes/IT.json — 4,678 unique CAPs covering all 7,904 comuni with 100% state_id resolution. Multi-CAP cities - Rome: 82 CAPs - Venice: 56 - Messina: 48 - Genoa: 47 - Milan: 42 - Each CAP gets one record pointing to the canonical commune name; this matches the Tier-4 "one row per code" contract from #1398. Validation (zero errors across 4,678 records) - All codes match countries.postal_code_regex (^(\\d{5})\$) - All FKs resolve, all state_codes agree with state.iso2 - No auto-managed fields present License & attribution - Upstream: Istat (CC-BY 3.0) - Mirror: github.com/matteocontrini/comuni-json - Each row: source: "istat" Refs: #1039 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent b62bcc5 commit bfa5c60

2 files changed

Lines changed: 46937 additions & 0 deletions

File tree

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#!/usr/bin/env python3
2+
"""Italy CAP -> contributions/postcodes/IT.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community-maintained ``matteocontrini/comuni-json`` archive is the
7+
canonical redistribution of Istat's official Italian commune list with
8+
postal codes (CAP). It carries Istat data under the original CC-BY 3.0
9+
attribution (Istat licensing).
10+
11+
https://github.com/matteocontrini/comuni-json
12+
13+
The JSON has 7,904 commune records, each with:
14+
- nome (commune name, mixed-case)
15+
- sigla (2-letter province ISO 3166-2:IT code, e.g. RM, MI, NA)
16+
- cap (array of postal codes — large cities have many)
17+
- regione, provincia, codiceCatastale, popolazione
18+
19+
About 4,678 unique CAPs across all comuni.
20+
21+
What this script does
22+
---------------------
23+
1. Reads comuni.json (UTF-8)
24+
2. Expands each commune's cap[] array, one record per (cap, commune)
25+
3. Picks ONE canonical commune per unique CAP (first alphabetical)
26+
— large cities like Rome/Milan have ~50-80 CAPs each, but each CAP
27+
points to one neighbourhood/zone within a single commune
28+
4. Resolves state_id by mapping commune.sigla to state.iso2 directly
29+
(Italy's 2-letter province codes match exactly: RM = Rome, MI = Milan)
30+
5. Writes contributions/postcodes/IT.json
31+
32+
License & attribution
33+
---------------------
34+
- Upstream source: Istat (CC-BY 3.0)
35+
- Mirror: github.com/matteocontrini/comuni-json
36+
- Each row: source: "istat"
37+
38+
Usage
39+
-----
40+
python3 -c "import urllib.request; urllib.request.urlretrieve(
41+
'https://raw.githubusercontent.com/matteocontrini/comuni-json/master/comuni.json',
42+
'/tmp/it_comuni.json')"
43+
44+
python3 bin/scripts/sync/import_italy_postcodes.py
45+
"""
46+
47+
from __future__ import annotations
48+
49+
import argparse
50+
import json
51+
import sys
52+
from pathlib import Path
53+
from typing import Dict, List, Optional
54+
55+
56+
def main() -> int:
57+
parser = argparse.ArgumentParser(description=__doc__)
58+
parser.add_argument("--input", default="/tmp/it_comuni.json")
59+
parser.add_argument("--dry-run", action="store_true")
60+
args = parser.parse_args()
61+
62+
src = Path(args.input)
63+
if not src.exists():
64+
print(f"ERROR: input not found: {src}", file=sys.stderr)
65+
return 2
66+
67+
project_root = Path(__file__).resolve().parents[3]
68+
countries = json.load((project_root / "contributions/countries/countries.json").open(encoding="utf-8"))
69+
it = next((c for c in countries if c.get("iso2") == "IT"), None)
70+
if it is None:
71+
print("ERROR: IT not in countries.json", file=sys.stderr)
72+
return 2
73+
states = json.load((project_root / "contributions/states/states.json").open(encoding="utf-8"))
74+
it_states = [s for s in states if s.get("country_id") == it["id"]]
75+
state_by_iso2: Dict[str, dict] = {(s.get("iso2") or "").upper(): s for s in it_states if s.get("iso2")}
76+
77+
# Aosta is sigla "AO" in the source, but states.json treats it as the
78+
# "Aosta Valley" autonomous region with iso2 "23" instead of a standard
79+
# province. Bridge it explicitly.
80+
if "23" in state_by_iso2 and "AO" not in state_by_iso2:
81+
state_by_iso2["AO"] = state_by_iso2["23"]
82+
83+
print(f"Country: Italy (id={it['id']}); states indexed by iso2: {len(state_by_iso2)}")
84+
85+
comuni = json.load(src.open(encoding="utf-8"))
86+
87+
# Expand to one row per (cap, commune); group by cap; pick first commune alphabetically.
88+
by_cap: Dict[str, List[dict]] = {}
89+
for c in comuni:
90+
nome = (c.get("nome") or "").strip()
91+
sigla = (c.get("sigla") or "").strip().upper()
92+
for cap in c.get("cap", []):
93+
cap = (cap or "").strip()
94+
if not cap or not cap.isdigit() or len(cap) != 5:
95+
continue
96+
by_cap.setdefault(cap, []).append({"nome": nome, "sigla": sigla})
97+
98+
print(f"Comuni: {len(comuni):,}")
99+
print(f"Unique CAPs: {len(by_cap):,}")
100+
101+
records: List[dict] = []
102+
matched_state = 0
103+
for cap in sorted(by_cap):
104+
rows = sorted(by_cap[cap], key=lambda r: r["nome"].upper())
105+
chosen = rows[0]
106+
record = {
107+
"code": cap,
108+
"country_id": int(it["id"]),
109+
"country_code": "IT",
110+
}
111+
state = state_by_iso2.get(chosen["sigla"])
112+
if state is not None:
113+
record["state_id"] = int(state["id"])
114+
# Use the state's canonical iso2 from states.json, not the raw
115+
# source sigla — they differ for Aosta (AO -> 23) and any future
116+
# alias bridges.
117+
record["state_code"] = state.get("iso2") or chosen["sigla"]
118+
matched_state += 1
119+
if chosen["nome"]:
120+
record["locality_name"] = chosen["nome"]
121+
record["type"] = "full"
122+
record["source"] = "istat"
123+
records.append(record)
124+
125+
print(f"Records: {len(records):,}")
126+
print(f" with state_id: {matched_state:,} ({matched_state*100//max(1,len(records))}%)")
127+
128+
if args.dry_run:
129+
return 0
130+
131+
target = project_root / "contributions/postcodes/IT.json"
132+
if target.exists():
133+
with target.open(encoding="utf-8") as f:
134+
existing = json.load(f)
135+
seen = {(r["code"], (r.get("locality_name") or "").lower()) for r in existing}
136+
merged = list(existing)
137+
for r in records:
138+
key = (r["code"], (r.get("locality_name") or "").lower())
139+
if key not in seen:
140+
merged.append(r)
141+
seen.add(key)
142+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
143+
else:
144+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
145+
146+
with target.open("w", encoding="utf-8") as f:
147+
json.dump(merged, f, ensure_ascii=False, indent=2)
148+
f.write("\n")
149+
size_kb = target.stat().st_size / 1024
150+
print(f"\n[OK] Wrote {target.relative_to(project_root)} ({len(merged):,} rows, {size_kb:.0f} KB)")
151+
return 0
152+
153+
154+
if __name__ == "__main__":
155+
raise SystemExit(main())

0 commit comments

Comments
 (0)