Skip to content

Commit c0c0f98

Browse files
dr5hnclaude
andauthored
feat(postcodes/AU): bulk-import 3,175 codes via Australia Post (#1039) (#1434)
Adds the importer + first run for Australia. Australia Post publishes postcode data under CC-BY 4.0; this PR uses the community-maintained matthewproctor/australianpostcodes mirror as the source-of-truth feed (redistributable under the same CC-BY licence). 1. bin/scripts/sync/import_australia_post_postcodes.py — pipeline that groups CSV rows by postcode and picks one canonical record per code (first locality alphabetically). State resolution is direct iso2 match: CSV uses ACT/NSW/NT/QLD/SA/TAS/VIC/WA, identical to states.json. 2. contributions/postcodes/AU.json — 3,175 records covering all 8 Australian states/territories with 100% state_id and 100% coordinate resolution. Validation (zero errors) - All codes match countries.postal_code_regex (^\\d{4}\$) - All FKs resolve, all state_codes agree with state.iso2 - No auto-managed fields present License & attribution - Upstream: Australia Post (CC-BY 4.0) - Mirror: github.com/matthewproctor/australianpostcodes - Each row: source: "australia-post" Refs: #1039 Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0d778c2 commit c0c0f98

2 files changed

Lines changed: 38260 additions & 0 deletions

File tree

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#!/usr/bin/env python3
2+
"""Australia Post -> contributions/postcodes/AU.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community-maintained Matthew Proctor archive is the standard
7+
redistributable mirror of Australia Post's open-data postcode list,
8+
published under CC-BY 4.0 attribution to Australia Post:
9+
10+
https://github.com/matthewproctor/australianpostcodes
11+
12+
The CSV has ~18,500 rows representing localities; ~3,175 unique postcodes
13+
spread across the 8 Australian states/territories.
14+
15+
What this script does
16+
---------------------
17+
1. Reads the CSV (UTF-8, header row)
18+
2. Picks ONE canonical record per unique postcode (first alphabetical
19+
locality name — gives a stable primary that future curated PRs can
20+
override per-row)
21+
3. Resolves country_id (AU) and state_id by direct iso2 match
22+
(CSV uses ACT/NSW/NT/QLD/SA/TAS/VIC/WA — same codes as states.json)
23+
4. Carries forward latitude/longitude when present
24+
5. Writes contributions/postcodes/AU.json
25+
6. Idempotent merge with existing curated rows
26+
27+
License & attribution
28+
---------------------
29+
- Upstream source: Australia Post (CC-BY 4.0)
30+
- Mirror: github.com/matthewproctor/australianpostcodes
31+
- Each generated row records source: "australia-post"
32+
33+
Usage
34+
-----
35+
curl -L -o /tmp/au_postcodes.csv \\
36+
https://raw.githubusercontent.com/matthewproctor/australianpostcodes/master/australian_postcodes.csv
37+
38+
python3 bin/scripts/sync/import_australia_post_postcodes.py
39+
"""
40+
41+
from __future__ import annotations
42+
43+
import argparse
44+
import csv
45+
import json
46+
import sys
47+
from pathlib import Path
48+
from typing import Dict, List, Optional
49+
50+
51+
def parse_coord(v: str) -> Optional[str]:
52+
if not v or not v.strip():
53+
return None
54+
try:
55+
f = float(v)
56+
if abs(f) > 180:
57+
return None
58+
return f"{f:.8f}".rstrip("0").rstrip(".") or "0"
59+
except ValueError:
60+
return None
61+
62+
63+
def main() -> int:
64+
parser = argparse.ArgumentParser(description=__doc__)
65+
parser.add_argument("--input", default="/tmp/au_postcodes.csv")
66+
parser.add_argument("--dry-run", action="store_true")
67+
args = parser.parse_args()
68+
69+
csv_path = Path(args.input)
70+
if not csv_path.exists():
71+
print(f"ERROR: input not found: {csv_path}", file=sys.stderr)
72+
return 2
73+
74+
project_root = Path(__file__).resolve().parents[3]
75+
countries = json.load((project_root / "contributions/countries/countries.json").open(encoding="utf-8"))
76+
au = next((c for c in countries if c.get("iso2") == "AU"), None)
77+
if au is None:
78+
print("ERROR: AU not in countries.json", file=sys.stderr)
79+
return 2
80+
states = json.load((project_root / "contributions/states/states.json").open(encoding="utf-8"))
81+
au_states = [s for s in states if s.get("country_id") == au["id"]]
82+
state_by_iso2: Dict[str, dict] = {(s.get("iso2") or "").upper(): s for s in au_states if s.get("iso2")}
83+
84+
print(f"Country: Australia (id={au['id']}); states: {len(au_states)}")
85+
86+
# First pass: group rows by postcode, then pick a canonical one per postcode
87+
by_postcode: Dict[str, List[dict]] = {}
88+
with csv_path.open(encoding="utf-8") as f:
89+
reader = csv.DictReader(f)
90+
for row in reader:
91+
code = (row.get("postcode") or "").strip()
92+
if not code or not code.isdigit() or len(code) != 4:
93+
continue
94+
by_postcode.setdefault(code, []).append(row)
95+
96+
# Sort each postcode's rows by locality name; first wins
97+
records: List[dict] = []
98+
matched_state = 0
99+
matched_coord = 0
100+
for code in sorted(by_postcode):
101+
rows = sorted(by_postcode[code], key=lambda r: (r.get("locality") or "").upper())
102+
chosen = rows[0]
103+
record = {
104+
"code": code,
105+
"country_id": int(au["id"]),
106+
"country_code": "AU",
107+
}
108+
st_iso = (chosen.get("state") or "").strip().upper()
109+
state = state_by_iso2.get(st_iso)
110+
if state is not None:
111+
record["state_id"] = int(state["id"])
112+
record["state_code"] = st_iso
113+
matched_state += 1
114+
locality = (chosen.get("locality") or "").strip()
115+
if locality:
116+
record["locality_name"] = locality
117+
record["type"] = "full"
118+
lat = parse_coord(chosen.get("lat") or "")
119+
lng = parse_coord(chosen.get("long") or "")
120+
if lat is not None and lng is not None:
121+
record["latitude"] = lat
122+
record["longitude"] = lng
123+
matched_coord += 1
124+
record["source"] = "australia-post"
125+
records.append(record)
126+
127+
print(f"Records: {len(records):,}")
128+
print(f" with state: {matched_state:,} ({matched_state*100//max(1,len(records))}%)")
129+
print(f" with coords: {matched_coord:,} ({matched_coord*100//max(1,len(records))}%)")
130+
131+
if args.dry_run:
132+
return 0
133+
134+
target = project_root / "contributions/postcodes/AU.json"
135+
if target.exists():
136+
with target.open(encoding="utf-8") as f:
137+
existing = json.load(f)
138+
seen = {(r["code"], (r.get("locality_name") or "").lower()) for r in existing}
139+
merged = list(existing)
140+
for r in records:
141+
key = (r["code"], (r.get("locality_name") or "").lower())
142+
if key not in seen:
143+
merged.append(r)
144+
seen.add(key)
145+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
146+
else:
147+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
148+
149+
with target.open("w", encoding="utf-8") as f:
150+
json.dump(merged, f, ensure_ascii=False, indent=2)
151+
f.write("\n")
152+
size_kb = target.stat().st_size / 1024
153+
print(f"\n[OK] Wrote {target.relative_to(project_root)} ({len(merged):,} rows, {size_kb:.0f} KB)")
154+
return 0
155+
156+
157+
if __name__ == "__main__":
158+
raise SystemExit(main())

0 commit comments

Comments
 (0)