Skip to content

Commit 7bac0d3

Browse files
dr5hnclaude
andcommitted
feat(postcodes/TN): import 4,868 La Poste Tunisienne codes (#1039)
Adds the full Tunisian 4-digit postal code dataset from the JenhaniChedli/TunisiaGeodataAPI mirror. Why --- Closes the TN gap on issue #1039. The previously-tracked hajer77/postCodeTunisia-api source was a stub (~9 records, 8/24 governorates). JenhaniChedli's mirror covers all 24 governorates with the full Gouvernorat / Délégation / Cité hierarchy. Coverage -------- - 4,868 codes / 100% state FK - All 24 CSC TN governorates covered State FK strategy ----------------- ASCII-fold + name match against states.json + 1 alias for 'Mannouba' -> CSC 'Manouba' (single-letter spelling drift). 'Beja'/'Gabes' (no diacritics in source) match CSC 'Béja'/'Gabès' via ASCII fold. License ------- Source: JenhaniChedli/TunisiaGeodataAPI (no formal LICENSE file). Upstream: La Poste Tunisienne publicly published codes. Tier 5 per #1039 license-tier policy. Each row: source: "la-poste-tunisienne-via-jenhani-chedli" Validation ---------- - python3 -m py_compile passes - 100% regex match (^\d{4}$) - 100% state_id valid + state.country_id == 224 + state_code agrees - No auto-managed fields (id, created_at, updated_at, flag) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 54de82e commit 7bac0d3

2 files changed

Lines changed: 48900 additions & 0 deletions

File tree

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
#!/usr/bin/env python3
2+
"""Tunisia -> contributions/postcodes/TN.json importer for issue #1039.
3+
4+
Source data
5+
-----------
6+
The community ``JenhaniChedli/TunisiaGeodataAPI`` repository ships
7+
postcodes.json — 4,868 La Poste Tunisienne 4-digit postcodes joined
8+
with the full Gouvernorat / Délégation / Cité (locality) hierarchy.
9+
10+
[{"Gov": "Ariana", "Deleg": "Sidi Thabet",
11+
"Cite": "Cite Dridi", "zip": "2032"}, ...]
12+
13+
Source URL: https://raw.githubusercontent.com/JenhaniChedli/TunisiaGeodataAPI/master/postcodes.json
14+
15+
What this script does
16+
---------------------
17+
1. Fetches the JSON via urllib (curl is blocked).
18+
2. Resolves state FK via ASCII-fold + name match against CSC's 24
19+
gouvernorat entries, with one alias for the source's
20+
'Mannouba' -> CSC 'Manouba' single-letter spelling drift.
21+
3. Emits one row per (zip, Cite, Deleg) tuple.
22+
4. Writes contributions/postcodes/TN.json idempotently.
23+
24+
Coverage upgrade
25+
----------------
26+
The previously-tracked ``hajer77/postCodeTunisia-api`` source ships
27+
only ~9 records covering 8 of 24 governorates. This source has
28+
**all 24 governorates** with 4,868 localities — confirmed via direct
29+
inspection. The research doc Tier B note for Tunisia
30+
(`Stub only (8/24)`) is now stale and superseded.
31+
32+
License & attribution
33+
---------------------
34+
- Source: JenhaniChedli/TunisiaGeodataAPI (no formal LICENSE file)
35+
- Upstream: La Poste Tunisienne publicly published codes
36+
- Each row: ``source: "la-poste-tunisienne-via-jenhani-chedli"``
37+
38+
Tier 5 per #1039 license-tier policy.
39+
40+
Usage
41+
-----
42+
python3 bin/scripts/sync/import_tunisia_postcodes.py
43+
"""
44+
45+
from __future__ import annotations
46+
47+
import argparse
48+
import json
49+
import re
50+
import sys
51+
import unicodedata
52+
import urllib.request
53+
from pathlib import Path
54+
from typing import Dict, List
55+
56+
57+
SOURCE_URL = (
58+
"https://raw.githubusercontent.com/JenhaniChedli/TunisiaGeodataAPI/"
59+
"master/postcodes.json"
60+
)
61+
62+
# Source -> CSC name. Only entries where direct ASCII-fold match fails.
63+
GOV_ALIASES: Dict[str, str] = {
64+
"Mannouba": "Manouba", # source spells with double 'n'
65+
}
66+
67+
68+
def _ascii_fold(value: str) -> str:
69+
return (
70+
"".join(
71+
c
72+
for c in unicodedata.normalize("NFKD", value)
73+
if not unicodedata.combining(c)
74+
)
75+
.strip()
76+
.lower()
77+
)
78+
79+
80+
def fetch_json(url: str) -> List[dict]:
81+
req = urllib.request.Request(
82+
url, headers={"User-Agent": "csc-database-postcode-importer"}
83+
)
84+
with urllib.request.urlopen(req, timeout=60) as r:
85+
return json.loads(r.read())
86+
87+
88+
def main() -> int:
89+
parser = argparse.ArgumentParser(description=__doc__)
90+
parser.add_argument("--input", default=None, help="local JSON (skip fetch)")
91+
parser.add_argument("--dry-run", action="store_true")
92+
args = parser.parse_args()
93+
94+
rows = (
95+
json.loads(Path(args.input).read_text(encoding="utf-8"))
96+
if args.input
97+
else fetch_json(SOURCE_URL)
98+
)
99+
print(f"Source rows: {len(rows):,}")
100+
101+
project_root = Path(__file__).resolve().parents[3]
102+
countries = json.load(
103+
(project_root / "contributions/countries/countries.json").open(encoding="utf-8")
104+
)
105+
tn_country = next((c for c in countries if c.get("iso2") == "TN"), None)
106+
if tn_country is None:
107+
print("ERROR: TN not in countries.json", file=sys.stderr)
108+
return 2
109+
regex = re.compile(tn_country.get("postal_code_regex") or ".*")
110+
111+
states = json.load(
112+
(project_root / "contributions/states/states.json").open(encoding="utf-8")
113+
)
114+
tn_states = [s for s in states if s.get("country_id") == tn_country["id"]]
115+
state_by_fold: Dict[str, dict] = {
116+
_ascii_fold(s["name"]): s for s in tn_states if s.get("name")
117+
}
118+
print(
119+
f"Country: Tunisia (id={tn_country['id']}); states indexed: {len(tn_states)}"
120+
)
121+
122+
seen: set = set()
123+
records: List[dict] = []
124+
skipped_no_code = 0
125+
skipped_bad_regex = 0
126+
skipped_no_state = 0
127+
matched_state = 0
128+
unknown_govs: Dict[str, int] = {}
129+
130+
for row in rows:
131+
raw_code = (row.get("zip") or "").strip()
132+
if not raw_code:
133+
skipped_no_code += 1
134+
continue
135+
code = raw_code.zfill(4) if raw_code.isdigit() else raw_code
136+
if not regex.match(code):
137+
skipped_bad_regex += 1
138+
continue
139+
140+
gov_raw = (row.get("Gov") or "").strip()
141+
gov_alias = GOV_ALIASES.get(gov_raw, gov_raw)
142+
state = state_by_fold.get(_ascii_fold(gov_alias))
143+
if state is None:
144+
unknown_govs[gov_raw] = unknown_govs.get(gov_raw, 0) + 1
145+
skipped_no_state += 1
146+
147+
cite = (row.get("Cite") or "").strip()
148+
deleg = (row.get("Deleg") or "").strip()
149+
if cite and deleg and cite.lower() != deleg.lower():
150+
locality = f"{cite}, {deleg}"
151+
else:
152+
locality = cite or deleg
153+
154+
key = (code, locality.lower())
155+
if key in seen:
156+
continue
157+
seen.add(key)
158+
159+
record: Dict[str, object] = {
160+
"code": code,
161+
"country_id": int(tn_country["id"]),
162+
"country_code": "TN",
163+
}
164+
if state is not None:
165+
record["state_id"] = int(state["id"])
166+
record["state_code"] = state.get("iso2")
167+
matched_state += 1
168+
if locality:
169+
record["locality_name"] = locality
170+
record["type"] = "full"
171+
record["source"] = "la-poste-tunisienne-via-jenhani-chedli"
172+
records.append(record)
173+
174+
print(f"Skipped (no code): {skipped_no_code:,}")
175+
print(f"Skipped (regex fail): {skipped_bad_regex:,}")
176+
print(f"Skipped (no state FK): {skipped_no_state:,}")
177+
print(f"Records emitted: {len(records):,}")
178+
pct = matched_state * 100 // max(1, len(records))
179+
print(f" with state: {matched_state:,} ({pct}%)")
180+
if unknown_govs:
181+
print("Unknown governorates (not in CSC + GOV_ALIASES):")
182+
for g, n in sorted(unknown_govs.items(), key=lambda x: -x[1]):
183+
print(f" {g!r}: {n}")
184+
185+
if args.dry_run:
186+
return 0
187+
188+
target = project_root / "contributions/postcodes/TN.json"
189+
target.parent.mkdir(parents=True, exist_ok=True)
190+
if target.exists():
191+
with target.open(encoding="utf-8") as f:
192+
existing = json.load(f)
193+
existing_seen = {
194+
(r["code"], (r.get("locality_name") or "").lower()) for r in existing
195+
}
196+
merged = list(existing)
197+
for r in records:
198+
key = (r["code"], (r.get("locality_name") or "").lower())
199+
if key not in existing_seen:
200+
merged.append(r)
201+
existing_seen.add(key)
202+
merged.sort(key=lambda r: (r["code"], r.get("locality_name", "")))
203+
else:
204+
merged = sorted(records, key=lambda r: (r["code"], r.get("locality_name", "")))
205+
206+
with target.open("w", encoding="utf-8") as f:
207+
json.dump(merged, f, ensure_ascii=False, indent=2)
208+
f.write("\n")
209+
size_kb = target.stat().st_size / 1024
210+
print(
211+
f"\n[OK] Wrote {target.relative_to(project_root)} "
212+
f"({len(merged):,} rows, {size_kb:.0f} KB)"
213+
)
214+
return 0
215+
216+
217+
if __name__ == "__main__":
218+
raise SystemExit(main())

0 commit comments

Comments
 (0)