Skip to content

Commit ae612c1

Browse files
dshkolclaude
andcommitted
refactor: Extract geo+vectors fetch and merge into dedicated functions
Extract the complex hybrid geo+vectors logic (~70 lines) from get_census() into two dedicated helper functions: 1. _fetch_census_with_geometry_and_vectors(): - Handles the CensusMapper API quirk where geo.geojson doesn't return vector data properly - Makes separate calls to geo.geojson and data.csv endpoints - Orchestrates the merge of geometry and vector data 2. _merge_geo_and_csv_results(): - Merges GeoDataFrame with CSV DataFrame on geographic identifier - Detects common merge keys (GeoUID, id, rgid) automatically - Falls back to index-based merge if no common key found - Handles duplicate key cleanup after merge This improves: - Readability: get_census() main flow is now much simpler - Testability: merge logic can be unit tested in isolation - Single responsibility: each function has one clear purpose - Documentation: detailed docstrings explain the API quirk Added 2 new unit tests: - test_merge_on_geoid_key: verifies key-based merge - test_merge_fallback_by_index: verifies index fallback Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 13ef466 commit ae612c1

2 files changed

Lines changed: 183 additions & 73 deletions

File tree

pycancensus/core.py

Lines changed: 121 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -171,80 +171,10 @@ def get_census(
171171
# Handle geo_format='geopandas' with vectors using hybrid approach
172172
if geo_format == "geopandas" and vectors:
173173
# The geo.geojson endpoint doesn't properly return vector data
174-
# So we need to fetch geometry and data separately, then merge
175-
176-
# 1. Fetch geometry data
177-
geo_request_data = request_data.copy()
178-
if "vectors" in geo_request_data:
179-
del geo_request_data["vectors"] # Remove vectors for geo request
180-
if resolution == "high":
181-
geo_request_data["resolution"] = "high"
182-
183-
geo_multipart_data = {}
184-
for key, value in geo_request_data.items():
185-
geo_multipart_data[key] = (None, value)
186-
187-
geo_response = get_session().post(
188-
f"{base_url}geo.geojson", files=geo_multipart_data
174+
# Use dedicated function to fetch and merge geo + vector data
175+
result = _fetch_census_with_geometry_and_vectors(
176+
base_url, request_data, resolution, vectors, labels
189177
)
190-
geo_data = geo_response.json()
191-
geo_result = _process_geojson_response(geo_data, None, labels) # No vectors
192-
193-
# 2. Fetch vector data using CSV endpoint
194-
csv_multipart_data = {}
195-
for key, value in request_data.items():
196-
csv_multipart_data[key] = (None, value)
197-
198-
csv_response = get_session().post(
199-
f"{base_url}data.csv", files=csv_multipart_data
200-
)
201-
csv_result = _process_csv_response(csv_response.text, vectors, labels)
202-
203-
# 3. Merge the results
204-
# Use a common identifier to merge - typically 'GeoUID' from CSV and 'id' from GeoJSON
205-
merge_key_csv = None
206-
merge_key_geo = None
207-
208-
# Find the appropriate merge keys
209-
for potential_key in ["GeoUID", "id", "rgid"]:
210-
if potential_key in csv_result.columns:
211-
merge_key_csv = potential_key
212-
break
213-
214-
for potential_key in ["id", "rgid", "GeoUID"]:
215-
if potential_key in geo_result.columns:
216-
merge_key_geo = potential_key
217-
break
218-
219-
if merge_key_csv and merge_key_geo:
220-
# Merge on the identifier
221-
# Keep all columns from geo_result, add vector columns from csv_result
222-
vector_columns = [
223-
col for col in csv_result.columns if col.startswith("v_")
224-
]
225-
merge_columns = [merge_key_csv] + vector_columns
226-
227-
result = geo_result.merge(
228-
csv_result[merge_columns],
229-
left_on=merge_key_geo,
230-
right_on=merge_key_csv,
231-
how="left",
232-
)
233-
234-
# Drop the duplicate merge key if it was added
235-
if merge_key_csv != merge_key_geo and merge_key_csv in result.columns:
236-
result = result.drop(columns=[merge_key_csv])
237-
238-
else:
239-
# Fallback: assume same order and merge by index
240-
vector_columns = [
241-
col for col in csv_result.columns if col.startswith("v_")
242-
]
243-
for col in vector_columns:
244-
if len(csv_result) == len(geo_result):
245-
geo_result[col] = csv_result[col].values
246-
result = geo_result
247-
248178
else:
249179
# Standard single-endpoint approach
250180
if geo_format == "geopandas":
@@ -308,6 +238,124 @@ def _generate_cache_key(dataset, regions, vectors, level, geo_format):
308238
return hashlib.md5(params_str.encode()).hexdigest()
309239

310240

241+
def _fetch_census_with_geometry_and_vectors(
242+
base_url: str,
243+
request_data: dict,
244+
resolution: str,
245+
vectors: List[str],
246+
labels: str,
247+
) -> gpd.GeoDataFrame:
248+
"""
249+
Fetch census data with both geometry and vector data.
250+
251+
The CensusMapper geo.geojson endpoint doesn't properly return vector data,
252+
so this function makes separate calls to geo.geojson and data.csv endpoints,
253+
then merges the results on geographic identifier.
254+
255+
Parameters
256+
----------
257+
base_url : str
258+
The API base URL (e.g., "https://censusmapper.ca/api/v1/").
259+
request_data : dict
260+
The base request parameters (dataset, level, api_key, regions, etc.).
261+
resolution : str
262+
Resolution of geographic data - 'simplified' or 'high'.
263+
vectors : list of str
264+
Vector codes to retrieve.
265+
labels : str
266+
Label format - 'detailed' or 'short'.
267+
268+
Returns
269+
-------
270+
gpd.GeoDataFrame
271+
GeoDataFrame with geometry and vector data merged.
272+
"""
273+
# 1. Fetch geometry data (without vectors)
274+
geo_request_data = request_data.copy()
275+
if "vectors" in geo_request_data:
276+
del geo_request_data["vectors"]
277+
if resolution == "high":
278+
geo_request_data["resolution"] = "high"
279+
280+
geo_multipart_data = {key: (None, value) for key, value in geo_request_data.items()}
281+
geo_response = get_session().post(
282+
f"{base_url}geo.geojson", files=geo_multipart_data
283+
)
284+
geo_data = geo_response.json()
285+
geo_result = _process_geojson_response(geo_data, None, labels)
286+
287+
# 2. Fetch vector data using CSV endpoint
288+
csv_multipart_data = {key: (None, value) for key, value in request_data.items()}
289+
csv_response = get_session().post(f"{base_url}data.csv", files=csv_multipart_data)
290+
csv_result = _process_csv_response(csv_response.text, vectors, labels)
291+
292+
# 3. Merge the results on geographic identifier
293+
return _merge_geo_and_csv_results(geo_result, csv_result)
294+
295+
296+
def _merge_geo_and_csv_results(
297+
geo_result: gpd.GeoDataFrame,
298+
csv_result: pd.DataFrame,
299+
) -> gpd.GeoDataFrame:
300+
"""
301+
Merge GeoDataFrame with CSV DataFrame on geographic identifier.
302+
303+
Finds a common identifier column (GeoUID, id, or rgid) and merges
304+
the vector columns from CSV onto the GeoDataFrame.
305+
306+
Parameters
307+
----------
308+
geo_result : gpd.GeoDataFrame
309+
GeoDataFrame with geometry data.
310+
csv_result : pd.DataFrame
311+
DataFrame with vector data.
312+
313+
Returns
314+
-------
315+
gpd.GeoDataFrame
316+
Merged GeoDataFrame with geometry and vector columns.
317+
"""
318+
# Find merge keys in each DataFrame
319+
merge_key_csv = None
320+
merge_key_geo = None
321+
322+
for potential_key in ["GeoUID", "id", "rgid"]:
323+
if potential_key in csv_result.columns:
324+
merge_key_csv = potential_key
325+
break
326+
327+
for potential_key in ["id", "rgid", "GeoUID"]:
328+
if potential_key in geo_result.columns:
329+
merge_key_geo = potential_key
330+
break
331+
332+
if merge_key_csv and merge_key_geo:
333+
# Merge on identifier - keep geo columns, add vector columns from CSV
334+
vector_columns = [col for col in csv_result.columns if col.startswith("v_")]
335+
merge_columns = [merge_key_csv] + vector_columns
336+
337+
result = geo_result.merge(
338+
csv_result[merge_columns],
339+
left_on=merge_key_geo,
340+
right_on=merge_key_csv,
341+
how="left",
342+
)
343+
344+
# Drop duplicate merge key if names differ
345+
if merge_key_csv != merge_key_geo and merge_key_csv in result.columns:
346+
result = result.drop(columns=[merge_key_csv])
347+
348+
else:
349+
# Fallback: assume same row order and merge by index
350+
vector_columns = [col for col in csv_result.columns if col.startswith("v_")]
351+
result = geo_result.copy()
352+
for col in vector_columns:
353+
if len(csv_result) == len(geo_result):
354+
result[col] = csv_result[col].values
355+
356+
return result
357+
358+
311359
def _extract_vector_metadata(df, vectors, labels):
312360
"""Extract vector metadata from column names and store as attribute."""
313361
if not vectors:

tests/test_basic.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,68 @@ def test_normalize_produces_equivalent_results(self):
359359
assert pd.isna(geo_result["pop"].iloc[1])
360360

361361

362+
class TestGeoVectorsMerge:
363+
"""Test geo+vectors merge functionality."""
364+
365+
def test_merge_on_geoid_key(self):
366+
"""Test merging geo and CSV results on GeoUID/id key."""
367+
from pycancensus.core import _merge_geo_and_csv_results
368+
369+
# Create mock GeoDataFrame
370+
geo_result = gpd.GeoDataFrame(
371+
{
372+
"id": ["001", "002", "003"],
373+
"name": ["Region A", "Region B", "Region C"],
374+
"geometry": [None, None, None],
375+
}
376+
)
377+
378+
# Create mock CSV result
379+
csv_result = pd.DataFrame(
380+
{
381+
"GeoUID": ["001", "002", "003"],
382+
"v_CA21_1": [100, 200, 300],
383+
"v_CA21_2": [50, 60, 70],
384+
}
385+
)
386+
387+
result = _merge_geo_and_csv_results(geo_result, csv_result)
388+
389+
# Should have vector columns merged
390+
assert "v_CA21_1" in result.columns
391+
assert "v_CA21_2" in result.columns
392+
assert list(result["v_CA21_1"]) == [100, 200, 300]
393+
394+
# Should have geo columns preserved
395+
assert "name" in result.columns
396+
assert "geometry" in result.columns
397+
398+
def test_merge_fallback_by_index(self):
399+
"""Test fallback merge by index when no common key found."""
400+
from pycancensus.core import _merge_geo_and_csv_results
401+
402+
# Create mock data without common keys
403+
geo_result = gpd.GeoDataFrame(
404+
{
405+
"custom_id": ["A", "B"],
406+
"geometry": [None, None],
407+
}
408+
)
409+
410+
csv_result = pd.DataFrame(
411+
{
412+
"other_id": ["X", "Y"],
413+
"v_CA21_1": [100, 200],
414+
}
415+
)
416+
417+
result = _merge_geo_and_csv_results(geo_result, csv_result)
418+
419+
# Should still merge by index
420+
assert "v_CA21_1" in result.columns
421+
assert list(result["v_CA21_1"]) == [100, 200]
422+
423+
362424
class TestCache:
363425
"""Test caching functionality."""
364426

0 commit comments

Comments
 (0)