Skip to content

Commit 1a67981

Browse files
dshkolclaude
andcommitted
refactor: Extract shared DataFrame normalization logic
Consolidate duplicate code between _process_csv_response and _process_geojson_response into a shared _normalize_census_dataframe() function. The new function handles: - Census NA value conversion ('x', 'X', 'F', '...', '-', '') - Numeric column dtype conversion (Population, Households, etc.) - Categorical column dtype conversion (Type, Region Name) - Vector metadata extraction - Both CSV endpoint names (Population) and GeoJSON short names (pop) This reduces code duplication by ~60 lines and ensures consistent data handling across both CSV and GeoJSON endpoints. Added 3 new unit tests: - test_normalize_census_dataframe_census_na_values - test_normalize_census_dataframe_geojson_short_names - test_normalize_produces_equivalent_results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent f18c0c8 commit 1a67981

2 files changed

Lines changed: 164 additions & 98 deletions

File tree

pycancensus/core.py

Lines changed: 73 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -356,58 +356,88 @@ def _extract_vector_metadata(df, vectors, labels):
356356
return df
357357

358358

359-
def _process_csv_response(csv_text, vectors, labels):
360-
"""Process CSV API response into a pandas DataFrame."""
361-
# Read all columns as strings initially (like R package)
362-
df = pd.read_csv(io.StringIO(csv_text), dtype=str, encoding="utf-8")
359+
def _normalize_census_dataframe(
360+
df: Union[pd.DataFrame, gpd.GeoDataFrame],
361+
vectors: Optional[List[str]],
362+
labels: str,
363+
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
364+
"""
365+
Normalize a census DataFrame or GeoDataFrame.
363366
364-
# Fix column names by removing trailing/leading spaces (critical fix for API compatibility)
365-
df.columns = df.columns.str.strip()
367+
Applies consistent data type conversions and metadata extraction:
368+
- Converts census NA values ('x', 'X', 'F', '...', '-', '') to pd.NA
369+
- Converts numeric columns (population, households, etc.) to numeric dtype
370+
- Converts categorical columns (Type, Region Name) to category dtype
371+
- Extracts and stores vector metadata
366372
367-
# Define census-specific NA values (matching R package)
368-
census_na_values = ["x", "X", "F", "...", "-", ""]
373+
Parameters
374+
----------
375+
df : pd.DataFrame or gpd.GeoDataFrame
376+
The data to normalize.
377+
vectors : list of str, optional
378+
Vector codes that were requested (for metadata extraction).
379+
labels : str
380+
Label format - 'detailed' or 'short'.
369381
370-
# Convert specific columns to numeric (matching R package exactly)
371-
numeric_columns = []
382+
Returns
383+
-------
384+
pd.DataFrame or gpd.GeoDataFrame
385+
The normalized data with proper dtypes.
386+
"""
387+
# Census-specific NA values (matching R package)
388+
census_na_values = ["x", "X", "F", "...", "-", ""]
372389

373390
# Standard census columns that should be numeric
374-
# Note: API may return column names with trailing spaces, so we need flexible matching
375-
standard_numeric = ["Population", "Households", "Dwellings", "Area (sq km)"]
391+
# Include both long names (CSV endpoint) and short names (GeoJSON endpoint)
392+
standard_numeric = [
393+
"Population",
394+
"Households",
395+
"Dwellings",
396+
"Area (sq km)",
397+
"pop", # GeoJSON short name
398+
"dw", # GeoJSON short name
399+
"hh", # GeoJSON short name
400+
"a", # GeoJSON short name
401+
]
376402

377-
# Create a mapping of actual column names to expected names for flexible matching
378-
column_mapping = {}
403+
# Find numeric columns to convert
404+
numeric_columns = []
379405
for expected_col in standard_numeric:
380-
# Check for exact match first
406+
# Check for exact match
381407
if expected_col in df.columns:
382408
numeric_columns.append(expected_col)
383409
continue
384-
385-
# Check for variations with trailing/leading spaces
410+
# Check for variations with trailing/leading spaces (API quirk)
386411
for actual_col in df.columns:
387412
if actual_col.strip() == expected_col:
388413
numeric_columns.append(actual_col)
389-
column_mapping[actual_col] = expected_col
390414
break
391415

392-
# Vector columns (v_* pattern) - handle both short and descriptive names
416+
# Vector columns (v_* pattern) are always numeric
393417
for col in df.columns:
394-
if col.startswith("v_CA") or col.startswith("v_"):
418+
if col.startswith("v_"):
395419
numeric_columns.append(col)
396420

397421
# Convert to numeric with census NA handling
398422
for col in numeric_columns:
399-
# Replace census NA values with NaN, then convert to numeric
400-
df[col] = df[col].replace(census_na_values, pd.NA)
401-
df[col] = pd.to_numeric(df[col], errors="coerce")
423+
if col in df.columns:
424+
df[col] = df[col].replace(census_na_values, pd.NA)
425+
df[col] = pd.to_numeric(df[col], errors="coerce")
426+
427+
# Standard categorical columns
428+
# Include both long names (CSV endpoint) and short names (GeoJSON endpoint)
429+
categorical_columns = [
430+
"Type",
431+
"Region Name",
432+
"name", # GeoJSON short name
433+
"t", # GeoJSON short name
434+
]
402435

403-
# Convert categorical columns to pandas categorical (matching R factors)
404-
categorical_columns = ["Type", "Region Name"]
405436
for expected_col in categorical_columns:
406-
# Check for exact match first
437+
# Check for exact match
407438
if expected_col in df.columns:
408439
df[expected_col] = df[expected_col].astype("category")
409440
continue
410-
411441
# Check for variations with trailing/leading spaces
412442
for actual_col in df.columns:
413443
if actual_col.strip() == expected_col:
@@ -420,17 +450,27 @@ def _process_csv_response(csv_text, vectors, labels):
420450
return df
421451

422452

453+
def _process_csv_response(csv_text, vectors, labels):
454+
"""Process CSV API response into a pandas DataFrame."""
455+
# Read all columns as strings initially (like R package)
456+
df = pd.read_csv(io.StringIO(csv_text), dtype=str, encoding="utf-8")
457+
458+
# Fix column names by removing trailing/leading spaces (critical fix for API compatibility)
459+
df.columns = df.columns.str.strip()
460+
461+
# Apply shared normalization
462+
return _normalize_census_dataframe(df, vectors, labels)
463+
464+
423465
def _process_json_response(data, vectors, labels):
424466
"""Process JSON API response into a pandas DataFrame."""
425467
if "data" not in data:
426468
raise ValueError("Invalid API response: missing 'data' field")
427469

428470
df = pd.DataFrame(data["data"])
429471

430-
# Extract vector metadata and handle labels
431-
df = _extract_vector_metadata(df, vectors, labels)
432-
433-
return df
472+
# Apply shared normalization
473+
return _normalize_census_dataframe(df, vectors, labels)
434474

435475

436476
def _process_geojson_response(data, vectors, labels):
@@ -440,70 +480,5 @@ def _process_geojson_response(data, vectors, labels):
440480

441481
gdf = gpd.GeoDataFrame.from_features(data["features"], crs="EPSG:4326")
442482

443-
# Apply the same numeric conversion logic as CSV processing
444-
# This was missing and causing all columns to remain as strings
445-
446-
# Define census-specific NA values (matching R package)
447-
census_na_values = ["x", "X", "F", "...", "-", ""]
448-
449-
# Convert specific columns to numeric (matching R package exactly)
450-
numeric_columns = []
451-
452-
# Standard census columns that should be numeric
453-
# Note: API may return column names with trailing spaces, so we need flexible matching
454-
standard_numeric = [
455-
"Population",
456-
"Households",
457-
"Dwellings",
458-
"Area (sq km)",
459-
"pop",
460-
"dw",
461-
"hh",
462-
"a",
463-
]
464-
465-
# Create a mapping of actual column names to expected names for flexible matching
466-
column_mapping = {}
467-
for expected_col in standard_numeric:
468-
# Check for exact match first
469-
if expected_col in gdf.columns:
470-
numeric_columns.append(expected_col)
471-
continue
472-
473-
# Check for variations with trailing/leading spaces
474-
for actual_col in gdf.columns:
475-
if actual_col.strip() == expected_col:
476-
numeric_columns.append(actual_col)
477-
column_mapping[actual_col] = expected_col
478-
break
479-
480-
# Vector columns (v_* pattern) - handle both short and descriptive names
481-
for col in gdf.columns:
482-
if col.startswith("v_CA") or col.startswith("v_"):
483-
numeric_columns.append(col)
484-
485-
# Convert to numeric with census NA handling
486-
for col in numeric_columns:
487-
if col in gdf.columns: # Additional safety check
488-
# Replace census NA values with NaN, then convert to numeric
489-
gdf[col] = gdf[col].replace(census_na_values, pd.NA)
490-
gdf[col] = pd.to_numeric(gdf[col], errors="coerce")
491-
492-
# Convert categorical columns to pandas categorical (matching R factors)
493-
categorical_columns = ["Type", "Region Name", "name", "t"]
494-
for expected_col in categorical_columns:
495-
# Check for exact match first
496-
if expected_col in gdf.columns:
497-
gdf[expected_col] = gdf[expected_col].astype("category")
498-
continue
499-
500-
# Check for variations with trailing/leading spaces
501-
for actual_col in gdf.columns:
502-
if actual_col.strip() == expected_col:
503-
gdf[actual_col] = gdf[actual_col].astype("category")
504-
break
505-
506-
# Extract vector metadata and handle labels
507-
gdf = _extract_vector_metadata(gdf, vectors, labels)
508-
509-
return gdf
483+
# Apply shared normalization
484+
return _normalize_census_dataframe(gdf, vectors, labels)

tests/test_basic.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,97 @@ def test_column_name_handling_with_spaces(self):
267267
assert result_df["Type"].dtype.name == "category"
268268
assert result_df["Region Name"].dtype.name == "category"
269269

270+
def test_normalize_census_dataframe_census_na_values(self):
271+
"""Test that census NA values are converted to pd.NA."""
272+
from pycancensus.core import _normalize_census_dataframe
273+
274+
# Create DataFrame with census NA values
275+
df = pd.DataFrame(
276+
{
277+
"Population": ["1000", "x", "2000", "F", "..."],
278+
"v_CA21_1": ["100", "-", "200", "", "300"],
279+
}
280+
)
281+
282+
result = _normalize_census_dataframe(df, ["v_CA21_1"], "detailed")
283+
284+
# Check that census NA values are now pd.NA
285+
assert pd.isna(result["Population"].iloc[1]) # 'x'
286+
assert pd.isna(result["Population"].iloc[3]) # 'F'
287+
assert pd.isna(result["Population"].iloc[4]) # '...'
288+
assert pd.isna(result["v_CA21_1"].iloc[1]) # '-'
289+
assert pd.isna(result["v_CA21_1"].iloc[3]) # ''
290+
291+
# Check that valid values are numeric
292+
assert result["Population"].iloc[0] == 1000
293+
assert result["v_CA21_1"].iloc[0] == 100
294+
295+
def test_normalize_census_dataframe_geojson_short_names(self):
296+
"""Test that GeoJSON short column names are handled."""
297+
from pycancensus.core import _normalize_census_dataframe
298+
299+
# Create DataFrame with GeoJSON-style short column names
300+
df = pd.DataFrame(
301+
{
302+
"pop": ["1000", "2000"],
303+
"dw": ["500", "600"],
304+
"hh": ["400", "500"],
305+
"a": ["10.5", "20.5"],
306+
"name": ["Region A", "Region B"],
307+
"t": ["CSD", "CMA"],
308+
}
309+
)
310+
311+
result = _normalize_census_dataframe(df, None, "detailed")
312+
313+
# Check numeric columns are converted
314+
assert pd.api.types.is_numeric_dtype(result["pop"])
315+
assert pd.api.types.is_numeric_dtype(result["dw"])
316+
assert pd.api.types.is_numeric_dtype(result["hh"])
317+
assert pd.api.types.is_numeric_dtype(result["a"])
318+
319+
# Check categorical columns are converted
320+
assert result["name"].dtype.name == "category"
321+
assert result["t"].dtype.name == "category"
322+
323+
def test_normalize_produces_equivalent_results(self):
324+
"""Test that CSV and GeoJSON processing produce equivalent normalization."""
325+
from pycancensus.core import _normalize_census_dataframe
326+
327+
# Same data structure, different column naming conventions
328+
csv_style = pd.DataFrame(
329+
{
330+
"Population": ["1000", "x"],
331+
"Type": ["CSD", "CMA"],
332+
"Region Name": ["A", "B"],
333+
"v_CA21_1": ["100", "200"],
334+
}
335+
)
336+
337+
geojson_style = pd.DataFrame(
338+
{
339+
"pop": ["1000", "x"],
340+
"t": ["CSD", "CMA"],
341+
"name": ["A", "B"],
342+
"v_CA21_1": ["100", "200"],
343+
}
344+
)
345+
346+
csv_result = _normalize_census_dataframe(
347+
csv_style.copy(), ["v_CA21_1"], "short"
348+
)
349+
geo_result = _normalize_census_dataframe(
350+
geojson_style.copy(), ["v_CA21_1"], "short"
351+
)
352+
353+
# Both should have numeric vector columns
354+
assert pd.api.types.is_numeric_dtype(csv_result["v_CA21_1"])
355+
assert pd.api.types.is_numeric_dtype(geo_result["v_CA21_1"])
356+
357+
# Both should handle NA values identically
358+
assert pd.isna(csv_result["Population"].iloc[1])
359+
assert pd.isna(geo_result["pop"].iloc[1])
360+
270361

271362
class TestCache:
272363
"""Test caching functionality."""

0 commit comments

Comments
 (0)