refactor: Extract shared DataFrame normalization logic

dshkol · claude · dshkol · commit 1a6798177585 · 2026-01-17T15:53:42.000-08:00
Consolidate duplicate code between _process_csv_response and
_process_geojson_response into a shared _normalize_census_dataframe()
function.

The new function handles:
- Census NA value conversion ('x', 'X', 'F', '...', '-', '')
- Numeric column dtype conversion (Population, Households, etc.)
- Categorical column dtype conversion (Type, Region Name)
- Vector metadata extraction
- Both CSV endpoint names (Population) and GeoJSON short names (pop)

This reduces code duplication by ~60 lines and ensures consistent
data handling across both CSV and GeoJSON endpoints.

Added 3 new unit tests:
- test_normalize_census_dataframe_census_na_values
- test_normalize_census_dataframe_geojson_short_names
- test_normalize_produces_equivalent_results

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/pycancensus/core.py b/pycancensus/core.py
@@ -356,58 +356,88 @@ def _extract_vector_metadata(df, vectors, labels):
     return df
 
 
-def _process_csv_response(csv_text, vectors, labels):
-    """Process CSV API response into a pandas DataFrame."""
-    # Read all columns as strings initially (like R package)
-    df = pd.read_csv(io.StringIO(csv_text), dtype=str, encoding="utf-8")
+def _normalize_census_dataframe(
+    df: Union[pd.DataFrame, gpd.GeoDataFrame],
+    vectors: Optional[List[str]],
+    labels: str,
+) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
+    """
+    Normalize a census DataFrame or GeoDataFrame.
 
-    # Fix column names by removing trailing/leading spaces (critical fix for API compatibility)
-    df.columns = df.columns.str.strip()
+    Applies consistent data type conversions and metadata extraction:
+    - Converts census NA values ('x', 'X', 'F', '...', '-', '') to pd.NA
+    - Converts numeric columns (population, households, etc.) to numeric dtype
+    - Converts categorical columns (Type, Region Name) to category dtype
+    - Extracts and stores vector metadata
 
-    # Define census-specific NA values (matching R package)
-    census_na_values = ["x", "X", "F", "...", "-", ""]
+    Parameters
+    ----------
+    df : pd.DataFrame or gpd.GeoDataFrame
+        The data to normalize.
+    vectors : list of str, optional
+        Vector codes that were requested (for metadata extraction).
+    labels : str
+        Label format - 'detailed' or 'short'.
 
-    # Convert specific columns to numeric (matching R package exactly)
-    numeric_columns = []
+    Returns
+    -------
+    pd.DataFrame or gpd.GeoDataFrame
+        The normalized data with proper dtypes.
+    """
+    # Census-specific NA values (matching R package)
+    census_na_values = ["x", "X", "F", "...", "-", ""]
 
     # Standard census columns that should be numeric
-    # Note: API may return column names with trailing spaces, so we need flexible matching
-    standard_numeric = ["Population", "Households", "Dwellings", "Area (sq km)"]
+    # Include both long names (CSV endpoint) and short names (GeoJSON endpoint)
+    standard_numeric = [
+        "Population",
+        "Households",
+        "Dwellings",
+        "Area (sq km)",
+        "pop",  # GeoJSON short name
+        "dw",  # GeoJSON short name
+        "hh",  # GeoJSON short name
+        "a",  # GeoJSON short name
+    ]
 
-    # Create a mapping of actual column names to expected names for flexible matching
-    column_mapping = {}
+    # Find numeric columns to convert
+    numeric_columns = []
     for expected_col in standard_numeric:
-        # Check for exact match first
+        # Check for exact match
         if expected_col in df.columns:
             numeric_columns.append(expected_col)
             continue
-
-        # Check for variations with trailing/leading spaces
+        # Check for variations with trailing/leading spaces (API quirk)
         for actual_col in df.columns:
             if actual_col.strip() == expected_col:
                 numeric_columns.append(actual_col)
-                column_mapping[actual_col] = expected_col
                 break
 
-    # Vector columns (v_* pattern) - handle both short and descriptive names
+    # Vector columns (v_* pattern) are always numeric
     for col in df.columns:
-        if col.startswith("v_CA") or col.startswith("v_"):
+        if col.startswith("v_"):
             numeric_columns.append(col)
 
     # Convert to numeric with census NA handling
     for col in numeric_columns:
-        # Replace census NA values with NaN, then convert to numeric
-        df[col] = df[col].replace(census_na_values, pd.NA)
-        df[col] = pd.to_numeric(df[col], errors="coerce")
+        if col in df.columns:
+            df[col] = df[col].replace(census_na_values, pd.NA)
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+
+    # Standard categorical columns
+    # Include both long names (CSV endpoint) and short names (GeoJSON endpoint)
+    categorical_columns = [
+        "Type",
+        "Region Name",
+        "name",  # GeoJSON short name
+        "t",  # GeoJSON short name
+    ]
 
-    # Convert categorical columns to pandas categorical (matching R factors)
-    categorical_columns = ["Type", "Region Name"]
     for expected_col in categorical_columns:
-        # Check for exact match first
+        # Check for exact match
         if expected_col in df.columns:
             df[expected_col] = df[expected_col].astype("category")
             continue
-
         # Check for variations with trailing/leading spaces
         for actual_col in df.columns:
             if actual_col.strip() == expected_col:
@@ -420,17 +450,27 @@ def _process_csv_response(csv_text, vectors, labels):
     return df
 
 
+def _process_csv_response(csv_text, vectors, labels):
+    """Process CSV API response into a pandas DataFrame."""
+    # Read all columns as strings initially (like R package)
+    df = pd.read_csv(io.StringIO(csv_text), dtype=str, encoding="utf-8")
+
+    # Fix column names by removing trailing/leading spaces (critical fix for API compatibility)
+    df.columns = df.columns.str.strip()
+
+    # Apply shared normalization
+    return _normalize_census_dataframe(df, vectors, labels)
+
+
 def _process_json_response(data, vectors, labels):
     """Process JSON API response into a pandas DataFrame."""
     if "data" not in data:
         raise ValueError("Invalid API response: missing 'data' field")
 
     df = pd.DataFrame(data["data"])
 
-    # Extract vector metadata and handle labels
-    df = _extract_vector_metadata(df, vectors, labels)
-
-    return df
+    # Apply shared normalization
+    return _normalize_census_dataframe(df, vectors, labels)
 
 
 def _process_geojson_response(data, vectors, labels):
@@ -440,70 +480,5 @@ def _process_geojson_response(data, vectors, labels):
 
     gdf = gpd.GeoDataFrame.from_features(data["features"], crs="EPSG:4326")
 
-    # Apply the same numeric conversion logic as CSV processing
-    # This was missing and causing all columns to remain as strings
-
-    # Define census-specific NA values (matching R package)
-    census_na_values = ["x", "X", "F", "...", "-", ""]
-
-    # Convert specific columns to numeric (matching R package exactly)
-    numeric_columns = []
-
-    # Standard census columns that should be numeric
-    # Note: API may return column names with trailing spaces, so we need flexible matching
-    standard_numeric = [
-        "Population",
-        "Households",
-        "Dwellings",
-        "Area (sq km)",
-        "pop",
-        "dw",
-        "hh",
-        "a",
-    ]
-
-    # Create a mapping of actual column names to expected names for flexible matching
-    column_mapping = {}
-    for expected_col in standard_numeric:
-        # Check for exact match first
-        if expected_col in gdf.columns:
-            numeric_columns.append(expected_col)
-            continue
-
-        # Check for variations with trailing/leading spaces
-        for actual_col in gdf.columns:
-            if actual_col.strip() == expected_col:
-                numeric_columns.append(actual_col)
-                column_mapping[actual_col] = expected_col
-                break
-
-    # Vector columns (v_* pattern) - handle both short and descriptive names
-    for col in gdf.columns:
-        if col.startswith("v_CA") or col.startswith("v_"):
-            numeric_columns.append(col)
-
-    # Convert to numeric with census NA handling
-    for col in numeric_columns:
-        if col in gdf.columns:  # Additional safety check
-            # Replace census NA values with NaN, then convert to numeric
-            gdf[col] = gdf[col].replace(census_na_values, pd.NA)
-            gdf[col] = pd.to_numeric(gdf[col], errors="coerce")
-
-    # Convert categorical columns to pandas categorical (matching R factors)
-    categorical_columns = ["Type", "Region Name", "name", "t"]
-    for expected_col in categorical_columns:
-        # Check for exact match first
-        if expected_col in gdf.columns:
-            gdf[expected_col] = gdf[expected_col].astype("category")
-            continue
-
-        # Check for variations with trailing/leading spaces
-        for actual_col in gdf.columns:
-            if actual_col.strip() == expected_col:
-                gdf[actual_col] = gdf[actual_col].astype("category")
-                break
-
-    # Extract vector metadata and handle labels
-    gdf = _extract_vector_metadata(gdf, vectors, labels)
-
-    return gdf
+    # Apply shared normalization
+    return _normalize_census_dataframe(gdf, vectors, labels)
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -267,6 +267,97 @@ def test_column_name_handling_with_spaces(self):
         assert result_df["Type"].dtype.name == "category"
         assert result_df["Region Name"].dtype.name == "category"
 
+    def test_normalize_census_dataframe_census_na_values(self):
+        """Test that census NA values are converted to pd.NA."""
+        from pycancensus.core import _normalize_census_dataframe
+
+        # Create DataFrame with census NA values
+        df = pd.DataFrame(
+            {
+                "Population": ["1000", "x", "2000", "F", "..."],
+                "v_CA21_1": ["100", "-", "200", "", "300"],
+            }
+        )
+
+        result = _normalize_census_dataframe(df, ["v_CA21_1"], "detailed")
+
+        # Check that census NA values are now pd.NA
+        assert pd.isna(result["Population"].iloc[1])  # 'x'
+        assert pd.isna(result["Population"].iloc[3])  # 'F'
+        assert pd.isna(result["Population"].iloc[4])  # '...'
+        assert pd.isna(result["v_CA21_1"].iloc[1])  # '-'
+        assert pd.isna(result["v_CA21_1"].iloc[3])  # ''
+
+        # Check that valid values are numeric
+        assert result["Population"].iloc[0] == 1000
+        assert result["v_CA21_1"].iloc[0] == 100
+
+    def test_normalize_census_dataframe_geojson_short_names(self):
+        """Test that GeoJSON short column names are handled."""
+        from pycancensus.core import _normalize_census_dataframe
+
+        # Create DataFrame with GeoJSON-style short column names
+        df = pd.DataFrame(
+            {
+                "pop": ["1000", "2000"],
+                "dw": ["500", "600"],
+                "hh": ["400", "500"],
+                "a": ["10.5", "20.5"],
+                "name": ["Region A", "Region B"],
+                "t": ["CSD", "CMA"],
+            }
+        )
+
+        result = _normalize_census_dataframe(df, None, "detailed")
+
+        # Check numeric columns are converted
+        assert pd.api.types.is_numeric_dtype(result["pop"])
+        assert pd.api.types.is_numeric_dtype(result["dw"])
+        assert pd.api.types.is_numeric_dtype(result["hh"])
+        assert pd.api.types.is_numeric_dtype(result["a"])
+
+        # Check categorical columns are converted
+        assert result["name"].dtype.name == "category"
+        assert result["t"].dtype.name == "category"
+
+    def test_normalize_produces_equivalent_results(self):
+        """Test that CSV and GeoJSON processing produce equivalent normalization."""
+        from pycancensus.core import _normalize_census_dataframe
+
+        # Same data structure, different column naming conventions
+        csv_style = pd.DataFrame(
+            {
+                "Population": ["1000", "x"],
+                "Type": ["CSD", "CMA"],
+                "Region Name": ["A", "B"],
+                "v_CA21_1": ["100", "200"],
+            }
+        )
+
+        geojson_style = pd.DataFrame(
+            {
+                "pop": ["1000", "x"],
+                "t": ["CSD", "CMA"],
+                "name": ["A", "B"],
+                "v_CA21_1": ["100", "200"],
+            }
+        )
+
+        csv_result = _normalize_census_dataframe(
+            csv_style.copy(), ["v_CA21_1"], "short"
+        )
+        geo_result = _normalize_census_dataframe(
+            geojson_style.copy(), ["v_CA21_1"], "short"
+        )
+
+        # Both should have numeric vector columns
+        assert pd.api.types.is_numeric_dtype(csv_result["v_CA21_1"])
+        assert pd.api.types.is_numeric_dtype(geo_result["v_CA21_1"])
+
+        # Both should handle NA values identically
+        assert pd.isna(csv_result["Population"].iloc[1])
+        assert pd.isna(geo_result["pop"].iloc[1])
+
 
 class TestCache:
     """Test caching functionality."""