@@ -356,58 +356,88 @@ def _extract_vector_metadata(df, vectors, labels):
356356 return df
357357
358358
359- def _process_csv_response (csv_text , vectors , labels ):
360- """Process CSV API response into a pandas DataFrame."""
361- # Read all columns as strings initially (like R package)
362- df = pd .read_csv (io .StringIO (csv_text ), dtype = str , encoding = "utf-8" )
359+ def _normalize_census_dataframe (
360+ df : Union [pd .DataFrame , gpd .GeoDataFrame ],
361+ vectors : Optional [List [str ]],
362+ labels : str ,
363+ ) -> Union [pd .DataFrame , gpd .GeoDataFrame ]:
364+ """
365+ Normalize a census DataFrame or GeoDataFrame.
363366
364- # Fix column names by removing trailing/leading spaces (critical fix for API compatibility)
365- df .columns = df .columns .str .strip ()
367+ Applies consistent data type conversions and metadata extraction:
368+ - Converts census NA values ('x', 'X', 'F', '...', '-', '') to pd.NA
369+ - Converts numeric columns (population, households, etc.) to numeric dtype
370+ - Converts categorical columns (Type, Region Name) to category dtype
371+ - Extracts and stores vector metadata
366372
367- # Define census-specific NA values (matching R package)
368- census_na_values = ["x" , "X" , "F" , "..." , "-" , "" ]
373+ Parameters
374+ ----------
375+ df : pd.DataFrame or gpd.GeoDataFrame
376+ The data to normalize.
377+ vectors : list of str, optional
378+ Vector codes that were requested (for metadata extraction).
379+ labels : str
380+ Label format - 'detailed' or 'short'.
369381
370- # Convert specific columns to numeric (matching R package exactly)
371- numeric_columns = []
382+ Returns
383+ -------
384+ pd.DataFrame or gpd.GeoDataFrame
385+ The normalized data with proper dtypes.
386+ """
387+ # Census-specific NA values (matching R package)
388+ census_na_values = ["x" , "X" , "F" , "..." , "-" , "" ]
372389
373390 # Standard census columns that should be numeric
374- # Note: API may return column names with trailing spaces, so we need flexible matching
375- standard_numeric = ["Population" , "Households" , "Dwellings" , "Area (sq km)" ]
391+ # Include both long names (CSV endpoint) and short names (GeoJSON endpoint)
392+ standard_numeric = [
393+ "Population" ,
394+ "Households" ,
395+ "Dwellings" ,
396+ "Area (sq km)" ,
397+ "pop" , # GeoJSON short name
398+ "dw" , # GeoJSON short name
399+ "hh" , # GeoJSON short name
400+ "a" , # GeoJSON short name
401+ ]
376402
377- # Create a mapping of actual column names to expected names for flexible matching
378- column_mapping = {}
403+ # Find numeric columns to convert
404+ numeric_columns = []
379405 for expected_col in standard_numeric :
380- # Check for exact match first
406+ # Check for exact match
381407 if expected_col in df .columns :
382408 numeric_columns .append (expected_col )
383409 continue
384-
385- # Check for variations with trailing/leading spaces
410+ # Check for variations with trailing/leading spaces (API quirk)
386411 for actual_col in df .columns :
387412 if actual_col .strip () == expected_col :
388413 numeric_columns .append (actual_col )
389- column_mapping [actual_col ] = expected_col
390414 break
391415
392- # Vector columns (v_* pattern) - handle both short and descriptive names
416+ # Vector columns (v_* pattern) are always numeric
393417 for col in df .columns :
394- if col .startswith ("v_CA" ) or col . startswith ( " v_" ):
418+ if col .startswith ("v_" ):
395419 numeric_columns .append (col )
396420
397421 # Convert to numeric with census NA handling
398422 for col in numeric_columns :
399- # Replace census NA values with NaN, then convert to numeric
400- df [col ] = df [col ].replace (census_na_values , pd .NA )
401- df [col ] = pd .to_numeric (df [col ], errors = "coerce" )
423+ if col in df .columns :
424+ df [col ] = df [col ].replace (census_na_values , pd .NA )
425+ df [col ] = pd .to_numeric (df [col ], errors = "coerce" )
426+
427+ # Standard categorical columns
428+ # Include both long names (CSV endpoint) and short names (GeoJSON endpoint)
429+ categorical_columns = [
430+ "Type" ,
431+ "Region Name" ,
432+ "name" , # GeoJSON short name
433+ "t" , # GeoJSON short name
434+ ]
402435
403- # Convert categorical columns to pandas categorical (matching R factors)
404- categorical_columns = ["Type" , "Region Name" ]
405436 for expected_col in categorical_columns :
406- # Check for exact match first
437+ # Check for exact match
407438 if expected_col in df .columns :
408439 df [expected_col ] = df [expected_col ].astype ("category" )
409440 continue
410-
411441 # Check for variations with trailing/leading spaces
412442 for actual_col in df .columns :
413443 if actual_col .strip () == expected_col :
@@ -420,17 +450,27 @@ def _process_csv_response(csv_text, vectors, labels):
420450 return df
421451
422452
453+ def _process_csv_response (csv_text , vectors , labels ):
454+ """Process CSV API response into a pandas DataFrame."""
455+ # Read all columns as strings initially (like R package)
456+ df = pd .read_csv (io .StringIO (csv_text ), dtype = str , encoding = "utf-8" )
457+
458+ # Fix column names by removing trailing/leading spaces (critical fix for API compatibility)
459+ df .columns = df .columns .str .strip ()
460+
461+ # Apply shared normalization
462+ return _normalize_census_dataframe (df , vectors , labels )
463+
464+
423465def _process_json_response (data , vectors , labels ):
424466 """Process JSON API response into a pandas DataFrame."""
425467 if "data" not in data :
426468 raise ValueError ("Invalid API response: missing 'data' field" )
427469
428470 df = pd .DataFrame (data ["data" ])
429471
430- # Extract vector metadata and handle labels
431- df = _extract_vector_metadata (df , vectors , labels )
432-
433- return df
472+ # Apply shared normalization
473+ return _normalize_census_dataframe (df , vectors , labels )
434474
435475
436476def _process_geojson_response (data , vectors , labels ):
@@ -440,70 +480,5 @@ def _process_geojson_response(data, vectors, labels):
440480
441481 gdf = gpd .GeoDataFrame .from_features (data ["features" ], crs = "EPSG:4326" )
442482
443- # Apply the same numeric conversion logic as CSV processing
444- # This was missing and causing all columns to remain as strings
445-
446- # Define census-specific NA values (matching R package)
447- census_na_values = ["x" , "X" , "F" , "..." , "-" , "" ]
448-
449- # Convert specific columns to numeric (matching R package exactly)
450- numeric_columns = []
451-
452- # Standard census columns that should be numeric
453- # Note: API may return column names with trailing spaces, so we need flexible matching
454- standard_numeric = [
455- "Population" ,
456- "Households" ,
457- "Dwellings" ,
458- "Area (sq km)" ,
459- "pop" ,
460- "dw" ,
461- "hh" ,
462- "a" ,
463- ]
464-
465- # Create a mapping of actual column names to expected names for flexible matching
466- column_mapping = {}
467- for expected_col in standard_numeric :
468- # Check for exact match first
469- if expected_col in gdf .columns :
470- numeric_columns .append (expected_col )
471- continue
472-
473- # Check for variations with trailing/leading spaces
474- for actual_col in gdf .columns :
475- if actual_col .strip () == expected_col :
476- numeric_columns .append (actual_col )
477- column_mapping [actual_col ] = expected_col
478- break
479-
480- # Vector columns (v_* pattern) - handle both short and descriptive names
481- for col in gdf .columns :
482- if col .startswith ("v_CA" ) or col .startswith ("v_" ):
483- numeric_columns .append (col )
484-
485- # Convert to numeric with census NA handling
486- for col in numeric_columns :
487- if col in gdf .columns : # Additional safety check
488- # Replace census NA values with NaN, then convert to numeric
489- gdf [col ] = gdf [col ].replace (census_na_values , pd .NA )
490- gdf [col ] = pd .to_numeric (gdf [col ], errors = "coerce" )
491-
492- # Convert categorical columns to pandas categorical (matching R factors)
493- categorical_columns = ["Type" , "Region Name" , "name" , "t" ]
494- for expected_col in categorical_columns :
495- # Check for exact match first
496- if expected_col in gdf .columns :
497- gdf [expected_col ] = gdf [expected_col ].astype ("category" )
498- continue
499-
500- # Check for variations with trailing/leading spaces
501- for actual_col in gdf .columns :
502- if actual_col .strip () == expected_col :
503- gdf [actual_col ] = gdf [actual_col ].astype ("category" )
504- break
505-
506- # Extract vector metadata and handle labels
507- gdf = _extract_vector_metadata (gdf , vectors , labels )
508-
509- return gdf
483+ # Apply shared normalization
484+ return _normalize_census_dataframe (gdf , vectors , labels )
0 commit comments