1919
2020class TestCancensusCompatibility :
2121 """Test pycancensus compatibility with cancensus R library results."""
22-
22+
2323 @pytest .fixture (autouse = True )
2424 def setup_test_env (self ):
2525 """Setup test environment."""
@@ -29,63 +29,64 @@ def setup_test_env(self):
2929 pc .set_api_key (test_api_key )
3030 else :
3131 pytest .skip ("CANCENSUS_API_KEY not set" )
32-
32+
3333 # Setup temporary cache directory
3434 self .temp_cache = tempfile .mkdtemp ()
3535 pc .set_cache_path (self .temp_cache )
36-
36+
3737 yield
38-
38+
3939 # Cleanup
4040 if os .path .exists (self .temp_cache ):
4141 shutil .rmtree (self .temp_cache )
42-
42+
4343 def test_get_census_basic_functionality (self ):
4444 """Test basic get_census functionality matches expected patterns."""
4545 # This test uses a small, fast query
4646 data = pc .get_census (
4747 dataset = "CA21" ,
4848 regions = {"PR" : "59" }, # British Columbia
4949 vectors = ["v_CA21_1" ], # Total population
50- level = "PR"
50+ level = "PR" ,
5151 )
52-
52+
5353 # Verify structure matches expected cancensus format
5454 assert isinstance (data , pd .DataFrame )
5555 assert "GeoUID" in data .columns
5656 assert "Type" in data .columns
5757 assert "Region Name" in data .columns
5858 assert "v_CA21_1" in data .columns
59-
59+
6060 # Verify data types are appropriate
6161 assert pd .api .types .is_integer_dtype (data ["v_CA21_1" ])
6262 assert isinstance (data ["GeoUID" ].iloc [0 ], str )
63-
63+
6464 # Verify basic data integrity
6565 assert len (data ) > 0
6666 assert not data ["v_CA21_1" ].isna ().all ()
67-
67+
6868 def test_get_census_with_geometry (self ):
6969 """Test geometry retrieval functionality."""
7070 data = pc .get_census (
7171 dataset = "CA21" ,
7272 regions = {"CSD" : "5915022" }, # Vancouver
7373 vectors = ["v_CA21_1" ],
7474 level = "CSD" ,
75- geo_format = "geopandas"
75+ geo_format = "geopandas" ,
7676 )
77-
77+
7878 # Verify geometry is included
79- assert hasattr (data , ' geometry' )
79+ assert hasattr (data , " geometry" )
8080 assert not data .geometry .isna ().all ()
81-
81+
8282 # Verify it's a proper GeoDataFrame
8383 try :
8484 import geopandas as gpd
85+
8586 assert isinstance (data , gpd .GeoDataFrame )
8687 except ImportError :
8788 pytest .skip ("geopandas not available" )
88-
89+
8990 def test_list_functions_compatibility (self ):
9091 """Test list functions return expected formats."""
9192 # Test datasets
@@ -94,34 +95,34 @@ def test_list_functions_compatibility(self):
9495 assert "dataset" in datasets .columns
9596 assert "description" in datasets .columns
9697 assert len (datasets ) > 0
97-
98+
9899 # Test vectors
99100 vectors = pc .list_census_vectors ("CA21" )
100101 assert isinstance (vectors , pd .DataFrame )
101102 assert "vector" in vectors .columns
102103 assert "type" in vectors .columns
103104 assert "label" in vectors .columns
104105 assert len (vectors ) > 0
105-
106+
106107 # Test regions
107108 regions = pc .list_census_regions ("CA21" )
108109 assert isinstance (regions , pd .DataFrame )
109110 assert "region" in regions .columns
110111 assert "name" in regions .columns
111112 assert len (regions ) > 0
112-
113+
113114 def test_search_functions (self ):
114115 """Test search functionality."""
115116 # Search vectors
116117 vectors = pc .search_census_vectors ("CA21" , "population" )
117118 assert isinstance (vectors , pd .DataFrame )
118119 assert len (vectors ) > 0
119-
120+
120121 # Search regions
121122 regions = pc .search_census_regions ("CA21" , "Vancouver" )
122123 assert isinstance (regions , pd .DataFrame )
123124 assert len (regions ) > 0
124-
125+
125126 def test_error_handling (self ):
126127 """Test error handling matches expected patterns."""
127128 # Invalid dataset
@@ -130,103 +131,101 @@ def test_error_handling(self):
130131 dataset = "INVALID" ,
131132 regions = {"PR" : "59" },
132133 vectors = ["v_CA21_1" ],
133- level = "PR"
134+ level = "PR" ,
134135 )
135-
136+
136137 # Invalid region
137138 with pytest .raises ((ValueError , Exception )):
138139 pc .get_census (
139140 dataset = "CA21" ,
140141 regions = {"PR" : "99999" }, # Non-existent region
141142 vectors = ["v_CA21_1" ],
142- level = "PR"
143+ level = "PR" ,
143144 )
144-
145+
145146 # Invalid vector
146147 with pytest .raises ((ValueError , Exception )):
147148 pc .get_census (
148149 dataset = "CA21" ,
149150 regions = {"PR" : "59" },
150151 vectors = ["v_INVALID_999" ],
151- level = "PR"
152+ level = "PR" ,
152153 )
153-
154+
154155 def test_caching_functionality (self ):
155156 """Test caching works as expected."""
156157 # Clear cache first
157158 pc .clear_cache ()
158-
159+
159160 # Make request
160161 data1 = pc .get_census (
161- dataset = "CA21" ,
162- regions = {"PR" : "59" },
163- vectors = ["v_CA21_1" ],
164- level = "PR"
162+ dataset = "CA21" , regions = {"PR" : "59" }, vectors = ["v_CA21_1" ], level = "PR"
165163 )
166-
164+
167165 # Check cache was created
168166 cache_list = pc .list_cache ()
169167 assert len (cache_list ) > 0
170-
168+
171169 # Make same request again
172170 data2 = pc .get_census (
173- dataset = "CA21" ,
174- regions = {"PR" : "59" },
175- vectors = ["v_CA21_1" ],
176- level = "PR"
171+ dataset = "CA21" , regions = {"PR" : "59" }, vectors = ["v_CA21_1" ], level = "PR"
177172 )
178-
173+
179174 # Data should be identical
180175 pd .testing .assert_frame_equal (data1 , data2 )
181-
176+
182177 def test_data_type_consistency (self ):
183178 """Test that data types are consistent with cancensus expectations."""
184179 data = pc .get_census (
185180 dataset = "CA21" ,
186181 regions = {"CSD" : "5915022" }, # Vancouver
187182 vectors = ["v_CA21_1" , "v_CA21_8" , "v_CA21_434" ], # Mix of data types
188- level = "CSD"
183+ level = "CSD" ,
189184 )
190-
185+
191186 # Population counts should be integers or floats
192187 assert pd .api .types .is_numeric_dtype (data ["v_CA21_1" ])
193-
188+
194189 # Geographic identifiers should be strings
195- assert pd .api .types .is_string_dtype (data ["GeoUID" ]) or pd .api .types .is_object_dtype (data ["GeoUID" ])
196-
190+ assert pd .api .types .is_string_dtype (
191+ data ["GeoUID" ]
192+ ) or pd .api .types .is_object_dtype (data ["GeoUID" ])
193+
197194 # Check for proper handling of NA values
198195 # Should not have string representations of NA
199196 for col in data .columns :
200- if data [col ].dtype == 'object' or pd .api .types .is_string_dtype (data [col ]):
201- assert not any (data [col ].astype (str ).str .contains (r'^(x|X|F|\.\.\.)$' , na = False ))
202-
197+ if data [col ].dtype == "object" or pd .api .types .is_string_dtype (data [col ]):
198+ assert not any (
199+ data [col ].astype (str ).str .contains (r"^(x|X|F|\.\.\.)$" , na = False )
200+ )
201+
203202 def test_multiple_regions_functionality (self ):
204203 """Test handling of multiple regions."""
205204 data = pc .get_census (
206205 dataset = "CA21" ,
207206 regions = {"CSD" : ["5915022" , "3520005" ]}, # Vancouver, Toronto
208207 vectors = ["v_CA21_1" ],
209- level = "CSD"
208+ level = "CSD" ,
210209 )
211-
210+
212211 assert len (data ) == 2 # Should have data for both cities
213212 assert "5915022" in data ["GeoUID" ].values
214213 assert "3520005" in data ["GeoUID" ].values
215-
214+
216215 def test_hierarchical_region_retrieval (self ):
217216 """Test retrieving data at different hierarchical levels."""
218217 # Get CT data for Vancouver CMA
219218 data = pc .get_census (
220219 dataset = "CA21" ,
221220 regions = {"CMA" : "59933" }, # Vancouver CMA
222221 vectors = ["v_CA21_1" ],
223- level = "CT"
222+ level = "CT" ,
224223 )
225-
224+
226225 # Should have multiple census tracts
227226 assert len (data ) > 100 # Vancouver CMA has many CTs
228227 assert all (data ["GeoUID" ].str .len () == 10 ) # CT GeoUIDs are 10 digits
229-
228+
230229 @pytest .mark .slow
231230 def test_large_dataset_handling (self ):
232231 """Test handling of larger datasets."""
@@ -235,9 +234,9 @@ def test_large_dataset_handling(self):
235234 dataset = "CA21" ,
236235 regions = {"PR" : "59" }, # BC
237236 vectors = ["v_CA21_1" , "v_CA21_2" , "v_CA21_3" ],
238- level = "DA" # Dissemination Areas - lots of records
237+ level = "DA" , # Dissemination Areas - lots of records
239238 )
240-
239+
241240 # Should handle large datasets without issues
242241 assert len (data ) > 10000 # BC has many DAs
243242 assert not data .empty
@@ -246,7 +245,7 @@ def test_large_dataset_handling(self):
246245
247246class TestDataQuality :
248247 """Test data quality and consistency."""
249-
248+
250249 @pytest .fixture (autouse = True )
251250 def setup_test_env (self ):
252251 """Setup test environment."""
@@ -255,62 +254,59 @@ def setup_test_env(self):
255254 pc .set_api_key (test_api_key )
256255 else :
257256 pytest .skip ("CANCENSUS_API_KEY not set" )
258-
257+
259258 def test_data_consistency_across_levels (self ):
260259 """Test that data is consistent across geographic levels."""
261260 # Get CMA-level data
262261 cma_data = pc .get_census (
263262 dataset = "CA21" ,
264263 regions = {"CMA" : "59933" }, # Vancouver CMA
265264 vectors = ["v_CA21_1" ], # Total population
266- level = "CMA"
265+ level = "CMA" ,
267266 )
268-
267+
269268 # Get CSD-level data for same region
270269 csd_data = pc .get_census (
271- dataset = "CA21" ,
272- regions = {"CMA" : "59933" },
273- vectors = ["v_CA21_1" ],
274- level = "CSD"
270+ dataset = "CA21" , regions = {"CMA" : "59933" }, vectors = ["v_CA21_1" ], level = "CSD"
275271 )
276-
272+
277273 # Sum of CSDs should approximately equal CMA total
278274 cma_pop = cma_data ["v_CA21_1" ].iloc [0 ]
279275 csd_pop_sum = csd_data ["v_CA21_1" ].sum ()
280-
276+
281277 # Allow for small rounding differences
282- assert abs ( cma_pop - csd_pop_sum ) / cma_pop < 0.01 , (
283- f"Population inconsistency: CMA= { cma_pop } , CSD sum= { csd_pop_sum } "
284- )
285-
278+ assert (
279+ abs ( cma_pop - csd_pop_sum ) / cma_pop < 0.01
280+ ), f"Population inconsistency: CMA= { cma_pop } , CSD sum= { csd_pop_sum } "
281+
286282 def test_geographic_identifier_format (self ):
287283 """Test that geographic identifiers follow expected formats."""
288284 test_cases = [
289- ("PR" , "CA21" , 2 ), # Province: 2 digits
290- ("CMA" , "CA21" , 3 ), # CMA: 3 digits
291- ("CD" , "CA21" , 4 ), # CD: 4 digits
292- ("CSD" , "CA21" , 7 ), # CSD: 7 digits
293- ("CT" , "CA21" , 10 ), # CT: 10 digits
285+ ("PR" , "CA21" , 2 ), # Province: 2 digits
286+ ("CMA" , "CA21" , 3 ), # CMA: 3 digits
287+ ("CD" , "CA21" , 4 ), # CD: 4 digits
288+ ("CSD" , "CA21" , 7 ), # CSD: 7 digits
289+ ("CT" , "CA21" , 10 ), # CT: 10 digits
294290 ]
295-
291+
296292 for level , dataset , expected_length in test_cases :
297293 data = pc .get_census (
298294 dataset = dataset ,
299295 regions = {"PR" : "59" }, # BC
300296 vectors = ["v_CA21_1" ],
301- level = level
297+ level = level ,
302298 )
303-
299+
304300 # Check GeoUID format
305- assert all (data [ "GeoUID" ]. str . len () == expected_length ), (
306- f "GeoUID length mismatch for { level } : expected { expected_length } "
307- )
308-
301+ assert all (
302+ data [ "GeoUID" ]. str . len () == expected_length
303+ ), f"GeoUID length mismatch for { level } : expected { expected_length } "
304+
309305 # Check GeoUID is numeric
310- assert all (data [ "GeoUID" ]. str . isdigit ()), (
311- f"Non-numeric GeoUID found for { level } "
312- )
306+ assert all (
307+ data [ " GeoUID" ]. str . isdigit ()
308+ ), f"Non-numeric GeoUID found for { level } "
313309
314310
315311if __name__ == "__main__" :
316- pytest .main ([__file__ , "-v" ])
312+ pytest .main ([__file__ , "-v" ])
0 commit comments