@@ -225,96 +225,6 @@ def test_column_dtypes(
225225 assert noised_data [col .name ].dtype == expected_dtype
226226
227227
228- @pytest .mark .parametrize (
229- "dataset_name" ,
230- [
231- DATASET_SCHEMAS .census .name ,
232- DATASET_SCHEMAS .acs .name ,
233- DATASET_SCHEMAS .cps .name ,
234- DATASET_SCHEMAS .ssa .name ,
235- DATASET_SCHEMAS .tax_w2_1099 .name ,
236- DATASET_SCHEMAS .wic .name ,
237- DATASET_SCHEMAS .tax_1040 .name ,
238- ],
239- )
240- @pytest .mark .parametrize (
241- "engine" ,
242- [
243- "pandas" ,
244- "dask" ,
245- ],
246- )
247- def test_column_noising (
248- dataset_name : str ,
249- engine : str ,
250- config : dict [str , Any ],
251- request : FixtureRequest ,
252- fuzzy_checker : FuzzyChecker ,
253- ) -> None :
254- """Tests that columns are noised as expected"""
255- if "TODO" in dataset_name :
256- pytest .skip (reason = dataset_name )
257- original = initialize_dataset_with_sample (dataset_name )
258- if engine == "dask" :
259- generation_function = DATASET_GENERATION_FUNCS [dataset_name ]
260- noised_data = generation_function (
261- seed = SEED ,
262- year = None ,
263- config = config ,
264- engine = engine ,
265- ).compute ()
266- else :
267- noised_data = request .getfixturevalue (f"noised_sample_data_{ dataset_name } " )
268- check_noised , check_original , shared_idx = _get_common_datasets (original , noised_data )
269-
270- run_column_noising_tests (
271- dataset_name , config , fuzzy_checker , check_noised , check_original , shared_idx
272- )
273-
274-
275- @pytest .mark .parametrize (
276- "dataset_name" ,
277- [
278- DATASET_SCHEMAS .census .name ,
279- DATASET_SCHEMAS .acs .name ,
280- DATASET_SCHEMAS .cps .name ,
281- DATASET_SCHEMAS .ssa .name ,
282- DATASET_SCHEMAS .tax_w2_1099 .name ,
283- DATASET_SCHEMAS .wic .name ,
284- DATASET_SCHEMAS .tax_1040 .name ,
285- ],
286- )
287- @pytest .mark .parametrize (
288- "engine" ,
289- [
290- "pandas" ,
291- "dask" ,
292- ],
293- )
294- def test_row_noising_omit_row_or_do_not_respond (
295- dataset_name : str , engine : str , config : dict [str , Any ], request : FixtureRequest
296- ) -> None :
297- """Tests that omit_row and do_not_respond row noising are being applied"""
298- if "TODO" in dataset_name :
299- pytest .skip (reason = dataset_name )
300- idx_cols = IDX_COLS .get (dataset_name )
301- original = get_unnoised_data (dataset_name )
302- original_data = original .data .set_index (idx_cols )
303- if engine == "dask" :
304- generation_function = DATASET_GENERATION_FUNCS [dataset_name ]
305- noised_data = generation_function (
306- seed = SEED ,
307- year = None ,
308- config = config ,
309- engine = engine ,
310- ).compute ()
311- else :
312- noised_data = request .getfixturevalue (f"noised_sample_data_{ dataset_name } " )
313- noised_data = noised_data .set_index (idx_cols )
314-
315- run_omit_row_or_do_not_respond_tests (dataset_name , config , original_data , noised_data )
316-
317-
318228@pytest .mark .skip (reason = "TODO: Implement duplication row noising" )
319229@pytest .mark .parametrize (
320230 "dataset_name" ,
@@ -336,42 +246,13 @@ def test_row_noising_duplication(dataset_name: str) -> None:
336246@pytest .mark .parametrize (
337247 "dataset_name" ,
338248 [
339- DATASET_SCHEMAS .census .name ,
249+ # DATASET_SCHEMAS.census.name,
250+ # DATASET_SCHEMAS.tax_w2_1099.name,
251+ # DATASET_SCHEMAS.wic.name,
252+ # DATASET_SCHEMAS.tax_1040.name,
340253 DATASET_SCHEMAS .acs .name ,
341254 DATASET_SCHEMAS .cps .name ,
342255 DATASET_SCHEMAS .ssa .name ,
343- DATASET_SCHEMAS .tax_w2_1099 .name ,
344- DATASET_SCHEMAS .wic .name ,
345- DATASET_SCHEMAS .tax_1040 .name ,
346- ],
347- )
348- @pytest .mark .parametrize (
349- "engine" ,
350- [
351- "pandas" ,
352- "dask" ,
353- ],
354- )
355- def test_generate_dataset_with_year (dataset_name : str , engine : str ) -> None :
356- if "TODO" in dataset_name :
357- pytest .skip (reason = dataset_name )
358- year = 2030 # not default 2020
359- generation_function = DATASET_GENERATION_FUNCS [dataset_name ]
360- original = get_unnoised_data (dataset_name )
361- # Generate a new (non-fixture) noised dataset for a single year
362- noised_data = generation_function (year = year , engine = engine )
363- if engine == "dask" :
364- noised_data = noised_data .compute ()
365- assert not original .data .equals (noised_data )
366-
367-
368- @pytest .mark .parametrize (
369- "dataset_name" ,
370- [
371- DATASET_SCHEMAS .census .name ,
372- DATASET_SCHEMAS .tax_w2_1099 .name ,
373- DATASET_SCHEMAS .wic .name ,
374- DATASET_SCHEMAS .tax_1040 .name ,
375256 ],
376257)
377258@pytest .mark .parametrize (
@@ -391,16 +272,16 @@ def test_dataset_filter_by_year(
391272 pytest .skip (reason = dataset_name )
392273 year = 2030 # not default 2020
393274
394- # Generate a new (non-fixture) noised dataset for a single year but mocked such
275+ # Generate a new (non-fixture) dataset for a single year but mocked such
395276 # that no noise actually happens (otherwise the years would get noised and
396277 # we couldn't tell if the filter was working properly)
397278 mocker .patch ("pseudopeople.dataset.Dataset._noise_dataset" )
398279 generation_function = DATASET_GENERATION_FUNCS [dataset_name ]
399- noised_data = generation_function (year = year , engine = engine )
280+ data = generation_function (year = year , engine = engine )
400281 if engine == "dask" :
401- noised_data = noised_data .compute ()
282+ data = data .compute ()
402283 dataset = DATASET_SCHEMAS .get_dataset_schema (dataset_name )
403- assert (noised_data [dataset .date_column_name ] == year ).all ()
284+ assert (data [dataset .date_column_name ] == year ).all ()
404285
405286
406287@pytest .mark .parametrize (
0 commit comments