|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | from collections.abc import Callable |
| 4 | +from pathlib import Path |
4 | 5 | from typing import Any, Literal |
5 | 6 |
|
| 7 | +import dask.dataframe as dd |
6 | 8 | import numpy as np |
7 | 9 | import pandas as pd |
8 | | -import pytest |
9 | 10 | from _pytest.fixtures import FixtureRequest |
10 | 11 | from layered_config_tree import LayeredConfigTree |
11 | 12 | from pytest_mock import MockerFixture |
|
14 | 15 | from pseudopeople.configuration import Keys, get_configuration |
15 | 16 | from pseudopeople.configuration.entities import NO_NOISE |
16 | 17 | from pseudopeople.configuration.noise_configuration import NoiseConfiguration |
| 18 | +from pseudopeople.constants import paths |
17 | 19 | from pseudopeople.constants.metadata import DatasetNames |
18 | 20 | from pseudopeople.constants.noise_type_metadata import ( |
19 | 21 | GUARDIAN_DUPLICATION_ADDRESS_COLUMNS, |
20 | 22 | ) |
21 | 23 | from pseudopeople.dataset import Dataset |
22 | 24 | from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType |
23 | | -from pseudopeople.interface import generate_decennial_census, generate_social_security |
| 25 | +from pseudopeople.filter import get_generate_data_filters |
| 26 | +from pseudopeople.interface import ( |
| 27 | + generate_social_security, |
| 28 | + get_dataset_filepaths, |
| 29 | + validate_source_compatibility, |
| 30 | +) |
| 31 | +from pseudopeople.loader import load_standard_dataset |
24 | 32 | from pseudopeople.noise_entities import NOISE_TYPES |
25 | 33 | from pseudopeople.noise_functions import merge_dependents_and_guardians |
26 | 34 | from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS |
| 35 | +from pseudopeople.utilities import DASK_ENGINE, get_engine_from_string |
27 | 36 | from tests.integration.conftest import SEED, _get_common_datasets |
28 | | -from tests.integration.release.conftest import DATASET_ARG_TO_FULL_NAME_MAPPER |
| 37 | +from tests.integration.release.conftest import ( |
| 38 | + DATASET_ARG_TO_FULL_NAME_MAPPER, |
| 39 | + RI_FILEPATH, |
| 40 | +) |
29 | 41 | from tests.integration.release.utilities import ( |
30 | 42 | run_do_not_respond_tests, |
31 | 43 | run_guardian_duplication_tests, |
32 | 44 | run_omit_row_tests, |
33 | 45 | ) |
34 | | -from tests.utilities import initialize_dataset_with_sample, run_column_noising_tests |
| 46 | +from tests.utilities import ( |
| 47 | + get_single_noise_type_config, |
| 48 | + initialize_dataset_with_sample, |
| 49 | + run_column_noising_tests, |
| 50 | +) |
35 | 51 |
|
36 | 52 | ROW_TEST_FUNCTIONS = { |
37 | 53 | "omit_row": run_omit_row_tests, |
|
40 | 56 | } |
41 | 57 |
|
42 | 58 |
|
43 | | -def test_release_runs( |
| 59 | +def test_release_row_noising( |
44 | 60 | dataset_params: tuple[ |
45 | 61 | str, |
46 | 62 | Callable[..., pd.DataFrame], |
47 | | - str | None, |
| 63 | + Path | str | None, |
48 | 64 | int | None, |
49 | 65 | str | None, |
50 | 66 | Literal["pandas", "dask"], |
51 | 67 | ], |
52 | 68 | fuzzy_checker: FuzzyChecker, |
53 | | - mocker: MockerFixture, |
54 | 69 | ) -> None: |
55 | | - # keep all columns when generating unnoised data because some of them are used in testing |
56 | | - mocker.patch( |
57 | | - "pseudopeople.dataset.Dataset.drop_non_schema_columns", side_effect=lambda df, _: df |
58 | | - ) |
59 | | - |
60 | | - # create unnoised dataset |
61 | | - dataset_name, dataset_func, source, year, state, engine = dataset_params |
62 | | - unnoised_data_kwargs = { |
63 | | - "source": source, |
64 | | - "config": NO_NOISE, |
65 | | - "year": year, |
66 | | - "engine": engine, |
67 | | - } |
68 | | - if dataset_func != generate_social_security: |
69 | | - unnoised_data_kwargs["state"] = state |
70 | | - unnoised_data = dataset_func(**unnoised_data_kwargs) |
71 | | - |
72 | | - # In our standard noising process, i.e. when noising a shard of data, we |
73 | | - # 1) clean and reformat the data, 2) noise the data, and 3) do some post-processing. |
74 | | - # We're replicating steps 1 and 2 in this test and skipping 3. |
| 70 | + dataset_name, _, source, year, state, engine_name = dataset_params |
75 | 71 | full_dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_name] |
76 | 72 | dataset_schema = DATASET_SCHEMAS.get_dataset_schema(full_dataset_name) |
77 | | - dataset = Dataset(dataset_schema, unnoised_data, SEED) |
78 | | - # don't unnecessarily keep in memory |
79 | | - del unnoised_data |
80 | | - dataset._clean_input_data() |
81 | | - # convert datetime columns to datetime types for _reformat_dates_for_noising |
82 | | - # because the post-processing that occured in generating the unnoised data |
83 | | - # in step 3 mentioned above converts these columns to object dtypes |
84 | | - for col in [COLUMNS.dob.name, COLUMNS.ssa_event_date.name]: |
85 | | - if col in dataset.data: |
86 | | - dataset.data[col] = pd.to_datetime( |
87 | | - dataset.data[col], format=dataset_schema.date_format |
88 | | - ) |
89 | | - copy_col = "copy_" + col |
90 | | - if copy_col in dataset.data: |
91 | | - dataset.data[copy_col] = pd.to_datetime( |
92 | | - dataset.data[copy_col], format=dataset_schema.date_format |
93 | | - ) |
94 | | - dataset._reformat_dates_for_noising() |
95 | | - |
96 | 73 | config = get_configuration() |
| 74 | + # config = NoiseConfiguration(LayeredConfigTree(config_dict)) |
| 75 | + |
| 76 | + # update parameters |
| 77 | + if source is None: |
| 78 | + source = paths.SAMPLE_DATA_ROOT |
| 79 | + elif isinstance(source, str) or isinstance(source, Path): |
| 80 | + source = Path(source) |
| 81 | + validate_source_compatibility(source, dataset_schema) |
| 82 | + |
| 83 | + engine = get_engine_from_string(engine_name) |
| 84 | + |
| 85 | + data_file_paths = get_dataset_filepaths(Path(source), dataset_schema.name) |
| 86 | + filters = get_generate_data_filters(dataset_schema, year, state) |
| 87 | + unnoised_data = [load_standard_dataset(path, filters, engine) for path in data_file_paths] |
| 88 | + |
| 89 | + if engine == DASK_ENGINE: |
| 90 | + # TODO: [MIC-5960] move this compute to later in the code |
| 91 | + dataset_data: list[pd.DataFrame] = [data.compute() for data in unnoised_data if len(data) != 0] # type: ignore [operator] |
| 92 | + else: |
| 93 | + dataset_data = [data for data in unnoised_data if len(data) != 0] # type: ignore [misc] |
| 94 | + |
| 95 | + if str(source) == RI_FILEPATH and (dataset_name == "acs" or dataset_name == "cps"): |
| 96 | + dataset_data = [pd.concat(dataset_data).reset_index()] |
| 97 | + |
| 98 | + datasets = [Dataset(dataset_schema, data, SEED) for data in dataset_data] |
97 | 99 |
|
98 | 100 | for noise_type in NOISE_TYPES: |
99 | | - original_data = dataset.data.copy() |
| 101 | + prenoised_dataframes = [dataset.data.copy() for dataset in datasets] |
100 | 102 | if isinstance(noise_type, RowNoiseType): |
101 | | - if config.has_noise_type(dataset.dataset_schema.name, noise_type.name): |
102 | | - noise_type(dataset, config) |
| 103 | + if config.has_noise_type(dataset_schema.name, noise_type.name): |
| 104 | + [noise_type(dataset, config) for dataset in datasets] |
103 | 105 | test_function = ROW_TEST_FUNCTIONS[noise_type.name] |
104 | 106 | test_function( |
105 | | - original_data, dataset.data, config, full_dataset_name, fuzzy_checker |
| 107 | + prenoised_dataframes, datasets, config, full_dataset_name, fuzzy_checker |
106 | 108 | ) |
107 | 109 |
|
108 | 110 |
|
|
0 commit comments