|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
| 3 | +from collections.abc import Callable |
3 | 4 | from typing import Any |
4 | 5 |
|
5 | 6 | import numpy as np |
6 | 7 | import pandas as pd |
7 | 8 | import pytest |
8 | 9 | from _pytest.fixtures import FixtureRequest |
9 | 10 | from layered_config_tree import LayeredConfigTree |
| 11 | +from pytest_mock import MockerFixture |
10 | 12 | from vivarium_testing_utils import FuzzyChecker |
11 | 13 |
|
12 | | -from pseudopeople.configuration import Keys |
| 14 | +from pseudopeople.configuration import Keys, get_configuration |
| 15 | +from pseudopeople.configuration.entities import NO_NOISE |
13 | 16 | from pseudopeople.configuration.noise_configuration import NoiseConfiguration |
| 17 | +from pseudopeople.constants.metadata import DatasetNames |
| 18 | +from pseudopeople.constants.noise_type_metadata import ( |
| 19 | + GUARDIAN_DUPLICATION_ADDRESS_COLUMNS, |
| 20 | +) |
14 | 21 | from pseudopeople.dataset import Dataset |
| 22 | +from pseudopeople.interface import generate_decennial_census |
15 | 23 | from pseudopeople.noise_entities import NOISE_TYPES |
| 24 | +from pseudopeople.noise_functions import merge_dependents_and_guardians |
16 | 25 | from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS |
17 | 26 | from tests.integration.conftest import SEED, _get_common_datasets |
18 | 27 | from tests.utilities import ( |
@@ -166,3 +175,145 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None: |
166 | 175 | .all() |
167 | 176 | .all() |
168 | 177 | ) |
| 178 | + |
| 179 | + |
| 180 | +@pytest.mark.parametrize( |
| 181 | + "probabilities", |
| 182 | + [ |
| 183 | + { |
| 184 | + "row_probability_in_households_under_18": 1.0, |
| 185 | + "row_probability_in_college_group_quarters_under_24": 1.0, |
| 186 | + }, |
| 187 | + { |
| 188 | + "row_probability_in_households_under_18": 0.7, |
| 189 | + "row_probability_in_college_group_quarters_under_24": 0.8, |
| 190 | + }, |
| 191 | + ], |
| 192 | +) |
| 193 | +def test_guardian_duplication( |
| 194 | + dataset_params: tuple[str | int | Callable[..., pd.DataFrame] | None, ...], |
| 195 | + dataset_name: str, |
| 196 | + probabilities: dict[str, float], |
| 197 | + fuzzy_checker: FuzzyChecker, |
| 198 | + mocker: MockerFixture, |
| 199 | +) -> None: |
| 200 | + if dataset_name != DatasetNames.CENSUS: |
| 201 | + return |
| 202 | + |
| 203 | + # patch these to avoid updating dtypes and dropping columns we need for testing |
| 204 | + mocker.patch("pseudopeople.dataset.coerce_dtypes", side_effect=lambda df, _: df) |
| 205 | + mocker.patch( |
| 206 | + "pseudopeople.dataset.Dataset.drop_non_schema_columns", side_effect=lambda df, _: df |
| 207 | + ) |
| 208 | + # allow all irrelevant probabilities to be 0 in our config |
| 209 | + mocker.patch( |
| 210 | + "pseudopeople.configuration.generator.validate_overrides", |
| 211 | + side_effect=lambda *args: None, |
| 212 | + ) |
| 213 | + # allow our noise levels to be high in testing |
| 214 | + mocker.patch( |
| 215 | + "pseudopeople.configuration.generator.validate_noise_level_proportions", |
| 216 | + lambda *args: None, |
| 217 | + ) |
| 218 | + |
| 219 | + # get unnoised data |
| 220 | + _, _, source, year, state, engine = dataset_params |
| 221 | + unnoised = generate_decennial_census( |
| 222 | + source=source, config=NO_NOISE, year=year, state=state, engine=engine |
| 223 | + ) |
| 224 | + |
| 225 | + # get noised data using custom config |
| 226 | + config_dict = get_single_noise_type_config( |
| 227 | + dataset_name, NOISE_TYPES.duplicate_with_guardian.name |
| 228 | + ) |
| 229 | + for probability_key in [ |
| 230 | + Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24, |
| 231 | + Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18, |
| 232 | + ]: |
| 233 | + config_dict[dataset_name][Keys.ROW_NOISE][NOISE_TYPES.duplicate_with_guardian.name][ |
| 234 | + probability_key |
| 235 | + ] = probabilities[probability_key] |
| 236 | + config = NoiseConfiguration(LayeredConfigTree(config_dict)) |
| 237 | + noised = generate_decennial_census( |
| 238 | + source=source, config=config.to_dict(), year=year, state=state, engine=engine |
| 239 | + ) |
| 240 | + |
| 241 | + duplicated = noised.loc[noised["simulant_id"].duplicated()] |
| 242 | + duplicated["age"] = duplicated["age"].astype(int) |
| 243 | + |
| 244 | + # add old housing type data to duplicated simulants |
| 245 | + old_housing_data = unnoised[["simulant_id", "housing_type"]].rename( |
| 246 | + {"housing_type": "unnoised_housing_type"}, axis=1 |
| 247 | + ) |
| 248 | + duplicated = duplicated.merge(old_housing_data) |
| 249 | + |
| 250 | + # separate tests for household under 18 and for college under 24 |
| 251 | + for probability_name, age, housing_type in zip( |
| 252 | + [ |
| 253 | + Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18, |
| 254 | + Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24, |
| 255 | + ], |
| 256 | + [18, 24], |
| 257 | + ["Household", "College"], |
| 258 | + ): |
| 259 | + group_data = unnoised.loc[ |
| 260 | + (unnoised["age"].astype(int) < age) |
| 261 | + & (unnoised["housing_type"] == housing_type) |
| 262 | + & (unnoised["guardian_1"].notna()) |
| 263 | + ] |
| 264 | + merged_data = merge_dependents_and_guardians(group_data, unnoised) |
| 265 | + sims_eligible_for_duplication = merged_data.index[ |
| 266 | + ( |
| 267 | + (merged_data["household_id"] != merged_data["guardian_1_household_id"]) |
| 268 | + & (merged_data["guardian_1_household_id"].notna()) |
| 269 | + ) |
| 270 | + | ( |
| 271 | + (merged_data["household_id"] != merged_data["guardian_2_household_id"]) |
| 272 | + & (merged_data["guardian_2_household_id"].notna()) |
| 273 | + ) |
| 274 | + ] |
| 275 | + duplicated_in_group = duplicated.loc[ |
| 276 | + (duplicated["age"] < age) & (duplicated["unnoised_housing_type"] == housing_type) |
| 277 | + ] |
| 278 | + |
| 279 | + fuzzy_checker.fuzzy_assert_proportion( |
| 280 | + name="test_duplicate_guardian", |
| 281 | + observed_numerator=len(duplicated_in_group), |
| 282 | + observed_denominator=len(sims_eligible_for_duplication), |
| 283 | + target_proportion=probabilities[probability_name], |
| 284 | + name_additional=f"noised_data", |
| 285 | + ) |
| 286 | + # Only duplicate a dependent one time |
| 287 | + assert noised["simulant_id"].value_counts().max() == 2 |
| 288 | + |
| 289 | + # Check address information is copied in new rows |
| 290 | + guardians = unnoised.loc[ |
| 291 | + unnoised["simulant_id"].isin(unnoised["guardian_1"]) |
| 292 | + | unnoised["simulant_id"].isin(unnoised["guardian_2"]) |
| 293 | + ] |
| 294 | + simulant_ids = unnoised["simulant_id"].values |
| 295 | + |
| 296 | + for i in duplicated.index: |
| 297 | + dependent = duplicated.loc[i] |
| 298 | + |
| 299 | + for column in GUARDIAN_DUPLICATION_ADDRESS_COLUMNS: |
| 300 | + guardian_1 = dependent["guardian_1"] |
| 301 | + guardian_2 = dependent["guardian_2"] |
| 302 | + |
| 303 | + if guardian_2 is np.nan: |
| 304 | + guardians_values = [ |
| 305 | + guardians.loc[guardians["simulant_id"] == guardian_1, column].values[0] |
| 306 | + ] |
| 307 | + else: # dependent has both guardians |
| 308 | + guardians_values = [] |
| 309 | + for guardian in [guardian_1, guardian_2]: |
| 310 | + if ( |
| 311 | + guardian in simulant_ids |
| 312 | + ): # duplicates will not have addresses copied from guardians not in data |
| 313 | + guardians_values += [ |
| 314 | + guardians.loc[ |
| 315 | + guardians["simulant_id"] == guardian, column |
| 316 | + ].values[0] |
| 317 | + ] |
| 318 | + |
| 319 | + assert dependent[column] in guardians_values |
0 commit comments