-
Notifications
You must be signed in to change notification settings - Fork 4
Hjafari/feature/mic 5517 guardian test #498
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ccd619b
e0f4307
ced0250
a1cfc65
a806f2c
3727a55
b15ee9d
670d5f7
f174ab7
01ecac7
5023d98
2d7607c
3e04940
0745388
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -93,6 +93,38 @@ def apply_do_not_respond( | |
| # return dataset | ||
|
|
||
|
|
||
| # Helper function to format group dataframe and merging with their dependents | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole function was copied without changes |
||
| def merge_dependents_and_guardians( | ||
| dependents_df: pd.DataFrame, full_data: pd.DataFrame | ||
| ) -> pd.DataFrame: | ||
| # Merge dependents with their guardians. We have to merge twice to check | ||
| # if either guardian is living at a separate location from the dependent. | ||
| guardian_1s = full_data.loc[ | ||
| full_data["simulant_id"].isin(full_data["guardian_1"]), | ||
| GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"], | ||
| ].add_prefix("guardian_1_") | ||
| dependents_and_guardians_df = dependents_df.merge( | ||
| guardian_1s, | ||
| how="left", | ||
| left_on=["guardian_1", "year"], | ||
| right_on=["guardian_1_simulant_id", "guardian_1_year"], | ||
| ) | ||
| del guardian_1s | ||
| guardian_2s = full_data.loc[ | ||
| full_data["simulant_id"].isin(full_data["guardian_2"]), | ||
| GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"], | ||
| ].add_prefix("guardian_2_") | ||
| dependents_and_guardians_df = dependents_and_guardians_df.merge( | ||
| guardian_2s, | ||
| how="left", | ||
| left_on=["guardian_2", "year"], | ||
| right_on=["guardian_2_simulant_id", "guardian_2_year"], | ||
| ) | ||
| del guardian_2s | ||
|
|
||
| return dependents_and_guardians_df | ||
|
|
||
|
|
||
| def duplicate_with_guardian( | ||
| dataset: Dataset, | ||
| configuration: NoiseConfiguration, | ||
|
|
@@ -109,37 +141,6 @@ def duplicate_with_guardian( | |
| :param to_noise_index: pd.Index of rows to be noised | ||
| """ | ||
|
|
||
| # Helper function to format group dataframe and merging with their dependents | ||
| def _merge_dependents_and_guardians( | ||
| dependents_df: pd.DataFrame, full_data: pd.DataFrame | ||
| ) -> pd.DataFrame: | ||
| # Merge dependents with their guardians. We have to merge twice to check | ||
| # if either guardian is living at a separate location from the dependent. | ||
| guardian_1s = full_data.loc[ | ||
| full_data["simulant_id"].isin(full_data["guardian_1"]), | ||
| GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"], | ||
| ].add_prefix("guardian_1_") | ||
| dependents_and_guardians_df = dependents_df.merge( | ||
| guardian_1s, | ||
| how="left", | ||
| left_on=["guardian_1", "year"], | ||
| right_on=["guardian_1_simulant_id", "guardian_1_year"], | ||
| ) | ||
| del guardian_1s | ||
| guardian_2s = full_data.loc[ | ||
| full_data["simulant_id"].isin(full_data["guardian_2"]), | ||
| GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"], | ||
| ].add_prefix("guardian_2_") | ||
| dependents_and_guardians_df = dependents_and_guardians_df.merge( | ||
| guardian_2s, | ||
| how="left", | ||
| left_on=["guardian_2", "year"], | ||
| right_on=["guardian_2_simulant_id", "guardian_2_year"], | ||
| ) | ||
| del guardian_2s | ||
|
|
||
| return dependents_and_guardians_df | ||
|
|
||
| # Get dict of group type and formatted dataframe for that group that should be noised | ||
| formatted_group_data = {} | ||
| # Get dataframe for each dependent group to merge with guardians | ||
|
|
@@ -157,10 +158,10 @@ def _merge_dependents_and_guardians( | |
| # Merge dependents with their guardians | ||
| formatted_group_data[ | ||
| Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18 | ||
| ] = _merge_dependents_and_guardians(in_households_under_18, dataset.data) | ||
| ] = merge_dependents_and_guardians(in_households_under_18, dataset.data) | ||
| formatted_group_data[ | ||
| Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24 | ||
| ] = _merge_dependents_and_guardians(in_college_under_24, dataset.data) | ||
| ] = merge_dependents_and_guardians(in_college_under_24, dataset.data) | ||
| # Note: We have two dicts (configuration and formatted_group_data) at this point that have | ||
| # the key for the group and then a dataframe for that group or the group and the configured | ||
| # noise level | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,18 +1,27 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Callable | ||
| from typing import Any | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import pytest | ||
| from _pytest.fixtures import FixtureRequest | ||
| from layered_config_tree import LayeredConfigTree | ||
| from pytest_mock import MockerFixture | ||
| from vivarium_testing_utils import FuzzyChecker | ||
|
|
||
| from pseudopeople.configuration import Keys | ||
| from pseudopeople.configuration import Keys, get_configuration | ||
| from pseudopeople.configuration.entities import NO_NOISE | ||
| from pseudopeople.configuration.noise_configuration import NoiseConfiguration | ||
| from pseudopeople.constants.metadata import DatasetNames | ||
| from pseudopeople.constants.noise_type_metadata import ( | ||
| GUARDIAN_DUPLICATION_ADDRESS_COLUMNS, | ||
| ) | ||
| from pseudopeople.dataset import Dataset | ||
| from pseudopeople.interface import generate_decennial_census | ||
| from pseudopeople.noise_entities import NOISE_TYPES | ||
| from pseudopeople.noise_functions import merge_dependents_and_guardians | ||
| from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS | ||
| from tests.integration.conftest import SEED, _get_common_datasets | ||
| from tests.utilities import ( | ||
|
|
@@ -166,3 +175,145 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None: | |
| .all() | ||
| .all() | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "probabilities", | ||
| [ | ||
| { | ||
| "row_probability_in_households_under_18": 1.0, | ||
| "row_probability_in_college_group_quarters_under_24": 1.0, | ||
| }, | ||
| { | ||
| "row_probability_in_households_under_18": 0.7, | ||
| "row_probability_in_college_group_quarters_under_24": 0.8, | ||
| }, | ||
| ], | ||
| ) | ||
| def test_guardian_duplication( | ||
| dataset_params: tuple[str | int | Callable[..., pd.DataFrame] | None, ...], | ||
| dataset_name: str, | ||
| probabilities: dict[str, float], | ||
| fuzzy_checker: FuzzyChecker, | ||
| mocker: MockerFixture, | ||
| ) -> None: | ||
| if dataset_name != DatasetNames.CENSUS: | ||
| return | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add comments about why you're patching |
||
| # patch these to avoid updating dtypes and dropping columns we need for testing | ||
| mocker.patch("pseudopeople.dataset.coerce_dtypes", side_effect=lambda df, _: df) | ||
| mocker.patch( | ||
| "pseudopeople.dataset.Dataset.drop_non_schema_columns", side_effect=lambda df, _: df | ||
| ) | ||
| # allow all irrelevant probabilities to be 0 in our config | ||
| mocker.patch( | ||
| "pseudopeople.configuration.generator.validate_overrides", | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check whether these validations are being tested
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need to patch over these?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For age differences, by default we can't have a non-zero probability of keeping the ages the same.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd say you should set that noise type to not noise then |
||
| side_effect=lambda *args: None, | ||
| ) | ||
| # allow our noise levels to be high in testing | ||
| mocker.patch( | ||
| "pseudopeople.configuration.generator.validate_noise_level_proportions", | ||
| lambda *args: None, | ||
| ) | ||
|
|
||
| # get unnoised data | ||
| _, _, source, year, state, engine = dataset_params | ||
| unnoised = generate_decennial_census( | ||
| source=source, config=NO_NOISE, year=year, state=state, engine=engine | ||
| ) | ||
|
|
||
| # get noised data using custom config | ||
| config_dict = get_single_noise_type_config( | ||
| dataset_name, NOISE_TYPES.duplicate_with_guardian.name | ||
| ) | ||
| for probability_key in [ | ||
| Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24, | ||
| Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18, | ||
| ]: | ||
| config_dict[dataset_name][Keys.ROW_NOISE][NOISE_TYPES.duplicate_with_guardian.name][ | ||
| probability_key | ||
| ] = probabilities[probability_key] | ||
| config = NoiseConfiguration(LayeredConfigTree(config_dict)) | ||
| noised = generate_decennial_census( | ||
| source=source, config=config.to_dict(), year=year, state=state, engine=engine | ||
| ) | ||
|
|
||
| duplicated = noised.loc[noised["simulant_id"].duplicated()] | ||
| duplicated["age"] = duplicated["age"].astype(int) | ||
|
|
||
| # add old housing type data to duplicated simulants | ||
| old_housing_data = unnoised[["simulant_id", "housing_type"]].rename( | ||
| {"housing_type": "unnoised_housing_type"}, axis=1 | ||
| ) | ||
| duplicated = duplicated.merge(old_housing_data) | ||
|
|
||
| # separate tests for household under 18 and for college under 24 | ||
| for probability_name, age, housing_type in zip( | ||
| [ | ||
| Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18, | ||
| Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24, | ||
| ], | ||
| [18, 24], | ||
| ["Household", "College"], | ||
| ): | ||
| group_data = unnoised.loc[ | ||
| (unnoised["age"].astype(int) < age) | ||
| & (unnoised["housing_type"] == housing_type) | ||
| & (unnoised["guardian_1"].notna()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because guardian_1 is never nan, right? But guardian_2 might be?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes if "guardian_1" is notna then there is at least one guardian but not necessarily guardian 2 |
||
| ] | ||
| merged_data = merge_dependents_and_guardians(group_data, unnoised) | ||
| sims_eligible_for_duplication = merged_data.index[ | ||
| ( | ||
| (merged_data["household_id"] != merged_data["guardian_1_household_id"]) | ||
| & (merged_data["guardian_1_household_id"].notna()) | ||
| ) | ||
| | ( | ||
| (merged_data["household_id"] != merged_data["guardian_2_household_id"]) | ||
| & (merged_data["guardian_2_household_id"].notna()) | ||
| ) | ||
| ] | ||
| duplicated_in_group = duplicated.loc[ | ||
| (duplicated["age"] < age) & (duplicated["unnoised_housing_type"] == housing_type) | ||
| ] | ||
|
|
||
| fuzzy_checker.fuzzy_assert_proportion( | ||
| name="test_duplicate_guardian", | ||
| observed_numerator=len(duplicated_in_group), | ||
| observed_denominator=len(sims_eligible_for_duplication), | ||
| target_proportion=probabilities[probability_name], | ||
| name_additional=f"noised_data", | ||
| ) | ||
| # Only duplicate a dependent one time | ||
| assert noised["simulant_id"].value_counts().max() == 2 | ||
|
|
||
| # Check address information is copied in new rows | ||
| guardians = unnoised.loc[ | ||
| unnoised["simulant_id"].isin(unnoised["guardian_1"]) | ||
| | unnoised["simulant_id"].isin(unnoised["guardian_2"]) | ||
| ] | ||
| simulant_ids = unnoised["simulant_id"].values | ||
|
|
||
| for i in duplicated.index: | ||
| dependent = duplicated.loc[i] | ||
|
|
||
| for column in GUARDIAN_DUPLICATION_ADDRESS_COLUMNS: | ||
| guardian_1 = dependent["guardian_1"] | ||
| guardian_2 = dependent["guardian_2"] | ||
|
|
||
| if guardian_2 is np.nan: | ||
| guardians_values = [ | ||
| guardians.loc[guardians["simulant_id"] == guardian_1, column].values[0] | ||
| ] | ||
| else: # dependent has both guardians | ||
| guardians_values = [] | ||
| for guardian in [guardian_1, guardian_2]: | ||
| if ( | ||
| guardian in simulant_ids | ||
| ): # duplicates will not have addresses copied from guardians not in data | ||
| guardians_values += [ | ||
| guardians.loc[ | ||
| guardians["simulant_id"] == guardian, column | ||
| ].values[0] | ||
| ] | ||
|
|
||
| assert dependent[column] in guardians_values | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -182,7 +182,8 @@ def get_single_noise_type_config( | |
| if isinstance(probability, list): | ||
| new_probability = [0.0 for x in probability] | ||
| elif isinstance(probability, dict): | ||
| new_probability = {key: 0.0 for key in probability.keys()} | ||
| # NOTE: this will fail default config validations | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain this? Why would the key be an integer? I don't really undwerstand your note, either.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This structure is for "possible age differences" where the keys can be -2 or 1 to indicate what int to add to a simulant's actual age and the value is the probability of picking each age difference. |
||
| new_probability = {0: 1.0} | ||
| else: | ||
| new_probability = 0.0 | ||
| config_dict[dataset_name][Keys.COLUMN_NOISE][col][noise_type][ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.