Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion src/pseudopeople/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def get_noised_data(
self._reformat_dates_for_noising()
self._noise_dataset(configuration, noise_types, progress_bar=progress_bar)
self.data = coerce_dtypes(self.data, self.dataset_schema)
self.data = self.data[[c.name for c in self.dataset_schema.columns]]
self.data = Dataset.drop_non_schema_columns(self.data, self.dataset_schema)
return self.data

def _noise_dataset(
Expand Down Expand Up @@ -164,6 +164,25 @@ def _reformat_dates_for_noising(self) -> None:

self.data = data

@staticmethod
def drop_non_schema_columns(
data: pd.DataFrame, dataset_schema: DatasetSchema
) -> pd.DataFrame:
"""Returns data with only the columns in the dataset schema.

Parameters
----------
data
The pd.DataFrame to update.
dataset_schema
A DatasetSchema which contains the columns of interest in its column attribute.

Returns
-------
A pd.DataFrame with the columns in the dataset schema.
"""
return data[[c.name for c in dataset_schema.columns]]

@staticmethod
def is_missing(data: pd.DataFrame) -> pd.DataFrame:
"""Returns a boolean dataframe with the same columns, index, and shape of
Expand Down
67 changes: 34 additions & 33 deletions src/pseudopeople/noise_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,38 @@ def apply_do_not_respond(
# return dataset


# Helper function to format group dataframe and merging with their dependents
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole function was copied without changes

def merge_dependents_and_guardians(
dependents_df: pd.DataFrame, full_data: pd.DataFrame
) -> pd.DataFrame:
# Merge dependents with their guardians. We have to merge twice to check
# if either guardian is living at a separate location from the dependent.
guardian_1s = full_data.loc[
full_data["simulant_id"].isin(full_data["guardian_1"]),
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
].add_prefix("guardian_1_")
dependents_and_guardians_df = dependents_df.merge(
guardian_1s,
how="left",
left_on=["guardian_1", "year"],
right_on=["guardian_1_simulant_id", "guardian_1_year"],
)
del guardian_1s
guardian_2s = full_data.loc[
full_data["simulant_id"].isin(full_data["guardian_2"]),
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
].add_prefix("guardian_2_")
dependents_and_guardians_df = dependents_and_guardians_df.merge(
guardian_2s,
how="left",
left_on=["guardian_2", "year"],
right_on=["guardian_2_simulant_id", "guardian_2_year"],
)
del guardian_2s

return dependents_and_guardians_df


def duplicate_with_guardian(
dataset: Dataset,
configuration: NoiseConfiguration,
Expand All @@ -109,37 +141,6 @@ def duplicate_with_guardian(
:param to_noise_index: pd.Index of rows to be noised
"""

# Helper function to format group dataframe and merging with their dependents
def _merge_dependents_and_guardians(
dependents_df: pd.DataFrame, full_data: pd.DataFrame
) -> pd.DataFrame:
# Merge dependents with their guardians. We have to merge twice to check
# if either guardian is living at a separate location from the dependent.
guardian_1s = full_data.loc[
full_data["simulant_id"].isin(full_data["guardian_1"]),
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
].add_prefix("guardian_1_")
dependents_and_guardians_df = dependents_df.merge(
guardian_1s,
how="left",
left_on=["guardian_1", "year"],
right_on=["guardian_1_simulant_id", "guardian_1_year"],
)
del guardian_1s
guardian_2s = full_data.loc[
full_data["simulant_id"].isin(full_data["guardian_2"]),
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
].add_prefix("guardian_2_")
dependents_and_guardians_df = dependents_and_guardians_df.merge(
guardian_2s,
how="left",
left_on=["guardian_2", "year"],
right_on=["guardian_2_simulant_id", "guardian_2_year"],
)
del guardian_2s

return dependents_and_guardians_df

# Get dict of group type and formatted dataframe for that group that should be noised
formatted_group_data = {}
# Get dataframe for each dependent group to merge with guardians
Expand All @@ -157,10 +158,10 @@ def _merge_dependents_and_guardians(
# Merge dependents with their guardians
formatted_group_data[
Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18
] = _merge_dependents_and_guardians(in_households_under_18, dataset.data)
] = merge_dependents_and_guardians(in_households_under_18, dataset.data)
formatted_group_data[
Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24
] = _merge_dependents_and_guardians(in_college_under_24, dataset.data)
] = merge_dependents_and_guardians(in_college_under_24, dataset.data)
# Note: We have two dicts (configuration and formatted_group_data) at this point that have
# the key for the group and then a dataframe for that group or the group and the configured
# noise level
Expand Down
153 changes: 152 additions & 1 deletion tests/integration/release/test_release.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
from __future__ import annotations

from collections.abc import Callable
from typing import Any

import numpy as np
import pandas as pd
import pytest
from _pytest.fixtures import FixtureRequest
from layered_config_tree import LayeredConfigTree
from pytest_mock import MockerFixture
from vivarium_testing_utils import FuzzyChecker

from pseudopeople.configuration import Keys
from pseudopeople.configuration import Keys, get_configuration
from pseudopeople.configuration.entities import NO_NOISE
from pseudopeople.configuration.noise_configuration import NoiseConfiguration
from pseudopeople.constants.metadata import DatasetNames
from pseudopeople.constants.noise_type_metadata import (
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS,
)
from pseudopeople.dataset import Dataset
from pseudopeople.interface import generate_decennial_census
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.noise_functions import merge_dependents_and_guardians
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
from tests.integration.conftest import SEED, _get_common_datasets
from tests.utilities import (
Expand Down Expand Up @@ -166,3 +175,145 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
.all()
.all()
)


@pytest.mark.parametrize(
"probabilities",
[
{
"row_probability_in_households_under_18": 1.0,
"row_probability_in_college_group_quarters_under_24": 1.0,
},
{
"row_probability_in_households_under_18": 0.7,
"row_probability_in_college_group_quarters_under_24": 0.8,
},
],
)
def test_guardian_duplication(
dataset_params: tuple[str | int | Callable[..., pd.DataFrame] | None, ...],
dataset_name: str,
probabilities: dict[str, float],
fuzzy_checker: FuzzyChecker,
mocker: MockerFixture,
) -> None:
if dataset_name != DatasetNames.CENSUS:
return

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comments about why you're patching

# patch these to avoid updating dtypes and dropping columns we need for testing
mocker.patch("pseudopeople.dataset.coerce_dtypes", side_effect=lambda df, _: df)
mocker.patch(
"pseudopeople.dataset.Dataset.drop_non_schema_columns", side_effect=lambda df, _: df
)
# allow all irrelevant probabilities to be 0 in our config
mocker.patch(
"pseudopeople.configuration.generator.validate_overrides",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check whether these validations are being tested

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need to patch over these?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does validate_overrides not allow for 0 probabilities?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For age differences, by default we can't have a non-zero probability of keeping the ages the same.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say you should set that noise type to not noise then

side_effect=lambda *args: None,
)
# allow our noise levels to be high in testing
mocker.patch(
"pseudopeople.configuration.generator.validate_noise_level_proportions",
lambda *args: None,
)

# get unnoised data
_, _, source, year, state, engine = dataset_params
unnoised = generate_decennial_census(
source=source, config=NO_NOISE, year=year, state=state, engine=engine
)

# get noised data using custom config
config_dict = get_single_noise_type_config(
dataset_name, NOISE_TYPES.duplicate_with_guardian.name
)
for probability_key in [
Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24,
Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18,
]:
config_dict[dataset_name][Keys.ROW_NOISE][NOISE_TYPES.duplicate_with_guardian.name][
probability_key
] = probabilities[probability_key]
config = NoiseConfiguration(LayeredConfigTree(config_dict))
noised = generate_decennial_census(
source=source, config=config.to_dict(), year=year, state=state, engine=engine
)

duplicated = noised.loc[noised["simulant_id"].duplicated()]
duplicated["age"] = duplicated["age"].astype(int)

# add old housing type data to duplicated simulants
old_housing_data = unnoised[["simulant_id", "housing_type"]].rename(
{"housing_type": "unnoised_housing_type"}, axis=1
)
duplicated = duplicated.merge(old_housing_data)

# separate tests for household under 18 and for college under 24
for probability_name, age, housing_type in zip(
[
Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18,
Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24,
],
[18, 24],
["Household", "College"],
):
group_data = unnoised.loc[
(unnoised["age"].astype(int) < age)
& (unnoised["housing_type"] == housing_type)
& (unnoised["guardian_1"].notna())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because guardian_1 is never nan, right? But guardian_2 might be?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes if "guardian_1" is notna then there is at least one guardian but not necessarily guardian 2

]
merged_data = merge_dependents_and_guardians(group_data, unnoised)
sims_eligible_for_duplication = merged_data.index[
(
(merged_data["household_id"] != merged_data["guardian_1_household_id"])
& (merged_data["guardian_1_household_id"].notna())
)
| (
(merged_data["household_id"] != merged_data["guardian_2_household_id"])
& (merged_data["guardian_2_household_id"].notna())
)
]
duplicated_in_group = duplicated.loc[
(duplicated["age"] < age) & (duplicated["unnoised_housing_type"] == housing_type)
]

fuzzy_checker.fuzzy_assert_proportion(
name="test_duplicate_guardian",
observed_numerator=len(duplicated_in_group),
observed_denominator=len(sims_eligible_for_duplication),
target_proportion=probabilities[probability_name],
name_additional=f"noised_data",
)
# Only duplicate a dependent one time
assert noised["simulant_id"].value_counts().max() == 2

# Check address information is copied in new rows
guardians = unnoised.loc[
unnoised["simulant_id"].isin(unnoised["guardian_1"])
| unnoised["simulant_id"].isin(unnoised["guardian_2"])
]
simulant_ids = unnoised["simulant_id"].values

for i in duplicated.index:
dependent = duplicated.loc[i]

for column in GUARDIAN_DUPLICATION_ADDRESS_COLUMNS:
guardian_1 = dependent["guardian_1"]
guardian_2 = dependent["guardian_2"]

if guardian_2 is np.nan:
guardians_values = [
guardians.loc[guardians["simulant_id"] == guardian_1, column].values[0]
]
else: # dependent has both guardians
guardians_values = []
for guardian in [guardian_1, guardian_2]:
if (
guardian in simulant_ids
): # duplicates will not have addresses copied from guardians not in data
guardians_values += [
guardians.loc[
guardians["simulant_id"] == guardian, column
].values[0]
]

assert dependent[column] in guardians_values
3 changes: 2 additions & 1 deletion tests/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ def get_single_noise_type_config(
if isinstance(probability, list):
new_probability = [0.0 for x in probability]
elif isinstance(probability, dict):
new_probability = {key: 0.0 for key in probability.keys()}
# NOTE: this will fail default config validations
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain this? Why would the key be an integer? I don't really undwerstand your note, either.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This structure is for "possible age differences" where the keys can be -2 or 1 to indicate what int to add to a simulant's actual age and the value is the probability of picking each age difference.

new_probability = {0: 1.0}
else:
new_probability = 0.0
config_dict[dataset_name][Keys.COLUMN_NOISE][col][noise_type][
Expand Down