ihmeuw · hussain-jafari · Mar 25, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/src/pseudopeople/dataset.py b/src/pseudopeople/dataset.py
@@ -70,7 +70,7 @@ def get_noised_data(
         self._reformat_dates_for_noising()
         self._noise_dataset(configuration, noise_types, progress_bar=progress_bar)
         self.data = coerce_dtypes(self.data, self.dataset_schema)
-        self.data = self.data[[c.name for c in self.dataset_schema.columns]]
+        self.data = Dataset.drop_non_schema_columns(self.data, self.dataset_schema)
         return self.data
 
     def _noise_dataset(
@@ -164,6 +164,25 @@ def _reformat_dates_for_noising(self) -> None:
 
         self.data = data
 
+    @staticmethod
+    def drop_non_schema_columns(
+        data: pd.DataFrame, dataset_schema: DatasetSchema
+    ) -> pd.DataFrame:
+        """Returns data with only the columns in the dataset schema.
+
+        Parameters
+        ----------
+        data
+            The pd.DataFrame to update.
+        dataset_schema
+            A DatasetSchema which contains the columns of interest in its column attribute.
+
+        Returns
+        -------
+            A pd.DataFrame with the columns in the dataset schema.
+        """
+        return data[[c.name for c in dataset_schema.columns]]
+
     @staticmethod
     def is_missing(data: pd.DataFrame) -> pd.DataFrame:
         """Returns a boolean dataframe with the same columns, index, and shape of

diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py
@@ -93,6 +93,38 @@ def apply_do_not_respond(
 #     return dataset
 
 
+# Helper function to format group dataframe and merging with their dependents
+def merge_dependents_and_guardians(
+    dependents_df: pd.DataFrame, full_data: pd.DataFrame
+) -> pd.DataFrame:
+    # Merge dependents with their guardians. We have to merge twice to check
+    # if either guardian is living at a separate location from the dependent.
+    guardian_1s = full_data.loc[
+        full_data["simulant_id"].isin(full_data["guardian_1"]),
+        GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
+    ].add_prefix("guardian_1_")
+    dependents_and_guardians_df = dependents_df.merge(
+        guardian_1s,
+        how="left",
+        left_on=["guardian_1", "year"],
+        right_on=["guardian_1_simulant_id", "guardian_1_year"],
+    )
+    del guardian_1s
+    guardian_2s = full_data.loc[
+        full_data["simulant_id"].isin(full_data["guardian_2"]),
+        GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
+    ].add_prefix("guardian_2_")
+    dependents_and_guardians_df = dependents_and_guardians_df.merge(
+        guardian_2s,
+        how="left",
+        left_on=["guardian_2", "year"],
+        right_on=["guardian_2_simulant_id", "guardian_2_year"],
+    )
+    del guardian_2s
+
+    return dependents_and_guardians_df
+
+
 def duplicate_with_guardian(
     dataset: Dataset,
     configuration: NoiseConfiguration,
@@ -109,37 +141,6 @@ def duplicate_with_guardian(
     :param to_noise_index: pd.Index of rows to be noised
     """
 
-    # Helper function to format group dataframe and merging with their dependents
-    def _merge_dependents_and_guardians(
-        dependents_df: pd.DataFrame, full_data: pd.DataFrame
-    ) -> pd.DataFrame:
-        # Merge dependents with their guardians. We have to merge twice to check
-        # if either guardian is living at a separate location from the dependent.
-        guardian_1s = full_data.loc[
-            full_data["simulant_id"].isin(full_data["guardian_1"]),
-            GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
-        ].add_prefix("guardian_1_")
-        dependents_and_guardians_df = dependents_df.merge(
-            guardian_1s,
-            how="left",
-            left_on=["guardian_1", "year"],
-            right_on=["guardian_1_simulant_id", "guardian_1_year"],
-        )
-        del guardian_1s
-        guardian_2s = full_data.loc[
-            full_data["simulant_id"].isin(full_data["guardian_2"]),
-            GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
-        ].add_prefix("guardian_2_")
-        dependents_and_guardians_df = dependents_and_guardians_df.merge(
-            guardian_2s,
-            how="left",
-            left_on=["guardian_2", "year"],
-            right_on=["guardian_2_simulant_id", "guardian_2_year"],
-        )
-        del guardian_2s
-
-        return dependents_and_guardians_df
-
     # Get dict of group type and formatted dataframe for that group that should be noised
     formatted_group_data = {}
     # Get dataframe for each dependent group to merge with guardians
@@ -157,10 +158,10 @@ def _merge_dependents_and_guardians(
     # Merge dependents with their guardians
     formatted_group_data[
         Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18
-    ] = _merge_dependents_and_guardians(in_households_under_18, dataset.data)
+    ] = merge_dependents_and_guardians(in_households_under_18, dataset.data)
     formatted_group_data[
         Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24
-    ] = _merge_dependents_and_guardians(in_college_under_24, dataset.data)
+    ] = merge_dependents_and_guardians(in_college_under_24, dataset.data)
     # Note: We have two dicts (configuration and formatted_group_data) at this point that have
     # the key for the group and then a dataframe for that group or the group and the configured
     # noise level

diff --git a/tests/integration/release/test_release.py b/tests/integration/release/test_release.py
@@ -1,18 +1,27 @@
 from __future__ import annotations
 
+from collections.abc import Callable
 from typing import Any
 
 import numpy as np
 import pandas as pd
 import pytest
 from _pytest.fixtures import FixtureRequest
 from layered_config_tree import LayeredConfigTree
+from pytest_mock import MockerFixture
 from vivarium_testing_utils import FuzzyChecker
 
-from pseudopeople.configuration import Keys
+from pseudopeople.configuration import Keys, get_configuration
+from pseudopeople.configuration.entities import NO_NOISE
 from pseudopeople.configuration.noise_configuration import NoiseConfiguration
+from pseudopeople.constants.metadata import DatasetNames
+from pseudopeople.constants.noise_type_metadata import (
+    GUARDIAN_DUPLICATION_ADDRESS_COLUMNS,
+)
 from pseudopeople.dataset import Dataset
+from pseudopeople.interface import generate_decennial_census
 from pseudopeople.noise_entities import NOISE_TYPES
+from pseudopeople.noise_functions import merge_dependents_and_guardians
 from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
 from tests.integration.conftest import SEED, _get_common_datasets
 from tests.utilities import (
@@ -166,3 +175,145 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
         .all()
         .all()
     )
+
+
+@pytest.mark.parametrize(
+    "probabilities",
+    [
+        {
+            "row_probability_in_households_under_18": 1.0,
+            "row_probability_in_college_group_quarters_under_24": 1.0,
+        },
+        {
+            "row_probability_in_households_under_18": 0.7,
+            "row_probability_in_college_group_quarters_under_24": 0.8,
+        },
+    ],
+)
+def test_guardian_duplication(
+    dataset_params: tuple[str | int | Callable[..., pd.DataFrame] | None, ...],
+    dataset_name: str,
+    probabilities: dict[str, float],
+    fuzzy_checker: FuzzyChecker,
+    mocker: MockerFixture,
+) -> None:
+    if dataset_name != DatasetNames.CENSUS:
+        return
+
+    # patch these to avoid updating dtypes and dropping columns we need for testing
+    mocker.patch("pseudopeople.dataset.coerce_dtypes", side_effect=lambda df, _: df)
+    mocker.patch(
+        "pseudopeople.dataset.Dataset.drop_non_schema_columns", side_effect=lambda df, _: df
+    )
+    # allow all irrelevant probabilities to be 0 in our config
+    mocker.patch(
+        "pseudopeople.configuration.generator.validate_overrides",
+        side_effect=lambda *args: None,
+    )
+    # allow our noise levels to be high in testing
+    mocker.patch(
+        "pseudopeople.configuration.generator.validate_noise_level_proportions",
+        lambda *args: None,
+    )
+
+    # get unnoised data
+    _, _, source, year, state, engine = dataset_params
+    unnoised = generate_decennial_census(
+        source=source, config=NO_NOISE, year=year, state=state, engine=engine
+    )
+
+    # get noised data using custom config
+    config_dict = get_single_noise_type_config(
+        dataset_name, NOISE_TYPES.duplicate_with_guardian.name
+    )
+    for probability_key in [
+        Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24,
+        Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18,
+    ]:
+        config_dict[dataset_name][Keys.ROW_NOISE][NOISE_TYPES.duplicate_with_guardian.name][
+            probability_key
+        ] = probabilities[probability_key]
+    config = NoiseConfiguration(LayeredConfigTree(config_dict))
+    noised = generate_decennial_census(
+        source=source, config=config.to_dict(), year=year, state=state, engine=engine
+    )
+
+    duplicated = noised.loc[noised["simulant_id"].duplicated()]
+    duplicated["age"] = duplicated["age"].astype(int)
+
+    # add old housing type data to duplicated simulants
+    old_housing_data = unnoised[["simulant_id", "housing_type"]].rename(
+        {"housing_type": "unnoised_housing_type"}, axis=1
+    )
+    duplicated = duplicated.merge(old_housing_data)
+
+    # separate tests for household under 18 and for college under 24
+    for probability_name, age, housing_type in zip(
+        [
+            Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18,
+            Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24,
+        ],
+        [18, 24],
+        ["Household", "College"],
+    ):
+        group_data = unnoised.loc[
+            (unnoised["age"].astype(int) < age)
+            & (unnoised["housing_type"] == housing_type)
+            & (unnoised["guardian_1"].notna())
+        ]
+        merged_data = merge_dependents_and_guardians(group_data, unnoised)
+        sims_eligible_for_duplication = merged_data.index[
+            (
+                (merged_data["household_id"] != merged_data["guardian_1_household_id"])
+                & (merged_data["guardian_1_household_id"].notna())
+            )
+            | (
+                (merged_data["household_id"] != merged_data["guardian_2_household_id"])
+                & (merged_data["guardian_2_household_id"].notna())
+            )
+        ]
+        duplicated_in_group = duplicated.loc[
+            (duplicated["age"] < age) & (duplicated["unnoised_housing_type"] == housing_type)
+        ]
+
+        fuzzy_checker.fuzzy_assert_proportion(
+            name="test_duplicate_guardian",
+            observed_numerator=len(duplicated_in_group),
+            observed_denominator=len(sims_eligible_for_duplication),
+            target_proportion=probabilities[probability_name],
+            name_additional=f"noised_data",
+        )
+    # Only duplicate a dependent one time
+    assert noised["simulant_id"].value_counts().max() == 2
+
+    # Check address information is copied in new rows
+    guardians = unnoised.loc[
+        unnoised["simulant_id"].isin(unnoised["guardian_1"])
+        | unnoised["simulant_id"].isin(unnoised["guardian_2"])
+    ]
+    simulant_ids = unnoised["simulant_id"].values
+
+    for i in duplicated.index:
+        dependent = duplicated.loc[i]
+
+        for column in GUARDIAN_DUPLICATION_ADDRESS_COLUMNS:
+            guardian_1 = dependent["guardian_1"]
+            guardian_2 = dependent["guardian_2"]
+
+            if guardian_2 is np.nan:
+                guardians_values = [
+                    guardians.loc[guardians["simulant_id"] == guardian_1, column].values[0]
+                ]
+            else:  # dependent has both guardians
+                guardians_values = []
+                for guardian in [guardian_1, guardian_2]:
+                    if (
+                        guardian in simulant_ids
+                    ):  # duplicates will not have addresses copied from guardians not in data
+                        guardians_values += [
+                            guardians.loc[
+                                guardians["simulant_id"] == guardian, column
+                            ].values[0]
+                        ]
+
+            assert dependent[column] in guardians_values
diff --git a/tests/utilities.py b/tests/utilities.py
@@ -182,7 +182,8 @@ def get_single_noise_type_config(
                     if isinstance(probability, list):
                         new_probability = [0.0 for x in probability]
                     elif isinstance(probability, dict):
-                        new_probability = {key: 0.0 for key in probability.keys()}
+                        # NOTE: this will fail default config validations
+                        new_probability = {0: 1.0}
                     else:
                         new_probability = 0.0
                     config_dict[dataset_name][Keys.COLUMN_NOISE][col][noise_type][