Skip to content

Commit d3faa25

Browse files
Hjafari/feature/mic 5517 guardian test (#498)
Category: test JIRA issue: MIC-5517 Expand duplicate guardian test to full scale. Testing Ran test on census on RI and US data.
1 parent b1952c3 commit d3faa25

File tree

4 files changed

+208
-36
lines changed

4 files changed

+208
-36
lines changed

src/pseudopeople/dataset.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def get_noised_data(
7070
self._reformat_dates_for_noising()
7171
self._noise_dataset(configuration, noise_types, progress_bar=progress_bar)
7272
self.data = coerce_dtypes(self.data, self.dataset_schema)
73-
self.data = self.data[[c.name for c in self.dataset_schema.columns]]
73+
self.data = Dataset.drop_non_schema_columns(self.data, self.dataset_schema)
7474
return self.data
7575

7676
def _noise_dataset(
@@ -164,6 +164,25 @@ def _reformat_dates_for_noising(self) -> None:
164164

165165
self.data = data
166166

167+
@staticmethod
168+
def drop_non_schema_columns(
169+
data: pd.DataFrame, dataset_schema: DatasetSchema
170+
) -> pd.DataFrame:
171+
"""Returns data with only the columns in the dataset schema.
172+
173+
Parameters
174+
----------
175+
data
176+
The pd.DataFrame to update.
177+
dataset_schema
178+
A DatasetSchema which contains the columns of interest in its column attribute.
179+
180+
Returns
181+
-------
182+
A pd.DataFrame with the columns in the dataset schema.
183+
"""
184+
return data[[c.name for c in dataset_schema.columns]]
185+
167186
@staticmethod
168187
def is_missing(data: pd.DataFrame) -> pd.DataFrame:
169188
"""Returns a boolean dataframe with the same columns, index, and shape of

src/pseudopeople/noise_functions.py

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,38 @@ def apply_do_not_respond(
9393
# return dataset
9494

9595

96+
# Helper function to format group dataframe and merging with their dependents
97+
def merge_dependents_and_guardians(
98+
dependents_df: pd.DataFrame, full_data: pd.DataFrame
99+
) -> pd.DataFrame:
100+
# Merge dependents with their guardians. We have to merge twice to check
101+
# if either guardian is living at a separate location from the dependent.
102+
guardian_1s = full_data.loc[
103+
full_data["simulant_id"].isin(full_data["guardian_1"]),
104+
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
105+
].add_prefix("guardian_1_")
106+
dependents_and_guardians_df = dependents_df.merge(
107+
guardian_1s,
108+
how="left",
109+
left_on=["guardian_1", "year"],
110+
right_on=["guardian_1_simulant_id", "guardian_1_year"],
111+
)
112+
del guardian_1s
113+
guardian_2s = full_data.loc[
114+
full_data["simulant_id"].isin(full_data["guardian_2"]),
115+
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
116+
].add_prefix("guardian_2_")
117+
dependents_and_guardians_df = dependents_and_guardians_df.merge(
118+
guardian_2s,
119+
how="left",
120+
left_on=["guardian_2", "year"],
121+
right_on=["guardian_2_simulant_id", "guardian_2_year"],
122+
)
123+
del guardian_2s
124+
125+
return dependents_and_guardians_df
126+
127+
96128
def duplicate_with_guardian(
97129
dataset: Dataset,
98130
configuration: NoiseConfiguration,
@@ -109,37 +141,6 @@ def duplicate_with_guardian(
109141
:param to_noise_index: pd.Index of rows to be noised
110142
"""
111143

112-
# Helper function to format group dataframe and merging with their dependents
113-
def _merge_dependents_and_guardians(
114-
dependents_df: pd.DataFrame, full_data: pd.DataFrame
115-
) -> pd.DataFrame:
116-
# Merge dependents with their guardians. We have to merge twice to check
117-
# if either guardian is living at a separate location from the dependent.
118-
guardian_1s = full_data.loc[
119-
full_data["simulant_id"].isin(full_data["guardian_1"]),
120-
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
121-
].add_prefix("guardian_1_")
122-
dependents_and_guardians_df = dependents_df.merge(
123-
guardian_1s,
124-
how="left",
125-
left_on=["guardian_1", "year"],
126-
right_on=["guardian_1_simulant_id", "guardian_1_year"],
127-
)
128-
del guardian_1s
129-
guardian_2s = full_data.loc[
130-
full_data["simulant_id"].isin(full_data["guardian_2"]),
131-
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS + ["simulant_id"],
132-
].add_prefix("guardian_2_")
133-
dependents_and_guardians_df = dependents_and_guardians_df.merge(
134-
guardian_2s,
135-
how="left",
136-
left_on=["guardian_2", "year"],
137-
right_on=["guardian_2_simulant_id", "guardian_2_year"],
138-
)
139-
del guardian_2s
140-
141-
return dependents_and_guardians_df
142-
143144
# Get dict of group type and formatted dataframe for that group that should be noised
144145
formatted_group_data = {}
145146
# Get dataframe for each dependent group to merge with guardians
@@ -157,10 +158,10 @@ def _merge_dependents_and_guardians(
157158
# Merge dependents with their guardians
158159
formatted_group_data[
159160
Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18
160-
] = _merge_dependents_and_guardians(in_households_under_18, dataset.data)
161+
] = merge_dependents_and_guardians(in_households_under_18, dataset.data)
161162
formatted_group_data[
162163
Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24
163-
] = _merge_dependents_and_guardians(in_college_under_24, dataset.data)
164+
] = merge_dependents_and_guardians(in_college_under_24, dataset.data)
164165
# Note: We have two dicts (configuration and formatted_group_data) at this point that have
165166
# the key for the group and then a dataframe for that group or the group and the configured
166167
# noise level

tests/integration/release/test_release.py

Lines changed: 152 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,27 @@
11
from __future__ import annotations
22

3+
from collections.abc import Callable
34
from typing import Any
45

56
import numpy as np
67
import pandas as pd
78
import pytest
89
from _pytest.fixtures import FixtureRequest
910
from layered_config_tree import LayeredConfigTree
11+
from pytest_mock import MockerFixture
1012
from vivarium_testing_utils import FuzzyChecker
1113

12-
from pseudopeople.configuration import Keys
14+
from pseudopeople.configuration import Keys, get_configuration
15+
from pseudopeople.configuration.entities import NO_NOISE
1316
from pseudopeople.configuration.noise_configuration import NoiseConfiguration
17+
from pseudopeople.constants.metadata import DatasetNames
18+
from pseudopeople.constants.noise_type_metadata import (
19+
GUARDIAN_DUPLICATION_ADDRESS_COLUMNS,
20+
)
1421
from pseudopeople.dataset import Dataset
22+
from pseudopeople.interface import generate_decennial_census
1523
from pseudopeople.noise_entities import NOISE_TYPES
24+
from pseudopeople.noise_functions import merge_dependents_and_guardians
1625
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
1726
from tests.integration.conftest import SEED, _get_common_datasets
1827
from tests.utilities import (
@@ -166,3 +175,145 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
166175
.all()
167176
.all()
168177
)
178+
179+
180+
@pytest.mark.parametrize(
181+
"probabilities",
182+
[
183+
{
184+
"row_probability_in_households_under_18": 1.0,
185+
"row_probability_in_college_group_quarters_under_24": 1.0,
186+
},
187+
{
188+
"row_probability_in_households_under_18": 0.7,
189+
"row_probability_in_college_group_quarters_under_24": 0.8,
190+
},
191+
],
192+
)
193+
def test_guardian_duplication(
194+
dataset_params: tuple[str | int | Callable[..., pd.DataFrame] | None, ...],
195+
dataset_name: str,
196+
probabilities: dict[str, float],
197+
fuzzy_checker: FuzzyChecker,
198+
mocker: MockerFixture,
199+
) -> None:
200+
if dataset_name != DatasetNames.CENSUS:
201+
return
202+
203+
# patch these to avoid updating dtypes and dropping columns we need for testing
204+
mocker.patch("pseudopeople.dataset.coerce_dtypes", side_effect=lambda df, _: df)
205+
mocker.patch(
206+
"pseudopeople.dataset.Dataset.drop_non_schema_columns", side_effect=lambda df, _: df
207+
)
208+
# allow all irrelevant probabilities to be 0 in our config
209+
mocker.patch(
210+
"pseudopeople.configuration.generator.validate_overrides",
211+
side_effect=lambda *args: None,
212+
)
213+
# allow our noise levels to be high in testing
214+
mocker.patch(
215+
"pseudopeople.configuration.generator.validate_noise_level_proportions",
216+
lambda *args: None,
217+
)
218+
219+
# get unnoised data
220+
_, _, source, year, state, engine = dataset_params
221+
unnoised = generate_decennial_census(
222+
source=source, config=NO_NOISE, year=year, state=state, engine=engine
223+
)
224+
225+
# get noised data using custom config
226+
config_dict = get_single_noise_type_config(
227+
dataset_name, NOISE_TYPES.duplicate_with_guardian.name
228+
)
229+
for probability_key in [
230+
Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24,
231+
Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18,
232+
]:
233+
config_dict[dataset_name][Keys.ROW_NOISE][NOISE_TYPES.duplicate_with_guardian.name][
234+
probability_key
235+
] = probabilities[probability_key]
236+
config = NoiseConfiguration(LayeredConfigTree(config_dict))
237+
noised = generate_decennial_census(
238+
source=source, config=config.to_dict(), year=year, state=state, engine=engine
239+
)
240+
241+
duplicated = noised.loc[noised["simulant_id"].duplicated()]
242+
duplicated["age"] = duplicated["age"].astype(int)
243+
244+
# add old housing type data to duplicated simulants
245+
old_housing_data = unnoised[["simulant_id", "housing_type"]].rename(
246+
{"housing_type": "unnoised_housing_type"}, axis=1
247+
)
248+
duplicated = duplicated.merge(old_housing_data)
249+
250+
# separate tests for household under 18 and for college under 24
251+
for probability_name, age, housing_type in zip(
252+
[
253+
Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18,
254+
Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24,
255+
],
256+
[18, 24],
257+
["Household", "College"],
258+
):
259+
group_data = unnoised.loc[
260+
(unnoised["age"].astype(int) < age)
261+
& (unnoised["housing_type"] == housing_type)
262+
& (unnoised["guardian_1"].notna())
263+
]
264+
merged_data = merge_dependents_and_guardians(group_data, unnoised)
265+
sims_eligible_for_duplication = merged_data.index[
266+
(
267+
(merged_data["household_id"] != merged_data["guardian_1_household_id"])
268+
& (merged_data["guardian_1_household_id"].notna())
269+
)
270+
| (
271+
(merged_data["household_id"] != merged_data["guardian_2_household_id"])
272+
& (merged_data["guardian_2_household_id"].notna())
273+
)
274+
]
275+
duplicated_in_group = duplicated.loc[
276+
(duplicated["age"] < age) & (duplicated["unnoised_housing_type"] == housing_type)
277+
]
278+
279+
fuzzy_checker.fuzzy_assert_proportion(
280+
name="test_duplicate_guardian",
281+
observed_numerator=len(duplicated_in_group),
282+
observed_denominator=len(sims_eligible_for_duplication),
283+
target_proportion=probabilities[probability_name],
284+
name_additional=f"noised_data",
285+
)
286+
# Only duplicate a dependent one time
287+
assert noised["simulant_id"].value_counts().max() == 2
288+
289+
# Check address information is copied in new rows
290+
guardians = unnoised.loc[
291+
unnoised["simulant_id"].isin(unnoised["guardian_1"])
292+
| unnoised["simulant_id"].isin(unnoised["guardian_2"])
293+
]
294+
simulant_ids = unnoised["simulant_id"].values
295+
296+
for i in duplicated.index:
297+
dependent = duplicated.loc[i]
298+
299+
for column in GUARDIAN_DUPLICATION_ADDRESS_COLUMNS:
300+
guardian_1 = dependent["guardian_1"]
301+
guardian_2 = dependent["guardian_2"]
302+
303+
if guardian_2 is np.nan:
304+
guardians_values = [
305+
guardians.loc[guardians["simulant_id"] == guardian_1, column].values[0]
306+
]
307+
else: # dependent has both guardians
308+
guardians_values = []
309+
for guardian in [guardian_1, guardian_2]:
310+
if (
311+
guardian in simulant_ids
312+
): # duplicates will not have addresses copied from guardians not in data
313+
guardians_values += [
314+
guardians.loc[
315+
guardians["simulant_id"] == guardian, column
316+
].values[0]
317+
]
318+
319+
assert dependent[column] in guardians_values

tests/utilities.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,8 @@ def get_single_noise_type_config(
182182
if isinstance(probability, list):
183183
new_probability = [0.0 for x in probability]
184184
elif isinstance(probability, dict):
185-
new_probability = {key: 0.0 for key in probability.keys()}
185+
# NOTE: this will fail default config validations
186+
new_probability = {0: 1.0}
186187
else:
187188
new_probability = 0.0
188189
config_dict[dataset_name][Keys.COLUMN_NOISE][col][noise_type][

0 commit comments

Comments
 (0)