Skip to content

Commit 086af43

Browse files
author
Hussain Jafari
committed
pseudocode
1 parent e12237c commit 086af43

File tree

2 files changed

+95
-5
lines changed

2 files changed

+95
-5
lines changed

tests/conftest.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,6 @@
1717
from loguru import logger
1818
from vivarium_testing_utils import FuzzyChecker
1919

20-
from pseudopeople.configuration import Keys, get_configuration
21-
from pseudopeople.noise_entities import NOISE_TYPES
22-
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
23-
from tests.integration.conftest import CELL_PROBABILITY
24-
2520

2621
def pytest_addoption(parser: argparsing.Parser) -> None:
2722
parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")

tests/integration/release/test_release.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,101 @@
3232
)
3333

3434

35+
def get_high_config() -> dict[str, Any]:
36+
"""Returns a custom configuration dict to be used in noising"""
37+
HIGH_PROBABILITY = 0.03
38+
config = get_configuration().to_dict() # default config
39+
40+
# Increase row noise probabilities and column cell_probabilities to 3%
41+
for dataset_name in config:
42+
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
43+
config[dataset_schema.name][Keys.ROW_NOISE] = {
44+
noise_type.name: {
45+
Keys.ROW_PROBABILITY: HIGH_PROBABILITY,
46+
}
47+
for noise_type in dataset_schema.row_noise_types
48+
if noise_type != NOISE_TYPES.duplicate_with_guardian
49+
}
50+
for col in [c for c in dataset_schema.columns if c.noise_types]:
51+
config[dataset_name][Keys.COLUMN_NOISE][col.name] = {
52+
noise_type.name: {
53+
Keys.CELL_PROBABILITY: HIGH_PROBABILITY,
54+
}
55+
for noise_type in col.noise_types
56+
}
57+
58+
return config
59+
60+
61+
def test_release(
62+
dataset_params: tuple[
63+
str,
64+
Callable[..., pd.DataFrame],
65+
str | None,
66+
int | None,
67+
str | None,
68+
Literal["pandas", "dask"],
69+
],
70+
) -> None:
71+
# create unnoised dataset
72+
dataset_name, dataset_func, source, year, state, engine, noising_level = dataset_params
73+
unnoised_data_kwargs = {
74+
"source": source,
75+
"config": NO_NOISE,
76+
"year": year,
77+
"engine": engine,
78+
}
79+
if dataset_func != generate_social_security:
80+
unnoised_data_kwargs["state"] = state
81+
unnoised_data = dataset_func(**unnoised_data_kwargs)
82+
83+
# In our standard noising process, i.e. when noising a shard of data, we
84+
# 1) clean and reformat the data, 2) noise the data, and 3) do some post-processing.
85+
# We're replicating steps 1 and 2 in this test and skipping 3.
86+
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
87+
dataset = Dataset(dataset_schema, unnoised_data, SEED)
88+
# don't unnecessarily keep in memory
89+
del unnoised_data
90+
dataset._clean_input_data()
91+
# convert datetime columns to datetime types for _reformat_dates_for_noising
92+
# because the post-processing that occured in generating the unnoised data
93+
# in step 3 mentioned above converts these columns to object dtypes
94+
for col in [COLUMNS.dob.name, COLUMNS.ssa_event_date.name]:
95+
if col in dataset.data:
96+
dataset.data[col] = pd.to_datetime(dataset.data[col])
97+
dataset.data["copy_" + col] = pd.to_datetime(dataset.data["copy_" + col])
98+
dataset._reformat_dates_for_noising()
99+
if noising_level == 'default':
100+
config = get_configuration()
101+
else:
102+
config = get_high_config()
103+
104+
for noise_type in NOISE_TYPES:
105+
original_data = dataset.data.copy()
106+
noise_type(dataset, config)
107+
run_noising_test(noise_type, original_data, dataset.data, OTHER_PARAMS)
108+
with check:
109+
# TODO: possible to replace missingness with smaller data structure?
110+
assert dataset.missingness.equals(dataset.is_missing(dataset.data))
111+
112+
del original_data
113+
114+
# dataset.data is now completely noised data
115+
dataset.post_process_data()
116+
unnoised_data = dataset_func(**unnoised_data_kwargs)
117+
118+
run_final_tests(unnoised_data, dataset.data)
119+
120+
121+
def run_noising_test():
122+
if noise_type is row_noise:
123+
test_function = test_functions_dict[noise_type]
124+
test_function(original_data, noised_data)
125+
else: # is column noise
126+
run_column_tests(noise_type)
127+
128+
129+
35130
def test_column_noising(
36131
unnoised_dataset: Dataset,
37132
noised_data: pd.DataFrame,

0 commit comments

Comments
 (0)