Skip to content

Commit d417357

Browse files
create and mark release sample data tests (#480)
Category: test JIRA issue: MIC-5623 Add three integration tests into release tests. Mark test for release. Testing Tests passed and behaved as expected with and without the --release flag.
1 parent 931d723 commit d417357

File tree

11 files changed

+512
-252
lines changed

11 files changed

+512
-252
lines changed

tests/conftest.py

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,17 @@
1717
from loguru import logger
1818
from vivarium_testing_utils import FuzzyChecker
1919

20+
from pseudopeople.configuration import Keys, get_configuration
21+
from pseudopeople.noise_entities import NOISE_TYPES
22+
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
23+
from tests.integration.conftest import CELL_PROBABILITY
24+
2025

2126
def pytest_addoption(parser: argparsing.Parser) -> None:
2227
parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")
28+
parser.addoption(
29+
"--release", action="store_true", default=False, help="run release tests"
30+
)
2331
parser.addoption(
2432
"--limit",
2533
action="store",
@@ -38,15 +46,24 @@ def pytest_configure(config: Config) -> None:
3846

3947

4048
def pytest_collection_modifyitems(config: Config, items: list[Function]) -> None:
49+
skip_release = pytest.mark.skip(reason="need --release to run")
50+
if not config.getoption("--release"):
51+
for item in items:
52+
if "release" in item.keywords:
53+
item.add_marker(skip_release)
54+
4155
if config.getoption("--runslow"):
4256
# --runslow given in cli: do not skip slow tests
4357
return
58+
4459
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
4560
for item in items:
4661
# Automatically tag all tests in the tests/integration dir as slow
47-
if item.parent and Path(item.parent.path).parent.stem == "integration":
48-
item.add_marker(pytest.mark.slow)
49-
if "slow" in item.keywords:
62+
test_in_slow_directory = (
63+
item.parent and Path(item.parent.path).parent.stem == "integration"
64+
)
65+
test_is_slow = "slow" in item.keywords
66+
if test_in_slow_directory or test_is_slow:
5067
item.add_marker(skip_slow)
5168

5269
# Limit the number of permutations of parametrised tests to run.
@@ -88,3 +105,50 @@ def fuzzy_checker(output_directory: Path) -> Generator[FuzzyChecker, None, None]
88105
yield checker
89106

90107
checker.save_diagnostic_output(output_directory)
108+
109+
110+
@pytest.fixture(scope="session")
111+
def config() -> dict[str, Any]:
112+
"""Returns a custom configuration dict to be used in noising"""
113+
ROW_PROBABILITY = 0.05
114+
config = get_configuration().to_dict() # default config
115+
116+
# Increase row noise probabilities to 5% and column cell_probabilities to 25%
117+
for dataset_name in config:
118+
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
119+
config[dataset_schema.name][Keys.ROW_NOISE] = {
120+
noise_type.name: {
121+
Keys.ROW_PROBABILITY: ROW_PROBABILITY,
122+
}
123+
for noise_type in dataset_schema.row_noise_types
124+
if noise_type != NOISE_TYPES.duplicate_with_guardian
125+
}
126+
for col in [c for c in dataset_schema.columns if c.noise_types]:
127+
config[dataset_name][Keys.COLUMN_NOISE][col.name] = {
128+
noise_type.name: {
129+
Keys.CELL_PROBABILITY: CELL_PROBABILITY,
130+
}
131+
for noise_type in col.noise_types
132+
}
133+
134+
# FIXME: Remove when record_id is added as the truth deck for datasets.
135+
# For integration tests, we will NOT duplicate rows with guardian duplication.
136+
# This is because we want to be able to compare the noised and unnoised data
137+
# and a big assumption we make is that simulant_id and household_id are the
138+
# truth decks in our datasets.
139+
config[DATASET_SCHEMAS.census.name][Keys.ROW_NOISE][
140+
NOISE_TYPES.duplicate_with_guardian.name
141+
] = {
142+
Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18: 0.0,
143+
Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24: 0.0,
144+
}
145+
# Update SSA dataset to noise 'ssn' but NOT noise 'ssa_event_type' since that
146+
# will be used as an identifier along with simulant_id
147+
# TODO: Noise ssa_event_type when record IDs are implemented (MIC-4039)
148+
config[DATASET_SCHEMAS.ssa.name][Keys.COLUMN_NOISE][COLUMNS.ssa_event_type.name] = {
149+
noise_type.name: {
150+
Keys.CELL_PROBABILITY: 0,
151+
}
152+
for noise_type in COLUMNS.ssa_event_type.noise_types
153+
}
154+
return config

tests/constants.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from __future__ import annotations
2+
3+
from collections.abc import Callable
4+
from functools import partial
5+
from typing import Any
6+
7+
import pandas as pd
8+
9+
from pseudopeople.interface import (
10+
generate_american_community_survey,
11+
generate_current_population_survey,
12+
generate_decennial_census,
13+
generate_social_security,
14+
generate_taxes_1040,
15+
generate_taxes_w2_and_1099,
16+
generate_women_infants_and_children,
17+
)
18+
from pseudopeople.noise_entities import NOISE_TYPES
19+
from pseudopeople.schema_entities import DATASET_SCHEMAS
20+
from pseudopeople.utilities import (
21+
count_number_of_tokens_per_string,
22+
load_ocr_errors,
23+
load_phonetic_errors,
24+
load_qwerty_errors_data,
25+
)
26+
27+
CELL_PROBABILITY = 0.25
28+
29+
DATASET_GENERATION_FUNCS: dict[str, Callable[..., Any]] = {
30+
DATASET_SCHEMAS.census.name: generate_decennial_census,
31+
DATASET_SCHEMAS.acs.name: generate_american_community_survey,
32+
DATASET_SCHEMAS.cps.name: generate_current_population_survey,
33+
DATASET_SCHEMAS.ssa.name: generate_social_security,
34+
DATASET_SCHEMAS.tax_w2_1099.name: generate_taxes_w2_and_1099,
35+
DATASET_SCHEMAS.wic.name: generate_women_infants_and_children,
36+
DATASET_SCHEMAS.tax_1040.name: generate_taxes_1040,
37+
}
38+
39+
TOKENS_PER_STRING_MAPPER: dict[str, Callable[..., pd.Series[int]]] = {
40+
NOISE_TYPES.make_ocr_errors.name: partial(
41+
count_number_of_tokens_per_string, pd.Series(load_ocr_errors().index)
42+
),
43+
NOISE_TYPES.make_phonetic_errors.name: partial(
44+
count_number_of_tokens_per_string,
45+
pd.Series(load_phonetic_errors().index),
46+
),
47+
NOISE_TYPES.write_wrong_digits.name: lambda x: x.astype(str)
48+
.str.replace(r"[^\d]", "", regex=True)
49+
.str.len(),
50+
NOISE_TYPES.make_typos.name: partial(
51+
count_number_of_tokens_per_string, pd.Series(load_qwerty_errors_data().index)
52+
),
53+
}

tests/integration/conftest.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pseudopeople.noise_entities import NOISE_TYPES
2424
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
2525
from pseudopeople.utilities import coerce_dtypes
26+
from tests.utilities import initialize_dataset_with_sample
2627

2728
ROW_PROBABILITY = 0.05
2829
CELL_PROBABILITY = 0.25
@@ -209,19 +210,11 @@ def noised_sample_data_taxes_1040(config: dict[str, Any]) -> pd.DataFrame:
209210

210211

211212
def get_unnoised_data(dataset_name: str) -> Dataset:
212-
result = _initialize_dataset_with_sample(dataset_name)
213+
result = initialize_dataset_with_sample(dataset_name)
213214
result.data = coerce_dtypes(result.data, result.dataset_schema)
214215
return result
215216

216217

217-
def _initialize_dataset_with_sample(dataset_name: str) -> Dataset:
218-
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
219-
data_path = paths.SAMPLE_DATA_ROOT / dataset_name / f"{dataset_name}.parquet"
220-
dataset = Dataset(dataset_schema, pd.read_parquet(data_path), SEED)
221-
222-
return dataset
223-
224-
225218
def _get_common_datasets(
226219
unnoised_dataset: Dataset, noised_dataset: pd.DataFrame
227220
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Index[int]]:

tests/integration/test_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
66
from pseudopeople.noise_entities import NOISE_TYPES
77
from pseudopeople.schema_entities import DATASET_SCHEMAS
8-
from tests.integration.conftest import _initialize_dataset_with_sample
8+
from tests.utilities import initialize_dataset_with_sample
99

1010

1111
@pytest.mark.parametrize(
@@ -22,7 +22,7 @@
2222
)
2323
def test_dataset_missingness(dataset_name: str) -> None:
2424
"""Tests that missingness is accurate with dataset.data."""
25-
dataset = _initialize_dataset_with_sample(dataset_name)
25+
dataset = initialize_dataset_with_sample(dataset_name)
2626
# We must manually clean the data for noising since we are recreating our main noising loop
2727
dataset._clean_input_data()
2828
dataset._reformat_dates_for_noising()

0 commit comments

Comments
 (0)