ihmeuw
diff --git a/‎tests/conftest.py‎
Lines changed: 67 additions & 3 deletions b/‎tests/conftest.py‎
Lines changed: 67 additions & 3 deletions
diff --git a/‎tests/constants.py‎
Lines changed: 53 additions & 0 deletions b/‎tests/constants.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎tests/integration/conftest.py‎
Lines changed: 2 additions & 9 deletions b/‎tests/integration/conftest.py‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎tests/integration/test_dataset.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/integration/test_dataset.py‎
Lines changed: 2 additions & 2 deletions
@@ -17,9 +17,17 @@
 from loguru import logger
 from vivarium_testing_utils import FuzzyChecker
 
+from pseudopeople.configuration import Keys, get_configuration
+from pseudopeople.noise_entities import NOISE_TYPES
+from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
+from tests.integration.conftest import CELL_PROBABILITY
+
 
 def pytest_addoption(parser: argparsing.Parser) -> None:
     parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")
+    parser.addoption(
+        "--release", action="store_true", default=False, help="run release tests"
+    )
     parser.addoption(
         "--limit",
         action="store",
@@ -38,15 +46,24 @@ def pytest_configure(config: Config) -> None:
 
 
 def pytest_collection_modifyitems(config: Config, items: list[Function]) -> None:
+    skip_release = pytest.mark.skip(reason="need --release to run")
+    if not config.getoption("--release"):
+        for item in items:
+            if "release" in item.keywords:
+                item.add_marker(skip_release)
+
     if config.getoption("--runslow"):
         # --runslow given in cli: do not skip slow tests
         return
+
     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
     for item in items:
         # Automatically tag all tests in the tests/integration dir as slow
-        if item.parent and Path(item.parent.path).parent.stem == "integration":
-            item.add_marker(pytest.mark.slow)
-        if "slow" in item.keywords:
+        test_in_slow_directory = (
+            item.parent and Path(item.parent.path).parent.stem == "integration"
+        )
+        test_is_slow = "slow" in item.keywords
+        if test_in_slow_directory or test_is_slow:
             item.add_marker(skip_slow)
 
     # Limit the number of permutations of parametrised tests to run.
@@ -88,3 +105,50 @@ def fuzzy_checker(output_directory: Path) -> Generator[FuzzyChecker, None, None]
     yield checker
 
     checker.save_diagnostic_output(output_directory)
+
+
+@pytest.fixture(scope="session")
+def config() -> dict[str, Any]:
+    """Returns a custom configuration dict to be used in noising"""
+    ROW_PROBABILITY = 0.05
+    config = get_configuration().to_dict()  # default config
+
+    # Increase row noise probabilities to 5% and column cell_probabilities to 25%
+    for dataset_name in config:
+        dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
+        config[dataset_schema.name][Keys.ROW_NOISE] = {
+            noise_type.name: {
+                Keys.ROW_PROBABILITY: ROW_PROBABILITY,
+            }
+            for noise_type in dataset_schema.row_noise_types
+            if noise_type != NOISE_TYPES.duplicate_with_guardian
+        }
+        for col in [c for c in dataset_schema.columns if c.noise_types]:
+            config[dataset_name][Keys.COLUMN_NOISE][col.name] = {
+                noise_type.name: {
+                    Keys.CELL_PROBABILITY: CELL_PROBABILITY,
+                }
+                for noise_type in col.noise_types
+            }
+
+    # FIXME: Remove when record_id is added as the truth deck for datasets.
+    # For integration tests, we will NOT duplicate rows with guardian duplication.
+    # This is because we want to be able to compare the noised and unnoised data
+    # and a big assumption we make is that simulant_id and household_id are the
+    # truth decks in our datasets.
+    config[DATASET_SCHEMAS.census.name][Keys.ROW_NOISE][
+        NOISE_TYPES.duplicate_with_guardian.name
+    ] = {
+        Keys.ROW_PROBABILITY_IN_HOUSEHOLDS_UNDER_18: 0.0,
+        Keys.ROW_PROBABILITY_IN_COLLEGE_GROUP_QUARTERS_UNDER_24: 0.0,
+    }
+    # Update SSA dataset to noise 'ssn' but NOT noise 'ssa_event_type' since that
+    # will be used as an identifier along with simulant_id
+    # TODO: Noise ssa_event_type when record IDs are implemented (MIC-4039)
+    config[DATASET_SCHEMAS.ssa.name][Keys.COLUMN_NOISE][COLUMNS.ssa_event_type.name] = {
+        noise_type.name: {
+            Keys.CELL_PROBABILITY: 0,
+        }
+        for noise_type in COLUMNS.ssa_event_type.noise_types
+    }
+    return config
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+from functools import partial
+from typing import Any
+
+import pandas as pd
+
+from pseudopeople.interface import (
+    generate_american_community_survey,
+    generate_current_population_survey,
+    generate_decennial_census,
+    generate_social_security,
+    generate_taxes_1040,
+    generate_taxes_w2_and_1099,
+    generate_women_infants_and_children,
+)
+from pseudopeople.noise_entities import NOISE_TYPES
+from pseudopeople.schema_entities import DATASET_SCHEMAS
+from pseudopeople.utilities import (
+    count_number_of_tokens_per_string,
+    load_ocr_errors,
+    load_phonetic_errors,
+    load_qwerty_errors_data,
+)
+
+CELL_PROBABILITY = 0.25
+
+DATASET_GENERATION_FUNCS: dict[str, Callable[..., Any]] = {
+    DATASET_SCHEMAS.census.name: generate_decennial_census,
+    DATASET_SCHEMAS.acs.name: generate_american_community_survey,
+    DATASET_SCHEMAS.cps.name: generate_current_population_survey,
+    DATASET_SCHEMAS.ssa.name: generate_social_security,
+    DATASET_SCHEMAS.tax_w2_1099.name: generate_taxes_w2_and_1099,
+    DATASET_SCHEMAS.wic.name: generate_women_infants_and_children,
+    DATASET_SCHEMAS.tax_1040.name: generate_taxes_1040,
+}
+
+TOKENS_PER_STRING_MAPPER: dict[str, Callable[..., pd.Series[int]]] = {
+    NOISE_TYPES.make_ocr_errors.name: partial(
+        count_number_of_tokens_per_string, pd.Series(load_ocr_errors().index)
+    ),
+    NOISE_TYPES.make_phonetic_errors.name: partial(
+        count_number_of_tokens_per_string,
+        pd.Series(load_phonetic_errors().index),
+    ),
+    NOISE_TYPES.write_wrong_digits.name: lambda x: x.astype(str)
+    .str.replace(r"[^\d]", "", regex=True)
+    .str.len(),
+    NOISE_TYPES.make_typos.name: partial(
+        count_number_of_tokens_per_string, pd.Series(load_qwerty_errors_data().index)
+    ),
+}
@@ -23,6 +23,7 @@
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
 from pseudopeople.utilities import coerce_dtypes
+from tests.utilities import initialize_dataset_with_sample
 
 ROW_PROBABILITY = 0.05
 CELL_PROBABILITY = 0.25
@@ -209,19 +210,11 @@ def noised_sample_data_taxes_1040(config: dict[str, Any]) -> pd.DataFrame:
 
 
 def get_unnoised_data(dataset_name: str) -> Dataset:
-    result = _initialize_dataset_with_sample(dataset_name)
+    result = initialize_dataset_with_sample(dataset_name)
     result.data = coerce_dtypes(result.data, result.dataset_schema)
     return result
 
 
-def _initialize_dataset_with_sample(dataset_name: str) -> Dataset:
-    dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
-    data_path = paths.SAMPLE_DATA_ROOT / dataset_name / f"{dataset_name}.parquet"
-    dataset = Dataset(dataset_schema, pd.read_parquet(data_path), SEED)
-
-    return dataset
-
-
 def _get_common_datasets(
     unnoised_dataset: Dataset, noised_dataset: pd.DataFrame
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Index[int]]:
 
@@ -5,7 +5,7 @@
 from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import DATASET_SCHEMAS
-from tests.integration.conftest import _initialize_dataset_with_sample
+from tests.utilities import initialize_dataset_with_sample
 
 
 @pytest.mark.parametrize(
@@ -22,7 +22,7 @@
 )
 def test_dataset_missingness(dataset_name: str) -> None:
     """Tests that missingness is accurate with dataset.data."""
-    dataset = _initialize_dataset_with_sample(dataset_name)
+    dataset = initialize_dataset_with_sample(dataset_name)
     # We must manually clean the data for noising since we are recreating our main noising loop
     dataset._clean_input_data()
     dataset._reformat_dates_for_noising()