11from __future__ import annotations
22
33from collections .abc import Callable
4+ import math
45import numpy .typing as npt
56from pathlib import Path
67from typing import TYPE_CHECKING , Any , Literal
6364}
6465
6566
67+ def get_high_noise_config () -> NoiseConfiguration :
68+ """Returns a custom configuration dict to be used in noising"""
69+ config = get_configuration ().to_dict () # default config
70+ breakpoint ()
71+ # Increase row noise probabilities to 5% and column cell_probabilities to 25%
72+ for dataset_name in config :
73+ dataset_schema = DATASET_SCHEMAS .get_dataset_schema (dataset_name )
74+ config [dataset_schema .name ][Keys .ROW_NOISE ] = {
75+ noise_type .name : {
76+ Keys .ROW_PROBABILITY : 0.05 ,
77+ }
78+ for noise_type in dataset_schema .row_noise_types
79+ if noise_type != NOISE_TYPES .duplicate_with_guardian
80+ }
81+ for col in [c for c in dataset_schema .columns if c .noise_types ]:
82+ config [dataset_name ][Keys .COLUMN_NOISE ][col .name ] = {
83+ noise_type .name : {
84+ Keys .CELL_PROBABILITY : 0.2 ,
85+ }
86+ for noise_type in col .noise_types
87+ }
88+
89+ # Update SSA dataset to noise 'ssn' but NOT noise 'ssa_event_type' since that
90+ # will be used as an identifier along with simulant_id
91+ # TODO: Noise ssa_event_type when record IDs are implemented (MIC-4039)
92+ config [DATASET_SCHEMAS .ssa .name ][Keys .COLUMN_NOISE ][COLUMNS .ssa_event_type .name ] = {
93+ noise_type .name : {
94+ Keys .CELL_PROBABILITY : 0 ,
95+ }
96+ for noise_type in COLUMNS .ssa_event_type .noise_types
97+ }
98+ return NoiseConfiguration ((LayeredConfigTree (config )))
99+
100+
66101def test_release_row_noising (
67102 dataset_params : tuple [
68103 str ,
@@ -77,7 +112,8 @@ def test_release_row_noising(
77112 dataset_name , _ , source , year , state , engine_name = dataset_params
78113 full_dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER [dataset_name ]
79114 dataset_schema = DATASET_SCHEMAS .get_dataset_schema (full_dataset_name )
80- config = get_configuration ()
115+ #config = get_configuration()
116+ config = get_high_noise_config ()
81117
82118 if source is None :
83119 filename = None
@@ -133,11 +169,12 @@ def test_release_row_noising(
133169 if config .has_noise_type (
134170 dataset_schema .name , noise_type .name , column
135171 ):
136- if column == COLUMNS .ssa_event_type .name :
137- pass
138- else :
139- [noise_type (dataset , config , column ) for dataset in datasets ]
140- run_column_noising_test (prenoised_dataframes , datasets , config , full_dataset_name , noise_type .name , column , fuzzy_checker , filename )
172+ #if column == COLUMNS.ssa_event_type.name:
173+ # pass
174+ #else:
175+ for dataset in datasets :
176+ noise_type (dataset , config , column )
177+ run_column_noising_test (prenoised_dataframes , datasets , config , full_dataset_name , noise_type .name , column , fuzzy_checker , filename )
141178
142179
143180def run_column_noising_test (
0 commit comments