Skip to content

Commit fd477bf

Browse files
author
Hussain Jafari
committed
use high noise config
1 parent 961715d commit fd477bf

File tree

1 file changed

+43
-6
lines changed

1 file changed

+43
-6
lines changed

tests/integration/release/test_release.py

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from collections.abc import Callable
4+
import math
45
import numpy.typing as npt
56
from pathlib import Path
67
from typing import TYPE_CHECKING, Any, Literal
@@ -63,6 +64,40 @@
6364
}
6465

6566

67+
def get_high_noise_config() -> NoiseConfiguration:
68+
"""Returns a custom configuration dict to be used in noising"""
69+
config = get_configuration().to_dict() # default config
70+
breakpoint()
71+
# Increase row noise probabilities to 5% and column cell_probabilities to 25%
72+
for dataset_name in config:
73+
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
74+
config[dataset_schema.name][Keys.ROW_NOISE] = {
75+
noise_type.name: {
76+
Keys.ROW_PROBABILITY: 0.05,
77+
}
78+
for noise_type in dataset_schema.row_noise_types
79+
if noise_type != NOISE_TYPES.duplicate_with_guardian
80+
}
81+
for col in [c for c in dataset_schema.columns if c.noise_types]:
82+
config[dataset_name][Keys.COLUMN_NOISE][col.name] = {
83+
noise_type.name: {
84+
Keys.CELL_PROBABILITY: 0.2,
85+
}
86+
for noise_type in col.noise_types
87+
}
88+
89+
# Update SSA dataset to noise 'ssn' but NOT noise 'ssa_event_type' since that
90+
# will be used as an identifier along with simulant_id
91+
# TODO: Noise ssa_event_type when record IDs are implemented (MIC-4039)
92+
config[DATASET_SCHEMAS.ssa.name][Keys.COLUMN_NOISE][COLUMNS.ssa_event_type.name] = {
93+
noise_type.name: {
94+
Keys.CELL_PROBABILITY: 0,
95+
}
96+
for noise_type in COLUMNS.ssa_event_type.noise_types
97+
}
98+
return NoiseConfiguration((LayeredConfigTree(config)))
99+
100+
66101
def test_release_row_noising(
67102
dataset_params: tuple[
68103
str,
@@ -77,7 +112,8 @@ def test_release_row_noising(
77112
dataset_name, _, source, year, state, engine_name = dataset_params
78113
full_dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_name]
79114
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(full_dataset_name)
80-
config = get_configuration()
115+
#config = get_configuration()
116+
config = get_high_noise_config()
81117

82118
if source is None:
83119
filename = None
@@ -133,11 +169,12 @@ def test_release_row_noising(
133169
if config.has_noise_type(
134170
dataset_schema.name, noise_type.name, column
135171
):
136-
if column == COLUMNS.ssa_event_type.name:
137-
pass
138-
else:
139-
[noise_type(dataset, config, column) for dataset in datasets]
140-
run_column_noising_test(prenoised_dataframes, datasets, config, full_dataset_name, noise_type.name, column, fuzzy_checker, filename)
172+
#if column == COLUMNS.ssa_event_type.name:
173+
# pass
174+
#else:
175+
for dataset in datasets:
176+
noise_type(dataset, config, column)
177+
run_column_noising_test(prenoised_dataframes, datasets, config, full_dataset_name, noise_type.name, column, fuzzy_checker, filename)
141178

142179

143180
def run_column_noising_test(

0 commit comments

Comments
 (0)