2727from tests .constants import TOKENS_PER_STRING_MAPPER
2828from tests .integration .conftest import SEED , IDX_COLS
2929from tests .integration .release .conftest import DATASET_ARG_TO_FULL_NAME_MAPPER
30- from tests .integration .release .utilities import run_omit_row_tests , run_do_not_respond_tests , run_guardian_duplication_tests
30+ from tests .integration .release .utilities import load_unnoised_data , run_omit_row_tests , run_do_not_respond_tests , run_guardian_duplication_tests
3131from tests .unit .test_configuration import COLUMN_NOISE_TYPES
3232from tests .utilities import (
3333 get_single_noise_type_config ,
3838ROW_TEST_FUNCTIONS = {'omit_row' : run_omit_row_tests ,
3939'do_not_respond' : run_do_not_respond_tests ,
4040'duplicate_with_guardian' : run_guardian_duplication_tests }
41- NEW_PROBABILITY = 0.03
41+ NEW_PROBABILITY = 0.2
4242
4343
4444def get_high_noise_config (
@@ -49,7 +49,7 @@ def get_high_noise_config(
4949
5050 for noise_type , probabilities in config_dict [dataset_name ][Keys .ROW_NOISE ].items ():
5151 for probability_name , probability in probabilities .items ():
52- config_dict [dataset_name ][Keys .ROW_NOISE ][noise_type ][probability_name ] = .03
52+ config_dict [dataset_name ][Keys .ROW_NOISE ][noise_type ][probability_name ] = NEW_PROBABILITY
5353
5454 for col , noise_types in config_dict [dataset_name ][Keys .COLUMN_NOISE ].items ():
5555 for noise_type , probabilities in noise_types .items ():
@@ -62,7 +62,7 @@ def get_high_noise_config(
6262 # NOTE: this will fail default config validations
6363 new_probability = {1 : 1.0 }
6464 else :
65- new_probability = .03
65+ new_probability = NEW_PROBABILITY
6666 config_dict [dataset_name ][Keys .COLUMN_NOISE ][col ][noise_type ][
6767 probability_name
6868 ] = new_probability
@@ -98,6 +98,10 @@ def test_release_runs(
9898 if dataset_func != generate_social_security :
9999 unnoised_data_kwargs ["state" ] = state
100100 unnoised_data = dataset_func (** unnoised_data_kwargs )
101+ full_dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER [dataset_name ]
102+ dataset_schema = DATASET_SCHEMAS .get_dataset_schema (full_dataset_name )
103+ #new_unnoised_data = load_unnoised_data(dataset_schema, source, year, state, engine)
104+ #breakpoint()
101105
102106 if source is None :
103107 population = 'sample'
@@ -106,20 +110,17 @@ def test_release_runs(
106110 else :
107111 population = 'usa'
108112 timestr = time .strftime ("%Y%m%d-%H%M%S" )
109- filename = f"/ihme/homes/hjafari/ppl_runs_new/{ timestr } _{ dataset_name } _{ population } .o"
113+ filename = f"/ihme/homes/hjafari/ppl_runs_new/{ timestr } _{ dataset_name } _{ population } _ { NEW_PROBABILITY * 100 } _percent .o"
110114
111115 # In our standard noising process, i.e. when noising a shard of data, we
112116 # 1) clean and reformat the data, 2) noise the data, and 3) do some post-processing.
113117 # We're replicating steps 1 and 2 in this test and skipping 3.
114- full_dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER [dataset_name ]
115- dataset_schema = DATASET_SCHEMAS .get_dataset_schema (full_dataset_name )
116118 dataset = Dataset (dataset_schema , unnoised_data , SEED )
117- # don't unnecessarily keep in memory
119+ # don't unnecessarily keep in memory now that we have the data in dataset
118120 del unnoised_data
119121 dataset ._clean_input_data ()
120122 # convert datetime columns to datetime types for _reformat_dates_for_noising
121- # because the post-processing that occured in generating the unnoised data
122- # in step 3 mentioned above converts these columns to object dtypes
123+ # because we coerce these types into object types when loading unnoised data
123124 for col in [COLUMNS .dob .name , COLUMNS .ssa_event_date .name ]:
124125 if col in dataset .data :
125126 dataset .data [col ] = pd .to_datetime (
@@ -133,7 +134,9 @@ def test_release_runs(
133134 dataset ._reformat_dates_for_noising ()
134135
135136 config = NoiseConfiguration (LayeredConfigTree (get_high_noise_config (full_dataset_name )))
136-
137+
138+
139+
137140 for noise_type in NOISE_TYPES :
138141 original_data = dataset .data .copy ()
139142 # if isinstance(noise_type, RowNoiseType):
@@ -145,12 +148,15 @@ def test_release_runs(
145148 for column in dataset .data .columns :
146149 if config .has_noise_type (
147150 dataset .dataset_schema .name , noise_type .name , column
148- ):
149- noise_type (dataset , config , column )
150- run_column_noising_test (original_data , dataset .data , config , full_dataset_name , noise_type .name , column , fuzzy_checker , filename )
151+ ):# and noise_type.name == 'copy_from_household_member' and column == 'age':
152+ if column == COLUMNS .ssa_event_type .name :
153+ pass
154+ else :
155+ noise_type (dataset , config , column )
156+ run_column_noising_test (original_data , dataset .data , config , full_dataset_name , noise_type .name , column , fuzzy_checker , filename )
151157 with check :
152- # TODO: possible to replace missingness with smaller data structure?
153158 try :
159+ # TODO: possible to replace missingness with smaller data structure?
154160 assert dataset .missingness .equals (dataset .is_missing (dataset .data ))
155161 except :
156162 breakpoint ()
@@ -164,6 +170,72 @@ def test_release_runs(
164170 # run_final_tests(unnoised_data, dataset.data)
165171
166172
173+ def test_release_runs (
174+ dataset_params : tuple [
175+ str ,
176+ Callable [..., pd .DataFrame ],
177+ str | None ,
178+ int | None ,
179+ str | None ,
180+ Literal ["pandas" , "dask" ],
181+ ],
182+ fuzzy_checker : FuzzyChecker ,
183+ mocker : MockerFixture ,
184+ ) -> None :
185+ # keep all columns when generating unnoised data because some of them are used in testing
186+ mocker .patch (
187+ "pseudopeople.dataset.Dataset.drop_non_schema_columns" , side_effect = lambda df , _ : df
188+ )
189+
190+ # create unnoised dataset
191+ dataset_name , dataset_func , source , year , state , engine = dataset_params
192+ unnoised_data_kwargs = {
193+ "source" : source ,
194+ "config" : NO_NOISE ,
195+ "year" : year ,
196+ "engine" : engine ,
197+ "concat_output" : False ,
198+ }
199+ if dataset_func != generate_social_security :
200+ unnoised_data_kwargs ["state" ] = state
201+ unnoised_data = dataset_func (** unnoised_data_kwargs )
202+ #full_dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_name]
203+ #dataset_schema = DATASET_SCHEMAS.get_dataset_schema(full_dataset_name)
204+ #new_unnoised_data = load_unnoised_data(dataset_schema, source, year, state, engine)
205+
206+ # In our standard noising process, i.e. when noising a shard of data, we
207+ # 1) clean and reformat the data, 2) noise the data, and 3) do some post-processing.
208+ # We're replicating steps 1 and 2 in this test and skipping 3.
209+ full_dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER [dataset_name ]
210+ dataset_schema = DATASET_SCHEMAS .get_dataset_schema (full_dataset_name )
211+ dataset = Dataset (dataset_schema , unnoised_data , SEED )
212+ # don't unnecessarily keep in memory now that we have the data in dataset
213+ del unnoised_data
214+ dataset ._clean_input_data ()
215+ # convert datetime columns to datetime types for _reformat_dates_for_noising
216+ # because we coerce these types into object types when loading unnoised data
217+ for col in [COLUMNS .dob .name , COLUMNS .ssa_event_date .name ]:
218+ if col in dataset .data :
219+ dataset .data [col ] = pd .to_datetime (
220+ dataset .data [col ], format = dataset_schema .date_format
221+ )
222+ copy_col = "copy_" + col
223+ if copy_col in dataset .data :
224+ dataset .data [copy_col ] = pd .to_datetime (
225+ dataset .data [copy_col ], format = dataset_schema .date_format
226+ )
227+ # TODO: mock this to do nothing
228+ dataset ._reformat_dates_for_noising ()
229+
230+ # TODO: generate dictionary of noise configs with key as noise type and value is config
231+
232+ # for noise_type, config in config_dict:
233+ # # TODO: case pandas vs dask
234+ # pre_noised = [x.copy() for x in unnoised_data]
235+ # noised_data = dataset_func(source=prenoised_data, config)
236+ # check_noise(pre_noised, noised_data)
237+
238+
167239def _get_common_datasets (
168240 unnoised_dataset : Dataset , noised_dataset : pd .DataFrame
169241) -> tuple [pd .DataFrame , pd .DataFrame , pd .Index [int ]]:
@@ -218,12 +290,9 @@ def run_column_noising_test(
218290) -> None :
219291 dataset_schema = DATASET_SCHEMAS .get_dataset_schema (dataset_name )
220292 original_dataset = Dataset (dataset_schema , original_data , SEED )
221- try :
222- check_noised , check_original , shared_idx = _get_common_datasets (
223- original_dataset , noised_data
224- )
225- except :
226- breakpoint ()
293+ check_noised , check_original , shared_idx = _get_common_datasets (
294+ original_dataset , noised_data
295+ )
227296 # TODO: remove population param which was just used in testing the tests
228297 check_column_noising (dataset_name , config , fuzzy_checker , check_noised , check_original , shared_idx , noise_type , column , filename )
229298
@@ -305,10 +374,7 @@ def check_column_noising(
305374
306375 # This is accumulating not_noised over all noise types
307376 expected_noise = avg_probability_any_token_noised * expected_noise
308- # if no_differences:
309- # with open(filename, "a") as f:
310- # info = f"no differences for NOISE_TYPE_{noise_type}_COL_{col.name} and expected noise level of {expected_noise} or {expected_noise * len(check_original.loc[to_compare_idx, col.name])} simulants\n"
311- # f.write(info)
377+ open (filename , 'a' ).close ()
312378 try :
313379 fuzzy_checker .fuzzy_assert_proportion (
314380 name = noise_type ,
@@ -332,7 +398,7 @@ def check_column_noising(
332398 f .write (info )
333399 else :
334400 with open (filename , "a" ) as f :
335- info = f"NOISE_TYPE_{ noise_type } _COL_{ col .name } issue fuzzy checking: expected { expected_noise } but got { noise_level / len (check_original .loc [to_compare_idx , col .name ])} \n "
401+ info = f"NOISE_TYPE_{ noise_type } _COL_{ col .name } issue fuzzy checking: expected { expected_noise } but got { noise_level / len (check_original .loc [to_compare_idx , col .name ])} from { noise_level } / { len ( check_original . loc [ to_compare_idx , col . name ]) } \n "
336402 f .write (info )
337403
338404
0 commit comments