@@ -128,6 +128,19 @@ def test_get_joined_batches_iter_success(transformed_parquet_dataset):
128128 assert joined_batch .schema .names == COLLATED_DATASET_SCHEMA .names
129129
130130
131+ def test_get_deduped_batches_iter_success (collated_with_dupe_dataset_directory ):
132+ deduped_batches_iter = get_deduped_batches_iter (collated_with_dupe_dataset_directory )
133+ deduped_df = next (deduped_batches_iter ).to_pandas ()
134+
135+ # assert record 'def456' was dropped because most recent is action=delete
136+ assert len (deduped_df ) == 2
137+ assert set (deduped_df .timdex_record_id ) == {"abc123" , "ghi789" }
138+
139+ # assert record 'ghi789' has most recent 2024-10-02 version
140+ deduped_record = deduped_df .set_index ("timdex_record_id" ).loc ["ghi789" ]
141+ assert json .loads (deduped_record .record_a )["material" ] == "stucco"
142+
143+
131144def test_validate_output_success (collated_dataset_directory ):
132145 validate_output (dataset_path = collated_dataset_directory )
133146
@@ -198,16 +211,3 @@ def test_get_transform_version_success(transformed_directories, output_filename)
198211def test_get_transform_version_raise_error ():
199212 with pytest .raises (ValueError , match = "Transformed filepath is invalid." ):
200213 get_transform_version ("invalid" )
201-
202-
203- def test_get_deduped_batches_iter_success (collated_with_dupe_dataset_directory ):
204- deduped_batches_iter = get_deduped_batches_iter (collated_with_dupe_dataset_directory )
205- deduped_df = next (deduped_batches_iter ).to_pandas ()
206-
207- # assert record 'def456' was dropped because most recent is action=delete
208- assert len (deduped_df ) == 2
209- assert set (deduped_df .timdex_record_id ) == {"abc123" , "ghi789" }
210-
211- # assert record 'ghi789' has most recent 2024-10-02 version
212- deduped_record = deduped_df .set_index ("timdex_record_id" ).loc ["ghi789" ]
213- assert json .loads (deduped_record .record_a )["material" ] == "stucco"
0 commit comments