File tree Expand file tree Collapse file tree 3 files changed +21
-5
lines changed
tests/integration/release Expand file tree Collapse file tree 3 files changed +21
-5
lines changed Original file line number Diff line number Diff line change @@ -168,6 +168,19 @@ def _reformat_dates_for_noising(self) -> None:
168168 def drop_non_schema_columns (
169169 data : pd .DataFrame , dataset_schema : DatasetSchema
170170 ) -> pd .DataFrame :
171+ """Returns data with only the columns in the dataset schema.
172+
173+ Parameters
174+ ----------
175+ data
176+ The pd.DataFrame to update.
177+ dataset_schema
178+ A DatasetSchema which contains the columns of interest in its column attribute.
179+
180+ Returns
181+ -------
182+ A pd.DataFrame with the columns in the dataset schema.
183+ """
171184 return data [[c .name for c in dataset_schema .columns ]]
172185
173186 @staticmethod
Original file line number Diff line number Diff line change @@ -216,7 +216,6 @@ def duplicate_with_guardian(
216216 if index_to_copy .empty :
217217 continue
218218 noised_group_df = group_df .loc [index_to_copy ]
219- noised_group_df ["old_housing_type" ] = noised_group_df ["housing_type" ]
220219 noised_group_df [GUARDIAN_DUPLICATION_ADDRESS_COLUMNS ] = group_df .loc [
221220 index_to_copy ,
222221 [f"{ guardian } _" + column for column in GUARDIAN_DUPLICATION_ADDRESS_COLUMNS ],
@@ -231,9 +230,7 @@ def duplicate_with_guardian(
231230 ].map (HOUSING_TYPE_GUARDIAN_DUPLICATION_RELATONSHIP_MAP )
232231
233232 # Clean columns
234- duplicated_rows_df = duplicated_rows_df [
235- list (dataset .data .columns ) + ["old_housing_type" ]
236- ]
233+ duplicated_rows_df = duplicated_rows_df [dataset .data .columns ]
237234
238235 # Add duplicated rows to the original data and make sure that households
239236 # are grouped together by sorting by date and household_id
Original file line number Diff line number Diff line change @@ -241,6 +241,12 @@ def test_guardian_duplication(
241241 duplicated = noised .loc [noised ["simulant_id" ].duplicated ()]
242242 duplicated ["age" ] = duplicated ["age" ].astype (int )
243243
244+ # add old housing type data to duplicated simulants
245+ old_housing_data = unnoised [["simulant_id" , "housing_type" ]].rename (
246+ {"housing_type" : "unnoised_housing_type" }, axis = 1
247+ )
248+ duplicated = duplicated .merge (old_housing_data )
249+
244250 # separate tests for household under 18 and for college under 24
245251 for probability_name , age , housing_type in zip (
246252 [
@@ -267,7 +273,7 @@ def test_guardian_duplication(
267273 )
268274 ]
269275 duplicated_in_group = duplicated .loc [
270- (duplicated ["age" ] < age ) & (duplicated ["old_housing_type " ] == housing_type )
276+ (duplicated ["age" ] < age ) & (duplicated ["unnoised_housing_type " ] == housing_type )
271277 ]
272278
273279 fuzzy_checker .fuzzy_assert_proportion (
You can’t perform that action at this time.
0 commit comments