|
14 | 14 | from pseudopeople.constants import paths |
15 | 15 | from pseudopeople.dataset import noise_data |
16 | 16 | from pseudopeople.exceptions import DataSourceError |
17 | | -from pseudopeople.filter import DataFilter |
| 17 | +from pseudopeople.filter import DataFilter, get_data_filters |
18 | 18 | from pseudopeople.loader import load_standard_dataset |
19 | 19 | from pseudopeople.schema_entities import DATASET_SCHEMAS, DatasetSchema |
20 | 20 | from pseudopeople.utilities import ( |
@@ -196,6 +196,9 @@ def _generate_dataset( |
196 | 196 |
|
197 | 197 |
|
198 | 198 | def validate_source_compatibility(source: Path, dataset_schema: DatasetSchema) -> None: |
| 199 | + """Validate that a given source is compatible with the provided dataset schema by checking that |
| 200 | + 1) data exist for said schema in the provided source path and that 2) the data is the expected version |
| 201 | + as specified in its CHANGELOG.""" |
199 | 202 | # TODO [MIC-4546]: Clean this up w/ metadata and update test_interface.py tests to be generic |
200 | 203 | directories = [x.name for x in source.iterdir() if x.is_dir()] |
201 | 204 | if dataset_schema.name not in directories: |
@@ -341,12 +344,7 @@ def generate_decennial_census( |
341 | 344 | The simulated population has no data for this dataset in the |
342 | 345 | specified year or state. |
343 | 346 | """ |
344 | | - filters: list[DataFilter] = [] |
345 | | - if year is not None: |
346 | | - filters.append(DataFilter(DATASET_SCHEMAS.census.date_column_name, "==", year)) |
347 | | - if state is not None: |
348 | | - state_column_name = cast(str, DATASET_SCHEMAS.census.state_column_name) |
349 | | - filters.append(DataFilter(state_column_name, "==", get_state_abbreviation(state))) |
| 347 | + filters: Sequence[DataFilter] = get_data_filters(DATASET_SCHEMAS.census, year, state) |
350 | 348 | return _generate_dataset( |
351 | 349 | DATASET_SCHEMAS.census, |
352 | 350 | source, |
@@ -472,26 +470,9 @@ def generate_american_community_survey( |
472 | 470 | The simulated population has no data for this dataset in the |
473 | 471 | specified year or state. |
474 | 472 | """ |
475 | | - filters = [] |
| 473 | + filters: Sequence[DataFilter] = get_data_filters(DATASET_SCHEMAS.acs, year, state) |
476 | 474 | if year is not None: |
477 | | - try: |
478 | | - date_lower_filter = DataFilter( |
479 | | - DATASET_SCHEMAS.acs.date_column_name, |
480 | | - ">=", |
481 | | - pd.Timestamp(year=year, month=1, day=1), |
482 | | - ) |
483 | | - date_upper_filter = DataFilter( |
484 | | - DATASET_SCHEMAS.acs.date_column_name, |
485 | | - "<=", |
486 | | - pd.Timestamp(year=year, month=12, day=31), |
487 | | - ) |
488 | | - filters.extend([date_lower_filter, date_upper_filter]) |
489 | | - except (pd.errors.OutOfBoundsDatetime, ValueError): |
490 | | - raise ValueError(f"Invalid year provided: '{year}'") |
491 | 475 | seed = seed * 10_000 + year |
492 | | - if state is not None: |
493 | | - state_column = cast(str, DATASET_SCHEMAS.acs.state_column_name) |
494 | | - filters.append(DataFilter(state_column, "==", get_state_abbreviation(state))) |
495 | 476 | return _generate_dataset( |
496 | 477 | DATASET_SCHEMAS.acs, source, seed, config, filters, verbose, engine_name=engine |
497 | 478 | ) |
@@ -612,26 +593,9 @@ def generate_current_population_survey( |
612 | 593 | The simulated population has no data for this dataset in the |
613 | 594 | specified year or state. |
614 | 595 | """ |
615 | | - filters = [] |
| 596 | + filters: Sequence[DataFilter] = get_data_filters(DATASET_SCHEMAS.cps, year, state) |
616 | 597 | if year is not None: |
617 | | - try: |
618 | | - date_lower_filter = DataFilter( |
619 | | - DATASET_SCHEMAS.acs.date_column_name, |
620 | | - ">=", |
621 | | - pd.Timestamp(year=year, month=1, day=1), |
622 | | - ) |
623 | | - date_upper_filter = DataFilter( |
624 | | - DATASET_SCHEMAS.acs.date_column_name, |
625 | | - "<=", |
626 | | - pd.Timestamp(year=year, month=12, day=31), |
627 | | - ) |
628 | | - filters.extend([date_lower_filter, date_upper_filter]) |
629 | | - except (pd.errors.OutOfBoundsDatetime, ValueError): |
630 | | - raise ValueError(f"Invalid year provided: '{year}'") |
631 | 598 | seed = seed * 10_000 + year |
632 | | - if state is not None: |
633 | | - state_column = cast(str, DATASET_SCHEMAS.cps.state_column_name) |
634 | | - filters.append(DataFilter(state_column, "==", get_state_abbreviation(state))) |
635 | 599 | return _generate_dataset( |
636 | 600 | DATASET_SCHEMAS.cps, source, seed, config, filters, verbose, engine_name=engine |
637 | 601 | ) |
@@ -743,13 +707,9 @@ def generate_taxes_w2_and_1099( |
743 | 707 | The simulated population has no data for this dataset in the |
744 | 708 | specified year or state. |
745 | 709 | """ |
746 | | - filters = [] |
| 710 | + filters: Sequence[DataFilter] = get_data_filters(DATASET_SCHEMAS.tax_w2_1099, year, state) |
747 | 711 | if year is not None: |
748 | | - filters.append(DataFilter(DATASET_SCHEMAS.tax_w2_1099.date_column_name, "==", year)) |
749 | 712 | seed = seed * 10_000 + year |
750 | | - if state is not None: |
751 | | - state_column = cast(str, DATASET_SCHEMAS.tax_w2_1099.state_column_name) |
752 | | - filters.append(DataFilter(state_column, "==", get_state_abbreviation(state))) |
753 | 713 | return _generate_dataset( |
754 | 714 | DATASET_SCHEMAS.tax_w2_1099, |
755 | 715 | source, |
@@ -878,13 +838,9 @@ def generate_women_infants_and_children( |
878 | 838 | The simulated population has no data for this dataset in the |
879 | 839 | specified year or state. |
880 | 840 | """ |
881 | | - filters = [] |
| 841 | + filters: Sequence[DataFilter] = get_data_filters(DATASET_SCHEMAS.wic, year, state) |
882 | 842 | if year is not None: |
883 | | - filters.append(DataFilter(DATASET_SCHEMAS.wic.date_column_name, "==", year)) |
884 | 843 | seed = seed * 10_000 + year |
885 | | - if state is not None: |
886 | | - state_column = cast(str, DATASET_SCHEMAS.wic.state_column_name) |
887 | | - filters.append(DataFilter(state_column, "==", get_state_abbreviation(state))) |
888 | 844 | return _generate_dataset( |
889 | 845 | DATASET_SCHEMAS.wic, source, seed, config, filters, verbose, engine_name=engine |
890 | 846 | ) |
@@ -984,18 +940,8 @@ def generate_social_security( |
984 | 940 | The simulated population has no data for this dataset in the |
985 | 941 | specified year or any prior years. |
986 | 942 | """ |
987 | | - filters = [] |
| 943 | + filters: Sequence[DataFilter] = get_data_filters(DATASET_SCHEMAS.ssa, year) |
988 | 944 | if year is not None: |
989 | | - try: |
990 | | - filters.append( |
991 | | - DataFilter( |
992 | | - DATASET_SCHEMAS.ssa.date_column_name, |
993 | | - "<=", |
994 | | - pd.Timestamp(year=year, month=12, day=31), |
995 | | - ) |
996 | | - ) |
997 | | - except (pd.errors.OutOfBoundsDatetime, ValueError): |
998 | | - raise ValueError(f"Invalid year provided: '{year}'") |
999 | 945 | seed = seed * 10_000 + year |
1000 | 946 | return _generate_dataset( |
1001 | 947 | DATASET_SCHEMAS.ssa, source, seed, config, filters, verbose, engine_name=engine |
@@ -1108,13 +1054,9 @@ def generate_taxes_1040( |
1108 | 1054 | The simulated population has no data for this dataset in the |
1109 | 1055 | specified year or state. |
1110 | 1056 | """ |
1111 | | - filters = [] |
| 1057 | + filters: Sequence[DataFilter] = get_data_filters(DATASET_SCHEMAS.tax_1040, year, state) |
1112 | 1058 | if year is not None: |
1113 | | - filters.append(DataFilter(DATASET_SCHEMAS.tax_1040.date_column_name, "==", year)) |
1114 | 1059 | seed = seed * 10_000 + year |
1115 | | - if state is not None: |
1116 | | - state_column = cast(str, DATASET_SCHEMAS.tax_1040.state_column_name) |
1117 | | - filters.append(DataFilter(state_column, "==", get_state_abbreviation(state))) |
1118 | 1060 | return _generate_dataset( |
1119 | 1061 | DATASET_SCHEMAS.tax_1040, |
1120 | 1062 | source, |
|
0 commit comments