support only a path to data root directory (#57)

stevebachmeier · web-flow · commit 8a02afe7a1c8 · 2023-04-13T12:06:02.000-06:00
diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py
@@ -6,7 +6,6 @@
 from vivarium import ConfigTree
 from vivarium.framework.randomness import RandomnessStream
 
-from pseudopeople import schema_entities
 from pseudopeople.utilities import get_index_to_noise
 
 
@@ -82,7 +81,7 @@ def __call__(
             column.loc[to_noise_idx], configuration, randomness_stream, additional_key
         )
 
-        # Coerce noised column dtype back to original column's if it's changed
+        # Coerce noised column dtype back to original column's if it has changed
         if noised_data.dtype.name != column.dtype.name:
             noised_data = noised_data.astype(column.dtype)
 
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
@@ -1,8 +1,9 @@
 from pathlib import Path
-from typing import List, Union
+from typing import Union
 
 import pandas as pd
 import pyarrow.parquet as pq
+from loguru import logger
 
 from pseudopeople.configuration import get_configuration
 from pseudopeople.constants import paths
@@ -12,7 +13,7 @@
 
 def _generate_form(
     form: Form,
-    source: Union[Path, str, pd.DataFrame],
+    source: Union[Path, str],
     seed: int,
     configuration: Union[Path, str, dict],
     year_filter: dict,
@@ -23,7 +24,7 @@ def _generate_form(
     :param form:
         Form needing to be noised
     :param source:
-        Clean data input which needs to be noised
+        Root directory of clean data input which needs to be noised
     :param seed:
         Seed for controlling randomness
     :param configuration:
@@ -32,73 +33,88 @@ def _generate_form(
         Noised form data in a pd.DataFrame
     """
     configuration_tree = get_configuration(configuration)
+    # TODO: we should save outputs of the simulation with filenames that are
+    #  consistent with the names of the forms if possible.
+    form_file_name = {
+        FORMS.acs.name: "household_survey_observer_acs",
+        FORMS.cps.name: "household_survey_observer_cps",
+        FORMS.tax_w2_1099.name: "tax_w2_observer",
+        FORMS.wic.name: "wic_observer",
+    }.get(form.name, f"{form.name}_observer")
     if source is None:
-        # TODO: hard-coding the .parquet extension for now. This will go away
-        #  once we only support passing the root directory of the data.
-        # TODO: we should save outputs of the simulation with filenames that are
-        #  consistent with the names of the forms if possible.
-        form_file_name = {
-            FORMS.acs.name: "household_survey_observer_acs",
-            FORMS.cps.name: "household_survey_observer_cps",
-            FORMS.tax_w2_1099.name: "tax_w2_observer",
-            FORMS.wic.name: "wic_observer",
-        }.get(form.name, f"{form.name}_observer")
-
-        source = paths.SAMPLE_DATA_ROOT / form_file_name / f"{form_file_name}.parquet"
-    if isinstance(source, str):
-        source = Path(source)
-    if isinstance(source, pd.DataFrame):
-        data = source
-    elif isinstance(source, Path):
-        if source.suffix == ".hdf":
-            with pd.HDFStore(str(source), mode="r") as hdf_store:
+        source = paths.SAMPLE_DATA_ROOT
+    source = Path(source) / form_file_name
+    data_paths = [x for x in source.glob(f"{form_file_name}*")]
+    if not data_paths:
+        logger.warning(
+            f"No datasets found at directory {str(source)}. "
+            "Please provide the path to the unmodified root data directory."
+        )
+        return None
+    suffix = set(x.suffix for x in data_paths)
+    if len(suffix) > 1:
+        raise TypeError(
+            f"Only one type of file extension expected but more than one found: {suffix}. "
+            "Please provide the path to the unmodified root data directory."
+        )
+    noised_form = []
+    columns_to_keep = [c for c in form.columns]
+    for data_path in data_paths:
+        if data_path.suffix == ".hdf":
+            with pd.HDFStore(str(data_path), mode="r") as hdf_store:
                 data = hdf_store.select("data", where=year_filter["hdf"])
-            hdf_store.close()
-        elif source.suffix == ".parquet":
-            data = pq.read_table(source, filters=year_filter["parquet"]).to_pandas()
+        elif data_path.suffix == ".parquet":
+            data = pq.read_table(data_path, filters=year_filter["parquet"]).to_pandas()
         else:
             raise ValueError(
                 "Source path must either be a .hdf or a .parquet file. Provided "
-                f"{source.suffix}"
+                f"{data_path.suffix}"
             )
         if not isinstance(data, pd.DataFrame):
-            raise TypeError(f"File located at {source} must contain a pandas DataFrame.")
-    else:
-        raise TypeError(
-            f"Source {source} must be either a pandas DataFrame or a path to a "
-            "file containing a pandas DataFrame."
-        )
+            raise TypeError(
+                f"File located at {data_path} must contain a pandas DataFrame. "
+                "Please provide the path to the unmodified root data directory."
+            )
 
-    columns_to_keep = [c for c in form.columns]
-    # Coerce dtypes
+        # Coerce dtypes prior to noising to catch issues early as well as
+        # get most columns away from dtype 'category' and into 'object' (strings)
+        for col in columns_to_keep:
+            if col.dtype_name != data[col.name].dtype.name:
+                data[col.name] = data[col.name].astype(col.dtype_name)
+
+        noised_data = noise_form(form, data, configuration_tree, seed)
+        noised_data = _extract_columns(columns_to_keep, noised_data)
+        noised_form.append(noised_data)
+
+    noised_form = pd.concat(noised_form, ignore_index=True)
+
+    # Known pandas bug: pd.concat does not preserve category dtypes so we coerce
+    # again after concat (https://github.com/pandas-dev/pandas/issues/51362)
     for col in columns_to_keep:
-        if col.dtype_name != data[col.name].dtype.name:
-            data[col.name] = data[col.name].astype(col.dtype_name)
-    noised_form = noise_form(form, data, configuration_tree, seed)
-    noised_form = _extract_columns(columns_to_keep, noised_form)
+        if col.dtype_name != noised_form[col.name].dtype.name:
+            noised_form[col.name] = noised_form[col.name].astype(col.dtype_name)
+
     return noised_form
 
 
 def _extract_columns(columns_to_keep, noised_form):
+    """Helper function for test mocking purposes"""
     if columns_to_keep:
         noised_form = noised_form[[c.name for c in columns_to_keep]]
     return noised_form
 
 
 # TODO: add year as parameter to select the year of the decennial census to generate (MIC-3909)
-# TODO: add default path: have the package install the small data in a known location and then
-#  to make this parameter optional, with the default being the location of the small data that
-#  is installed with the package (MIC-3884)
 def generate_decennial_census(
-    source: Union[Path, str, pd.DataFrame] = None,
+    source: Union[Path, str] = None,
     seed: int = 0,
     configuration: Union[Path, str, dict] = None,
     year: int = 2020,
 ) -> pd.DataFrame:
     """
     Generates noised decennial census data from un-noised data.
 
-    :param source: A path to or pd.DataFrame of the un-noised source census data
+    :param source: A path to un-noised source census data
     :param seed: An integer seed for randomness
     :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
     :param year: The year from the data to noise
@@ -112,15 +128,15 @@ def generate_decennial_census(
 
 
 def generate_american_communities_survey(
-    source: Union[Path, str, pd.DataFrame] = None,
+    source: Union[Path, str] = None,
     seed: int = 0,
     configuration: Union[Path, str, dict] = None,
     year: int = 2020,
 ) -> pd.DataFrame:
     """
     Generates noised American Communities Survey (ACS) data from un-noised data.
 
-    :param source: A path to or pd.DataFrame of the un-noised source ACS data
+    :param source: A path to un-noised source ACS data
     :param seed: An integer seed for randomness
     :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
     :param year: The year from the data to noise
@@ -140,15 +156,15 @@ def generate_american_communities_survey(
 
 
 def generate_current_population_survey(
-    source: Union[Path, str, pd.DataFrame] = None,
+    source: Union[Path, str] = None,
     seed: int = 0,
     configuration: Union[Path, str, dict] = None,
     year: int = 2020,
 ) -> pd.DataFrame:
     """
     Generates noised Current Population Survey (CPS) data from un-noised data.
 
-    :param source: A path to or pd.DataFrame of the un-noised source CPS data
+    :param source: A path to un-noised source CPS data
     :param seed: An integer seed for randomness
     :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
     :param year: The year from the data to noise
@@ -168,15 +184,15 @@ def generate_current_population_survey(
 
 
 def generate_taxes_w2_and_1099(
-    source: Union[Path, str, pd.DataFrame] = None,
+    source: Union[Path, str] = None,
     seed: int = 0,
     configuration: Union[Path, str, dict] = None,
     year: int = 2020,
 ) -> pd.DataFrame:
     """
     Generates noised W2 and 1099 data from un-noised data.
 
-    :param source: A path to or pd.DataFrame of the un-noised source W2 and 1099 data
+    :param source: A path to un-noised source W2 and 1099 data
     :param seed: An integer seed for randomness
     :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
     :param year: The year from the data to noise
@@ -191,15 +207,15 @@ def generate_taxes_w2_and_1099(
 
 
 def generate_women_infants_and_children(
-    source: Union[Path, str, pd.DataFrame] = None,
+    source: Union[Path, str] = None,
     seed: int = 0,
     configuration: Union[Path, str, dict] = None,
     year: int = 2020,
 ) -> pd.DataFrame:
     """
     Generates noised Women Infants and Children (WIC) data from un-noised data.
 
-    :param source: A path to or pd.DataFrame of the un-noised source WIC data
+    :param source: A path to un-noised source WIC data
     :param seed: An integer seed for randomness
     :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
     :param year: The year from the data to noise
@@ -214,15 +230,15 @@ def generate_women_infants_and_children(
 
 
 def generate_social_security(
-    source: Union[Path, str, pd.DataFrame] = None,
+    source: Union[Path, str] = None,
     seed: int = 0,
     configuration: Union[Path, str, dict] = None,
     year: int = 2020,
 ) -> pd.DataFrame:
     """
     Generates noised Social Security (SSA) data from un-noised data.
 
-    :param source: A path to or pd.DataFrame of the un-noised source SSA data
+    :param source: A path to un-noised source SSA data
     :param seed: An integer seed for randomness
     :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
     :param year: The year up to which to noise from the data
diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
@@ -15,29 +15,64 @@
 )
 from pseudopeople.schema_entities import COLUMNS, FORMS
 
+# TODO: Move into a metadata file and import metadata into prl
+DATA_COLUMNS = ["year", "event_date", "survey_date", "tax_year"]
+
 
 @pytest.mark.parametrize(
-    "data_dir_name, noising_function",
+    "data_dir_name, noising_function, use_sample_data",
     [
-        ("decennial_census_observer", generate_decennial_census),
-        ("household_survey_observer_acs", generate_american_communities_survey),
-        ("household_survey_observer_cps", generate_current_population_survey),
-        ("social_security_observer", generate_social_security),
-        ("tax_w2_observer", generate_taxes_w2_and_1099),
-        ("wic_observer", generate_women_infants_and_children),
-        ("tax 1040", "todo"),
+        ("decennial_census_observer", generate_decennial_census, True),
+        ("decennial_census_observer", generate_decennial_census, False),
+        ("household_survey_observer_acs", generate_american_communities_survey, True),
+        ("household_survey_observer_acs", generate_american_communities_survey, False),
+        ("household_survey_observer_cps", generate_current_population_survey, True),
+        ("household_survey_observer_cps", generate_current_population_survey, False),
+        ("social_security_observer", generate_social_security, True),
+        ("social_security_observer", generate_social_security, False),
+        ("tax_w2_observer", generate_taxes_w2_and_1099, True),
+        ("tax_w2_observer", generate_taxes_w2_and_1099, False),
+        ("wic_observer", generate_women_infants_and_children, True),
+        ("wic_observer", generate_women_infants_and_children, False),
+        ("tax 1040", "todo", True),
+        ("tax 1040", "todo", False),
     ],
 )
-def test_generate_form(data_dir_name: str, noising_function: Callable):
+def test_generate_form(
+    data_dir_name: str, noising_function: Callable, use_sample_data: bool, tmpdir
+):
+    """Tests that noised forms are generated and as expected. The 'use_sample_data'
+    parameter determines whether or not to use the sample data (if True) or
+    a non-default root directory with multiple datasets to compile (if False)
+    """
     if noising_function == "todo":
         pytest.skip(reason=f"TODO: implement form {data_dir_name}")
-    # todo fix hard-coding in MIC-3960
-    data_path = paths.SAMPLE_DATA_ROOT / data_dir_name / f"{data_dir_name}.parquet"
-    data = pd.read_parquet(data_path)
 
-    noised_data = noising_function(seed=0)
-    noised_data_same_seed = noising_function(seed=0)
-    noised_data_different_seed = noising_function(seed=1)
+    sample_data_path = list(
+        (paths.SAMPLE_DATA_ROOT / data_dir_name).glob(f"{data_dir_name}*")
+    )[0]
+
+    # Load the unnoised sample data
+    if sample_data_path.suffix == ".parquet":
+        data = pd.read_parquet(sample_data_path)
+    elif sample_data_path.suffix == ".hdf":
+        data = pd.read_hdf(sample_data_path)
+    else:
+        raise NotImplementedError(
+            f"Expected hdf or parquet but got {sample_data_path.suffix}"
+        )
+
+    # Configure if default (sample data) is used or a different root directory
+    if use_sample_data:
+        source = None  # will default to using sample data
+    else:
+        source = _generate_non_default_data_root(
+            data_dir_name, tmpdir, sample_data_path, data
+        )
+
+    noised_data = noising_function(seed=0, source=source)
+    noised_data_same_seed = noising_function(seed=0, source=source)
+    noised_data_different_seed = noising_function(seed=1, source=source)
 
     assert not data.equals(noised_data)
     assert noised_data.equals(noised_data_same_seed)
@@ -52,6 +87,38 @@ def test_generate_form(data_dir_name: str, noising_function: Callable):
         assert noised_data[col].dtype == expected_dtype
 
 
+def _generate_non_default_data_root(data_dir_name, tmpdir, sample_data_path, data):
+    """Helper function to break the single sample dataset into two and save
+    out to tmpdir to be used as a non-default 'source' argument
+    """
+    outdir = tmpdir.mkdir(data_dir_name)
+    suffix = sample_data_path.suffix
+    split_idx = int(len(data) / 2)
+    if suffix == ".parquet":
+        data[:split_idx].to_parquet(outdir / f"{data_dir_name}_1{suffix}")
+        data[split_idx:].to_parquet(outdir / f"{data_dir_name}_2{suffix}")
+    elif suffix == ".hdf":
+        data[:split_idx].to_hdf(
+            outdir / f"{data_dir_name}_1{suffix}",
+            "data",
+            format="table",
+            complib="bzip2",
+            complevel=9,
+            data_columns=DATA_COLUMNS,
+        )
+        data[split_idx:].to_hdf(
+            outdir / f"{data_dir_name}_2{suffix}",
+            "data",
+            format="table",
+            complib="bzip2",
+            complevel=9,
+            data_columns=DATA_COLUMNS,
+        )
+    else:
+        raise NotImplementedError(f"Requires hdf or parquet, got {suffix}")
+    return tmpdir
+
+
 # TODO [MIC-4000]: add test that each col to get noised actually does get noised