Skip to content

Commit 8eb7e7e

Browse files
Merge branch 'develop' into noise-function-docs
2 parents 5d2489c + a299584 commit 8eb7e7e

File tree

11 files changed

+496
-203
lines changed

11 files changed

+496
-203
lines changed

README.rst

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ or build it from source with
1919

2020
``> python setup.py install``
2121

22-
This will make the ``pseudopeople`` library available to python and install a
23-
command-line executable called ``...`` that you can use to verify your
24-
installation with
25-
26-
``> ... test``
22+
Documentation
23+
======================
24+
You can view documentation at https://pseudopeople.readthedocs.io/en/latest/

src/pseudopeople/entity_types.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
from typing import Any, Callable, Dict
33

44
import pandas as pd
5+
from loguru import logger
56
from vivarium import ConfigTree
67
from vivarium.framework.randomness import RandomnessStream
78

9+
from pseudopeople import schema_entities
810
from pseudopeople.utilities import get_index_to_noise
911

1012

@@ -53,6 +55,7 @@ class ColumnNoiseType:
5355
noise_function: Callable[[pd.Series, ConfigTree, RandomnessStream, Any], pd.Series]
5456
row_noise_level: float = 0.01
5557
token_noise_level: float = 0.1
58+
noise_level_scaling_function: Callable[[str], float] = lambda x: 1.0
5659
additional_parameters: Dict[str, Any] = None
5760

5861
def __call__(
@@ -62,18 +65,27 @@ def __call__(
6265
randomness_stream: RandomnessStream,
6366
additional_key: Any,
6467
) -> pd.Series:
65-
# TODO: this is a temporary hack to account for all string columns having been made categorical
66-
# We should record expected output dtype in the columns data structure
67-
if column.dtype.name == "category":
68-
column = column.astype(str)
69-
else:
70-
column = column.copy()
71-
noise_level = configuration.row_noise_level
68+
column = column.copy()
69+
noise_level = configuration.row_noise_level * self.noise_level_scaling_function(
70+
column.name
71+
)
7272
to_noise_idx = get_index_to_noise(
7373
column, noise_level, randomness_stream, f"{self.name}_{additional_key}"
7474
)
75+
if to_noise_idx.empty:
76+
logger.debug(
77+
f"No cells chosen to noise for noise function {self.name} on column {column.name}. "
78+
"This is likely due to a combination of the configuration noise levels and the input data."
79+
)
80+
return column
7581
noised_data = self.noise_function(
7682
column.loc[to_noise_idx], configuration, randomness_stream, additional_key
7783
)
84+
85+
# Coerce noised column dtype back to original column's if it's changed
86+
if noised_data.dtype.name != column.dtype.name:
87+
noised_data = noised_data.astype(column.dtype)
88+
7889
column.loc[to_noise_idx] = noised_data
90+
7991
return column

src/pseudopeople/interface.py

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from pathlib import Path
2-
from typing import Union
2+
from typing import List, Union
33

44
import pandas as pd
5+
import pyarrow.parquet as pq
56

67
from pseudopeople.configuration import get_configuration
78
from pseudopeople.constants import paths
@@ -14,6 +15,7 @@ def _generate_form(
1415
source: Union[Path, str, pd.DataFrame],
1516
seed: int,
1617
configuration: Union[Path, str, dict],
18+
year_filter: dict,
1719
) -> pd.DataFrame:
1820
"""
1921
Helper for generating noised forms from clean data.
@@ -49,9 +51,11 @@ def _generate_form(
4951
data = source
5052
elif isinstance(source, Path):
5153
if source.suffix == ".hdf":
52-
data = pd.read_hdf(source)
54+
with pd.HDFStore(str(source), mode="r") as hdf_store:
55+
data = hdf_store.select("data", where=year_filter["hdf"])
56+
hdf_store.close()
5357
elif source.suffix == ".parquet":
54-
data = pd.read_parquet(source)
58+
data = pq.read_table(source, filters=year_filter["parquet"]).to_pandas()
5559
else:
5660
raise ValueError(
5761
"Source path must either be a .hdf or a .parquet file. Provided "
@@ -64,7 +68,21 @@ def _generate_form(
6468
f"Source {source} must be either a pandas DataFrame or a path to a "
6569
"file containing a pandas DataFrame."
6670
)
67-
return noise_form(form, data, configuration_tree, seed)
71+
72+
columns_to_keep = [c for c in form.columns]
73+
# Coerce dtypes
74+
for col in columns_to_keep:
75+
if col.dtype_name != data[col.name].dtype.name:
76+
data[col.name] = data[col.name].astype(col.dtype_name)
77+
noised_form = noise_form(form, data, configuration_tree, seed)
78+
noised_form = _extract_columns(columns_to_keep, noised_form)
79+
return noised_form
80+
81+
82+
def _extract_columns(columns_to_keep, noised_form):
83+
if columns_to_keep:
84+
noised_form = noised_form[[c.name for c in columns_to_keep]]
85+
return noised_form
6886

6987

7088
# TODO: add year as parameter to select the year of the decennial census to generate (MIC-3909)
@@ -75,93 +93,146 @@ def generate_decennial_census(
7593
source: Union[Path, str, pd.DataFrame] = None,
7694
seed: int = 0,
7795
configuration: Union[Path, str, dict] = None,
96+
year: int = 2020,
7897
) -> pd.DataFrame:
7998
"""
8099
Generates noised decennial census data from un-noised data.
81100
82101
:param source: A path to or pd.DataFrame of the un-noised source census data
83102
:param seed: An integer seed for randomness
84103
:param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
104+
:param year: The year from the data to noise
85105
:return: A pd.DataFrame of noised census data
86106
"""
87-
return _generate_form(FORMS.census, source, seed, configuration)
107+
year_filter = {"hdf": None, "parquet": None}
108+
if year:
109+
year_filter["hdf"] = [f"{FORMS.census.date_column} == {year}."]
110+
year_filter["parquet"] = [(FORMS.census.date_column, "==", year)]
111+
return _generate_form(FORMS.census, source, seed, configuration, year_filter)
88112

89113

90114
def generate_american_communities_survey(
91115
source: Union[Path, str, pd.DataFrame] = None,
92116
seed: int = 0,
93117
configuration: Union[Path, str, dict] = None,
118+
year: int = 2020,
94119
) -> pd.DataFrame:
95120
"""
96121
Generates noised American Communities Survey (ACS) data from un-noised data.
97122
98123
:param source: A path to or pd.DataFrame of the un-noised source ACS data
99124
:param seed: An integer seed for randomness
100125
:param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
126+
:param year: The year from the data to noise
101127
:return: A pd.DataFrame of noised ACS data
102128
"""
103-
return _generate_form(FORMS.acs, source, seed, configuration)
129+
year_filter = {"hdf": None, "parquet": None}
130+
if year:
131+
year_filter["hdf"] = [
132+
f"{FORMS.acs.date_column} >= '{year}-01-01' and {FORMS.acs.date_column} <= '{year}-12-31'"
133+
]
134+
year_filter["parquet"] = [
135+
(FORMS.acs.date_column, ">=", pd.Timestamp(f"{year}-01-01")),
136+
(FORMS.acs.date_column, "<=", pd.Timestamp(f"{year}-12-31")),
137+
]
138+
seed = seed * 10_000 + year
139+
return _generate_form(FORMS.acs, source, seed, configuration, year_filter)
104140

105141

106142
def generate_current_population_survey(
107143
source: Union[Path, str, pd.DataFrame] = None,
108144
seed: int = 0,
109145
configuration: Union[Path, str, dict] = None,
146+
year: int = 2020,
110147
) -> pd.DataFrame:
111148
"""
112149
Generates noised Current Population Survey (CPS) data from un-noised data.
113150
114151
:param source: A path to or pd.DataFrame of the un-noised source CPS data
115152
:param seed: An integer seed for randomness
116153
:param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
154+
:param year: The year from the data to noise
117155
:return: A pd.DataFrame of noised CPS data
118156
"""
119-
return _generate_form(FORMS.cps, source, seed, configuration)
157+
year_filter = {"hdf": None, "parquet": None}
158+
if year:
159+
year_filter["hdf"] = [
160+
f"{FORMS.cps.date_column} >= '{year}-01-01' and {FORMS.cps.date_column} <= '{year}-12-31'"
161+
]
162+
year_filter["parquet"] = [
163+
(FORMS.cps.date_column, ">=", pd.Timestamp(f"{year}-01-01")),
164+
(FORMS.cps.date_column, "<=", pd.Timestamp(f"{year}-12-31")),
165+
]
166+
seed = seed * 10_000 + year
167+
return _generate_form(FORMS.cps, source, seed, configuration, year_filter)
120168

121169

122170
def generate_taxes_w2_and_1099(
123171
source: Union[Path, str, pd.DataFrame] = None,
124172
seed: int = 0,
125173
configuration: Union[Path, str, dict] = None,
174+
year: int = 2020,
126175
) -> pd.DataFrame:
127176
"""
128177
Generates noised W2 and 1099 data from un-noised data.
129178
130179
:param source: A path to or pd.DataFrame of the un-noised source W2 and 1099 data
131180
:param seed: An integer seed for randomness
132181
:param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
182+
:param year: The year from the data to noise
133183
:return: A pd.DataFrame of noised W2 and 1099 data
134184
"""
135-
return _generate_form(FORMS.tax_w2_1099, source, seed, configuration)
185+
year_filter = {"hdf": None, "parquet": None}
186+
if year:
187+
year_filter["hdf"] = [f"{FORMS.tax_w2_1099.date_column} == {year}."]
188+
year_filter["parquet"] = [(FORMS.tax_w2_1099.date_column, "==", year)]
189+
seed = seed * 10_000 + year
190+
return _generate_form(FORMS.tax_w2_1099, source, seed, configuration, year_filter)
136191

137192

138193
def generate_women_infants_and_children(
139194
source: Union[Path, str, pd.DataFrame] = None,
140195
seed: int = 0,
141196
configuration: Union[Path, str, dict] = None,
197+
year: int = 2020,
142198
) -> pd.DataFrame:
143199
"""
144200
Generates noised Women Infants and Children (WIC) data from un-noised data.
145201
146202
:param source: A path to or pd.DataFrame of the un-noised source WIC data
147203
:param seed: An integer seed for randomness
148204
:param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
205+
:param year: The year from the data to noise
149206
:return: A pd.DataFrame of noised WIC data
150207
"""
151-
return _generate_form(FORMS.wic, source, seed, configuration)
208+
year_filter = {"hdf": None, "parquet": None}
209+
if year:
210+
year_filter["hdf"] = [f"{FORMS.wic.date_column} == {year}."]
211+
year_filter["parquet"] = [(FORMS.wic.date_column, "==", year)]
212+
seed = seed * 10_000 + year
213+
return _generate_form(FORMS.wic, source, seed, configuration, year_filter)
152214

153215

154216
def generate_social_security(
155217
source: Union[Path, str, pd.DataFrame] = None,
156218
seed: int = 0,
157219
configuration: Union[Path, str, dict] = None,
220+
year: int = 2020,
158221
) -> pd.DataFrame:
159222
"""
160223
Generates noised Social Security (SSA) data from un-noised data.
161224
162225
:param source: A path to or pd.DataFrame of the un-noised source SSA data
163226
:param seed: An integer seed for randomness
164227
:param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
228+
:param year: The year up to which to noise from the data
165229
:return: A pd.DataFrame of noised SSA data
166230
"""
167-
return _generate_form(FORMS.ssa, source, seed, configuration)
231+
year_filter = {"hdf": None, "parquet": None}
232+
if year:
233+
year_filter["hdf"] = [f"{FORMS.ssa.date_column} <= {year}."]
234+
year_filter["parquet"] = [
235+
(FORMS.ssa.date_column, "<=", pd.Timestamp(f"{year}-12-31"))
236+
]
237+
seed = seed * 10_000 + year
238+
return _generate_form(FORMS.ssa, source, seed, configuration, year_filter)

src/pseudopeople/noise_entities.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from typing import NamedTuple
22

3-
from pseudopeople import noise_functions
3+
from pseudopeople import noise_functions, utilities
44
from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
55

66

@@ -24,6 +24,7 @@ class __NoiseTypes(NamedTuple):
2424
incorrect_selection: ColumnNoiseType = ColumnNoiseType(
2525
"incorrect_selection",
2626
noise_functions.generate_incorrect_selections,
27+
noise_level_scaling_function=utilities.noise_scaling_incorrect_selection,
2728
token_noise_level=None,
2829
)
2930
# copy_from_within_household: ColumnNoiseType = ColumnNoiseType(

0 commit comments

Comments
 (0)