11from pathlib import Path
2- from typing import Union
2+ from typing import List , Union
33
44import pandas as pd
5+ import pyarrow .parquet as pq
56
67from pseudopeople .configuration import get_configuration
78from pseudopeople .constants import paths
@@ -14,6 +15,7 @@ def _generate_form(
1415 source : Union [Path , str , pd .DataFrame ],
1516 seed : int ,
1617 configuration : Union [Path , str , dict ],
18+ year_filter : dict ,
1719) -> pd .DataFrame :
1820 """
1921 Helper for generating noised forms from clean data.
@@ -49,9 +51,11 @@ def _generate_form(
4951 data = source
5052 elif isinstance (source , Path ):
5153 if source .suffix == ".hdf" :
52- data = pd .read_hdf (source )
54+ with pd .HDFStore (str (source ), mode = "r" ) as hdf_store :
55+ data = hdf_store .select ("data" , where = year_filter ["hdf" ])
56+ hdf_store .close ()
5357 elif source .suffix == ".parquet" :
54- data = pd . read_parquet (source )
58+ data = pq . read_table (source , filters = year_filter [ "parquet" ]). to_pandas ( )
5559 else :
5660 raise ValueError (
5761 "Source path must either be a .hdf or a .parquet file. Provided "
@@ -64,7 +68,21 @@ def _generate_form(
6468 f"Source { source } must be either a pandas DataFrame or a path to a "
6569 "file containing a pandas DataFrame."
6670 )
67- return noise_form (form , data , configuration_tree , seed )
71+
72+ columns_to_keep = [c for c in form .columns ]
73+ # Coerce dtypes
74+ for col in columns_to_keep :
75+ if col .dtype_name != data [col .name ].dtype .name :
76+ data [col .name ] = data [col .name ].astype (col .dtype_name )
77+ noised_form = noise_form (form , data , configuration_tree , seed )
78+ noised_form = _extract_columns (columns_to_keep , noised_form )
79+ return noised_form
80+
81+
82+ def _extract_columns (columns_to_keep , noised_form ):
83+ if columns_to_keep :
84+ noised_form = noised_form [[c .name for c in columns_to_keep ]]
85+ return noised_form
6886
6987
7088# TODO: add year as parameter to select the year of the decennial census to generate (MIC-3909)
@@ -75,93 +93,146 @@ def generate_decennial_census(
7593 source : Union [Path , str , pd .DataFrame ] = None ,
7694 seed : int = 0 ,
7795 configuration : Union [Path , str , dict ] = None ,
96+ year : int = 2020 ,
7897) -> pd .DataFrame :
7998 """
8099 Generates noised decennial census data from un-noised data.
81100
82101 :param source: A path to or pd.DataFrame of the un-noised source census data
83102 :param seed: An integer seed for randomness
84103 :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
104+ :param year: The year from the data to noise
85105 :return: A pd.DataFrame of noised census data
86106 """
87- return _generate_form (FORMS .census , source , seed , configuration )
107+ year_filter = {"hdf" : None , "parquet" : None }
108+ if year :
109+ year_filter ["hdf" ] = [f"{ FORMS .census .date_column } == { year } ." ]
110+ year_filter ["parquet" ] = [(FORMS .census .date_column , "==" , year )]
111+ return _generate_form (FORMS .census , source , seed , configuration , year_filter )
88112
89113
90114def generate_american_communities_survey (
91115 source : Union [Path , str , pd .DataFrame ] = None ,
92116 seed : int = 0 ,
93117 configuration : Union [Path , str , dict ] = None ,
118+ year : int = 2020 ,
94119) -> pd .DataFrame :
95120 """
96121 Generates noised American Communities Survey (ACS) data from un-noised data.
97122
98123 :param source: A path to or pd.DataFrame of the un-noised source ACS data
99124 :param seed: An integer seed for randomness
100125 :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
126+ :param year: The year from the data to noise
101127 :return: A pd.DataFrame of noised ACS data
102128 """
103- return _generate_form (FORMS .acs , source , seed , configuration )
129+ year_filter = {"hdf" : None , "parquet" : None }
130+ if year :
131+ year_filter ["hdf" ] = [
132+ f"{ FORMS .acs .date_column } >= '{ year } -01-01' and { FORMS .acs .date_column } <= '{ year } -12-31'"
133+ ]
134+ year_filter ["parquet" ] = [
135+ (FORMS .acs .date_column , ">=" , pd .Timestamp (f"{ year } -01-01" )),
136+ (FORMS .acs .date_column , "<=" , pd .Timestamp (f"{ year } -12-31" )),
137+ ]
138+ seed = seed * 10_000 + year
139+ return _generate_form (FORMS .acs , source , seed , configuration , year_filter )
104140
105141
106142def generate_current_population_survey (
107143 source : Union [Path , str , pd .DataFrame ] = None ,
108144 seed : int = 0 ,
109145 configuration : Union [Path , str , dict ] = None ,
146+ year : int = 2020 ,
110147) -> pd .DataFrame :
111148 """
112149 Generates noised Current Population Survey (CPS) data from un-noised data.
113150
114151 :param source: A path to or pd.DataFrame of the un-noised source CPS data
115152 :param seed: An integer seed for randomness
116153 :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
154+ :param year: The year from the data to noise
117155 :return: A pd.DataFrame of noised CPS data
118156 """
119- return _generate_form (FORMS .cps , source , seed , configuration )
157+ year_filter = {"hdf" : None , "parquet" : None }
158+ if year :
159+ year_filter ["hdf" ] = [
160+ f"{ FORMS .cps .date_column } >= '{ year } -01-01' and { FORMS .cps .date_column } <= '{ year } -12-31'"
161+ ]
162+ year_filter ["parquet" ] = [
163+ (FORMS .cps .date_column , ">=" , pd .Timestamp (f"{ year } -01-01" )),
164+ (FORMS .cps .date_column , "<=" , pd .Timestamp (f"{ year } -12-31" )),
165+ ]
166+ seed = seed * 10_000 + year
167+ return _generate_form (FORMS .cps , source , seed , configuration , year_filter )
120168
121169
122170def generate_taxes_w2_and_1099 (
123171 source : Union [Path , str , pd .DataFrame ] = None ,
124172 seed : int = 0 ,
125173 configuration : Union [Path , str , dict ] = None ,
174+ year : int = 2020 ,
126175) -> pd .DataFrame :
127176 """
128177 Generates noised W2 and 1099 data from un-noised data.
129178
130179 :param source: A path to or pd.DataFrame of the un-noised source W2 and 1099 data
131180 :param seed: An integer seed for randomness
132181 :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
182+ :param year: The year from the data to noise
133183 :return: A pd.DataFrame of noised W2 and 1099 data
134184 """
135- return _generate_form (FORMS .tax_w2_1099 , source , seed , configuration )
185+ year_filter = {"hdf" : None , "parquet" : None }
186+ if year :
187+ year_filter ["hdf" ] = [f"{ FORMS .tax_w2_1099 .date_column } == { year } ." ]
188+ year_filter ["parquet" ] = [(FORMS .tax_w2_1099 .date_column , "==" , year )]
189+ seed = seed * 10_000 + year
190+ return _generate_form (FORMS .tax_w2_1099 , source , seed , configuration , year_filter )
136191
137192
138193def generate_women_infants_and_children (
139194 source : Union [Path , str , pd .DataFrame ] = None ,
140195 seed : int = 0 ,
141196 configuration : Union [Path , str , dict ] = None ,
197+ year : int = 2020 ,
142198) -> pd .DataFrame :
143199 """
144200 Generates noised Women Infants and Children (WIC) data from un-noised data.
145201
146202 :param source: A path to or pd.DataFrame of the un-noised source WIC data
147203 :param seed: An integer seed for randomness
148204 :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
205+ :param year: The year from the data to noise
149206 :return: A pd.DataFrame of noised WIC data
150207 """
151- return _generate_form (FORMS .wic , source , seed , configuration )
208+ year_filter = {"hdf" : None , "parquet" : None }
209+ if year :
210+ year_filter ["hdf" ] = [f"{ FORMS .wic .date_column } == { year } ." ]
211+ year_filter ["parquet" ] = [(FORMS .wic .date_column , "==" , year )]
212+ seed = seed * 10_000 + year
213+ return _generate_form (FORMS .wic , source , seed , configuration , year_filter )
152214
153215
154216def generate_social_security (
155217 source : Union [Path , str , pd .DataFrame ] = None ,
156218 seed : int = 0 ,
157219 configuration : Union [Path , str , dict ] = None ,
220+ year : int = 2020 ,
158221) -> pd .DataFrame :
159222 """
160223 Generates noised Social Security (SSA) data from un-noised data.
161224
162225 :param source: A path to or pd.DataFrame of the un-noised source SSA data
163226 :param seed: An integer seed for randomness
164227 :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
228+ :param year: The year up to which to noise from the data
165229 :return: A pd.DataFrame of noised SSA data
166230 """
167- return _generate_form (FORMS .ssa , source , seed , configuration )
231+ year_filter = {"hdf" : None , "parquet" : None }
232+ if year :
233+ year_filter ["hdf" ] = [f"{ FORMS .ssa .date_column } <= { year } ." ]
234+ year_filter ["parquet" ] = [
235+ (FORMS .ssa .date_column , "<=" , pd .Timestamp (f"{ year } -12-31" ))
236+ ]
237+ seed = seed * 10_000 + year
238+ return _generate_form (FORMS .ssa , source , seed , configuration , year_filter )
0 commit comments