3131"""
3232
3333from pathlib import Path
34- from typing import Dict , List , Union
34+ from typing import Dict , List , Optional , Union
3535
3636import pandas as pd
3737import pyarrow .parquet as pq
3838from loguru import logger
39+ from tqdm import tqdm
3940
4041from pseudopeople .configuration import get_configuration
4142from pseudopeople .constants import paths
@@ -90,8 +91,14 @@ def _generate_dataset(
9091 "Please provide the path to the unmodified root data directory."
9192 )
9293 noised_dataset = []
93- for data_path in data_paths :
94- logger .info (f"Loading data from { data_path } ." )
94+ iterator = (
95+ tqdm (data_paths , desc = "Noising data" , leave = False )
96+ if len (data_paths ) > 1
97+ else data_paths
98+ )
99+ # for data_path in tqdm(data_paths, desc="Noising data", leave=False):
100+ for data_path in iterator :
101+ logger .debug (f"Loading data from { data_path } ." )
95102 data = _load_data_from_path (data_path , year_filter )
96103
97104 data = _reformat_dates_for_noising (data , dataset )
@@ -106,6 +113,8 @@ def _generate_dataset(
106113 # again after concat (https://github.com/pandas-dev/pandas/issues/51362)
107114 noised_dataset = _coerce_dtypes (noised_dataset , dataset )
108115
116+ logger .debug ("*** Finished ***" )
117+
109118 return noised_dataset
110119
111120
@@ -162,7 +171,7 @@ def generate_decennial_census(
162171 source : Union [Path , str ] = None ,
163172 seed : int = 0 ,
164173 config : Union [Path , str , Dict [str , Dict ]] = None ,
165- year : int = 2020 ,
174+ year : Optional [ int ] = 2020 ,
166175 verbose : bool = False ,
167176) -> pd .DataFrame :
168177 """
@@ -175,8 +184,9 @@ def generate_decennial_census(
175184 :param config: An optional override to the default configuration. Can be a path
176185 to a configuration YAML file or a dictionary.
177186 :param year: The year (format YYYY) to include in the dataset. Must be a decennial
178- year (e.g. 2020, 2030, 2040). Will return an empty pd.DataFrame if there is no
179- data with this year.
187+ year (e.g. 2020, 2030, 2040). Will return an empty pd.DataFrame if there are no
188+ data with this year. If None is provided, data from all years are
189+ included in the dataset.
180190 :param verbose: Log with verbosity if True.
181191 :return: A pd.DataFrame of simulated decennial census data.
182192 :raises ConfigurationError: An incorrect config is provided.
@@ -193,7 +203,7 @@ def generate_american_community_survey(
193203 source : Union [Path , str ] = None ,
194204 seed : int = 0 ,
195205 config : Union [Path , str , Dict [str , Dict ]] = None ,
196- year : int = 2020 ,
206+ year : Optional [ int ] = 2020 ,
197207 verbose : bool = False ,
198208) -> pd .DataFrame :
199209 """
@@ -212,7 +222,8 @@ def generate_american_community_survey(
212222 :param config: An optional override to the default configuration. Can be a path
213223 to a configuration YAML file or a dictionary.
214224 :param year: The survey date year (format YYYY) to include in the dataset. Will
215- return an empty pd.DataFrame if there is no data with this year.
225+ return an empty pd.DataFrame if there are no data with this year. If None is
226+ provided, data from all years are included in the dataset.
216227 :param verbose: Log with verbosity if True.
217228 :return: A pd.DataFrame of simulated ACS data.
218229 :raises ConfigurationError: An incorrect config is provided.
@@ -235,7 +246,7 @@ def generate_current_population_survey(
235246 source : Union [Path , str ] = None ,
236247 seed : int = 0 ,
237248 config : Union [Path , str , Dict [str , Dict ]] = None ,
238- year : int = 2020 ,
249+ year : Optional [ int ] = 2020 ,
239250 verbose : bool = False ,
240251) -> pd .DataFrame :
241252 """
@@ -255,7 +266,8 @@ def generate_current_population_survey(
255266 :param config: An optional override to the default configuration. Can be a path
256267 to a configuration YAML file or a dictionary.
257268 :param year: The survey date year (format YYYY) to include in the dataset. Will
258- return an empty pd.DataFrame if there is no data with this year.
269+ return an empty pd.DataFrame if there are no data with this year. If None is
270+ provided, data from all years are included in the dataset.
259271 :param verbose: Log with verbosity if True.
260272 :return: A pd.DataFrame of simulated CPS data.
261273 :raises ConfigurationError: An incorrect config is provided.
@@ -278,7 +290,7 @@ def generate_taxes_w2_and_1099(
278290 source : Union [Path , str ] = None ,
279291 seed : int = 0 ,
280292 config : Union [Path , str , Dict [str , Dict ]] = None ,
281- year : int = 2020 ,
293+ year : Optional [ int ] = 2020 ,
282294 verbose : bool = False ,
283295) -> pd .DataFrame :
284296 """
@@ -291,7 +303,8 @@ def generate_taxes_w2_and_1099(
291303 :param config: An optional override to the default configuration. Can be a path
292304 to a configuration YAML file or a dictionary.
293305 :param year: The tax year (format YYYY) to include in the dataset. Will return
294- an empty pd.DataFrame if there is no data with this year.
306+ an empty pd.DataFrame if there are no data with this year. If None is provided,
307+ data from all years are included in the dataset.
295308 :param verbose: Log with verbosity if True.
296309 :return: A pd.DataFrame of simulated W2 and 1099 tax data.
297310 :raises ConfigurationError: An incorrect config is provided.
@@ -309,7 +322,7 @@ def generate_women_infants_and_children(
309322 source : Union [Path , str ] = None ,
310323 seed : int = 0 ,
311324 config : Union [Path , str , Dict [str , Dict ]] = None ,
312- year : int = 2020 ,
325+ year : Optional [ int ] = 2020 ,
313326 verbose : bool = False ,
314327) -> pd .DataFrame :
315328 """
@@ -327,7 +340,8 @@ def generate_women_infants_and_children(
327340 :param config: An optional override to the default configuration. Can be a path
328341 to a configuration YAML file or a dictionary.
329342 :param year: The year (format YYYY) to include in the dataset. Will return an
330- empty pd.DataFrame if there is no data with this year.
343+ empty pd.DataFrame if there are no data with this year. If None is provided,
344+ data from all years are included in the dataset.
331345 :param verbose: Log with verbosity if True.
332346 :return: A pd.DataFrame of simulated WIC data.
333347 :raises ConfigurationError: An incorrect config is provided.
@@ -345,7 +359,7 @@ def generate_social_security(
345359 source : Union [Path , str ] = None ,
346360 seed : int = 0 ,
347361 config : Union [Path , str , Dict [str , Dict ]] = None ,
348- year : int = 2020 ,
362+ year : Optional [ int ] = 2020 ,
349363 verbose : bool = False ,
350364) -> pd .DataFrame :
351365 """
@@ -358,8 +372,9 @@ def generate_social_security(
358372 :param config: An optional override to the default configuration. Can be a path
359373 to a configuration YAML file or a dictionary.
360374 :param year: The latest year (format YYYY) to include in the dataset; will also
361- include all previous years. Will return an empty pd.DataFrame if there is no
362- data on or before this year.
375+ include all previous years. Will return an empty pd.DataFrame if there are no
376+ data on or before this year. If None is provided, data from all years are
377+ included in the dataset.
363378 :param verbose: Log with verbosity if True.
364379 :return: A pd.DataFrame of simulated SSA data.
365380 :raises ConfigurationError: An incorrect config is provided.
0 commit comments