Skip to content

Commit efa6aa9

Browse files
Merge pull request #50 from ihmeuw/develop
release v0.4.0
2 parents d2a4109 + 6baccff commit efa6aa9

27 files changed

+1207
-1311
lines changed

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
**0.4.0 - 04/11/23**
2+
- Generate default configuration instead of maintaining a static file
3+
- Read sample data if no data argument is provided
4+
- Update sample datasets
5+
16
**0.3.2 - 04/10/23**
27

38
- Update sample datasets

src/pseudopeople/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
__summary__ = "pseudopeople is package which adds noise to simulated census-scale data using standard scientific Python tools."
1414
__uri__ = "https://github.com/ihmeuw/pseudopeople"
1515

16-
__version__ = "0.3.2"
16+
__version__ = "0.4.0"
1717

1818
__author__ = "The pseudopeople developers"
1919
__email__ = "vivarium.dev@gmail.com"

src/pseudopeople/configuration.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
from pathlib import Path
2+
from typing import Dict, Union
3+
4+
import yaml
5+
from vivarium.config_tree import ConfigTree
6+
7+
from pseudopeople.schema_entities import FORMS, NOISE_TYPES
8+
9+
10+
class Keys:
11+
"""Container for all non-form standard/repeated key names used in the configuration file"""
12+
13+
ROW_NOISE = "row_noise" # second layer, eg <form>: row_noise: {...}
14+
COLUMN_NOISE = "column_noise" # second layer, eg <form>: column_noise: {...}
15+
PROBABILITY = "probability"
16+
ROW_NOISE_LEVEL = "row_noise_level"
17+
TOKEN_NOISE_LEVEL = "token_noise_level"
18+
19+
20+
# Define non-baseline default items
21+
# NOTE: default values are defined in entity_types.RowNoiseType and entity_types.ColumnNoiseType
22+
DEFAULT_NOISE_VALUES = {
23+
FORMS.census.name: {
24+
Keys.ROW_NOISE: {
25+
NOISE_TYPES.omission.name: {
26+
Keys.PROBABILITY: 0.0145,
27+
}
28+
},
29+
},
30+
FORMS.acs.name: {
31+
Keys.ROW_NOISE: {
32+
NOISE_TYPES.omission.name: {
33+
Keys.PROBABILITY: 0.0145,
34+
},
35+
},
36+
},
37+
FORMS.cps.name: {
38+
Keys.ROW_NOISE: {
39+
NOISE_TYPES.omission.name: {
40+
Keys.PROBABILITY: 0.2905,
41+
},
42+
},
43+
},
44+
}
45+
46+
47+
def get_configuration(user_configuration: Union[Path, str, Dict] = None) -> ConfigTree:
48+
"""
49+
Gets a noising configuration ConfigTree, optionally overridden by a user-provided YAML.
50+
51+
:param user_configuration: A path to the YAML file or a dictionary defining user overrides for the defaults
52+
:return: a ConfigTree object of the noising configuration
53+
"""
54+
55+
default_config_layers = [
56+
"baseline",
57+
"default",
58+
"user",
59+
]
60+
noising_configuration = ConfigTree(
61+
layers=default_config_layers,
62+
)
63+
64+
# Instantiate the configuration file with baseline values
65+
baseline_dict = {}
66+
67+
# Loop through each form
68+
for form in FORMS:
69+
form_dict = {}
70+
row_noise_dict = {}
71+
column_dict = {}
72+
73+
# Loop through row noise types
74+
for row_noise in form.row_noise_types:
75+
row_noise_type_dict = {}
76+
if row_noise.probability is not None:
77+
row_noise_type_dict[Keys.PROBABILITY] = row_noise.probability
78+
if row_noise_type_dict:
79+
row_noise_dict[row_noise.name] = row_noise_type_dict
80+
81+
# Loop through columns and their applicable column noise types
82+
for column in form.columns:
83+
column_noise_dict = {}
84+
for noise_type in column.noise_types:
85+
column_noise_type_dict = {}
86+
if noise_type.row_noise_level is not None:
87+
column_noise_type_dict[Keys.ROW_NOISE_LEVEL] = noise_type.row_noise_level
88+
if noise_type.token_noise_level is not None:
89+
column_noise_type_dict[
90+
Keys.TOKEN_NOISE_LEVEL
91+
] = noise_type.token_noise_level
92+
if noise_type.additional_parameters is not None:
93+
for key, value in noise_type.additional_parameters.items():
94+
column_noise_type_dict[key] = value
95+
if column_noise_type_dict:
96+
column_noise_dict[noise_type.name] = column_noise_type_dict
97+
if column_noise_dict:
98+
column_dict[column.name] = column_noise_dict
99+
100+
# Compile
101+
if row_noise_dict:
102+
form_dict[Keys.ROW_NOISE] = row_noise_dict
103+
if column_dict:
104+
form_dict[Keys.COLUMN_NOISE] = column_dict
105+
106+
# Add the form's dictionary to baseline
107+
if form_dict:
108+
baseline_dict[form.name] = form_dict
109+
110+
noising_configuration.update(baseline_dict, layer="baseline")
111+
112+
# Update configuration with non-baseline default values
113+
noising_configuration.update(DEFAULT_NOISE_VALUES, layer="default")
114+
115+
# Update configuration with user-supplied values
116+
if user_configuration:
117+
if isinstance(user_configuration, (Path, str)):
118+
with open(user_configuration, "r") as f:
119+
user_configuration = yaml.full_load(f)
120+
user_configuration = format_user_configuration(
121+
user_configuration, noising_configuration
122+
)
123+
noising_configuration.update(user_configuration, layer="user")
124+
125+
validate_noising_configuration(noising_configuration)
126+
127+
return noising_configuration
128+
129+
130+
def format_user_configuration(user_dict: Dict, default_config) -> Dict:
131+
"""Formats the user's configuration file as necessary so it can properly
132+
update noising configuration to be used
133+
"""
134+
user_dict = _format_age_miswriting_perturbations(user_dict, default_config)
135+
136+
return user_dict
137+
138+
139+
def _format_age_miswriting_perturbations(user_dict: Dict, default_config: ConfigTree) -> Dict:
140+
# Format any age perturbation lists as a dictionary with uniform probabilites
141+
for form in user_dict:
142+
user_perturbations = (
143+
user_dict[form]
144+
.get("column_noise", {})
145+
.get("age", {})
146+
.get("age_miswriting", {})
147+
.get("possible_perturbations", {})
148+
)
149+
if not user_perturbations:
150+
continue
151+
formatted = {}
152+
default_perturbations = default_config[form]["column_noise"]["age"]["age_miswriting"][
153+
"possible_perturbations"
154+
]
155+
# Replace default configuration with 0 probabilities
156+
for perturbation in default_perturbations:
157+
formatted[perturbation] = 0
158+
if isinstance(user_perturbations, list):
159+
# Add user perturbations with uniform probabilities
160+
uniform_prob = 1 / len(user_perturbations)
161+
for perturbation in user_perturbations:
162+
formatted[perturbation] = uniform_prob
163+
elif isinstance(user_perturbations, dict):
164+
for perturbation, prob in user_perturbations.items():
165+
formatted[perturbation] = prob
166+
else:
167+
raise NotImplementedError(
168+
"age.age_miswriting.possible_perturbations can only be a list or dict, "
169+
f"received type {type(user_perturbations)}"
170+
)
171+
user_dict[form]["column_noise"]["age"]["age_miswriting"][
172+
"possible_perturbations"
173+
] = formatted
174+
175+
return user_dict
176+
177+
178+
def validate_noising_configuration(config: ConfigTree) -> None:
179+
"""Perform various validation checks on the final noising ConfigTree object"""
180+
_validate_age_miswriting(config)
181+
# TODO: validate omissions = [0, 0.5]
182+
183+
184+
def _validate_age_miswriting(config: ConfigTree) -> None:
185+
possible_perturbations = _extract_values(config, "possible_perturbations")
186+
for form_perturbations in possible_perturbations:
187+
form_perturbations_dict = form_perturbations.to_dict()
188+
if 0 in form_perturbations_dict:
189+
# TODO: Find a way to report specific location in config file
190+
raise ValueError("Cannot include 0 in age_miswriting.possible_perturbations")
191+
if sum(form_perturbations_dict.values()) != 1:
192+
raise ValueError(
193+
"The provided possible_perturbation probabilities must sum to 1 but they "
194+
f"currently sum to {sum(form_perturbations_dict.values())}: {form_perturbations_dict}",
195+
)
196+
197+
198+
def _extract_values(config: Union[ConfigTree, Dict], key: str):
199+
"""Extract values with a specific key from a dict or configtree"""
200+
results = []
201+
for k, v in config.items():
202+
if k == key:
203+
results.append(v)
204+
if isinstance(v, (dict, ConfigTree)):
205+
for result in _extract_values(v, key):
206+
results.append(result)
207+
208+
return results

src/pseudopeople/constants/paths.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,3 @@
99
QWERTY_ERRORS = DATA_ROOT / "qwerty_errors.yaml"
1010

1111
SAMPLE_DATA_ROOT = DATA_ROOT / "sample_forms"
12-
SAMPLE_DECENNIAL_CENSUS = SAMPLE_DATA_ROOT / "decennial_census_observer.parquet"
13-
SAMPLE_TAXES_W2_AND_1099 = SAMPLE_DATA_ROOT / "tax_w2_observer.parquet"
14-
SAMPLE_AMERICAN_COMMUNITIES_SURVEY = (
15-
SAMPLE_DATA_ROOT / "household_survey_observer_acs.parquet"
16-
)
17-
SAMPLE_CURRENT_POPULATION_SURVEY = SAMPLE_DATA_ROOT / "household_survey_observer_cps.parquet"
18-
SAMPLE_SOCIAL_SECURITY = SAMPLE_DATA_ROOT / "social_security_observer.parquet"
19-
SAMPLE_WOMEN_INFANTS_AND_CHILDREN = SAMPLE_DATA_ROOT / "wic_observer.parquet"

src/pseudopeople/data/sample_forms/decennial_census_observer.parquet renamed to src/pseudopeople/data/sample_forms/decennial_census_observer/decennial_census_observer.parquet

960 KB
Binary file not shown.
Binary file not shown.

src/pseudopeople/data/sample_forms/household_survey_observer_cps.parquet renamed to src/pseudopeople/data/sample_forms/household_survey_observer_cps/household_survey_observer_cps.parquet

98.5 KB
Binary file not shown.

src/pseudopeople/data/sample_forms/social_security_observer.parquet renamed to src/pseudopeople/data/sample_forms/social_security_observer/social_security_observer.parquet

838 KB
Binary file not shown.

src/pseudopeople/data/sample_forms/tax_1040_observer.parquet renamed to src/pseudopeople/data/sample_forms/tax_1040_observer/tax_1040_observer.parquet

4.58 MB
Binary file not shown.

0 commit comments

Comments
 (0)