Skip to content

Commit 91aefa7

Browse files
This is a major commit before the beta release. It includes the following:
1) the setup.py has been modified so that popgen can be distributed as a python package 2) weights from the reweighting step can now be output either by geo or summed across all geographies 3) reweighting performance measures are now output at a frequency specified by the user 4) ipf performance measures are now output at a frequency specified by the user 5) entropy approach to reweighting is now included - this has also been refactored so now the performance is comparable to the IPU procedure 6) ipf region outputs were output incorrectly - this was a bug and is fixed in this release 7) in the earlier version results for different scenarios would have been overwrritten. Also, when you run the same project twice, results would have been overwritten. This doesn't happen now - everytime a scenario is run PopGen stores the results in a folder with the scenario name and the clock time at the time of exporting results used to specify the folder name. Also, the portion of the configuration file that defines the specific scenario is now copied into the results folder - this way users can keep track of different population synthesis scenario runs just within the PopGen environment and not necessarily have to keep separate notes. 8) a simple example project is added under tutorials - users can review this tutorial to understand how the configuration and data files are specified 9) when synthetic population files are generated three different ids are now added 1) unique_id_in_geo assigns a unique id to the housing unit - this carries over to both the housing and person files, unique_housing_id - this is a unique id that identifies the particular synthetic housing unit uniquely across all geographies, unique_person_id - this is a unique id that identifies a synthetic person uniquely across all geographies 10) geos_to_synthesize is now implemented - this allows users to specify a "region" ids for which population synthesis must be carried out. Note that "geo" ids cannot be specified because what if users specify a subset of geographis and choose to match regional constraints - to avoid such inconsistent setups only "region" ids can be specified. 11) code has been cleaned to conform to pep8 standards - at least to the extent prompted by atom-pep8-linter
1 parent 48e08b4 commit 91aefa7

22 files changed

+792
-297
lines changed

popgen/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
__author__ = 'Karthik Konduri'
2-
__email__ = '[email protected]'
3-
__version__ = '2.0'
1+
from .project import Project
2+
3+
__version__ = '2.0.b1'

popgen/config.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
1+
import yaml
2+
3+
4+
class ConfigError(Exception):
5+
pass
6+
7+
18
def wrap_config_value(value):
29
"""The method is used to wrap YAML elements as Config objects. So the
310
YAML properties can be accessed using attribute access.
4-
E.g. If config object - x for the following YAML is given as:
11+
E.g. If config object - x for is specificed as the following YAML:
512
613
attribbute1:
714
attribute2 : 'Value'
815
916
then attribute access x.attribute1.attribute2 is used to access "Value".
17+
Also, x.attribute can be used to access the dictionary {attribute: 'value'}
1018
"""
1119
if isinstance(value, basestring):
1220
return value
@@ -38,8 +46,8 @@ def return_value(self, key):
3846
try:
3947
value = self._data[key]
4048
except KeyError, e:
41-
raise KeyError("Key - %s doesn't exist in the YAML configuration"
42-
% key)
49+
raise ConfigError(
50+
"Key - %s doesn't exist in the YAML configuration" % key)
4351
return value
4452

4553
def __len__(self):
@@ -57,6 +65,12 @@ def return_list(self):
5765
def return_dict(self):
5866
return self._data
5967

68+
def write_to_file(self, filepath):
69+
with open(filepath, 'w') as outfile:
70+
outfile.write(yaml.dump(self._data,
71+
default_flow_style=False))
72+
73+
6074
if __name__ == "__main__":
6175
import yaml
6276

popgen/data.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import pandas as pd
44
import numpy as np
55

6+
from config import ConfigError
7+
68

79
class DB(object):
810
"""This class returns a Inputs object that can be used to handle all
@@ -45,9 +47,11 @@ def load_data(self):
4547
self.region_marginals = self.get_data(region_marginals_config,
4648
header=[0, 1])
4749

50+
self._enumerate_geo_ids()
51+
4852
def get_data(self, config, header=0):
4953
config_dict = config.return_dict()
50-
#print config_dict, type(config_dict)
54+
# print config_dict, type(config_dict)
5155
data_dict = {}
5256
for item in config_dict:
5357
full_location = os.path.abspath(config_dict[item])
@@ -56,30 +60,38 @@ def get_data(self, config, header=0):
5660
data_dict[item].loc[:,
5761
data_dict[item].index.name] = (data_dict[item]
5862
.index.values)
59-
#print data_dict[item]
60-
#print data_dict.keys()
63+
# print data_dict[item]
64+
# print data_dict.keys()
6165
return data_dict
6266

63-
def enumerate_geo_ids(self):
67+
def _enumerate_geo_ids(self):
6468
geo_to_sample = self.geo["geo_to_sample"]
65-
self.geo_ids = geo_to_sample.index.values
66-
self.sample_geo_ids = np.unique(geo_to_sample[self._inputs_config
67-
.column_names
68-
.sample_geo].values)
69-
69+
self.geo_ids_all = geo_to_sample.index.tolist()
70+
# self.sample_geo_ids = np.unique(geo_to_sample[self._inputs_config
71+
# .column_names
72+
# .sample_geo].values)
7073
region_to_geo = self.geo["region_to_geo"]
71-
self.region_ids = np.unique(region_to_geo.index.values)
74+
self.region_ids_all = np.unique(region_to_geo.index.values).tolist()
7275

73-
#region_to_sample = self.geo["region_to_sample"]
74-
#self.region_ids = np.unique(region_to_sample.index.values)
76+
# region_to_sample = self.geo["region_to_sample"]
77+
# self.region_ids = np.unique(region_to_sample.index.values)
7578

7679
def get_geo_ids_for_region(self, region_id):
7780
geo_name = self._inputs_config.column_names.geo
78-
return self.geo["region_to_geo"].loc[region_id, geo_name].copy()
79-
80-
def enumerate_geo_ids_to_synthesize(self):
81-
#TODO: Implement this to only synthesize a few geographies
82-
pass
81+
return (
82+
self.geo["region_to_geo"].loc[region_id, geo_name].copy().tolist())
83+
84+
def enumerate_geo_ids_for_scenario(self, scenario_config):
85+
try:
86+
self.region_ids = scenario_config.geos_to_synthesize.region.ids
87+
self.geo_ids = []
88+
for region_id in self.region_ids:
89+
self.geo_ids += self.get_geo_ids_for_region(region_id)
90+
except ConfigError, e:
91+
print "KeyError", e
92+
self.geo_ids = self.geo_ids_all
93+
# self.sample_geo_ids = self.sample_geo_ids_all
94+
self.region_ids = self.region_ids_all
8395

8496
def return_variables_cats(self, entity, variable_names):
8597
variables_cats = {}
@@ -95,12 +107,10 @@ def check_data(self):
95107
self.check_sample_marginals_consistency()
96108
self.check_marginals()
97109

98-
def check_sample_margianls_consistency(self):
99-
#TODO: check consistency in variables across files
100-
#TODO: check consistency in categories across files
101-
pass
102-
103-
def check_marginals(self):
104-
#TODO: check consistency in marginals across
105-
#TODO: check geo ids, sample geo ids, region ids across files
110+
def check(self):
111+
# TODO: check if the ids entered are consistent with the region ids
112+
# TODO: check consistency in variables across files
113+
# TODO: check consistency in categories across files
114+
# TODO: check consistency in marginals across
115+
# TODO: check geo ids, sample geo ids, region ids across files
106116
pass

popgen/draw.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,16 @@ def __init__(self, scenario_config, geo_ids, geo_row_idx, geo_frequencies,
2121
self.pvalue_tolerance = (
2222
self.scenario_config.parameters.draws.pvalue_tolerance)
2323
self.geo_id_rows_syn_dict = {}
24+
self.performance_columns = ["p_value", "iterations",
25+
"chi_sq_stat"]
2426
self.draws_performance = pd.DataFrame(
25-
index=self.geo_ids, columns=["p_value", "iterations",
26-
"chi_sq_stat"])
27+
index=self.geo_ids, columns=self.performance_columns)
2728

2829
def draw_population(self):
2930
np.random.seed(self.seed)
30-
#print "Drawing Households"
31-
performance_columns = ["p_value", "iterations", "chi_sq_stat"]
32-
for geo_id in self.geo_ids:
33-
#print "For geo:", geo_id
31+
# print "Drawing Households"
32+
for geo_id in self.geo_ids[:20]:
33+
# print "For geo:", geo_id
3434
geo_sample_weights = self.region_sample_weights.loc[:, geo_id]
3535
geo_cumulative_weights = (self._return_cumulative_probability(
3636
geo_sample_weights))
@@ -42,7 +42,7 @@ def draw_population(self):
4242

4343
p_value_max = -1
4444
for iter in range(self.iterations):
45-
#print "Iter is:", iter, self.iterations
45+
# print "Iter is:", iter, self.iterations
4646
seed = self.seed + iter
4747
geo_id_rows_syn = self._pick_households(
4848
geo_id_frequencies, geo_cumulative_weights)
@@ -54,7 +54,7 @@ def draw_population(self):
5454
geo_id_rows_syn, iter,
5555
stat, True)
5656
self.draws_performance.loc[geo_id,
57-
performance_columns] = (
57+
self.performance_columns] = (
5858
p_value_max, iter, stat_max)
5959
break
6060
elif p_value > p_value_max:
@@ -63,15 +63,15 @@ def draw_population(self):
6363
geo_id_rows_syn, iter,
6464
stat, False)
6565
self.draws_performance.loc[geo_id,
66-
performance_columns] = (
66+
self.performance_columns] = (
6767
p_value_max, iter, stat_max)
6868

69-
#print "Max found:", max_found, geo_id_frequencies.sum()
70-
#print "Max iter: %d, %f, %f" % (iter_max, p_value_max, stat_max)
71-
#self.syn_population.add_records_for_geo_id(
69+
# print "Max found:", max_found, geo_id_frequencies.sum()
70+
# print "Max iter: %d, %f, %f" % (iter_max, p_value_max, stat_max)
71+
# self.syn_population.add_records_for_geo_id(
7272
# geo_id, geo_id_rows_syn_max)
7373
self.geo_id_rows_syn_dict[geo_id] = geo_id_rows_syn_max
74-
#print self.draws_performance
74+
# print self.draws_performance
7575

7676
def _return_cumulative_probability(self, geo_sample_weights):
7777
geo_cumulative_weights = {}
@@ -81,7 +81,7 @@ def _return_cumulative_probability(self, geo_sample_weights):
8181
weights = geo_sample_weights.take(rows)
8282
geo_cumulative_weights[column] = (weights / weights.sum()).cumsum()
8383

84-
#print geo_cumulative_weights[column]
84+
# print geo_cumulative_weights[column]
8585
return geo_cumulative_weights
8686

8787
def _pick_households(self, geo_id_frequencies, geo_cumulative_weights):

0 commit comments

Comments
 (0)