Merge pull request #61 from UDST/large_mnl_sim_w_interactions

mxndrwgrdnr · web-flow · commit 21d2acc13d0c · 2018-11-15T18:47:46.000-08:00
latest updates to segmented MNL
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='urbansim_templates',
-    version='0.1.dev16',
+    version='0.1.dev17',
     description='UrbanSim extension for managing model steps',
     author='UrbanSim Inc.',
     author_email='info@urbansim.com',
diff --git a/urbansim_templates/__init__.py b/urbansim_templates/__init__.py
@@ -1 +1 @@
-version = __version__ = '0.1.dev16'
+version = __version__ = '0.1.dev17'
diff --git a/urbansim_templates/models/large_multinomial_logit.py b/urbansim_templates/models/large_multinomial_logit.py
@@ -446,13 +446,34 @@ def fit(self, mct=None):
         self.mergedchoicetable = mct
             
     
-    def run(self):
+    def run(self, chooser_batch_size=None, interaction_terms=None):
         """
         Run the model step: simulate choices and use them to update an Orca column.
         
         The simulated choices are saved to the class object for diagnostics ('choices').
         If choices are unconstrained, the probabilities of sampled alternatives are saved
         as well ('probabilities').
+
+        Parameters
+        ----------
+        chooser_batch_size : int
+            This parameter gets passed to 
+            choicemodels.tools.simulation.iterative_lottery_choices and is a temporary
+            workaround for dealing with memory issues that arise from generating massive
+            merged choice tables for simulations that involve large numbers of choosers,
+            large numbers of alternatives, and large numbers of predictors. It allows the
+            user to specify a batch size for simulating choices one chunk at a time. 
+
+        interaction_terms : pandas.Series, pandas.DataFrame, or list of either, optional
+            Additional column(s) of interaction terms whose values depend on the combination 
+            of observation and alternative, to be merged onto the final data table. If passed
+            as a Series or DataFrame, it should include a two-level MultiIndex. One level's 
+            name and values should match an index or column from the observations table, and 
+            the other should match an index or column from the alternatives table. 
+        
+        Returns
+        -------
+        None
         
         """
         obs = self._get_df(tables=self.out_choosers, fallback_tables=self.choosers, 
@@ -465,15 +486,18 @@ def run(self):
                 fitted_parameters = self.fitted_parameters)
 
         def mct(obs, alts):
-            return MergedChoiceTable(obs, alts, sample_size=self.alt_sample_size)
+            return MergedChoiceTable(
+                obs, alts, sample_size=self.alt_sample_size,
+                interaction_terms=interaction_terms)
         
         def probs(mct):
             return model.probabilities(mct)
 
         if (self.constrained_choices == True):
             choices = iterative_lottery_choices(obs, alts, mct_callable=mct, 
                     probs_callable=probs, alt_capacity=self.alt_capacity,
-                    chooser_size=self.chooser_size, max_iter=self.max_iter)
+                    chooser_size=self.chooser_size, max_iter=self.max_iter,
+                    chooser_batch_size=chooser_batch_size)
             
         else:
             probs = probs(mct(obs, alts))
diff --git a/urbansim_templates/models/segmented_large_multinomial_logit.py b/urbansim_templates/models/segmented_large_multinomial_logit.py
@@ -12,10 +12,11 @@
 from ..utils import update_name
 from .. import modelmanager
 from . import LargeMultinomialLogitStep
+from .shared import TemplateStep
 
 
 @modelmanager.template
-class SegmentedLargeMultinomialLogitStep():
+class SegmentedLargeMultinomialLogitStep(TemplateStep):
     """
     This template automatically generates a set of LargeMultinomialLogitStep submodels
     corresponding to "segments" or categories of choosers. The submodels can be directly 
@@ -132,13 +133,15 @@ def get_segmentation_column(self):
         # TO DO - this doesn't filter for columns in the model expression; is there
         #   centralized functionality for this merge that we should be using instead?
         
-        obs = orca.get_table(self.defaults.choosers).to_frame()
-        obs = apply_filter_query(obs, self.defaults.chooser_filters)
-        
-        alts = orca.get_table(self.defaults.alternatives).to_frame()
-        alts = apply_filter_query(alts, self.defaults.alt_filters)
+        obs = self._get_df(
+            tables=self.defaults.choosers,
+            filters=self.defaults.chooser_filters)
+
+        alts = self._get_df(
+            tables=self.defaults.alternatives,
+            filters=self.defaults.alt_filters)
 
-        df = pd.merge(obs, alts, how='inner', 
+        df = pd.merge(obs, alts, how='inner',
                       left_on=self.defaults.choice_column, right_index=True)
         
         return df[self.segmentation_column]
@@ -222,6 +225,8 @@ def fit_all(self):
             self.build_submodels()
         
         for k, m in self.submodels.items():
+            print(' SEGMENT: {0} = {1} '.format(
+                self.segmentation_column, str(k)).center(70, '#'))
             m.fit()
         
         self.name = update_name(self.template, self.name)
diff --git a/urbansim_templates/tests/test_segmented_large_multinomial_logit.py b/urbansim_templates/tests/test_segmented_large_multinomial_logit.py
@@ -32,6 +32,51 @@ def orca_session():
     orca.add_table('alts', alts)
 
 
+@pytest.fixture
+def orca_session_alts_as_list():
+    """
+    Set up a clean Orca session with a couple of data tables.
+    
+    """
+    d1 = {'oid': np.arange(100),
+          'group': np.random.choice(['A', 'B', 'C'], size=100),
+          'int_group': np.random.choice([3, 4], size=100),
+          'obsval': np.random.random(100),
+          'choice': np.random.choice(np.arange(20), size=100)}
+
+    d2 = {'aid': np.arange(20),
+          'altval': np.random.random(20)}
+
+    d3 = {'aid': np.arange(20),
+          'altval_2': np.random.random(20)}
+
+    obs = pd.DataFrame(d1).set_index('oid')
+    orca.add_table('obs', obs)
+
+    d2_df = pd.DataFrame(d2).set_index('aid')
+    orca.add_table('d2', d2_df)
+
+    d3_df = pd.DataFrame(d3).set_index('aid')
+    orca.add_table('d3', d3_df)
+
+    orca.broadcast('d3', 'd2', cast_index=True, onto_index=True)
+
+
+@pytest.fixture
+def m_alts_as_list(orca_session_alts_as_list):
+    """
+    Set up a partially configured model step with multiple
+    tables of alternatives
+    """
+    m = SegmentedLargeMultinomialLogitStep()
+    m.defaults.choosers = 'obs'
+    m.defaults.alternatives = ['d2', 'd3']
+    m.defaults.choice_column = 'choice'
+    m.defaults.model_expression = 'obsval + altval + altval_2'
+    m.segmentation_column = 'group'
+    return m
+
+
 @pytest.fixture
 def m(orca_session):
     """
@@ -55,6 +100,25 @@ def test_template_validity():
     assert validate_template(SegmentedLargeMultinomialLogitStep)
 
 
+def test_basic_operation(m):
+    """
+    Test basic operation of the template.
+    
+    """
+    m.fit_all()
+    m.to_dict()
+    assert len(m.submodels) == 3
+
+def test_basic_operation_alts_as_list(m_alts_as_list):
+    """
+    Test basic operation of the template.
+    
+    """
+    m = m_alts_as_list
+    m.fit_all()
+    m.to_dict()
+    assert len(m.submodels) == 3
+
 def test_basic_operation(m):
     """
     Test basic operation of the template.
@@ -103,6 +167,20 @@ def test_alternative_filters(m):
     assert len1 == len2
 
 
+def test_alternative_filters_for_alts_as_list(m_alts_as_list):
+    """
+    Test that the default alternative filters generate the correct data subset.
+    
+    """
+    m = m_alts_as_list
+    m.defaults.alt_filters = 'altval_2 < 0.5'
+    
+    m.build_submodels()
+    for k, v in m.submodels.items():
+        alts = v._get_df(tables = v.alternatives, filters = v.alt_filters)
+        assert alts['altval_2'].max() < 0.5
+
+
 def test_submodel_filters(m):
     """
     Test that submodel filters generate the correct data subset.

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-version = __version__ = '0.1.dev16'`
	`1`	`+version = __version__ = '0.1.dev17'`