Add treatment recommendations to causal analysis

kbattocchi · kbattocchi · commit 64c4d953ee4d · 2021-05-19T02:12:32.000-04:00
diff --git a/econml/solutions/causal_analysis/_causal_analysis.py b/econml/solutions/causal_analysis/_causal_analysis.py
@@ -997,10 +997,12 @@ def _tree(self, is_policy, Xtest, feature_index, *, treatment_cost=0,
         if is_policy:
             intrp.interpret(result.estimator, Xtest,
                             sample_treatment_costs=treatment_cost)
+            treat = intrp.treat(Xtest)
         else:  # no treatment cost for CATE trees
             intrp.interpret(result.estimator, Xtest)
+            treat = None
 
-        return intrp, result.X_transformer.get_feature_names(self.feature_names_), treatment_names
+        return intrp, result.X_transformer.get_feature_names(self.feature_names_), treatment_names, treat
 
     # TODO: it seems like it would be better to just return the tree itself rather than plot it;
     #       however, the tree can't store the feature and treatment names we compute here...
@@ -1027,18 +1029,21 @@ def plot_policy_tree(self, Xtest, feature_index, *, treatment_cost=0,
             Confidence level of the confidence intervals displayed in the leaf nodes.
             A (1-alpha)*100% confidence interval is displayed.
         """
-        intrp, feature_names, treatment_names = self._tree(True, Xtest, feature_index,
-                                                           treatment_cost=treatment_cost,
-                                                           max_depth=max_depth,
-                                                           min_samples_leaf=min_samples_leaf,
-                                                           min_impurity_decrease=min_value_increase,
-                                                           alpha=alpha)
+        intrp, feature_names, treatment_names, _ = self._tree(True, Xtest, feature_index,
+                                                              treatment_cost=treatment_cost,
+                                                              max_depth=max_depth,
+                                                              min_samples_leaf=min_samples_leaf,
+                                                              min_impurity_decrease=min_value_increase,
+                                                              alpha=alpha)
         return intrp.plot(feature_names=feature_names, treatment_names=treatment_names)
 
-    def _policy_tree_string(self, Xtest, feature_index, *, treatment_cost=0,
+    def _policy_tree_output(self, Xtest, feature_index, *, treatment_cost=0,
                             max_depth=3, min_samples_leaf=2, min_value_increase=1e-4, alpha=.1):
         """
-        Get a recommended policy tree in graphviz format as a string.
+        Get a tuple policy outputs.
+
+        The first item in the tuple is the recommended policy tree in graphviz format as a string.
+        The second item is the recommended treatment for each sample as a list.
 
         Parameters
         ----------
@@ -1060,18 +1065,18 @@ def _policy_tree_string(self, Xtest, feature_index, *, treatment_cost=0,
 
         Returns
         -------
-        tree : string
-            The policy tree represented as a graphviz string
+        tree : tuple of string, list of int
+            The policy tree represented as a graphviz string and the recommended treatment for each row
         """
 
-        intrp, feature_names, treatment_names = self._tree(True, Xtest, feature_index,
-                                                           treatment_cost=treatment_cost,
-                                                           max_depth=max_depth,
-                                                           min_samples_leaf=min_samples_leaf,
-                                                           min_impurity_decrease=min_value_increase,
-                                                           alpha=alpha)
+        intrp, feature_names, treatment_names, treat = self._tree(True, Xtest, feature_index,
+                                                                  treatment_cost=treatment_cost,
+                                                                  max_depth=max_depth,
+                                                                  min_samples_leaf=min_samples_leaf,
+                                                                  min_impurity_decrease=min_value_increase,
+                                                                  alpha=alpha)
         return intrp.export_graphviz(feature_names=feature_names,
-                                     treatment_names=treatment_names)
+                                     treatment_names=treatment_names), treat.tolist()
 
     # TODO: it seems like it would be better to just return the tree itself rather than plot it;
     #       however, the tree can't store the feature and treatment names we compute here...
@@ -1099,11 +1104,11 @@ def plot_heterogeneity_tree(self, Xtest, feature_index, *,
             A (1-alpha)*100% confidence interval is displayed.
         """
 
-        intrp, feature_names, treatment_names = self._tree(False, Xtest, feature_index,
-                                                           max_depth=max_depth,
-                                                           min_samples_leaf=min_samples_leaf,
-                                                           min_impurity_decrease=min_impurity_decrease,
-                                                           alpha=alpha)
+        intrp, feature_names, treatment_names, _ = self._tree(False, Xtest, feature_index,
+                                                              max_depth=max_depth,
+                                                              min_samples_leaf=min_samples_leaf,
+                                                              min_impurity_decrease=min_impurity_decrease,
+                                                              alpha=alpha)
         return intrp.plot(feature_names=feature_names,
                           treatment_names=treatment_names)
 
@@ -1131,10 +1136,10 @@ def _heterogeneity_tree_string(self, Xtest, feature_index, *,
             A (1-alpha)*100% confidence interval is displayed.
         """
 
-        intrp, feature_names, treatment_names = self._tree(False, Xtest, feature_index,
-                                                           max_depth=max_depth,
-                                                           min_samples_leaf=min_samples_leaf,
-                                                           min_impurity_decrease=min_impurity_decrease,
-                                                           alpha=alpha)
+        intrp, feature_names, treatment_names, _ = self._tree(False, Xtest, feature_index,
+                                                              max_depth=max_depth,
+                                                              min_samples_leaf=min_samples_leaf,
+                                                              min_impurity_decrease=min_impurity_decrease,
+                                                              alpha=alpha)
         return intrp.export_graphviz(feature_names=feature_names,
                                      treatment_names=treatment_names)
diff --git a/econml/tests/test_causal_analysis.py b/econml/tests/test_causal_analysis.py
@@ -44,13 +44,13 @@ def test_basic_array(self):
                 coh_point_est = np.array(coh_dict[_CausalInsightsConstants.PointEstimateKey])
                 loc_point_est = np.array(loc_dict[_CausalInsightsConstants.PointEstimateKey])
 
-                ca._policy_tree_string(X, 1)
+                ca._policy_tree_output(X, 1)
                 ca._heterogeneity_tree_string(X, 1)
                 ca._heterogeneity_tree_string(X, 3)
 
                 # Can't handle multi-dimensional treatments
                 with self.assertRaises(AssertionError):
-                    ca._policy_tree_string(X, 3)
+                    ca._policy_tree_output(X, 3)
 
                 # global shape is (d_y, sum(d_t))
                 assert glo_point_est.shape == coh_point_est.shape == (1, 5)
@@ -133,13 +133,13 @@ def test_basic_pandas(self):
                 assert glo_point_est.shape == coh_point_est.shape == (1, 5)
                 assert loc_point_est.shape == (2,) + glo_point_est.shape
 
-                ca._policy_tree_string(X, inds[1])
+                ca._policy_tree_output(X, inds[1])
                 ca._heterogeneity_tree_string(X, inds[1])
                 ca._heterogeneity_tree_string(X, inds[3])
 
                 # Can't handle multi-dimensional treatments
                 with self.assertRaises(AssertionError):
-                    ca._policy_tree_string(X, inds[3])
+                    ca._policy_tree_output(X, inds[3])
 
                 if not classification:
                     # ExitStack can be used as a "do nothing" ContextManager
@@ -199,13 +199,13 @@ def test_automl_first_stage(self):
             coh_point_est = np.array(coh_dict[_CausalInsightsConstants.PointEstimateKey])
             loc_point_est = np.array(loc_dict[_CausalInsightsConstants.PointEstimateKey])
 
-            ca._policy_tree_string(X, 1)
+            ca._policy_tree_output(X, 1)
             ca._heterogeneity_tree_string(X, 1)
             ca._heterogeneity_tree_string(X, 3)
 
             # Can't handle multi-dimensional treatments
             with self.assertRaises(AssertionError):
-                ca._policy_tree_string(X, 3)
+                ca._policy_tree_output(X, 3)
 
             # global shape is (d_y, sum(d_t))
             assert glo_point_est.shape == coh_point_est.shape == (1, 5)
@@ -279,7 +279,7 @@ def test_one_feature(self):
         assert glo_point_est.shape == coh_point_est.shape == (1, 1)
         assert loc_point_est.shape == (2,) + glo_point_est.shape
 
-        ca._policy_tree_string(X, inds[0])
+        ca._policy_tree_output(X, inds[0])
         ca._heterogeneity_tree_string(X, inds[0])
 
     def test_final_models(self):
@@ -302,13 +302,13 @@ def test_final_models(self):
                 coh_dict = ca._cohort_causal_effect_dict(X[:2])
                 loc_dict = ca._local_causal_effect_dict(X[:2])
 
-                ca._policy_tree_string(X, 1)
+                ca._policy_tree_output(X, 1)
                 ca._heterogeneity_tree_string(X, 1)
                 ca._heterogeneity_tree_string(X, 3)
 
                 # Can't handle multi-dimensional treatments
                 with self.assertRaises(AssertionError):
-                    ca._policy_tree_string(X, 3)
+                    ca._policy_tree_output(X, 3)
 
                 if not classification:
                     # ExitStack can be used as a "do nothing" ContextManager
@@ -370,13 +370,13 @@ def test_forest_with_pandas(self):
         assert glo_point_est.shape == coh_point_est.shape == (1, 5)
         assert loc_point_est.shape == (2,) + glo_point_est.shape
 
-        ca._policy_tree_string(X, inds[1])
+        ca._policy_tree_output(X, inds[1])
         ca._heterogeneity_tree_string(X, inds[1])
         ca._heterogeneity_tree_string(X, inds[3])
 
         # Can't handle multi-dimensional treatments
         with self.assertRaises(AssertionError):
-            ca._policy_tree_string(X, inds[3])
+            ca._policy_tree_output(X, inds[3])
 
     def test_warm_start(self):
         for classification in [True, False]: