Changed default identification behavior with unobserved variables (#354)

amit-sharma · web-flow · commit 0e6b0c95736f · 2022-01-09T18:26:23.000+05:30
Earlier, DoWhy would automatically add an unobserved variable, but it created confusion.

New behavior:
DoWhy does not add an unobserved variable. User has to provide it.
Identification proceeds as per user-provided graph. Simplified code.

* proceed_when_unidentifiable is redundant. Will be removed in a future version.

* fixed some identification errors with unobserved variables

* removed unobserved c from default graph of datasets.py

* fixed some notebook errors

* added error messages when identification has failed
diff --git a/docs/source/example_notebooks/dowhy-simple-iv-example.ipynb b/docs/source/example_notebooks/dowhy-simple-iv-example.ipynb
diff --git a/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb b/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb
@@ -210,7 +210,7 @@
     "1. Activity prior to the treatment (assumed a cause of the treatment)\n",
     "2. Activity after the treatment (is the outcome of applying treatment)\n",
     "\n",
-    "Of course, many important variables that affect signup and total spend are missing (e.g., the type of products bought, length of a user's account, geography, etc.). So we'll need a node denoting `Unobserved Confounders`. \n",
+    "Of course, many important variables that affect signup and total spend are missing (e.g., the type of products bought, length of a user's account, geography, etc.). This is a critical assumption in the analysis, one that needs to be tested later using refutation tests. \n",
     "\n",
     "Below is the causal graph for a user who signed up in month `i=3`. The analysis will be similar for any `i`. "
    ]
@@ -271,12 +271,10 @@
     "pre_spends;\n",
     "post_spends;\n",
     "Z->treatment;\n",
-    "U[label=\"Unobserved Confounders\"]; \n",
     "pre_spends -> treatment;\n",
     "treatment->post_spends;\n",
     "signup_month->post_spends;\n",
     "signup_month->treatment;\n",
-    "U->treatment; U->pre_spends; U->post_spends;\n",
     "}\"\"\"\n",
     "\n",
     "# Post-process the data based on the graph and the month of the treatment (signup)\n",
diff --git a/docs/source/example_notebooks/dowhy_ihdp_data_example.ipynb b/docs/source/example_notebooks/dowhy_ihdp_data_example.ipynb
diff --git a/dowhy/causal_estimator.py b/dowhy/causal_estimator.py
@@ -95,6 +95,25 @@ def __init__(self, data, identified_estimand, treatment, outcome,
 
         self.logger = logging.getLogger(__name__)
 
+        # Setting treatment and outcome values
+        if self._data is not None:
+            self._treatment = self._data[self._treatment_name]
+            self._outcome = self._data[self._outcome_name]
+
+        # Now saving the effect modifiers
+        if self._effect_modifier_names:
+            # only add the observed nodes
+            self._effect_modifier_names = [cname
+                    for cname in self._effect_modifier_names
+                    if cname in self._data.columns]
+            if len(self._effect_modifier_names) > 0:
+                self._effect_modifiers = self._data[self._effect_modifier_names]
+                self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
+                self.logger.debug("Effect modifiers: " +
+                                  ",".join(self._effect_modifier_names))
+            else:
+                self._effect_modifier_names = None
+
         # Checking if some parameters were set, otherwise setting to default values
         if not hasattr(self, 'num_null_simulations'):
             self.num_null_simulations = CausalEstimator.DEFAULT_NUMBER_OF_SIMULATIONS_STAT_TEST
@@ -109,17 +128,6 @@ def __init__(self, data, identified_estimand, treatment, outcome,
         # Estimate conditional estimates by default
         if not hasattr(self, 'need_conditional_estimates'):
             self.need_conditional_estimates = bool(self._effect_modifier_names)
-        # Setting more values
-        if self._data is not None:
-            self._treatment = self._data[self._treatment_name]
-            self._outcome = self._data[self._outcome_name]
-
-        # Now saving the effect modifiers
-        if self._effect_modifier_names:
-            self._effect_modifiers = self._data[self._effect_modifier_names]
-            self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
-            self.logger.debug("Effect modifiers: " +
-                              ",".join(self._effect_modifier_names))
 
     @staticmethod
     def get_estimator_object(new_data, identified_estimand, estimate):
@@ -765,6 +773,9 @@ def interpret(self, method_name=None, **kwargs):
 
     def __str__(self):
         s = "*** Causal Estimate ***\n"
+        # No estimand was identified (identification failed)
+        if self.target_estimand is None:
+            return "Estimation failed! No relevant identified estimand available for this estimation method."
         s += "\n## Identified estimand\n{0}".format(self.target_estimand.__str__(only_target_estimand=True))
         s += "\n## Realized estimand\n{0}".format(self.realized_estimand_expr)
         if hasattr(self, "estimator"):
diff --git a/dowhy/causal_identifier.py b/dowhy/causal_identifier.py
@@ -89,9 +89,11 @@ def identify_ate_effect(self, optimize_backdoor):
                 estimands_dict)
         # Setting default "backdoor" identification adjustment set
         default_backdoor_id = self.get_default_backdoor_set_id(backdoor_variables_dict)
-        estimands_dict["backdoor"] = estimands_dict.get(str(default_backdoor_id), None)
-        backdoor_variables_dict["backdoor"] = backdoor_variables_dict.get(str(default_backdoor_id), None)
-
+        if len(backdoor_variables_dict) > 0:
+            estimands_dict["backdoor"] = estimands_dict.get(str(default_backdoor_id), None)
+            backdoor_variables_dict["backdoor"] = backdoor_variables_dict.get(str(default_backdoor_id), None)
+        else:
+            estimands_dict["backdoor"] = None
         ### 2. INSTRUMENTAL VARIABLE IDENTIFICATION
         # Now checking if there is also a valid iv estimand
         instrument_names = self._graph.get_instruments(self.treatment_name,
@@ -250,7 +252,7 @@ def identify_nde_effect(self):
         return estimand
 
     def identify_backdoor(self, treatment_name, outcome_name,
-            include_unobserved=True, dseparation_algo="default"):
+            include_unobserved=False, dseparation_algo="default"):
         backdoor_sets = []
         backdoor_paths = None
         bdoor_graph = None
@@ -310,7 +312,6 @@ def identify_backdoor(self, treatment_name, outcome_name,
                         max_iterations= CausalIdentifier.MAX_BACKDOOR_ITERATIONS)
         else:
             raise ValueError(f"Identifier method {method_name} not supported. Try one of the following: {CausalIdentifier.METHOD_NAMES}")
-
         return backdoor_sets
 
     def find_valid_adjustment_sets(self, treatment_name, outcome_name,
@@ -319,6 +320,7 @@ def find_valid_adjustment_sets(self, treatment_name, outcome_name,
             method_name, max_iterations):
         num_iterations = 0
         found_valid_adjustment_set = False
+        all_nodes_observed = self._graph.all_observed(self._graph.get_all_nodes())
         # If `minimal-adjustment` method is specified, start the search from the set with minimum size. Otherwise, start from the largest.
         set_sizes = range(1, len(filt_eligible_variables) + 1, 1) if method_name == CausalIdentifier.BACKDOOR_MIN else range(len(filt_eligible_variables), 0, -1)
         for size_candidate_set in set_sizes:
@@ -341,7 +343,7 @@ def find_valid_adjustment_sets(self, treatment_name, outcome_name,
                 break
             # If all variables are observed, and the biggest eligible set
             # does not satisfy backdoor, then none of its subsets will.
-            if method_name in {CausalIdentifier.BACKDOOR_DEFAULT, CausalIdentifier.BACKDOOR_MAX} and self._graph.all_observed(filt_eligible_variables):
+            if method_name in {CausalIdentifier.BACKDOOR_DEFAULT, CausalIdentifier.BACKDOOR_MAX} and all_nodes_observed:
                 break
             if num_iterations > max_iterations:
                 self.logger.warning(f"Max number of iterations {max_iterations} reached. Could not find a valid backdoor set.")
@@ -386,26 +388,8 @@ def build_backdoor_estimands_dict(self, treatment_name, outcome_name,
                 for bset in backdoor_sets
                 if self._graph.all_observed(bset["backdoor_set"]) ]
         else: # there is unobserved confounding
-            self.logger.warning("If this is observed data (not from a randomized experiment), there might always be missing confounders. Causal effect cannot be identified perfectly.")
-            response = False # user response
-            if proceed_when_unidentifiable:
-                self.logger.info(
-                    "Continuing by ignoring these unobserved confounders because proceed_when_unidentifiable flag is True."
-                )
-            else:
-                response= cli.query_yes_no(
-                    "WARN: Do you want to continue by ignoring any unobserved confounders? (use proceed_when_unidentifiable=True to disable this prompt)",
-                    default=None
-                )
-                if response is False:
-                    self.logger.warn("Identification failed due to unobserved variables.")
-                    backdoor_sets_arr = []
-            if proceed_when_unidentifiable or response is True:
-                # Just removing the unobserved variable
-                backdoor_sets_arr = []
-                for bset in backdoor_sets:
-                    curr_set = list(self._graph.filter_unobserved_variables(bset["backdoor_set"]))
-                    backdoor_sets_arr.append(curr_set)
+            self.logger.warning("Backdoor identification failed.")
+            backdoor_sets_arr = []
 
         for i in range(len(backdoor_sets_arr)):
             backdoor_estimand_expr = self.construct_backdoor_estimand(
@@ -718,7 +702,7 @@ def get_backdoor_variables(self, key=None):
         if key is None:
             if self.identifier_method and self.identifier_method.startswith("backdoor"):
                 return self.backdoor_variables[self.identifier_method]
-            elif self.backdoor_variables is not None:
+            elif self.backdoor_variables is not None and len(self.backdoor_variables) > 0:
                 return self.backdoor_variables[self.default_backdoor_id]
             else:
                 return []
@@ -786,7 +770,7 @@ def __str__(self, only_target_estimand=False, show_all_backdoor_sets=False):
                 s += " (Default)"
             s += "\n"
             if v is None:
-                s += "No such variable found!\n"
+                s += "No such variable(s) found!\n"
             else:
                 sp_expr_str = sp.pretty(v["estimand"], use_unicode=True)
                 s += "Estimand expression:\n{0}\n".format(sp_expr_str)
diff --git a/dowhy/causal_model.py b/dowhy/causal_model.py
@@ -268,7 +268,6 @@ def estimate_effect(self, identified_estimand, method_name=None,
             else: # For older dowhy methods
                 # Process the dowhy estimators
                 causal_estimator_class = causal_estimators.get_class_object(estimator_name + "_estimator")
-
         if identified_estimand.no_directed_path:
             self.logger.warning("No directed path from {0} to {1}.".format(
                 self._treatment,
@@ -278,7 +277,7 @@ def estimate_effect(self, identified_estimand, method_name=None,
                 treatment_value=treatment_value)
         # Check if estimator's target estimand is identified
         elif identified_estimand.estimands[identifier_name] is None:
-            self.logger.warning("No valid identified estimand available.")
+            self.logger.error("No valid identified estimand available.")
             estimate = CausalEstimate(None, None, None,
                                   control_value=control_value,
                                   treatment_value=treatment_value)
@@ -390,6 +389,9 @@ def refute_estimate(self, estimand, estimate, method_name=None, **kwargs):
         :returns: an instance of the RefuteResult class
 
         """
+        if estimate is None or estimate.value is None:
+            self.logger.error("Aborting refutation! No estimate is provided.")
+            raise ValueError("Aborting refutation! No valid estimate is provided.")
         if method_name is None:
             pass
         else:
diff --git a/dowhy/datasets.py b/dowhy/datasets.py
@@ -271,14 +271,10 @@ def _compute_y(t, W, beta, c2):
 
 def create_dot_graph(treatments, outcome, common_causes,
         instruments, effect_modifiers=[], frontdoor_variables=[]):
-    dot_graph = ('digraph {{'
-                 ' U[label="Unobserved Confounders"];'
-                 ' U->{0};'
-                 ).format(outcome)
+    dot_graph = 'digraph {'
     for currt in treatments:
         if len(frontdoor_variables) == 0:
             dot_graph += '{0}->{1};'.format(currt, outcome)
-        dot_graph +=  'U->{0};'.format(currt)
         dot_graph +=  " ".join([v + "-> " + currt + ";" for v in common_causes])
         dot_graph += " ".join([v + "-> " + currt + ";" for v in instruments])
         dot_graph += " ".join([currt + "-> " + v + ";" for v in frontdoor_variables])
@@ -296,17 +292,14 @@ def create_gml_graph(treatments, outcome, common_causes,
         instruments, effect_modifiers=[], frontdoor_variables=[]):
     gml_graph = ('graph[directed 1'
                  'node[ id "{0}" label "{0}"]'
-                 'node[ id "{1}" label "{1}"]'
-                 'edge[source "{1}" target "{0}"]'
-                 ).format(outcome, "Unobserved Confounders")
+                 ).format(outcome)
 
     gml_graph +=  " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in common_causes])
     gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in instruments])
     gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in frontdoor_variables])
     for currt in treatments:
         gml_graph += ('node[ id "{0}" label "{0}"]'
-                     'edge[source "{1}" target "{0}"]'
-                     ).format(currt, "Unobserved Confounders")
+                     ).format(currt)
         if len(frontdoor_variables) == 0:
             gml_graph += 'edge[source "{0}" target "{1}"]'.format(currt, outcome)
         gml_graph +=  " ".join(['edge[ source "{0}" target "{1}"]'.format(v, currt) for v in common_causes])
diff --git a/tests/causal_identifiers/example_graphs.py b/tests/causal_identifiers/example_graphs.py
@@ -264,9 +264,43 @@
         biased_sets = [],
         minimal_adjustment_sets = [{"Z"}],
         maximal_adjustment_sets = [{"Z"}]
-    )
+    ),
 
+    "mbias_with_unobserved": dict(
+        graph_str = """graph[directed 1 node[id "X" label "X"]
+                node[id "Y" label "Y"]
+                node[id "U1" label "U1"]
+                node[id "U2" label "U2"]
+                node[id "Z" label "Z"]
+                node[id "M" label "M"]
+                edge[source "X" target "Y"]
+                edge[source "U1" target "X"]
+                edge[source "U1" target "M"]
+                edge[source "U2" target "M"]
+                edge[source "U2" target "Y"]
+                edge[source "Z" target "X"]]
+                """,
+        observed_variables = ["X", "Y", "Z", "M"],
+        biased_sets = [{"Z", "M"}, {"M"}],
+        minimal_adjustment_sets = [{}],
+        maximal_adjustment_sets = [{"Z"}]
+    ),
 
+    "iv": dict(
+        graph_str = """graph[directed 1 node[id "X" label "X"]
+                node[id "Y" label "Y"]
+                node[id "U" label "U"]
+                node[id "Z" label "Z"]
+                edge[source "X" target "Y"]
+                edge[source "U" target "X"]
+                edge[source "U" target "Y"]
+                edge[source "Z" target "X"]]
+                """,
+        observed_variables = ["X", "Y", "Z"],
+        biased_sets = [{"Z"}],
+        minimal_adjustment_sets = [],
+        maximal_adjustment_sets = []
+    )
 }