Skip to content

Commit 0e6b0c9

Browse files
authored
Changed default identification behavior with unobserved variables (#354)
Earlier, DoWhy would automatically add an unobserved variable, but it created confusion. New behavior: DoWhy does not add an unobserved variable. User has to provide it. Identification proceeds as per user-provided graph. Simplified code. * proceed_when_unidentifiable is redundant. Will be removed in a future version. * fixed some identification errors with unobserved variables * removed unobserved c from default graph of datasets.py * fixed some notebook errors * added error messages when identification has failed
1 parent 8452f39 commit 0e6b0c9

8 files changed

+829
-536
lines changed

docs/source/example_notebooks/dowhy-simple-iv-example.ipynb

+31-35
Large diffs are not rendered by default.

docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb

+1-3
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@
210210
"1. Activity prior to the treatment (assumed a cause of the treatment)\n",
211211
"2. Activity after the treatment (is the outcome of applying treatment)\n",
212212
"\n",
213-
"Of course, many important variables that affect signup and total spend are missing (e.g., the type of products bought, length of a user's account, geography, etc.). So we'll need a node denoting `Unobserved Confounders`. \n",
213+
"Of course, many important variables that affect signup and total spend are missing (e.g., the type of products bought, length of a user's account, geography, etc.). This is a critical assumption in the analysis, one that needs to be tested later using refutation tests. \n",
214214
"\n",
215215
"Below is the causal graph for a user who signed up in month `i=3`. The analysis will be similar for any `i`. "
216216
]
@@ -271,12 +271,10 @@
271271
"pre_spends;\n",
272272
"post_spends;\n",
273273
"Z->treatment;\n",
274-
"U[label=\"Unobserved Confounders\"]; \n",
275274
"pre_spends -> treatment;\n",
276275
"treatment->post_spends;\n",
277276
"signup_month->post_spends;\n",
278277
"signup_month->treatment;\n",
279-
"U->treatment; U->pre_spends; U->post_spends;\n",
280278
"}\"\"\"\n",
281279
"\n",
282280
"# Post-process the data based on the graph and the month of the treatment (signup)\n",

docs/source/example_notebooks/dowhy_ihdp_data_example.ipynb

+721-446
Large diffs are not rendered by default.

dowhy/causal_estimator.py

+22-11
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,25 @@ def __init__(self, data, identified_estimand, treatment, outcome,
9595

9696
self.logger = logging.getLogger(__name__)
9797

98+
# Setting treatment and outcome values
99+
if self._data is not None:
100+
self._treatment = self._data[self._treatment_name]
101+
self._outcome = self._data[self._outcome_name]
102+
103+
# Now saving the effect modifiers
104+
if self._effect_modifier_names:
105+
# only add the observed nodes
106+
self._effect_modifier_names = [cname
107+
for cname in self._effect_modifier_names
108+
if cname in self._data.columns]
109+
if len(self._effect_modifier_names) > 0:
110+
self._effect_modifiers = self._data[self._effect_modifier_names]
111+
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
112+
self.logger.debug("Effect modifiers: " +
113+
",".join(self._effect_modifier_names))
114+
else:
115+
self._effect_modifier_names = None
116+
98117
# Checking if some parameters were set, otherwise setting to default values
99118
if not hasattr(self, 'num_null_simulations'):
100119
self.num_null_simulations = CausalEstimator.DEFAULT_NUMBER_OF_SIMULATIONS_STAT_TEST
@@ -109,17 +128,6 @@ def __init__(self, data, identified_estimand, treatment, outcome,
109128
# Estimate conditional estimates by default
110129
if not hasattr(self, 'need_conditional_estimates'):
111130
self.need_conditional_estimates = bool(self._effect_modifier_names)
112-
# Setting more values
113-
if self._data is not None:
114-
self._treatment = self._data[self._treatment_name]
115-
self._outcome = self._data[self._outcome_name]
116-
117-
# Now saving the effect modifiers
118-
if self._effect_modifier_names:
119-
self._effect_modifiers = self._data[self._effect_modifier_names]
120-
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
121-
self.logger.debug("Effect modifiers: " +
122-
",".join(self._effect_modifier_names))
123131

124132
@staticmethod
125133
def get_estimator_object(new_data, identified_estimand, estimate):
@@ -765,6 +773,9 @@ def interpret(self, method_name=None, **kwargs):
765773

766774
def __str__(self):
767775
s = "*** Causal Estimate ***\n"
776+
# No estimand was identified (identification failed)
777+
if self.target_estimand is None:
778+
return "Estimation failed! No relevant identified estimand available for this estimation method."
768779
s += "\n## Identified estimand\n{0}".format(self.target_estimand.__str__(only_target_estimand=True))
769780
s += "\n## Realized estimand\n{0}".format(self.realized_estimand_expr)
770781
if hasattr(self, "estimator"):

dowhy/causal_identifier.py

+12-28
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,11 @@ def identify_ate_effect(self, optimize_backdoor):
8989
estimands_dict)
9090
# Setting default "backdoor" identification adjustment set
9191
default_backdoor_id = self.get_default_backdoor_set_id(backdoor_variables_dict)
92-
estimands_dict["backdoor"] = estimands_dict.get(str(default_backdoor_id), None)
93-
backdoor_variables_dict["backdoor"] = backdoor_variables_dict.get(str(default_backdoor_id), None)
94-
92+
if len(backdoor_variables_dict) > 0:
93+
estimands_dict["backdoor"] = estimands_dict.get(str(default_backdoor_id), None)
94+
backdoor_variables_dict["backdoor"] = backdoor_variables_dict.get(str(default_backdoor_id), None)
95+
else:
96+
estimands_dict["backdoor"] = None
9597
### 2. INSTRUMENTAL VARIABLE IDENTIFICATION
9698
# Now checking if there is also a valid iv estimand
9799
instrument_names = self._graph.get_instruments(self.treatment_name,
@@ -250,7 +252,7 @@ def identify_nde_effect(self):
250252
return estimand
251253

252254
def identify_backdoor(self, treatment_name, outcome_name,
253-
include_unobserved=True, dseparation_algo="default"):
255+
include_unobserved=False, dseparation_algo="default"):
254256
backdoor_sets = []
255257
backdoor_paths = None
256258
bdoor_graph = None
@@ -310,7 +312,6 @@ def identify_backdoor(self, treatment_name, outcome_name,
310312
max_iterations= CausalIdentifier.MAX_BACKDOOR_ITERATIONS)
311313
else:
312314
raise ValueError(f"Identifier method {method_name} not supported. Try one of the following: {CausalIdentifier.METHOD_NAMES}")
313-
314315
return backdoor_sets
315316

316317
def find_valid_adjustment_sets(self, treatment_name, outcome_name,
@@ -319,6 +320,7 @@ def find_valid_adjustment_sets(self, treatment_name, outcome_name,
319320
method_name, max_iterations):
320321
num_iterations = 0
321322
found_valid_adjustment_set = False
323+
all_nodes_observed = self._graph.all_observed(self._graph.get_all_nodes())
322324
# If `minimal-adjustment` method is specified, start the search from the set with minimum size. Otherwise, start from the largest.
323325
set_sizes = range(1, len(filt_eligible_variables) + 1, 1) if method_name == CausalIdentifier.BACKDOOR_MIN else range(len(filt_eligible_variables), 0, -1)
324326
for size_candidate_set in set_sizes:
@@ -341,7 +343,7 @@ def find_valid_adjustment_sets(self, treatment_name, outcome_name,
341343
break
342344
# If all variables are observed, and the biggest eligible set
343345
# does not satisfy backdoor, then none of its subsets will.
344-
if method_name in {CausalIdentifier.BACKDOOR_DEFAULT, CausalIdentifier.BACKDOOR_MAX} and self._graph.all_observed(filt_eligible_variables):
346+
if method_name in {CausalIdentifier.BACKDOOR_DEFAULT, CausalIdentifier.BACKDOOR_MAX} and all_nodes_observed:
345347
break
346348
if num_iterations > max_iterations:
347349
self.logger.warning(f"Max number of iterations {max_iterations} reached. Could not find a valid backdoor set.")
@@ -386,26 +388,8 @@ def build_backdoor_estimands_dict(self, treatment_name, outcome_name,
386388
for bset in backdoor_sets
387389
if self._graph.all_observed(bset["backdoor_set"]) ]
388390
else: # there is unobserved confounding
389-
self.logger.warning("If this is observed data (not from a randomized experiment), there might always be missing confounders. Causal effect cannot be identified perfectly.")
390-
response = False # user response
391-
if proceed_when_unidentifiable:
392-
self.logger.info(
393-
"Continuing by ignoring these unobserved confounders because proceed_when_unidentifiable flag is True."
394-
)
395-
else:
396-
response= cli.query_yes_no(
397-
"WARN: Do you want to continue by ignoring any unobserved confounders? (use proceed_when_unidentifiable=True to disable this prompt)",
398-
default=None
399-
)
400-
if response is False:
401-
self.logger.warn("Identification failed due to unobserved variables.")
402-
backdoor_sets_arr = []
403-
if proceed_when_unidentifiable or response is True:
404-
# Just removing the unobserved variable
405-
backdoor_sets_arr = []
406-
for bset in backdoor_sets:
407-
curr_set = list(self._graph.filter_unobserved_variables(bset["backdoor_set"]))
408-
backdoor_sets_arr.append(curr_set)
391+
self.logger.warning("Backdoor identification failed.")
392+
backdoor_sets_arr = []
409393

410394
for i in range(len(backdoor_sets_arr)):
411395
backdoor_estimand_expr = self.construct_backdoor_estimand(
@@ -718,7 +702,7 @@ def get_backdoor_variables(self, key=None):
718702
if key is None:
719703
if self.identifier_method and self.identifier_method.startswith("backdoor"):
720704
return self.backdoor_variables[self.identifier_method]
721-
elif self.backdoor_variables is not None:
705+
elif self.backdoor_variables is not None and len(self.backdoor_variables) > 0:
722706
return self.backdoor_variables[self.default_backdoor_id]
723707
else:
724708
return []
@@ -786,7 +770,7 @@ def __str__(self, only_target_estimand=False, show_all_backdoor_sets=False):
786770
s += " (Default)"
787771
s += "\n"
788772
if v is None:
789-
s += "No such variable found!\n"
773+
s += "No such variable(s) found!\n"
790774
else:
791775
sp_expr_str = sp.pretty(v["estimand"], use_unicode=True)
792776
s += "Estimand expression:\n{0}\n".format(sp_expr_str)

dowhy/causal_model.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,6 @@ def estimate_effect(self, identified_estimand, method_name=None,
268268
else: # For older dowhy methods
269269
# Process the dowhy estimators
270270
causal_estimator_class = causal_estimators.get_class_object(estimator_name + "_estimator")
271-
272271
if identified_estimand.no_directed_path:
273272
self.logger.warning("No directed path from {0} to {1}.".format(
274273
self._treatment,
@@ -278,7 +277,7 @@ def estimate_effect(self, identified_estimand, method_name=None,
278277
treatment_value=treatment_value)
279278
# Check if estimator's target estimand is identified
280279
elif identified_estimand.estimands[identifier_name] is None:
281-
self.logger.warning("No valid identified estimand available.")
280+
self.logger.error("No valid identified estimand available.")
282281
estimate = CausalEstimate(None, None, None,
283282
control_value=control_value,
284283
treatment_value=treatment_value)
@@ -390,6 +389,9 @@ def refute_estimate(self, estimand, estimate, method_name=None, **kwargs):
390389
:returns: an instance of the RefuteResult class
391390
392391
"""
392+
if estimate is None or estimate.value is None:
393+
self.logger.error("Aborting refutation! No estimate is provided.")
394+
raise ValueError("Aborting refutation! No valid estimate is provided.")
393395
if method_name is None:
394396
pass
395397
else:

dowhy/datasets.py

+3-10
Original file line numberDiff line numberDiff line change
@@ -271,14 +271,10 @@ def _compute_y(t, W, beta, c2):
271271

272272
def create_dot_graph(treatments, outcome, common_causes,
273273
instruments, effect_modifiers=[], frontdoor_variables=[]):
274-
dot_graph = ('digraph {{'
275-
' U[label="Unobserved Confounders"];'
276-
' U->{0};'
277-
).format(outcome)
274+
dot_graph = 'digraph {'
278275
for currt in treatments:
279276
if len(frontdoor_variables) == 0:
280277
dot_graph += '{0}->{1};'.format(currt, outcome)
281-
dot_graph += 'U->{0};'.format(currt)
282278
dot_graph += " ".join([v + "-> " + currt + ";" for v in common_causes])
283279
dot_graph += " ".join([v + "-> " + currt + ";" for v in instruments])
284280
dot_graph += " ".join([currt + "-> " + v + ";" for v in frontdoor_variables])
@@ -296,17 +292,14 @@ def create_gml_graph(treatments, outcome, common_causes,
296292
instruments, effect_modifiers=[], frontdoor_variables=[]):
297293
gml_graph = ('graph[directed 1'
298294
'node[ id "{0}" label "{0}"]'
299-
'node[ id "{1}" label "{1}"]'
300-
'edge[source "{1}" target "{0}"]'
301-
).format(outcome, "Unobserved Confounders")
295+
).format(outcome)
302296

303297
gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in common_causes])
304298
gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in instruments])
305299
gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in frontdoor_variables])
306300
for currt in treatments:
307301
gml_graph += ('node[ id "{0}" label "{0}"]'
308-
'edge[source "{1}" target "{0}"]'
309-
).format(currt, "Unobserved Confounders")
302+
).format(currt)
310303
if len(frontdoor_variables) == 0:
311304
gml_graph += 'edge[source "{0}" target "{1}"]'.format(currt, outcome)
312305
gml_graph += " ".join(['edge[ source "{0}" target "{1}"]'.format(v, currt) for v in common_causes])

tests/causal_identifiers/example_graphs.py

+35-1
Original file line numberDiff line numberDiff line change
@@ -264,9 +264,43 @@
264264
biased_sets = [],
265265
minimal_adjustment_sets = [{"Z"}],
266266
maximal_adjustment_sets = [{"Z"}]
267-
)
267+
),
268268

269+
"mbias_with_unobserved": dict(
270+
graph_str = """graph[directed 1 node[id "X" label "X"]
271+
node[id "Y" label "Y"]
272+
node[id "U1" label "U1"]
273+
node[id "U2" label "U2"]
274+
node[id "Z" label "Z"]
275+
node[id "M" label "M"]
276+
edge[source "X" target "Y"]
277+
edge[source "U1" target "X"]
278+
edge[source "U1" target "M"]
279+
edge[source "U2" target "M"]
280+
edge[source "U2" target "Y"]
281+
edge[source "Z" target "X"]]
282+
""",
283+
observed_variables = ["X", "Y", "Z", "M"],
284+
biased_sets = [{"Z", "M"}, {"M"}],
285+
minimal_adjustment_sets = [{}],
286+
maximal_adjustment_sets = [{"Z"}]
287+
),
269288

289+
"iv": dict(
290+
graph_str = """graph[directed 1 node[id "X" label "X"]
291+
node[id "Y" label "Y"]
292+
node[id "U" label "U"]
293+
node[id "Z" label "Z"]
294+
edge[source "X" target "Y"]
295+
edge[source "U" target "X"]
296+
edge[source "U" target "Y"]
297+
edge[source "Z" target "X"]]
298+
""",
299+
observed_variables = ["X", "Y", "Z"],
300+
biased_sets = [{"Z"}],
301+
minimal_adjustment_sets = [],
302+
maximal_adjustment_sets = []
303+
)
270304
}
271305

272306

0 commit comments

Comments
 (0)