Skip to content

Issue with Logistic Regression in Mediation Analysis #1335

@icallal

Description

@icallal

Describe the bug
DoWhy does not appear to allow for the use of generalized_linear_model_estimator.GeneralizedLinearModelEstimator with mediation analysis due to "missing backdoor" paths, even if they exist in the graph.

Steps to reproduce the behavior

data = dowhy.datasets.linear_dataset(10, num_common_causes=1, num_samples=10000,
                                     num_instruments=0, num_effect_modifiers=0,
                                     num_treatments=1,
                                     num_frontdoor_variables=1,
                                     treatment_is_binary=False,
                                    outcome_is_binary=True)
df = data['df']
print(df.head())

# Natural indirect effect (nie)
identified_estimand_nie = model.identify_effect(estimand_type="nonparametric-nie",
                                            proceed_when_unidentifiable=True)
print(identified_estimand_nie)

import dowhy.causal_estimators.linear_regression_estimator
causal_estimate_nie = model.estimate_effect(identified_estimand_nie,
                                        method_name="mediation.two_stage_regression",
                                       confidence_intervals=False,
                                       test_significance=False,
                                        method_params = {
                                            'first_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator,
                                            'second_stage_model': dowhy.causal_estimators.generalized_linear_model_estimator.GeneralizedLinearModelEstimator(identified_estimand = identified_estimand_nie, glm_family=Binomial()),
                                        }
                                       )
print(causal_estimate_nie)

Error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[29], line 36
     33 print(identified_estimand_nie)
     35 import dowhy.causal_estimators.linear_regression_estimator
---> 36 causal_estimate_nie = model.estimate_effect(identified_estimand_nie,
     37                                         method_name="mediation.two_stage_regression",
     38                                        confidence_intervals=False,
     39                                        test_significance=False,
     40                                         method_params = {
     41                                             'first_stage_model': dowhy.causal_estimators.linear_regression_estimator.LinearRegressionEstimator,
     42                                             'second_stage_model': dowhy.causal_estimators.generalized_linear_model_estimator.GeneralizedLinearModelEstimator(identified_estimand = identified_estimand_nie, glm_family=Binomial()),
     43                                         }
     44                                        )
     45 print(causal_estimate_nie)

File ~/.conda/envs/lowpyAD/lib/python3.10/site-packages/dowhy/causal_model.py:359, in CausalModel.estimate_effect(self, identified_estimand, method_name, control_value, treatment_value, test_significance, evaluate_effect_strength, confidence_intervals, target_units, effect_modifiers, fit_estimator, method_params)
    348         causal_estimator = causal_estimator_class(
    349             identified_estimand,
    350             test_significance=test_significance,
   (...)
    354             **extra_args,
    355         )
    357         self._estimator_cache[method_name] = causal_estimator
--> 359 return estimate_effect(
    360     self._data,
    361     self._treatment,
    362     self._outcome,
    363     identifier_name,
    364     causal_estimator,
    365     control_value,
    366     treatment_value,
    367     target_units,
    368     effect_modifiers,
    369     fit_estimator,
    370     method_params,
    371 )

File ~/.conda/envs/lowpyAD/lib/python3.10/site-packages/dowhy/causal_estimator.py:752, in estimate_effect(data, treatment, outcome, identifier_name, estimator, control_value, treatment_value, target_units, effect_modifiers, fit_estimator, method_params)
    747     return CausalEstimate(
    748         None, None, None, None, None, None, control_value=control_value, treatment_value=treatment_value
    749     )
    751 if fit_estimator:
--> 752     estimator.fit(
    753         data=data,
    754         effect_modifier_names=effect_modifiers,
    755         **method_params["fit_params"] if "fit_params" in method_params else {},
    756     )
    758 estimate = estimator.estimate_effect(
    759     data,
    760     treatment_value=treatment_value,
   (...)
    763     confidence_intervals=estimator._confidence_intervals,
    764 )
    766 if estimator._significance_test:

File ~/.conda/envs/lowpyAD/lib/python3.10/site-packages/dowhy/causal_estimators/two_stage_regression_estimator.py:234, in TwoStageRegressionEstimator.fit(self, data, effect_modifier_names, **_)
    231 elif self._target_estimand.identifier_method == "mediation":
    232     self._second_stage_model._target_estimand.treatment_variable = parse_state(self._mediators_names)
--> 234 self._second_stage_model.fit(
    235     data,
    236     effect_modifier_names=effect_modifier_names,
    237 )
    239 if self._target_estimand.estimand_type == EstimandType.NONPARAMETRIC_NDE:
    240     self._second_stage_model_nde._target_estimand.identifier_method = "backdoor"

File ~/.conda/envs/lowpyAD/lib/python3.10/site-packages/dowhy/causal_estimators/generalized_linear_model_estimator.py:105, in GeneralizedLinearModelEstimator.fit(self, data, effect_modifier_names)
     91 def fit(
     92     self,
     93     data: pd.DataFrame,
     94     effect_modifier_names: Optional[List[str]] = None,
     95 ):
     96     """
     97     Fits the estimator with data for effect estimation
     98     :param data: data frame containing the data
   (...)
    103                 methods support this currently.
    104     """
--> 105     return super().fit(
    106         data,
    107         effect_modifier_names=effect_modifier_names,
    108     )

File ~/.conda/envs/lowpyAD/lib/python3.10/site-packages/dowhy/causal_estimators/regression_estimator.py:90, in RegressionEstimator.fit(self, data, effect_modifier_names)
     87 self.reset_encoders()  # Forget any existing encoders
     88 self._set_effect_modifiers(data, effect_modifier_names)
---> 90 self.logger.debug("Adjustment set variables used:" + ",".join(self._target_estimand.get_adjustment_set()))
     91 self._observed_common_causes_names = self._target_estimand.get_adjustment_set()
     92 if len(self._observed_common_causes_names) > 0:

File ~/.conda/envs/lowpyAD/lib/python3.10/site-packages/dowhy/causal_identifier/identified_estimand.py:98, in IdentifiedEstimand.get_adjustment_set(self, key)
     96 if self.identifier_method == "general_adjustment":
     97     return self.get_general_adjustment_variables(key)
---> 98 return self.get_backdoor_variables(key)

File ~/.conda/envs/lowpyAD/lib/python3.10/site-packages/dowhy/causal_identifier/identified_estimand.py:62, in IdentifiedEstimand.get_backdoor_variables(self, key)
     60     return self.backdoor_variables[self.identifier_method]
     61 elif self.backdoor_variables is not None and len(self.backdoor_variables) > 0:
---> 62     return self.backdoor_variables[self.default_backdoor_id]
     63 else:
     64     return []

KeyError: None

Expected behavior
Logisitc regression should be allowable with a binary outcome.

Version information:

  • 0.13

Additional context
Add any other context about the problem here.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions