diff --git a/causalml/inference/meta/base.py b/causalml/inference/meta/base.py index 50d7a019..715241af 100644 --- a/causalml/inference/meta/base.py +++ b/causalml/inference/meta/base.py @@ -2,6 +2,9 @@ import logging import numpy as np import pandas as pd +from joblib import Parallel, delayed +from sklearn.base import clone +from tqdm import tqdm from causalml.inference.meta.explainer import Explainer from causalml.inference.meta.utils import check_p_conditions, convert_pd_to_np @@ -10,6 +13,31 @@ logger = logging.getLogger("causalml") +def _fit_bootstrap_clone(learner_template, X, treatment, y, p, seed, bootstrap_size): + """Module-level bootstrap helper for joblib pickling compatibility. + + Args: + learner_template: the fitted learner to clone as a template + X: feature matrix + treatment: treatment vector + y: outcome vector + p: propensity scores or None + seed (int): random seed for this bootstrap iteration + bootstrap_size (int): number of samples to draw + Returns: + A fitted clone of learner_template trained on a bootstrap sample. + """ + rng = np.random.RandomState(seed) + idxs = rng.choice(np.arange(X.shape[0]), size=bootstrap_size) + X_b = X[idxs] + treatment_b = treatment[idxs] + y_b = y[idxs] + p_b = {group: _p[idxs] for group, _p in p.items()} if p is not None else None + learner_b = clone(learner_template, safe=False) + learner_b.fit(X=X_b, treatment=treatment_b, y=y_b, p=p_b) + return learner_b + + class BaseLearner(metaclass=ABCMeta): @classmethod @abstractmethod @@ -70,6 +98,50 @@ def bootstrap(self, X, treatment, y, p=None, size=10000, rng=None): self.fit(X=X_b, treatment=treatment_b, y=y_b, p=p_b) return self.predict(X=X, p=p) + def fit_bootstrap_ensemble( + self, + X, + treatment, + y, + p=None, + n_bootstraps=200, + bootstrap_size=10000, + random_state=None, + n_jobs=1, + ): + """Train and store a bootstrap ensemble for post-fit CI estimation. + + Fits n_bootstraps cloned copies of the entire learner on bootstrap samples + and stores them in self.bootstrap_models_. Used by predict(return_ci=True) + to compute percentile-based confidence intervals on new data without refitting. + + This design follows EconML's BootstrapEstimator pattern — each bootstrap + clone is a full copy of the learner, making this method generic across all + meta-learners. + + Note: storing N bootstrap clones can be memory-intensive for heavy base + learners. Monitor RAM for large n_bootstraps. + + Args: + X (np.matrix or np.array or pd.Dataframe): a feature matrix + treatment (np.array or pd.Series): a treatment vector + y (np.array or pd.Series): an outcome vector + p: propensity scores, passed through to fit() if provided + n_bootstraps (int, optional): number of bootstrap iterations. Default: 200. + bootstrap_size (int, optional): number of samples per bootstrap. Default: 10000. + random_state (int, optional): random seed for reproducibility. + n_jobs (int, optional): number of parallel jobs. -1 uses all cores. Default: 1. + """ + + rng = np.random.RandomState(random_state) + seeds = rng.randint(0, np.iinfo(np.int32).max, size=n_bootstraps) + logger.info("Storing bootstrap ensemble ({} iterations)".format(n_bootstraps)) + + self.bootstrap_models_ = Parallel(n_jobs=n_jobs)( + delayed(_fit_bootstrap_clone)(self, X, treatment, y, p, s, bootstrap_size) + for s in tqdm(seeds) + ) + @staticmethod def _format_p(p, t_groups): """Format propensity scores into a dictionary of {treatment group: propensity scores}. diff --git a/causalml/inference/meta/tlearner.py b/causalml/inference/meta/tlearner.py index 7d0bc02d..185bcec2 100644 --- a/causalml/inference/meta/tlearner.py +++ b/causalml/inference/meta/tlearner.py @@ -65,6 +65,7 @@ def __init__( self.ate_alpha = ate_alpha self.control_name = control_name + self.bootstrap_models_ = None def __repr__(self): return "{}(model_c={}, model_t={})".format( @@ -72,13 +73,36 @@ def __repr__(self): ) @ignore_warnings(category=ConvergenceWarning) - def fit(self, X, treatment, y, p=None): + def fit( + self, + X, + treatment, + y, + p=None, + store_bootstraps=False, + n_bootstraps=200, + bootstrap_size=10000, + random_state=None, + n_jobs=1, + ): """Fit the inference model Args: X (np.matrix or np.array or pd.Dataframe): a feature matrix treatment (np.array or pd.Series): a treatment vector y (np.array or pd.Series): an outcome vector + p: unused, kept for API consistency + store_bootstraps (bool, optional): if True, trains a bootstrap ensemble + during fit and stores it in self.bootstrap_models_ for post-fit CI + estimation via predict(return_ci=True). Default: False. + n_bootstraps (int, optional): number of bootstrap iterations. Default: 200. + Note: storing N bootstraps of a GBM-based learner with k treatment + groups holds 2*N*k model objects in memory. Monitor RAM for large N + or heavy base learners. + n_jobs (int, optional): number of parallel jobs for bootstrap fitting. + -1 uses all available cores. Default: 1. + bootstrap_size (int, optional): number of samples per bootstrap. Default: 10000. + random_state (int, optional): random seed for reproducible bootstrap sampling. """ X, treatment, y = convert_pd_to_np(X, treatment, y) check_treatment_vector(treatment, self.control_name) @@ -100,8 +124,28 @@ def fit(self, X, treatment, y, p=None): treatment_mask = treatment == group self.models_t[group].fit(X[treatment_mask], y[treatment_mask]) + if store_bootstraps: + self.fit_bootstrap_ensemble( + X=X, + treatment=treatment, + y=y, + n_bootstraps=n_bootstraps, + bootstrap_size=bootstrap_size, + random_state=random_state, + n_jobs=n_jobs, + ) + else: + self.bootstrap_models_ = None + def predict( - self, X, treatment=None, y=None, p=None, return_components=False, verbose=True + self, + X, + treatment=None, + y=None, + p=None, + return_components=False, + verbose=True, + return_ci=False, ): """Predict treatment effects. @@ -109,11 +153,21 @@ def predict( X (np.matrix or np.array or pd.Dataframe): a feature matrix treatment (np.array or pd.Series, optional): a treatment vector y (np.array or pd.Series, optional): an outcome vector - return_components (bool, optional): whether to return outcome for treatment and control seperately + return_components (bool, optional): whether to return outcome for + treatment and control separately verbose (bool, optional): whether to output progress logs + return_ci (bool, optional): whether to return confidence intervals + using the stored bootstrap ensemble. Requires fit() to have been + called with store_bootstraps=True. CI width is controlled by + self.ate_alpha set at init time. Returns: - (numpy.ndarray): Predictions of treatment effects. + (numpy.ndarray): Predictions of treatment effects. If return_ci=True, + returns (te, te_lower, te_upper) each of shape [n_samples, n_treatment]. + return_ci=True and return_components=True cannot be used together. """ + if return_ci and return_components: + raise ValueError("return_ci and return_components cannot both be True.") + X, treatment, y = convert_pd_to_np(X, treatment, y) yhat_ts = {} @@ -142,6 +196,22 @@ def predict( for i, group in enumerate(self.t_groups): te[:, i] = yhat_ts[group] - yhat_c + if return_ci: + if self.bootstrap_models_ is None: + raise ValueError( + "No bootstrap ensemble found. Call fit(..., store_bootstraps=True) first." + ) + te_bootstraps = np.zeros( + (X.shape[0], self.t_groups.shape[0], len(self.bootstrap_models_)) + ) + for b, learner_b in enumerate(self.bootstrap_models_): + te_bootstraps[:, :, b] = learner_b.predict(X) + te_lower = np.percentile(te_bootstraps, (self.ate_alpha / 2) * 100, axis=2) + te_upper = np.percentile( + te_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=2 + ) + return te, te_lower, te_upper + if not return_components: return te else: diff --git a/tests/test_meta_learners.py b/tests/test_meta_learners.py index c067e449..6694569f 100644 --- a/tests/test_meta_learners.py +++ b/tests/test_meta_learners.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pytest from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression @@ -1222,6 +1223,90 @@ def test_BaseDRClassifier(generate_classification_data): assert te_separate.shape == te.shape +def test_BaseTLearner_predict_return_ci(generate_regression_data): + y, X, treatment, tau, b, e = generate_regression_data() + + learner = BaseTRegressor(learner=LinearRegression(), control_name=0) + + # Test 1: store_bootstraps=True then predict with return_ci=True + learner.fit( + X, + treatment, + y, + store_bootstraps=True, + n_bootstraps=50, + bootstrap_size=500, + random_state=RANDOM_SEED, + ) + tau_pred, lb, ub = learner.predict(X, return_ci=True) + + assert tau_pred.shape == (X.shape[0], len(learner.t_groups)) + assert lb.shape == tau_pred.shape + assert ub.shape == tau_pred.shape + assert (lb <= ub).all() + + # Test 2: ValueError without store_bootstraps + learner2 = BaseTRegressor(learner=LinearRegression(), control_name=0) + learner2.fit(X, treatment, y) + with pytest.raises(ValueError): + learner2.predict(X, return_ci=True) + + # Test 3: ValueError when return_ci and return_components both True + with pytest.raises(ValueError): + learner.predict(X, return_ci=True, return_components=True) + + # Test 4: old API unchanged + tau_plain = learner.predict(X) + assert tau_plain.shape == (X.shape[0], len(learner.t_groups)) + + # Test 5: reproducibility via random_state + learner3 = BaseTRegressor(learner=LinearRegression(), control_name=0) + learner3.fit( + X, + treatment, + y, + store_bootstraps=True, + n_bootstraps=50, + bootstrap_size=500, + random_state=RANDOM_SEED, + ) + tau2, lb2, ub2 = learner3.predict(X, return_ci=True) + np.testing.assert_array_equal(lb, lb2) + np.testing.assert_array_equal(ub, ub2) + + # Test 6: parallel execution (n_jobs=2) produces same result as serial + learner_parallel = BaseTRegressor(learner=LinearRegression(), control_name=0) + learner_parallel.fit( + X, + treatment, + y, + store_bootstraps=True, + n_bootstraps=50, + bootstrap_size=500, + random_state=RANDOM_SEED, + n_jobs=2, + ) + tau_p, lb_p, ub_p = learner_parallel.predict(X, return_ci=True) + assert tau_p.shape == (X.shape[0], len(learner_parallel.t_groups)) + assert (lb_p <= ub_p).all() + + # Test 7: different random_state produces different bounds + learner_diff = BaseTRegressor(learner=LinearRegression(), control_name=0) + learner_diff.fit( + X, + treatment, + y, + store_bootstraps=True, + n_bootstraps=50, + bootstrap_size=500, + random_state=RANDOM_SEED + 1, + ) + _, lb_diff, ub_diff = learner_diff.predict(X, return_ci=True) + assert not np.array_equal( + lb, lb_diff + ), "Different seeds should produce different bounds" + + def test_multi_treatment_learners(): """Comprehensive multi-treatment (N=3) contract test for all meta-learners.