uber · jeongyoonlee · Jul 5, 2025 · Jun 28, 2025 · Jun 28, 2025 · Jun 28, 2025
diff --git a/causalml/inference/meta/tmle.py b/causalml/inference/meta/tmle.py
@@ -11,7 +11,6 @@
     check_p_conditions,
     convert_pd_to_np,
 )
-from causalml.propensity import calibrate
 
 
 logger = logging.getLogger("causalml")
@@ -105,7 +104,6 @@ def __init__(
         ate_alpha=0.05,
         control_name=0,
         cv=None,
-        calibrate_propensity=True,
     ):
         """Initialize a TMLE learner.
 
@@ -119,7 +117,6 @@ def __init__(
         self.ate_alpha = ate_alpha
         self.control_name = control_name
         self.cv = cv
-        self.calibrate_propensity = calibrate_propensity
 
     def __repr__(self):
         return "{}(model={}, cv={})".format(
@@ -165,10 +162,6 @@ def estimate_ate(self, X, treatment, y, p, segment=None, return_ci=False):
             w_group = (treatment == group).astype(int)
             p_group = p[group]
 
-            if self.calibrate_propensity:
-                logger.info("Calibrating propensity scores.")
-                p_group = calibrate(p_group, w_group)
-
             yhat_c = np.zeros_like(y, dtype=float)
             yhat_t = np.zeros_like(y, dtype=float)
             if self.cv:

diff --git a/causalml/metrics/visualize.py b/causalml/metrics/visualize.py
@@ -342,7 +342,6 @@ def get_tmlegain(
     p_col="p",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
 ):
     """Get TMLE based average uplifts of model estimates of segments.
@@ -356,7 +355,6 @@ def get_tmlegain(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     Returns:
         (pandas.DataFrame): cumulative gains of model estimates based of TMLE
@@ -374,7 +372,7 @@ def get_tmlegain(
     inference_col = [x for x in inference_col if x in df.columns]
 
     # Initialize TMLE
-    tmle = TMLELearner(learner, cv=cv, calibrate_propensity=calibrate_propensity)
+    tmle = TMLELearner(learner, cv=cv)
     ate_all, ate_all_lb, ate_all_ub = tmle.estimate_ate(
         X=df[inference_col], p=df[p_col], treatment=df[treatment_col], y=df[outcome_col]
     )
@@ -454,7 +452,6 @@ def get_tmleqini(
     p_col="p",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
     normalize=False,
 ):
@@ -469,7 +466,6 @@ def get_tmleqini(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     Returns:
         (pandas.DataFrame): cumulative gains of model estimates based of TMLE
@@ -487,7 +483,7 @@ def get_tmleqini(
     inference_col = [x for x in inference_col if x in df.columns]
 
     # Initialize TMLE
-    tmle = TMLELearner(learner, cv=cv, calibrate_propensity=calibrate_propensity)
+    tmle = TMLELearner(learner, cv=cv)
     ate_all, ate_all_lb, ate_all_ub = tmle.estimate_ate(
         X=df[inference_col], p=df[p_col], treatment=df[treatment_col], y=df[outcome_col]
     )
@@ -696,7 +692,6 @@ def plot_tmlegain(
     p_col="tau",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
     figsize=(8, 8),
 ):
@@ -711,7 +706,6 @@ def plot_tmlegain(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     """
 
@@ -728,7 +722,6 @@ def plot_tmlegain(
         p_col=p_col,
         n_segment=n_segment,
         cv=cv,
-        calibrate_propensity=calibrate_propensity,
     )
 
 
@@ -741,7 +734,6 @@ def plot_tmleqini(
     p_col="tau",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
     figsize=(8, 8),
 ):
@@ -756,7 +748,6 @@ def plot_tmleqini(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     """
 
@@ -773,7 +764,6 @@ def plot_tmleqini(
         p_col=p_col,
         n_segment=n_segment,
         cv=cv,
-        calibrate_propensity=calibrate_propensity,
     )
 
 

diff --git a/causalml/propensity.py b/causalml/propensity.py
@@ -12,16 +12,19 @@
 
 
 class PropensityModel(metaclass=ABCMeta):
-    def __init__(self, clip_bounds=(1e-3, 1 - 1e-3), **model_kwargs):
+    def __init__(self, clip_bounds=(1e-3, 1 - 1e-3), calibrate=True, **model_kwargs):
         """
         Args:
             clip_bounds (tuple): lower and upper bounds for clipping propensity scores. Bounds should be implemented
                     such that: 0 < lower < upper < 1, to avoid division by zero in BaseRLearner.fit_predict() step.
+            calibrate (bool): whether calibrate the propensity score
             model_kwargs: Keyword arguments to be passed to the underlying classification model.
         """
         self.clip_bounds = clip_bounds
+        self.calibrate = calibrate
         self.model_kwargs = model_kwargs
         self.model = self._model
+        self.calibrator = None
 
     @property
     @abstractmethod
@@ -40,6 +43,15 @@ def fit(self, X, y):
             y (numpy.ndarray): a binary target vector
         """
         self.model.fit(X, y)
+        if self.calibrate:
+            # Fit a calibrator to the propensity scores with IsotonicRegression.
+            # Ref: https://scikit-learn.org/stable/modules/isotonic.html
+            self.calibrator = IsotonicRegression(
+                out_of_bounds="clip",
+                y_min=self.clip_bounds[0],
+                y_max=self.clip_bounds[1],
+            )
+            self.calibrator.fit(self.model.predict_proba(X)[:, 1], y)
 
     def predict(self, X):
         """
@@ -51,7 +63,11 @@ def predict(self, X):
         Returns:
             (numpy.ndarray): Propensity scores between 0 and 1.
         """
-        return np.clip(self.model.predict_proba(X)[:, 1], *self.clip_bounds)
+        p = self.model.predict_proba(X)[:, 1]
+        if self.calibrate:
+            p = self.calibrator.transform(p)
+
+        return np.clip(p, *self.clip_bounds)
 
     def fit_predict(self, X, y):
         """
@@ -66,7 +82,6 @@ def fit_predict(self, X, y):
         """
         self.fit(X, y)
         propensity_scores = self.predict(X)
-        logger.info("AUC score: {:.6f}".format(auc(y, propensity_scores)))
         return propensity_scores
 
 
@@ -112,12 +127,15 @@ class GradientBoostedPropensityModel(PropensityModel):
     https://xgboost.readthedocs.io/en/latest/python/python_api.html
     """
 
-    def __init__(self, early_stop=False, clip_bounds=(1e-3, 1 - 1e-3), **model_kwargs):
+    def __init__(
+        self,
+        early_stop=False,
+        clip_bounds=(1e-3, 1 - 1e-3),
+        calibrate=True,
+        **model_kwargs,
+    ):
         self.early_stop = early_stop
-
-        super(GradientBoostedPropensityModel, self).__init__(
-            clip_bounds, **model_kwargs
-        )
+        super().__init__(clip_bounds, calibrate, **model_kwargs)
 
     @property
     def _model(self):
@@ -156,50 +174,25 @@ def fit(self, X, y, stop_val_size=0.2):
                 y_train,
                 eval_set=[(X_val, y_val)],
             )
+            if self.calibrate:
+                self.calibrator = IsotonicRegression(
+                    out_of_bounds="clip",
+                    y_min=self.clip_bounds[0],
+                    y_max=self.clip_bounds[1],
+                )
+                self.calibrator.fit(self.model.predict_proba(X)[:, 1], y)
         else:
-            super(GradientBoostedPropensityModel, self).fit(X, y)
-
-    def predict(self, X):
-        """
-        Predict propensity scores.
-
-        Args:
-            X (numpy.ndarray): a feature matrix
-
-        Returns:
-            (numpy.ndarray): Propensity scores between 0 and 1.
-        """
-        if self.early_stop:
-            return np.clip(
-                self.model.predict_proba(X)[:, 1],
-                *self.clip_bounds,
-            )
-        else:
-            return super(GradientBoostedPropensityModel, self).predict(X)
-
-
-def calibrate(ps, treatment):
-    """Calibrate propensity scores with IsotonicRegression.
-
-    Ref: https://scikit-learn.org/stable/modules/isotonic.html
-
-    Args:
-        ps (numpy.array): a propensity score vector
-        treatment (numpy.array): a binary treatment vector (0: control, 1: treated)
-
-    Returns:
-        (numpy.array): a calibrated propensity score vector
-    """
-
-    two_eps = 2.0 * np.finfo(float).eps
-    pm_ir = IsotonicRegression(out_of_bounds="clip", y_min=two_eps, y_max=1.0 - two_eps)
-    ps_ir = pm_ir.fit_transform(ps, treatment)
-
-    return ps_ir
+            super().fit(X, y)
 
 
 def compute_propensity_score(
-    X, treatment, p_model=None, X_pred=None, treatment_pred=None, calibrate_p=True
+    X,
+    treatment,
+    p_model=None,
+    X_pred=None,
+    treatment_pred=None,
+    calibrate_p=True,
+    clip_bounds=(1e-3, 1 - 1e-3),
 ):
     """Generate propensity score if user didn't provide and optionally calibrate.
 
@@ -210,16 +203,18 @@ def compute_propensity_score(
         X_pred (np.matrix, optional): features for prediction
         treatment_pred (np.array or pd.Series, optional): a treatment vector for prediciton
         calibrate_p (bool, optional): whether calibrate the propensity score
+        clip_bounds (tuple, optional): lower and upper bounds for clipping propensity scores. Bounds should be implemented
+                    such that: 0 < lower < upper < 1, to avoid division by zero in BaseRLearner.fit_predict() step.
 
     Returns:
         (tuple)
             - p (numpy.ndarray): propensity score
-            - p_model (PropensityModel): either the original p_model, a trained ElasticNetPropensityModel, or None if calibrate_p=True
+            - p_model (PropensityModel): either the original p_model or a trained ElasticNetPropensityModel
     """
     if treatment_pred is None:
         treatment_pred = treatment.copy()
     if p_model is None:
-        p_model = ElasticNetPropensityModel()
+        p_model = ElasticNetPropensityModel(calibrate=calibrate_p)
 
     p_model.fit(X, treatment)
 
@@ -231,14 +226,7 @@ def compute_propensity_score(
         logger.info("predict_proba not available, using predict instead")
         p = p_model.predict(X_pred)
 
-    if calibrate_p:
-        logger.info("Calibrating propensity scores. Returning p_model=None.")
-        p = calibrate(p, treatment_pred)
-        p_model = None
-
     # force the p values within the range
-    eps = np.finfo(float).eps
-    p = np.where(p < 0 + eps, 0 + eps * 1.001, p)
-    p = np.where(p > 1 - eps, 1 - eps * 1.001, p)
+    p = np.clip(p, clip_bounds[0], clip_bounds[1])
 
     return p, p_model