Fix CausalRandomForestRegressor predicting inf from division by zero (#589) (#883)

jeongyoonlee · claude · web-flow · commit 7975c30056ae · 2026-03-13T09:54:42.000-07:00
* Add .worktrees/ to .gitignore * Fix CausalRandomForestRegressor predicting inf from division by zero (#589) Guard against zero treatment/control counts in CausalMSE and TTest criterion functions. When a tree split creates a child node with no treatment or no control observations, the variance formula `var/count` produces infinity. Now skips impurity contribution for that treatment group (zero impurity), preventing the splitter from favoring degenerate splits. Affected methods: - CausalMSE.node_impurity() - CausalMSE.children_impurity() - TTest.children_impurity() * Add regression test for inf predictions with sparse groups (#589) Test that CausalRandomForestRegressor.predict() returns finite values when imbalanced data causes zero-count treatment/control nodes. * Add ttest criterion regression test for inf predictions (#589) * Fix ttest criterion name: 'ttest' -> 't_test' --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/causalml/inference/tree/causal/_criterion.pyx b/causalml/inference/tree/causal/_criterion.pyx
@@ -463,7 +463,8 @@ cdef class CausalMSE(CausalRegressionCriterion):
             tr_var = self.state.node.outcome_var(tr_group_idx)
             tr_count = self.state.node.count_1d[tr_group_idx]
 
-            impurity += (tr_var / tr_count + ct_var / ct_count) - node_tau * node_tau
+            if tr_count > 0 and ct_count > 0:
+                impurity += (tr_var / tr_count + ct_var / ct_count) - node_tau * node_tau
 
         impurity /= (self.n_outputs - 1)
         impurity += self.get_groups_penalty(self.state.node)
@@ -500,8 +501,10 @@ cdef class CausalMSE(CausalRegressionCriterion):
             left_tr_var = self.state.left.outcome_var(tr_group_idx)
             left_tr_count = self.state.left.count_1d[tr_group_idx]
 
-            impurity_right[0] += (right_tr_var / right_tr_count + right_ct_var / right_ct_count) - right_tau * right_tau
-            impurity_left[0] += (left_tr_var / left_tr_count + left_ct_var / left_ct_count) - left_tau * left_tau
+            if right_tr_count > 0 and right_ct_count > 0:
+                impurity_right[0] += (right_tr_var / right_tr_count + right_ct_var / right_ct_count) - right_tau * right_tau
+            if left_tr_count > 0 and left_ct_count > 0:
+                impurity_left[0] += (left_tr_var / left_tr_count + left_ct_var / left_ct_count) - left_tau * left_tau
 
         impurity_right[0] /= (self.n_outputs - 1)
         impurity_left[0] /= (self.n_outputs - 1)
@@ -577,16 +580,22 @@ cdef class TTest(CausalRegressionCriterion):
             left_tr_var = self.state.left.outcome_var(tr_group_idx)
             left_tr_count = self.state.left.count_1d[tr_group_idx]
 
-            denom_left = sqrt(left_tr_var / left_tr_count + left_ct_var / left_ct_count)
-            denom_right = sqrt(right_tr_var / right_tr_count + right_ct_var / right_ct_count)
+            denom_left = 0.0
+            denom_right = 0.0
+            if left_tr_count > 0 and left_ct_count > 0:
+                denom_left = sqrt(left_tr_var / left_tr_count + left_ct_var / left_ct_count)
+            if right_tr_count > 0 and right_ct_count > 0:
+                denom_right = sqrt(right_tr_var / right_tr_count + right_ct_var / right_ct_count)
             if denom_left > 0.:
                 t_left_sum += left_tau / denom_left
             if denom_right > 0.:
                 t_right_sum += right_tau / denom_right
-    
+
             # Per-treatment squared difference in taus between sides
-            inv_n_sum = (1.0 / right_tr_count + 1.0 / right_ct_count +
-                        1.0 / left_tr_count + 1.0 / left_ct_count)
+            inv_n_sum = 0.0
+            if right_tr_count > 0 and right_ct_count > 0 and left_tr_count > 0 and left_ct_count > 0:
+                inv_n_sum = (1.0 / right_tr_count + 1.0 / right_ct_count +
+                            1.0 / left_tr_count + 1.0 / left_ct_count)
 
             # Pooled variance across four cells (left/right × tr/ct)
             pooled_var_t = 0.0
diff --git a/tests/test_causal_trees.py b/tests/test_causal_trees.py
@@ -275,3 +275,50 @@ def test_unbiased_sampling_error(
             crforest_test_var = crforest.calculate_error(X_train=X_train, X_test=X_test)
             assert (crforest_test_var > 0).all()
             assert crforest_test_var.shape[0] == y_test.shape[0]
+
+
+def test_CausalRandomForestRegressor_no_inf_predictions():
+    """Test that CausalRandomForestRegressor does not predict inf values
+    when some tree splits have zero-count treatment/control groups (#589)."""
+    np.random.seed(RANDOM_SEED)
+    n = 100
+    X = np.random.randn(n, 5)
+    # Heavily imbalanced: very few treated samples so tree splits
+    # can produce nodes with zero treatment count
+    treatment = np.array([0] * 90 + [1] * 10)
+    y = np.random.randn(n)
+
+    model = CausalRandomForestRegressor(
+        criterion="causal_mse",
+        control_name=0,
+        n_estimators=10,
+        min_samples_leaf=1,
+        random_state=RANDOM_SEED,
+    )
+    model.fit(X=X, treatment=treatment, y=y)
+    preds = model.predict(X=X)
+
+    assert np.all(np.isfinite(preds)), "Predictions contain inf or NaN values"
+
+
+def test_CausalRandomForestRegressor_no_inf_predictions_ttest():
+    """Test that CausalRandomForestRegressor with criterion='ttest' does not
+    predict inf values when some tree splits have zero-count
+    treatment/control groups (#589)."""
+    np.random.seed(RANDOM_SEED)
+    n = 100
+    X = np.random.randn(n, 5)
+    treatment = np.array([0] * 90 + [1] * 10)
+    y = np.random.randn(n)
+
+    model = CausalRandomForestRegressor(
+        criterion="t_test",
+        control_name=0,
+        n_estimators=10,
+        min_samples_leaf=1,
+        random_state=RANDOM_SEED,
+    )
+    model.fit(X=X, treatment=treatment, y=y)
+    preds = model.predict(X=X)
+
+    assert np.all(np.isfinite(preds)), "Predictions contain inf or NaN values"