py-why · bloebp · Aug 26, 2025
diff --git a/docs/source/user_guide/modeling_gcm/model_evaluation.rst b/docs/source/user_guide/modeling_gcm/model_evaluation.rst
@@ -81,7 +81,7 @@ the chain structure example X→Y→Z again:
 
     If non-root node and the data is categorical:
     A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
-    Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected.
+    Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model. Here, different model classes are evaluated using the log loss metric and the best performing model class is selected.
 
     In total, 3 nodes were analyzed:
 

diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py
@@ -11,7 +11,6 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
-from sklearn.preprocessing import MultiLabelBinarizer
 
 from dowhy.gcm import config
 from dowhy.gcm.causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, DiscreteAdditiveNoiseModel
@@ -30,6 +29,7 @@
 )
 from dowhy.gcm.ml.classification import (
     create_ada_boost_classifier,
+    create_decision_tree_classifier,
     create_extra_trees_classifier,
     create_gaussian_nb_classifier,
     create_knn_classifier,
@@ -55,8 +55,9 @@
 from dowhy.graph import get_ordered_predecessors, is_root_node
 
 _LIST_OF_POTENTIAL_CLASSIFIERS_GOOD = [
-    partial(create_logistic_regression_classifier, max_iter=10000),
     create_hist_gradient_boost_classifier,
+    partial(create_logistic_regression_classifier, max_iter=10000),
+    create_decision_tree_classifier,
 ]
 _LIST_OF_POTENTIAL_REGRESSORS_GOOD = [
     create_linear_regressor,
@@ -152,9 +153,8 @@ def __str__(self):
         summary_strings.append(
             "A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).\n"
             "Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a "
-            "class (category) using the conditional probability distribution produced by a "
-            "classification model."
-            "Here, different model classes are evaluated using the (negative) F1 score and the best"
+            "class (category) using the conditional probability distribution produced by a classification model. "
+            "Here, different model classes are evaluated using the log loss metric and the best"
             " performing model class is selected."
         )
         summary_strings.append("\nIn total, %d nodes were analyzed:" % len(list(self._nodes)))
@@ -223,7 +223,7 @@ def assign_causal_mechanisms(
     A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
     Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the
     conditional probability distribution produced by a classification model. Here, different model classes are evaluated
-    using the (negative) F1 score and the best performing model class is selected.
+    using the log loss metric and the best performing model class is selected.
 
     The current model zoo is:
 
@@ -528,20 +528,13 @@ def find_best_model(
     metric_name = "given"
 
     if metric is None:
-        metric_name = "(negative) F1"
         if is_classification_problem:
-            metric = lambda y_true, y_preds: -metrics.f1_score(
-                y_true, y_preds, average="macro", zero_division=0
-            )  # Higher score is better
+            metric_name = "log loss"
+            metric = metrics.log_loss  # Lower score is better (better calibrated probabilities)
         else:
             metric_name = "mean squared error (MSE)"
             metric = metrics.mean_squared_error
 
-    labelBinarizer = None
-    if is_classification_problem:
-        labelBinarizer = MultiLabelBinarizer()
-        labelBinarizer.fit(Y)
-
     if is_classification_problem:
         if len(np.unique(Y)) == 1:
             raise ValueError(
@@ -559,20 +552,29 @@ def estimate_average_score(prediction_model_factory: Callable[[], PredictionMode
 
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=ConvergenceWarning)
+            warnings.filterwarnings("ignore", category=RuntimeWarning)
             for train_indices, test_indices in kfolds:
-                if is_classification_problem and len(np.unique(Y[train_indices[:max_samples_per_split]])) == 1:
-                    continue
+                if is_classification_problem:
+                    unique_training_labels = np.unique(Y[train_indices[:max_samples_per_split]])
+                    unique_test_labels = np.unique(Y[test_indices[:max_samples_per_split]])
+                    if len(unique_training_labels) == 1 or len(unique_test_labels) == 1:
+                        continue
 
                 model_instance = prediction_model_factory()
                 model_instance.fit(X[train_indices[:max_samples_per_split]], Y[train_indices[:max_samples_per_split]])
 
                 y_true = Y[test_indices[:max_samples_per_split]]
-                y_pred = model_instance.predict(X[test_indices[:max_samples_per_split]])
-                if labelBinarizer is not None:
-                    y_true = labelBinarizer.transform(y_true)
-                    y_pred = labelBinarizer.transform(y_pred)
 
-                average_result.append(metric(y_true, y_pred))
+                if is_classification_problem:
+                    # For classification, use probabilities for log loss calculation
+                    y_pred_proba = model_instance.predict_probabilities(X[test_indices[:max_samples_per_split]])
+                    # Convert string labels to label indices for log_loss
+                    label_to_idx = {label: idx for idx, label in enumerate(unique_test_labels)}
+                    y_true_indices = np.array([label_to_idx[label] for label in y_true.flatten()])
+                    average_result.append(metric(y_true_indices, y_pred_proba))
+                else:
+                    y_pred = model_instance.predict(X[test_indices[:max_samples_per_split]])
+                    average_result.append(metric(y_true, y_pred))
 
         if len(average_result) == 0:
             return float("inf")

diff --git a/dowhy/gcm/ml/classification.py b/dowhy/gcm/ml/classification.py
@@ -6,6 +6,7 @@
 from packaging import version
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import PolynomialFeatures
+from sklearn.tree import DecisionTreeClassifier
 
 from dowhy.gcm.ml.prediction_model import PredictionModel
 
@@ -107,3 +108,7 @@ def create_polynom_logistic_regression_classifier(
             PolynomialFeatures(degree=degree, include_bias=False), LogisticRegression(**kwargs_logistic_regression)
         )
     )
+
+
+def create_decision_tree_classifier() -> SklearnClassificationModel:
+    return SklearnClassificationModel(DecisionTreeClassifier())
diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py
@@ -9,6 +9,7 @@
 from sklearn.linear_model import ElasticNetCV, LassoCV, LinearRegression, LogisticRegression, RidgeCV
 from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline
+from sklearn.tree import DecisionTreeClassifier
 
 from dowhy import gcm
 from dowhy.gcm import (
@@ -40,19 +41,84 @@ def _generate_non_linear_regression_data():
 
 
 def _generate_linear_classification_data():
-    X = np.random.normal(0, 1, (1000, 5))
+    X = np.random.normal(0, 1, (100, 5))
     Y = (np.sum(X * np.random.uniform(-5, 5, X.shape[1]), axis=1) > 0).astype(str)
 
     return X, Y
 
 
-def _generate_non_classification_data():
+def _generate_non_linear_classification_data():
     X = np.random.normal(0, 1, (1000, 5))
     Y = (np.sum(np.exp(X), axis=1) > np.median(np.sum(np.exp(X), axis=1))).astype(str)
 
     return X, Y
 
 
+def _generate_linear_multiclass_classification_data_with_mixed_features():
+    """Generate multi-class classification data with mixed categorical and numerical features (linear relationship)."""
+    n_samples = 100
+
+    # Numerical features
+    num_feat1 = np.random.normal(0, 1, n_samples)
+    num_feat2 = np.random.normal(0, 1, n_samples)
+
+    # Categorical features
+    cat_feat1 = np.random.choice(["TypeA", "TypeB"], n_samples)
+    cat_feat2 = np.random.choice(["Group1", "Group2", "Group3"], n_samples)
+
+    # Create target variable based on linear combination of features
+    # Convert categorical to numerical for decision making
+    cat1_numeric = np.where(cat_feat1 == "TypeA", 1, -1)
+    cat2_numeric = np.where(cat_feat2 == "Group1", 2, np.where(cat_feat2 == "Group2", 0, -2))
+
+    # Linear combination to determine class
+    decision_value = 2 * num_feat1 + 1.5 * num_feat2 + 0.8 * cat1_numeric + 0.5 * cat2_numeric
+
+    # Convert to 3 classes
+    Y = np.where(decision_value > 1, "Class_A", np.where(decision_value > -1, "Class_B", "Class_C"))
+
+    # Combine features
+    X = np.column_stack([num_feat1, num_feat2, cat_feat1, cat_feat2])
+
+    return X, Y
+
+
+def _generate_non_linear_multiclass_classification_data_with_mixed_features():
+    """Generate multi-class classification data with mixed categorical and numerical features (non-linear relationship)."""
+    n_samples = 1000
+
+    # Numerical features
+    num_feat1 = np.random.normal(0, 1, n_samples)
+    num_feat2 = np.random.normal(0, 1, n_samples)
+
+    # Categorical features
+    cat_feat1 = np.random.choice(["TypeA", "TypeB"], n_samples)
+    cat_feat2 = np.random.choice(["Group1", "Group2", "Group3"], n_samples)
+
+    # Create target variable based on non-linear combination of features
+    # Convert categorical to numerical for decision making
+    cat1_numeric = np.where(cat_feat1 == "TypeA", 1, -1)
+    cat2_numeric = np.where(cat_feat2 == "Group1", 2, np.where(cat_feat2 == "Group2", 0, -2))
+
+    # Non-linear combination: use exponentials and products
+    decision_value = (
+        np.exp(num_feat1 * 0.5)
+        + np.sin(num_feat2 * 2)
+        + num_feat1 * num_feat2 * 0.3
+        + cat1_numeric * np.exp(num_feat2 * 0.2)
+        + cat2_numeric * np.cos(num_feat1)
+    )
+
+    # Convert to 3 classes based on percentiles
+    p33, p67 = np.percentile(decision_value, [33, 67])
+    Y = np.where(decision_value > p67, "Class_A", np.where(decision_value > p33, "Class_B", "Class_C"))
+
+    # Combine features
+    X = np.column_stack([num_feat1, num_feat2, cat_feat1, cat_feat2])
+
+    return X, Y
+
+
 @flaky(max_runs=3)
 def test_given_linear_regression_problem_when_auto_assign_causal_models_with_good_quality_returns_linear_model():
     X, Y = _generate_linear_regression_data()
@@ -148,7 +214,7 @@ def test_given_linear_classification_problem_when_auto_assign_causal_models_with
 
 @flaky(max_runs=3)
 def test_given_non_linear_classification_problem_when_auto_assign_causal_models_with_good_quality_returns_non_linear_model():
-    X, Y = _generate_non_classification_data()
+    X, Y = _generate_non_linear_classification_data()
 
     causal_model = ProbabilisticCausalModel(
         nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y"), ("X4", "Y")])
@@ -164,7 +230,7 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_
 
 @flaky(max_runs=3)
 def test_given_non_linear_classification_problem_when_auto_assign_causal_models_with_better_quality_returns_non_linear_model():
-    X, Y = _generate_non_classification_data()
+    X, Y = _generate_non_linear_classification_data()
 
     causal_model = ProbabilisticCausalModel(
         nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y"), ("X4", "Y")])
@@ -384,7 +450,7 @@ def test_given_continuous_data_when_print_auto_summary_then_returns_expected_for
 
 If non-root node and the data is categorical:
 A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
-Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected.
+Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model. Here, different model classes are evaluated using the log loss metric and the best performing model class is selected.
 
 In total, 6 nodes were analyzed:
 
@@ -459,7 +525,7 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo
 
 If non-root node and the data is categorical:
 A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
-Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected.
+Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model. Here, different model classes are evaluated using the log loss metric and the best performing model class is selected.
 
 In total, 6 nodes were analyzed:
 
@@ -483,7 +549,7 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo
         in summary_string
     )
     assert "This represents the causal relationship as Y := f(X0,X1,X2,X3,X4,N)." in summary_string
-    assert "For the model selection, the following models were evaluated on the (negative) F1 metric:" in summary_string
+    assert "For the model selection, the following models were evaluated on the log loss metric:" in summary_string
     assert (
         """===Note===
 Note, based on the selected auto assignment quality, the set of evaluated models changes.
@@ -602,3 +668,27 @@ def test_given_missing_data_mixed_numerical_and_categorical_when_auto_assign_mec
 
     # Just check if it doesn't raise errors.
     gcm.intrinsic_causal_influence(causal_model, "Z")
+
+
+@flaky(max_runs=3)
+def test_given_linear_multiclass_mixed_features_when_auto_assign_causal_models_with_good_quality_returns_linear_model():
+    X, Y = _generate_linear_multiclass_classification_data_with_mixed_features()
+
+    causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y")]))
+    data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
+    data.update({"Y": Y})
+    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
+    assert isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, LogisticRegression)
+
+
+@flaky(max_runs=3)
+def test_given_non_linear_multiclass_mixed_features_when_auto_assign_causal_models_with_good_quality_returns_non_linear_model():
+    X, Y = _generate_non_linear_multiclass_classification_data_with_mixed_features()
+
+    causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y")]))
+    data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
+    data.update({"Y": Y})
+    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
+    assert isinstance(
+        causal_model.causal_mechanism("Y").classifier_model.sklearn_model, DecisionTreeClassifier
+    ) or isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier)