fix: Leverage get_scorer and the fact it is a _BaseScorer (#1723)

glemaitre · auguste-probabl · web-flow · commit 3c759f9c77d1 · 2025-05-20T08:33:51.000Z
closes #1715 closes #1714 closes #1716 closes #1686 This PR: - fixes the way we handle scikit-learn scorer names - update the API documentation to provide the difference between built-in metrics and scikit-learn scorer names - fixes the tests that are available - add more tests for metrics that takes additional parameters --------- Co-authored-by: Auguste Baum <auguste@probabl.ai>
diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
@@ -83,13 +83,18 @@ def report_metrics(
             provided when creating the report.
 
         scoring : list of str, callable, or scorer, default=None
-            The metrics to report. You can get the possible list of strings by calling
-            `report.metrics.help()`. When passing a callable, it should take as
-            arguments ``y_true``, ``y_pred`` as the two first arguments. Additional
-            arguments can be passed as keyword arguments and will be forwarded with
-            `scoring_kwargs`. If the callable API is too restrictive (e.g. need to pass
-            same parameter name with different values), you can use scikit-learn scorers
-            as provided by :func:`sklearn.metrics.make_scorer`.
+            The metrics to report. The possible values in the list are:
+
+            - if a string, either one of the built-in metrics or a scikit-learn scorer
+              name. You can get the possible list of string using
+              `report.metrics.help()` or :func:`sklearn.metrics.get_scorer_names` for
+              the built-in metrics or the scikit-learn scorers, respectively.
+            - if a callable, it should take as arguments `y_true`, `y_pred` as the two
+              first arguments. Additional arguments can be passed as keyword arguments
+              and will be forwarded with `scoring_kwargs`.
+            - if the callable API is too restrictive (e.g. need to pass
+              same parameter name with different values), you can use scikit-learn
+              scorers as provided by :func:`sklearn.metrics.make_scorer`.
 
         scoring_names : list of str, default=None
             Used to overwrite the default scoring names in the report. It should be of
diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
@@ -83,13 +83,18 @@ def report_metrics(
             provided when creating the report.
 
         scoring : list of str, callable, or scorer, default=None
-            The metrics to report. You can get the possible list of string by calling
-            `report.metrics.help()`. When passing a callable, it should take as
-            arguments `y_true`, `y_pred` as the two first arguments. Additional
-            arguments can be passed as keyword arguments and will be forwarded with
-            `scoring_kwargs`. If the callable API is too restrictive (e.g. need to pass
-            same parameter name with different values), you can use scikit-learn scorers
-            as provided by :func:`sklearn.metrics.make_scorer`.
+            The metrics to report. The possible values in the list are:
+
+            - if a string, either one of the built-in metrics or a scikit-learn scorer
+              name. You can get the possible list of string using
+              `report.metrics.help()` or :func:`sklearn.metrics.get_scorer_names` for
+              the built-in metrics or the scikit-learn scorers, respectively.
+            - if a callable, it should take as arguments `y_true`, `y_pred` as the two
+              first arguments. Additional arguments can be passed as keyword arguments
+              and will be forwarded with `scoring_kwargs`.
+            - if the callable API is too restrictive (e.g. need to pass
+              same parameter name with different values), you can use scikit-learn
+              scorers as provided by :func:`sklearn.metrics.make_scorer`.
 
         scoring_names : list of str, default=None
             Used to overwrite the default scoring names in the report. It should be of
diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
@@ -90,13 +90,18 @@ def report_metrics(
             provided when creating the report.
 
         scoring : list of str, callable, or scorer, default=None
-            The metrics to report. You can get the possible list of string by calling
-            `report.metrics.help()`. When passing a callable, it should take as
-            arguments `y_true`, `y_pred` as the two first arguments. Additional
-            arguments can be passed as keyword arguments and will be forwarded with
-            `scoring_kwargs`. If the callable API is too restrictive (e.g. need to pass
-            same parameter name with different values), you can use scikit-learn scorers
-            as provided by :func:`sklearn.metrics.make_scorer`.
+            The metrics to report. The possible values in the list are:
+
+            - if a string, either one of the built-in metrics or a scikit-learn scorer
+              name. You can get the possible list of string using
+              `report.metrics.help()` or :func:`sklearn.metrics.get_scorer_names` for
+              the built-in metrics or the scikit-learn scorers, respectively.
+            - if a callable, it should take as arguments `y_true`, `y_pred` as the two
+              first arguments. Additional arguments can be passed as keyword arguments
+              and will be forwarded with `scoring_kwargs`.
+            - if the callable API is too restrictive (e.g. need to pass
+              same parameter name with different values), you can use scikit-learn
+              scorers as provided by :func:`sklearn.metrics.make_scorer`.
 
         scoring_names : list of str, default=None
             Used to overwrite the default scoring names in the report. It should be of
@@ -138,14 +143,13 @@ def report_metrics(
         Recall                 0.93...         (↗︎)
         ROC AUC                0.99...         (↗︎)
         Brier score            0.03...         (↘︎)
-
         >>> # Using scikit-learn metrics
         >>> report.metrics.report_metrics(
-        scoring=["neg_log_loss"],
-        indicator_favorability=True)
-                    LogisticRegression Favorability
-        Metric
-        Negative Log Loss      -0.10...        (↘︎)
+        ...     scoring=["f1"], pos_label=1, indicator_favorability=True
+        ... )
+                                  LogisticRegression Favorability
+        Metric   Label / Average
+        F1 Score               1             0.96...          (↗︎)
         """
         if data_source == "X_y":
             # optimization of the hash computation to avoid recomputing it
@@ -194,6 +198,28 @@ def report_metrics(
         scores = []
         favorability_indicator = []
         for metric_name, metric in zip(scoring_names, scoring):
+            if isinstance(metric, str) and not (
+                (metric.startswith("_") and metric[1:] in self._SCORE_OR_LOSS_INFO)
+                or metric in self._SCORE_OR_LOSS_INFO
+            ):
+                try:
+                    metric = metrics.get_scorer(metric)
+                except ValueError as err:
+                    raise ValueError(
+                        f"Invalid metric: {metric!r}. "
+                        f"Please use a valid metric from the "
+                        f"list of supported metrics: "
+                        f"{list(self._SCORE_OR_LOSS_INFO.keys())} "
+                        "or a valid scikit-learn scoring string."
+                    ) from err
+                if scoring_kwargs is not None:
+                    raise ValueError(
+                        "The `scoring_kwargs` parameter is not supported when "
+                        "`scoring` is a scikit-learn scorer name. Use the function "
+                        "`sklearn.metrics.make_scorer` to create a scorer with "
+                        "additional parameters."
+                    )
+
             # NOTE: we have to check specifically for `_BaseScorer` first because this
             # is also a callable but it has a special private API that we can leverage
             if isinstance(metric, _BaseScorer):
@@ -221,8 +247,8 @@ def report_metrics(
                     elif pos_label is not None:
                         metrics_kwargs["pos_label"] = pos_label
                 if metric_name is None:
-                    metric_name = metric._score_func.__name__
-                metric_favorability = "↗︎" if metric._sign == 1 else "↘︎"
+                    metric_name = metric._score_func.__name__.replace("_", " ").title()
+                metric_favorability = "(↗︎)" if metric._sign == 1 else "(↘︎)"
                 favorability_indicator.append(metric_favorability)
             elif isinstance(metric, str) or callable(metric):
                 if isinstance(metric, str):
@@ -248,51 +274,6 @@ def report_metrics(
                         if metric_name is None:
                             metric_name = f"{self._SCORE_OR_LOSS_INFO[metric]['name']}"
                         metric_favorability = self._SCORE_OR_LOSS_INFO[metric]["icon"]
-
-                    # Handle scikit-learn metrics by trying get_scorer
-                    else:
-                        from sklearn.metrics import get_scorer
-
-                        try:
-                            scorer = get_scorer(metric)
-                            metric_function = scorer._score_func
-                            response_method = scorer._response_method
-
-                            display_name = metric
-                            if metric.startswith("neg_"):
-                                display_name = metric[4:].replace("_", " ")
-                                metric_fn = partial(
-                                    self._custom_metric,
-                                    metric_function=metric_function,
-                                    response_method=response_method,
-                                )
-                                metrics_kwargs = {**scorer._kwargs}
-                                metrics_kwargs["data_source_hash"] = data_source_hash
-                                metric_favorability = "↘︎"
-                                favorability_indicator.append(metric_favorability)
-
-                            if metric_name is None:
-                                metric_name = display_name.title()
-
-                            metric_fn = partial(
-                                self._custom_metric,
-                                metric_function=metric_function,
-                                response_method=response_method,
-                            )
-                            metrics_kwargs = {**scorer._kwargs}
-                            metrics_kwargs["data_source_hash"] = data_source_hash
-                            metric_favorability = (
-                                "(↘︎)" if metric.startswith("neg_") else "(↗︎)"
-                            )
-                        except ValueError as err:
-                            raise ValueError(
-                                f"Invalid metric: {metric!r}. "
-                                f"Please use a valid metric from the "
-                                f"list of supported metrics: "
-                                f"{list(self._SCORE_OR_LOSS_INFO.keys())} "
-                                "or a valid scikit-learn scoring string."
-                            ) from err
-                    favorability_indicator.append(metric_favorability)
                 else:
                     # Handle callable metrics
                     metric_fn = partial(self._custom_metric, metric_function=metric)
diff --git a/skore/tests/unit/sklearn/cross_validation/test_cross_validation.py b/skore/tests/unit/sklearn/cross_validation/test_cross_validation.py
@@ -885,7 +885,7 @@ def test_cross_validation_report_custom_metric(binary_classification_data):
         response_method="predict",
     )
     assert result.shape == (1, 2)
-    assert result.index == ["accuracy_score"]
+    assert result.index == ["Accuracy Score"]
 
 
 @pytest.mark.parametrize(
@@ -936,7 +936,7 @@ def predict(self, X):
         response_method="predict",
     )
     assert result.shape == (1, 2)
-    assert result.index == ["accuracy_score"]
+    assert result.index == ["Accuracy Score"]
 
 
 def test_cross_validation_report_brier_score_requires_probabilities():
diff --git a/skore/tests/unit/sklearn/estimator/test_estimator.py b/skore/tests/unit/sklearn/estimator/test_estimator.py
@@ -15,7 +15,6 @@
 from sklearn.metrics import (
     accuracy_score,
     f1_score,
-    get_scorer,
     make_scorer,
     median_absolute_error,
     r2_score,
@@ -1349,49 +1348,30 @@ def test_estimator_report_average_return_float(binary_classification_data):
 def test_estimator_report_metric_with_neg_metrics(binary_classification_data):
     """Check that scikit-learn metrics with 'neg_' prefix are handled correctly."""
     classifier, X_test, y_test = binary_classification_data
-    report = EstimatorReport(
-        classifier,
-        X_test=X_test,
-        y_test=y_test,
-    )
-
-    # Use scikit-learn's get_scorer to handle neg_log_loss
-    scorer = get_scorer("neg_log_loss")
-    result = report.metrics.report_metrics(scoring=[scorer])
+    report = EstimatorReport(classifier, X_test=X_test, y_test=y_test)
 
-    # Check that the metric name is displayed properly (as 'log_loss')
-    assert "log_loss" in result.index
-
-    # Get the neg_log_loss score directly - use the fitted model from the report
-    neg_log_loss_value = get_scorer("neg_log_loss")(report.estimator_, X_test, y_test)
-
-    # Check that the reported log_loss matches the absolute value of neg_log_loss
-    log_loss_value = result.loc["log_loss", classifier.__class__.__name__]
-    assert np.isclose(log_loss_value, abs(neg_log_loss_value))
+    result = report.metrics.report_metrics(scoring=["neg_log_loss"])
+    assert "Log Loss" in result.index
+    assert result.loc["Log Loss", "RandomForestClassifier"] == pytest.approx(
+        report.metrics.log_loss()
+    )
 
 
 def test_estimator_report_with_sklearn_scoring_strings(binary_classification_data):
     """Test that scikit-learn metric strings can be passed to report_metrics."""
     classifier, X_test, y_test = binary_classification_data
-    class_report = EstimatorReport(
-        classifier,
-        X_test=X_test,
-        y_test=y_test,
-    )
+    class_report = EstimatorReport(classifier, X_test=X_test, y_test=y_test)
 
-    # Test single scikit-learn metric string
     result = class_report.metrics.report_metrics(scoring=["neg_log_loss"])
     assert "Log Loss" in result.index.get_level_values(0)
 
-    # Test with multiple scikit-learn metrics
     result_multi = class_report.metrics.report_metrics(
         scoring=["accuracy", "neg_log_loss", "roc_auc"], indicator_favorability=True
     )
     assert "Accuracy" in result_multi.index.get_level_values(0)
     assert "Log Loss" in result_multi.index.get_level_values(0)
     assert "ROC AUC" in result_multi.index.get_level_values(0)
 
-    # Test favorability indicators
     favorability = result_multi.loc["Accuracy"]["Favorability"]
     assert favorability == "(↗︎)"
     favorability = result_multi.loc["Log Loss"]["Favorability"]
@@ -1401,13 +1381,8 @@ def test_estimator_report_with_sklearn_scoring_strings(binary_classification_dat
 def test_estimator_report_with_sklearn_scoring_strings_regression(regression_data):
     """Test scikit-learn regression metric strings in report_metrics."""
     regressor, X_test, y_test = regression_data
-    reg_report = EstimatorReport(
-        regressor,
-        X_test=X_test,
-        y_test=y_test,
-    )
+    reg_report = EstimatorReport(regressor, X_test=X_test, y_test=y_test)
 
-    # Test regression metrics
     reg_result = reg_report.metrics.report_metrics(
         scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"],
         indicator_favorability=True,
@@ -1417,21 +1392,15 @@ def test_estimator_report_with_sklearn_scoring_strings_regression(regression_dat
     assert "Mean Absolute Error" in reg_result.index.get_level_values(0)
     assert "R²" in reg_result.index.get_level_values(0)
 
-    # Check favorability
     assert reg_result.loc["Mean Squared Error"]["Favorability"] == "(↘︎)"
     assert reg_result.loc["R²"]["Favorability"] == "(↗︎)"
 
 
 def test_estimator_report_with_scoring_strings_regression(regression_data):
     """Test scikit-learn regression metric strings in report_metrics."""
     regressor, X_test, y_test = regression_data
-    reg_report = EstimatorReport(
-        regressor,
-        X_test=X_test,
-        y_test=y_test,
-    )
+    reg_report = EstimatorReport(regressor, X_test=X_test, y_test=y_test)
 
-    # Test regression metrics
     reg_result = reg_report.metrics.report_metrics(
         scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"],
         indicator_favorability=True,
@@ -1441,6 +1410,40 @@ def test_estimator_report_with_scoring_strings_regression(regression_data):
     assert "Mean Absolute Error" in reg_result.index.get_level_values(0)
     assert "R²" in reg_result.index.get_level_values(0)
 
-    # Check favorability
     assert reg_result.loc["Mean Squared Error"]["Favorability"] == "(↘︎)"
     assert reg_result.loc["R²"]["Favorability"] == "(↗︎)"
+
+
+def test_estimator_report_sklearn_scorer_names_pos_label(binary_classification_data):
+    """Check that `pos_label` is dispatched with scikit-learn scorer names."""
+    classifier, X_test, y_test = binary_classification_data
+    report = EstimatorReport(classifier, X_test=X_test, y_test=y_test)
+
+    result = report.metrics.report_metrics(scoring=["f1"], pos_label=0)
+    assert "F1 Score" in result.index.get_level_values(0)
+    assert 0 in result.index.get_level_values(1)
+    f1_scorer = make_scorer(
+        f1_score, response_method="predict", average="binary", pos_label=0
+    )
+    assert result.loc[("F1 Score", 0), "RandomForestClassifier"] == pytest.approx(
+        f1_scorer(classifier, X_test, y_test)
+    )
+
+
+def test_estimator_report_sklearn_scorer_names_scoring_kwargs(
+    binary_classification_data,
+):
+    """Check that `scoring_kwargs` is not supported when `scoring` is a scikit-learn
+    scorer name.
+    """
+    classifier, X_test, y_test = binary_classification_data
+    report = EstimatorReport(classifier, X_test=X_test, y_test=y_test)
+
+    err_msg = (
+        "The `scoring_kwargs` parameter is not supported when `scoring` is a "
+        "scikit-learn scorer name."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        report.metrics.report_metrics(
+            scoring=["f1"], scoring_kwargs={"average": "macro"}
+        )

Original file line number	Diff line number	Diff line change
`@@ -885,7 +885,7 @@ def test_cross_validation_report_custom_metric(binary_classification_data):`
`885`	`885`	`response_method="predict",`
`886`	`886`	`)`
`887`	`887`	`assert result.shape == (1, 2)`
`888`		`- assert result.index == ["accuracy_score"]`
	`888`	`+ assert result.index == ["Accuracy Score"]`
`889`	`889`
`890`	`890`
`891`	`891`	`@pytest.mark.parametrize(`
`@@ -936,7 +936,7 @@ def predict(self, X):`
`936`	`936`	`response_method="predict",`
`937`	`937`	`)`
`938`	`938`	`assert result.shape == (1, 2)`
`939`		`- assert result.index == ["accuracy_score"]`
	`939`	`+ assert result.index == ["Accuracy Score"]`
`940`	`940`
`941`	`941`
`942`	`942`	`def test_cross_validation_report_brier_score_requires_probabilities():`