feat(CrossValidationReport): Add threshold averaging for roc plot

foster999 · foster999 · commit eb25f592e919 · 2025-05-23T12:18:16.000+01:00
diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
@@ -1100,6 +1100,7 @@ def _get_display(
         *,
         X: Optional[ArrayLike] = None,
         y: Optional[ArrayLike] = None,
+        average: Optional[Literal["threshold"]] = None,
         data_source: DataSource,
         response_method: str,
         display_class: type[
@@ -1208,6 +1209,7 @@ def _get_display(
             display = display_class._compute_data_for_display(
                 y_true=y_true,
                 y_pred=y_pred,
+                average=average,
                 report_type="cross-validation",
                 estimators=[
                     report.estimator_ for report in self._parent.estimator_reports_
@@ -1232,6 +1234,7 @@ def roc(
         data_source: DataSource = "test",
         X: Optional[ArrayLike] = None,
         y: Optional[ArrayLike] = None,
+        average: Optional[Literal["threshold"]] = None,
         pos_label: Optional[PositiveLabel] = None,
     ) -> RocCurveDisplay:
         """Plot the ROC curve.
@@ -1280,6 +1283,7 @@ def roc(
                 data_source=data_source,
                 X=X,
                 y=y,
+                average=average,
                 response_method=response_method,
                 display_class=RocCurveDisplay,
                 display_kwargs=display_kwargs,
diff --git a/skore/src/skore/sklearn/_plot/metrics/precision_recall_curve.py b/skore/src/skore/sklearn/_plot/metrics/precision_recall_curve.py
@@ -545,6 +545,7 @@ def _compute_data_for_display(
         cls,
         y_true: Sequence[YPlotData],
         y_pred: Sequence[YPlotData],
+        average: Optional[Literal["threshold"]] = None,
         *,
         report_type: Literal["comparison-estimator", "cross-validation", "estimator"],
         estimators: Sequence[BaseEstimator],
diff --git a/skore/src/skore/sklearn/_plot/metrics/roc_curve.py b/skore/src/skore/sklearn/_plot/metrics/roc_curve.py
@@ -618,6 +618,7 @@ def _compute_data_for_display(
         cls,
         y_true: Sequence[YPlotData],
         y_pred: Sequence[YPlotData],
+        average: Optional[Literal["threshold"]] = None,
         *,
         report_type: Literal["comparison-estimator", "cross-validation", "estimator"],
         estimators: Sequence[BaseEstimator],
@@ -672,21 +673,40 @@ def _compute_data_for_display(
 
         fpr: dict[PositiveLabel, list[ArrayLike]] = defaultdict(list)
         tpr: dict[PositiveLabel, list[ArrayLike]] = defaultdict(list)
+        thresholds: dict[PositiveLabel, list[ArrayLike]] = defaultdict(list)
         roc_auc: dict[PositiveLabel, list[float]] = defaultdict(list)
 
         if ml_task == "binary-classification":
+            pos_label_validated = cast(PositiveLabel, pos_label_validated)
             for y_true_i, y_pred_i in zip(y_true, y_pred):
-                fpr_i, tpr_i, _ = roc_curve(
+                fpr_i, tpr_i, thresholds_i = roc_curve(
                     y_true_i.y,
                     y_pred_i.y,
                     pos_label=pos_label,
                     drop_intermediate=drop_intermediate,
                 )
                 roc_auc_i = auc(fpr_i, tpr_i)
-                pos_label_validated = cast(PositiveLabel, pos_label_validated)
                 fpr[pos_label_validated].append(fpr_i)
                 tpr[pos_label_validated].append(tpr_i)
+                thresholds[pos_label_validated].append(thresholds_i)
                 roc_auc[pos_label_validated].append(roc_auc_i)
+            if average is not None:
+                if average == "threshold":
+                    average_fpr, average_tpr = cls._threshold_average(
+                        fpr[pos_label_validated],
+                        tpr[pos_label_validated],
+                        thresholds[pos_label_validated],
+                    )
+                else:
+                    raise TypeError(
+                        "'threshold' is the only supported option for `average`,"
+                        f"but got {average} instead"
+                    )
+                average_roc_auc = auc(average_fpr, average_tpr)
+                fpr[pos_label_validated] = [average_fpr]
+                tpr[pos_label_validated] = [average_tpr]
+                roc_auc[pos_label_validated] = [average_roc_auc]
+
         else:  # multiclass-classification
             # OvR fashion to collect fpr, tpr, and roc_auc
             for y_true_i, y_pred_i, est in zip(y_true, y_pred, estimators):
diff --git a/skore/src/skore/sklearn/_plot/utils.py b/skore/src/skore/sklearn/_plot/utils.py
@@ -6,6 +6,7 @@
 import numpy as np
 from matplotlib.axes import Axes
 from matplotlib.colors import Colormap
+from numpy.typing import ArrayLike
 from rich.console import Console
 from rich.panel import Panel
 from rich.tree import Tree
@@ -242,6 +243,45 @@ def _validate_from_predictions_params(
 
         return pos_label
 
+    @staticmethod
+    def _threshold_average(
+        xs: list[ArrayLike], ys: list[ArrayLike], thresholds: list[ArrayLike]
+    ) -> tuple[ArrayLike, ArrayLike]:
+        """
+        Private method to calculate threshold average roc or precision_recall_curve.
+
+        Parameters
+        ----------
+        x : list of array-like of shape (n_samples,)
+            False positive rates or precision
+        y : list of array-like of shape (n_samples,)
+            True positive rates or recall
+        thresholds : list of array-like of shape (n_samples,)
+            Thresholds
+        """
+        unique_thresholds = sorted(np.unique(np.concatenate(thresholds)), reverse=True)
+
+        average_x = []
+        average_y = []
+        for target_threshold in unique_thresholds:
+            threshold_x, threshold_y = [], []
+            for x, y, threshold in zip(
+                xs,
+                ys,
+                thresholds,
+            ):
+                closest_idx = max(
+                    np.searchsorted(threshold[::-1], target_threshold, side="right")
+                    - 1,
+                    0,
+                )
+                closest_idx_inverted = (closest_idx + 1) * -1
+                threshold_x.append(x[closest_idx_inverted])
+                threshold_y.append(y[closest_idx_inverted])
+            average_x.append(np.mean(threshold_x))
+            average_y.append(np.mean(threshold_y))
+        return average_x, average_y
+
 
 def _despine_matplotlib_axis(
     ax: Axes,
diff --git a/skore/tests/unit/sklearn/cross_validation/test_cross_validation.py b/skore/tests/unit/sklearn/cross_validation/test_cross_validation.py
@@ -319,11 +319,12 @@ def test_cross_validation_report_metrics_data_source_external(
 ########################################################################################
 
 
-def test_cross_validation_report_plot_roc(binary_classification_data):
+@pytest.mark.parametrize("average", [None, "threshold"])
+def test_cross_validation_report_plot_roc(binary_classification_data, average):
     """Check that the ROC plot method works."""
     estimator, X, y = binary_classification_data
     report = CrossValidationReport(estimator, X, y, cv_splitter=2)
-    assert isinstance(report.metrics.roc(), RocCurveDisplay)
+    assert isinstance(report.metrics.roc(average=average), RocCurveDisplay)
 
 
 @pytest.mark.parametrize("display", ["roc", "precision_recall"])
diff --git a/skore/tests/unit/sklearn/test_utils.py b/skore/tests/unit/sklearn/test_utils.py
@@ -1,6 +1,7 @@
 import numpy
 import pandas
 import pytest
+from numpy.testing import assert_array_equal
 from sklearn.cluster import KMeans
 from sklearn.datasets import (
     make_classification,
@@ -10,6 +11,7 @@
 from sklearn.dummy import DummyClassifier, DummyRegressor
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.multioutput import MultiOutputClassifier
+from skore.sklearn._plot.utils import _ClassifierCurveDisplayMixin
 from skore.sklearn.find_ml_task import _find_ml_task
 
 
@@ -118,3 +120,13 @@ def test_find_ml_task_pandas():
 
 def test_find_ml_task_string():
     assert _find_ml_task(["0", "1", "2"], None) == "multiclass-classification"
+
+
+def test_threshold_average():
+    xs = [numpy.array([3, 2, 1]), numpy.array([3, 2, 1])]
+    ys = [numpy.array([3, 2, 1]), numpy.array([3, 2, 1])]
+    thresholds = [numpy.array([4, 3, 1]), numpy.array([5, 3, 2])]
+    x, y = _ClassifierCurveDisplayMixin._threshold_average(xs, ys, thresholds)
+    expected = numpy.array([3, 2.5, 2, 1, 1])
+    assert_array_equal(x, expected)
+    assert_array_equal(y, expected)