probabl-ai · glemaitre · Apr 30, 2025 · Mar 24, 2025 · Mar 26, 2025 · Mar 26, 2025
diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
diff --git a/skore/src/skore/sklearn/_comparison/report.py b/skore/src/skore/sklearn/_comparison/report.py
@@ -1,24 +1,31 @@
 from __future__ import annotations
 
 import time
+from collections import Counter
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
 
 import joblib
 import numpy as np
 from numpy.typing import ArrayLike
 
 from skore.externals._pandas_accessors import DirNamesMixin
 from skore.sklearn._base import _BaseReport
+from skore.sklearn._cross_validation.report import CrossValidationReport
 from skore.sklearn._estimator.report import EstimatorReport
 from skore.utils._progress_bar import progress_decorator
 
 if TYPE_CHECKING:
     from skore.sklearn._estimator.metrics_accessor import _MetricsAccessor
 
+    ReportType = Literal["EstimatorReport", "CrossValidationReport"]
+
 
 class ComparisonReport(_BaseReport, DirNamesMixin):
-    """Report for comparison of instances of :class:`skore.EstimatorReport`.
+    """Report for comparing reports.
+
+    This object can be used to compare several :class:`skore.EstimatorReport`s, or
+    several :class:`~skore.CrossValidationReport`s.
 
     .. caution:: Reports passed to `ComparisonReport` are not copied. If you pass
        a report to `ComparisonReport`, and then modify the report outside later, it
@@ -28,13 +35,9 @@ class ComparisonReport(_BaseReport, DirNamesMixin):
 
     Parameters
     ----------
-    reports : list of :class:`~skore.EstimatorReport` instances or dict
-        Estimator reports to compare.
-
-        * If `reports` is a list, the class name of each estimator is used.
-        * If `reports` is a dict, it is expected to have estimator names as keys
-          and :class:`~skore.EstimatorReport` instances as values.
-          If the keys are not strings, they will be converted to strings.
+    reports : list of reports or dict
+        Reports to compare. If a dict, keys will be used to label the estimators;
+        if a list, the labels are computed from the estimator class names.
 
     n_jobs : int, default=None
         Number of jobs to run in parallel. Training the estimators and computing
@@ -46,11 +49,14 @@ class ComparisonReport(_BaseReport, DirNamesMixin):
 
     Attributes
     ----------
-    estimator_reports_ : list of :class:`~skore.EstimatorReport`
-        The compared estimator reports.
+    reports_ : list of :class:`~skore.EstimatorReport` or list of
+               :class:`~skore.CrossValidationReport`
+        The compared reports.
 
     report_names_ : list of str
-        The names of the compared estimator reports.
+        The names of the compared estimators. If the names are not customized (i.e. the
+        class names are used), a de-duplication process is used to make sure that the
+        names are distinct.
 
     See Also
     --------
@@ -85,80 +91,168 @@ class ComparisonReport(_BaseReport, DirNamesMixin):
     ...     y_test=y_test
     ... )
     >>> report = ComparisonReport([estimator_report_1, estimator_report_2])
+    >>> report.report_names_
+    ['LogisticRegression_1', 'LogisticRegression_2']
     >>> report = ComparisonReport(
     ...     {"model1": estimator_report_1, "model2": estimator_report_2}
     ... )
+    >>> report.report_names_
+    ['model1', 'model2']
+
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from skore import ComparisonReport, CrossValidationReport
+    >>> X, y = make_classification(random_state=42)
+    >>> estimator_1 = LogisticRegression()
+    >>> estimator_2 = LogisticRegression(C=2)  # Different regularization
+    >>> report_1 = CrossValidationReport(estimator_1, X, y)
+    >>> report_2 = CrossValidationReport(estimator_2, X, y)
+    >>> report = ComparisonReport([report_1, report_2])
+    >>> report = ComparisonReport({"model1": report_1, "model2": report_2})
     """
 
     _ACCESSOR_CONFIG: dict[str, dict[str, str]] = {
         "metrics": {"name": "metrics"},
     }
     metrics: _MetricsAccessor
 
-    def __init__(
-        self,
-        reports: Union[list[EstimatorReport], dict[str, EstimatorReport]],
-        *,
-        n_jobs: Optional[int] = None,
-    ) -> None:
-        """
-        ComparisonReport instance initializer.
+    _reports_type: ReportType
+
+    @staticmethod
+    def _validate_reports(
+        reports: Union[
+            list[EstimatorReport],
+            dict[str, EstimatorReport],
+            list[CrossValidationReport],
+            dict[str, CrossValidationReport],
+        ],
+    ) -> tuple[
+        Union[list[EstimatorReport], list[CrossValidationReport]],
+        list[str],
+        ReportType,
+    ]:
+        """Validate that reports are in the right format for comparison.
 
-        Notes
-        -----
-        We check that the estimator reports can be compared:
-        - all reports are estimator reports,
-        - all estimators are in the same ML use case,
-        - all estimators have non-empty X_test and y_test,
-        - all estimators have the same X_test and y_test.
+        Parameters
+        ----------
+        reports : list of reports or dict
+            The reports to be validated.
+
+        Returns
+        -------
+        list of EstimatorReport or list of CrossValidationReport
+            The validated reports.
+        list of str
+            The report names, either taken from dict keys or computed from the estimator
+            class names.
+        {"EstimatorReport", "CrossValidationReport"}
+            The inferred type of the reports that will be compared.
         """
         if not isinstance(reports, Iterable):
-            raise TypeError(f"Expected reports to be an iterable; got {type(reports)}")
+            raise TypeError(
+                f"Expected reports to be a list or dict; got {type(reports)}"
+            )
 
         if len(reports) < 2:
-            raise ValueError("At least 2 instances of EstimatorReport are needed")
+            raise ValueError(
+                f"Expected at least 2 reports to compare; got {len(reports)}"
+            )
 
-        report_names = (
-            list(map(str, reports.keys())) if isinstance(reports, dict) else None
-        )
-        reports = list(reports.values()) if isinstance(reports, dict) else reports
+        if isinstance(reports, list):
+            report_names = None
+            reports_list = reports
+        else:  # dict
+            report_names = list(reports.keys())
+            for key in report_names:
+                if not isinstance(key, str):
+                    raise TypeError(
+                        f"Expected all report names to be strings; got {type(key)}"
+                    )
+            reports_list = cast(
+                Union[list[EstimatorReport], list[CrossValidationReport]],
+                list(reports.values()),
+            )
 
-        if not all(isinstance(report, EstimatorReport) for report in reports):
-            raise TypeError("Expected instances of EstimatorReport")
+        reports_type: ReportType
+        if all(isinstance(report, EstimatorReport) for report in reports_list):
+            reports_list = cast(list[EstimatorReport], reports_list)
+            reports_type = "EstimatorReport"
+
+            # FIXME: We should only check y_test since it is all we need to tell us
+            # whether we have a distinct ML task at hand.
+            test_dataset_hashes = {
-            test_dataset_hashes = {
+            test_dataset_hashes = {
+                joblib.hash(report.y_test) for report in reports_list
+                if report.y_test is not None)
+            }
-            test_dataset_hashes = {
+            test_dataset_hashes = {
+                joblib.hash(report.y_test) for report in reports_list
+                if report.y_test is not None)
+            }
+                joblib.hash((report.X_test, report.y_test))
+                for report in reports_list
+                if not ((report.X_test is None) and (report.y_test is None))
+            }
+            if len(test_dataset_hashes) > 1:
+                raise ValueError(
+                    "Expected all estimators to have the same testing data."
+                )
+
+        elif all(isinstance(report, CrossValidationReport) for report in reports_list):
+            reports_list = cast(list[CrossValidationReport], reports_list)
+            reports_type = "CrossValidationReport"
+        else:
+            raise TypeError(
+                f"Expected list or dict of {EstimatorReport.__name__} "
+                f"or list of dict of {CrossValidationReport.__name__}"
+            )
 
-        test_dataset_hashes = {
-            joblib.hash((report.X_test, report.y_test))
-            for report in reports
-            if not ((report.X_test is None) and (report.y_test is None))
-        }
-        if len(test_dataset_hashes) > 1:
-            raise ValueError("Expected all estimators to have the same testing data.")
+        if len(set(id(report) for report in reports_list)) < len(reports_list):
+            raise ValueError("Expected reports to be distinct objects")
 self_obj._parent_progress = None 
 self_obj._progress_info = None 
 self_obj._parent_progress = None 
 self_obj._progress_info = None 
 
-        ml_tasks = {report: report._ml_task for report in reports}
+        ml_tasks = {report: report._ml_task for report in reports_list}
         if len(set(ml_tasks.values())) > 1:
             raise ValueError(
                 f"Expected all estimators to have the same ML usecase; got {ml_tasks}"
             )
 
         if report_names is None:
-            self.report_names_ = [report.estimator_name_ for report in reports]
+            deduped_report_names = _deduplicate_report_names(
+                [report.estimator_name_ for report in reports_list]
+            )
         else:
-            self.report_names_ = report_names
+            deduped_report_names = report_names
+
+        return reports_list, deduped_report_names, reports_type
 
-        self.estimator_reports_ = reports
+    def __init__(
+        self,
+        reports: Union[
+            list[EstimatorReport],
+            dict[str, EstimatorReport],
+            list[CrossValidationReport],
+            dict[str, CrossValidationReport],
+        ],
+        *,
+        n_jobs: Optional[int] = None,
+    ) -> None:
+        """
+        ComparisonReport instance initializer.
+
+        Notes
+        -----
+        We check that the estimator reports can be compared:
+        - all reports are estimator reports,
+        - all estimators are in the same ML use case,
+        - all estimators have non-empty X_test and y_test,
+        - all estimators have the same X_test and y_test.
+        """
+        self.reports_, self.report_names_, self._reports_type = (
+            ComparisonReport._validate_reports(reports)
+        )
 
-        # used to know if a parent launches a progress bar manager
         self._progress_info: Optional[dict[str, Any]] = None
         self._parent_progress = None
 
-        # NEEDED FOR METRICS ACCESSOR
         self.n_jobs = n_jobs
         self._rng = np.random.default_rng(time.time_ns())
         self._hash = self._rng.integers(
             low=np.iinfo(np.int64).min, high=np.iinfo(np.int64).max
         )
         self._cache: dict[tuple[Any, ...], Any] = {}
-        self._ml_task = self.estimator_reports_[0]._ml_task
+        self._ml_task = self.reports_[0]._ml_task
 
     def clear_cache(self) -> None:
         """Clear the cache.
@@ -193,7 +287,7 @@ def clear_cache(self) -> None:
         >>> report._cache
         {}
         """
-        for report in self.estimator_reports_:
+        for report in self.reports_:
             report.clear_cache()
         self._cache = {}
 
@@ -222,7 +316,7 @@ def cache_predictions(
         >>> from sklearn.datasets import make_classification
         >>> from sklearn.linear_model import LogisticRegression
         >>> from sklearn.model_selection import train_test_split
-        >>> from skore import ComparisonReport
+        >>> from skore import ComparisonReport, EstimatorReport
         >>> X, y = make_classification(random_state=42)
         >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
         >>> estimator_1 = LogisticRegression()
@@ -255,22 +349,21 @@ def cache_predictions(
         progress = self._progress_info["current_progress"]
         main_task = self._progress_info["current_task"]
 
-        total_estimators = len(self.estimator_reports_)
+        total_estimators = len(self.reports_)
         progress.update(main_task, total=total_estimators)
 
-        for estimator_report in self.estimator_reports_:
+        for report in self.reports_:
             # Pass the progress manager to child tasks
-            estimator_report._parent_progress = progress
-            estimator_report.cache_predictions(
-                response_methods=response_methods, n_jobs=n_jobs
-            )
+            report._parent_progress = progress
+            report.cache_predictions(response_methods=response_methods, n_jobs=n_jobs)
             progress.update(main_task, advance=1, refresh=True)
 
     def get_predictions(
         self,
         *,
         data_source: Literal["train", "test", "X_y"],
         response_method: Literal["predict", "predict_proba", "decision_function"],
+        X: Optional[ArrayLike] = None,
         pos_label: Optional[Any] = None,
     ) -> ArrayLike:
         """Get estimator's predictions.
@@ -290,6 +383,10 @@ def get_predictions(
         response_method : {"predict", "predict_proba", "decision_function"}
             The response method to use.
 
+        X : array-like of shape (n_samples, n_features), optional
+            When `data_source` is "X_y", the input features on which to compute the
+            response method.
+
         pos_label : int, float, bool or str, default=None
             The positive class when it comes to binary classification. When
             `response_method="predict_proba"`, it will select the column corresponding
@@ -343,9 +440,10 @@ def get_predictions(
             report.get_predictions(
                 data_source=data_source,
                 response_method=response_method,
+                X=X,
                 pos_label=pos_label,
             )
-            for report in self.estimator_reports_
+            for report in self.reports_
         ]
 
     ####################################################################################
@@ -363,3 +461,47 @@ def _get_help_legend(self) -> str:
     def __repr__(self) -> str:
         """Return a string representation."""
         return f"{self.__class__.__name__}(...)"
+
+
+def _deduplicate_report_names(report_names: list[str]) -> list[str]:
+    """De-duplicate report names that appear several times.
+
+    Leave the other report names alone.
+
+    Parameters
+    ----------
+    report_names : list of str
+        The list of report names to be checked.
+
+    Returns
+    -------
+    list of str
+        The de-duplicated list of report names.
+
+    Examples
+    --------
+    >>> _deduplicate_report_names(['a', 'b'])
+    ['a', 'b']
+    >>> _deduplicate_report_names(['a', 'a'])
+    ['a_1', 'a_2']
+    >>> _deduplicate_report_names(['a', 'b', 'a'])
+    ['a_1', 'b', 'a_2']
+    >>> _deduplicate_report_names(['a', 'b', 'a', 'b'])
+    ['a_1', 'b_1', 'a_2', 'b_2']
+    >>> _deduplicate_report_names([])
+    []
+    >>> _deduplicate_report_names(['a'])
+    ['a']
+    """
+    counts = Counter(report_names)
+    if len(report_names) == len(counts):
+        return report_names
+
+    names = report_names.copy()
+    seen: Counter = Counter()
+    for i in range(len(names)):
+        name = names[i]
+        seen[name] += 1
+        if counts[name] > 1:
+            names[i] = f"{name}_{seen[name]}"
+    return names