@@ -36,7 +36,7 @@ class Benchmarking:
3636 Attributes
3737 ----------
3838 results : pd.DataFrame
39- DataFrame produced by :meth:`run`.
39+ Summary DataFrame produced by :meth:`run`.
4040
4141 - Index: pandas.MultiIndex with two levels (names shown in parentheses)
4242 - level 0 "estimator": estimator name
@@ -46,6 +46,18 @@ class Benchmarking:
4646 - "train" = mean of cross_validate(...)[f"train_{metric}"]
4747 - "test" = mean of cross_validate(...)[f"test_{metric}"]
4848
49+ raw_results_ : pd.DataFrame or None
50+ Per-fold scores produced by :meth:`run` when ``return_raw=True``.
51+
52+ - Index: pandas.MultiIndex with three levels
53+ - level 0 "estimator": estimator name
54+ - level 1 "metric": evaluator name
55+ - level 2 "fold": fold index (0-based)
56+ - Columns: ["train", "test"] (both floats)
57+ - Cell values: raw per-fold scores, directly compatible with
58+ ``sktime``'s ``Evaluator`` for Friedman tests and Critical
59+ Difference diagrams.
60+
4961 Example
5062 -------
5163 >>> import numpy as np
@@ -81,6 +93,7 @@ def __init__(self, estimators, metrics, X, y, cv=None, labels=None):
8193 self .cv = cv
8294 self .labels = labels
8395 self .results = None
96+ self .raw_results_ = None
8497
8598 def _to_scorers (self , metrics ):
8699 """Convert metric callables to a dict of scorers."""
@@ -97,7 +110,7 @@ def _to_scorers(self, metrics):
97110 return scorers
98111
99112 def _to_df (self , results ):
100- """Convert nested results to a unified DataFrame."""
113+ """Convert nested mean results to a summary DataFrame."""
101114 records = []
102115 index = []
103116
@@ -109,25 +122,60 @@ def _to_df(self, results):
109122 index = pd .MultiIndex .from_tuples (index , names = ["estimator" , "metric" ])
110123 return pd .DataFrame (records , index = index , columns = ["train" , "test" ])
111124
112- def run (self ):
125+ def _to_raw_df (self , raw_results ):
126+ """Convert nested per-fold results to a raw DataFrame.
127+
128+ The resulting DataFrame is directly compatible with ``sktime``'s
129+ ``Evaluator`` class for Friedman tests and Critical Difference diagrams.
130+ """
131+ records = []
132+ index = []
133+
134+ for est_name , est_scores in raw_results .items ():
135+ for metric_name , fold_scores in est_scores .items ():
136+ for fold_idx , (train_score , test_score ) in enumerate (
137+ zip (fold_scores ["train" ], fold_scores ["test" ])
138+ ):
139+ records .append ({"train" : train_score , "test" : test_score })
140+ index .append ((est_name , metric_name , fold_idx ))
141+
142+ index = pd .MultiIndex .from_tuples (
143+ index , names = ["estimator" , "metric" , "fold" ]
144+ )
145+ return pd .DataFrame (records , index = index , columns = ["train" , "test" ])
146+
147+ def run (self , return_raw = False ):
113148 """
114149 Train each estimator and evaluate with cross-validation.
115150
151+ Parameters
152+ ----------
153+ return_raw : bool, default=False
154+ If ``False`` (default), returns only a summary DataFrame with
155+ mean scores across folds.
156+
157+ If ``True``, returns a tuple ``(summary, raw)`` where ``raw`` is
158+ a per-fold DataFrame with a three-level MultiIndex
159+ ``(estimator, metric, fold)``. The ``raw`` DataFrame is directly
160+ compatible with ``sktime``'s ``Evaluator`` class for Friedman
161+ tests and Critical Difference diagrams.
162+
116163 Returns
117164 -------
118165 results : pd.DataFrame
166+ Summary DataFrame with mean scores.
119167
120- - Index: pandas.MultiIndex with two levels (names shown in parentheses)
121- - level 0 "estimator": estimator name
122- - level 1 "metric": evaluator name
123- - Columns: ["train", "test"] (both floats)
124- - Cell values: mean scores (float) computed across CV folds:
125- - "train" = mean of cross_validate(...)[f"train_{metric}"]
126- - "test" = mean of cross_validate(...)[f"test_{metric}"]
168+ - Index: pandas.MultiIndex ``(estimator, metric)``
169+ - Columns: ["train", "test"] (floats)
127170
171+ (results, raw_results) : tuple[pd.DataFrame, pd.DataFrame]
172+ Returned only when ``return_raw=True``. ``raw_results`` has a
173+ three-level MultiIndex ``(estimator, metric, fold)`` and contains
174+ the raw per-fold scores.
128175 """
129176 self .scorers_ = self ._to_scorers (self .metrics )
130177 results = {}
178+ raw_results = {}
131179
132180 if self .labels is not None :
133181 if len (self .labels ) != len (self .estimators ):
@@ -138,7 +186,7 @@ def run(self):
138186 for est in self .estimators :
139187 name = est .__class__ .__name__
140188 counts [name ] = counts .get (name , 0 ) + 1
141-
189+
142190 names = []
143191 seen = {}
144192 for est in self .estimators :
@@ -160,15 +208,28 @@ def run(self):
160208 return_train_score = True ,
161209 )
162210
163- # average across folds
211+ # mean scores across folds (summary)
164212 est_scores = {}
213+ # raw per-fold scores (for sktime Evaluator compatibility)
214+ est_raw_scores = {}
165215 for metric in self .scorers_ .keys ():
216+ train_folds = cv_results [f"train_{ metric } " ]
217+ test_folds = cv_results [f"test_{ metric } " ]
166218 est_scores [metric ] = {
167- "train" : float (np .mean (cv_results [f"train_{ metric } " ])),
168- "test" : float (np .mean (cv_results [f"test_{ metric } " ])),
219+ "train" : float (np .mean (train_folds )),
220+ "test" : float (np .mean (test_folds )),
221+ }
222+ est_raw_scores [metric ] = {
223+ "train" : train_folds .tolist (),
224+ "test" : test_folds .tolist (),
169225 }
170226
171227 results [est_name ] = est_scores
228+ raw_results [est_name ] = est_raw_scores
172229
173230 self .results = self ._to_df (results )
231+ self .raw_results_ = self ._to_raw_df (raw_results )
232+
233+ if return_raw :
234+ return self .results , self .raw_results_
174235 return self .results
0 commit comments