Skip to content

Commit e4b7644

Browse files
Pranav ChoudharyPranav Choudhary
authored andcommitted
[ENH] Add return_raw parameter to Benchmarking.run() for sktime Evaluator compatibility (#125)
1 parent 34ebd1d commit e4b7644

1 file changed

Lines changed: 75 additions & 14 deletions

File tree

pyaptamer/benchmarking/_base.py

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class Benchmarking:
3636
Attributes
3737
----------
3838
results : pd.DataFrame
39-
DataFrame produced by :meth:`run`.
39+
Summary DataFrame produced by :meth:`run`.
4040
4141
- Index: pandas.MultiIndex with two levels (names shown in parentheses)
4242
- level 0 "estimator": estimator name
@@ -46,6 +46,18 @@ class Benchmarking:
4646
- "train" = mean of cross_validate(...)[f"train_{metric}"]
4747
- "test" = mean of cross_validate(...)[f"test_{metric}"]
4848
49+
raw_results_ : pd.DataFrame or None
50+
Per-fold scores produced by :meth:`run` when ``return_raw=True``.
51+
52+
- Index: pandas.MultiIndex with three levels
53+
- level 0 "estimator": estimator name
54+
- level 1 "metric": evaluator name
55+
- level 2 "fold": fold index (0-based)
56+
- Columns: ["train", "test"] (both floats)
57+
- Cell values: raw per-fold scores, directly compatible with
58+
``sktime``'s ``Evaluator`` for Friedman tests and Critical
59+
Difference diagrams.
60+
4961
Example
5062
-------
5163
>>> import numpy as np
@@ -81,6 +93,7 @@ def __init__(self, estimators, metrics, X, y, cv=None, labels=None):
8193
self.cv = cv
8294
self.labels = labels
8395
self.results = None
96+
self.raw_results_ = None
8497

8598
def _to_scorers(self, metrics):
8699
"""Convert metric callables to a dict of scorers."""
@@ -97,7 +110,7 @@ def _to_scorers(self, metrics):
97110
return scorers
98111

99112
def _to_df(self, results):
100-
"""Convert nested results to a unified DataFrame."""
113+
"""Convert nested mean results to a summary DataFrame."""
101114
records = []
102115
index = []
103116

@@ -109,25 +122,60 @@ def _to_df(self, results):
109122
index = pd.MultiIndex.from_tuples(index, names=["estimator", "metric"])
110123
return pd.DataFrame(records, index=index, columns=["train", "test"])
111124

112-
def run(self):
125+
def _to_raw_df(self, raw_results):
126+
"""Convert nested per-fold results to a raw DataFrame.
127+
128+
The resulting DataFrame is directly compatible with ``sktime``'s
129+
``Evaluator`` class for Friedman tests and Critical Difference diagrams.
130+
"""
131+
records = []
132+
index = []
133+
134+
for est_name, est_scores in raw_results.items():
135+
for metric_name, fold_scores in est_scores.items():
136+
for fold_idx, (train_score, test_score) in enumerate(
137+
zip(fold_scores["train"], fold_scores["test"])
138+
):
139+
records.append({"train": train_score, "test": test_score})
140+
index.append((est_name, metric_name, fold_idx))
141+
142+
index = pd.MultiIndex.from_tuples(
143+
index, names=["estimator", "metric", "fold"]
144+
)
145+
return pd.DataFrame(records, index=index, columns=["train", "test"])
146+
147+
def run(self, return_raw=False):
113148
"""
114149
Train each estimator and evaluate with cross-validation.
115150
151+
Parameters
152+
----------
153+
return_raw : bool, default=False
154+
If ``False`` (default), returns only a summary DataFrame with
155+
mean scores across folds.
156+
157+
If ``True``, returns a tuple ``(summary, raw)`` where ``raw`` is
158+
a per-fold DataFrame with a three-level MultiIndex
159+
``(estimator, metric, fold)``. The ``raw`` DataFrame is directly
160+
compatible with ``sktime``'s ``Evaluator`` class for Friedman
161+
tests and Critical Difference diagrams.
162+
116163
Returns
117164
-------
118165
results : pd.DataFrame
166+
Summary DataFrame with mean scores.
119167
120-
- Index: pandas.MultiIndex with two levels (names shown in parentheses)
121-
- level 0 "estimator": estimator name
122-
- level 1 "metric": evaluator name
123-
- Columns: ["train", "test"] (both floats)
124-
- Cell values: mean scores (float) computed across CV folds:
125-
- "train" = mean of cross_validate(...)[f"train_{metric}"]
126-
- "test" = mean of cross_validate(...)[f"test_{metric}"]
168+
- Index: pandas.MultiIndex ``(estimator, metric)``
169+
- Columns: ["train", "test"] (floats)
127170
171+
(results, raw_results) : tuple[pd.DataFrame, pd.DataFrame]
172+
Returned only when ``return_raw=True``. ``raw_results`` has a
173+
three-level MultiIndex ``(estimator, metric, fold)`` and contains
174+
the raw per-fold scores.
128175
"""
129176
self.scorers_ = self._to_scorers(self.metrics)
130177
results = {}
178+
raw_results = {}
131179

132180
if self.labels is not None:
133181
if len(self.labels) != len(self.estimators):
@@ -138,7 +186,7 @@ def run(self):
138186
for est in self.estimators:
139187
name = est.__class__.__name__
140188
counts[name] = counts.get(name, 0) + 1
141-
189+
142190
names = []
143191
seen = {}
144192
for est in self.estimators:
@@ -160,15 +208,28 @@ def run(self):
160208
return_train_score=True,
161209
)
162210

163-
# average across folds
211+
# mean scores across folds (summary)
164212
est_scores = {}
213+
# raw per-fold scores (for sktime Evaluator compatibility)
214+
est_raw_scores = {}
165215
for metric in self.scorers_.keys():
216+
train_folds = cv_results[f"train_{metric}"]
217+
test_folds = cv_results[f"test_{metric}"]
166218
est_scores[metric] = {
167-
"train": float(np.mean(cv_results[f"train_{metric}"])),
168-
"test": float(np.mean(cv_results[f"test_{metric}"])),
219+
"train": float(np.mean(train_folds)),
220+
"test": float(np.mean(test_folds)),
221+
}
222+
est_raw_scores[metric] = {
223+
"train": train_folds.tolist(),
224+
"test": test_folds.tolist(),
169225
}
170226

171227
results[est_name] = est_scores
228+
raw_results[est_name] = est_raw_scores
172229

173230
self.results = self._to_df(results)
231+
self.raw_results_ = self._to_raw_df(raw_results)
232+
233+
if return_raw:
234+
return self.results, self.raw_results_
174235
return self.results

0 commit comments

Comments
 (0)