Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions skore-hub-project/src/skore_hub_project/artefact/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

import httpx

from .project import Project
from ..project.project import Project


Progress = partial(
SkinnedProgress = partial(
Progress,
TextColumn("[bold cyan blink]Uploading..."),
BarColumn(
Expand Down Expand Up @@ -154,7 +154,7 @@ def upload(project: Project, o: Any, type: str) -> str:
task_to_chunk_id[task] = chunk_id

try:
with Progress() as progress:
with SkinnedProgress() as progress:
tasks = as_completed(task_to_chunk_id)
total = len(task_to_chunk_id)
etags = dict(
Expand Down
2 changes: 1 addition & 1 deletion skore-hub-project/src/skore_hub_project/metric/timing.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ class PredictTimeAggregate(Metric): # noqa: D101
report: CrossValidationReport = Field(repr=False, exclude=True)
aggregate: ClassVar[Literal["mean", "std"]]
greater_is_better: bool = False
position: int = 2

@computed_field # type: ignore[prop-decorator]
@cached_property
Expand All @@ -121,6 +120,7 @@ class PredictTimeMean(PredictTimeAggregate): # noqa: D101
aggregate: ClassVar[Literal["mean"]] = "mean"
name: str = "predict_time_mean"
verbose_name: str = "Predict time (s) - MEAN"
position: int = 2


class PredictTimeTrainMean(PredictTimeMean): # noqa: D101
Expand Down
8 changes: 4 additions & 4 deletions skore-hub-project/src/skore_hub_project/project/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def run_id(self) -> int:
return run["id"]

@ensure_project_is_created
def put(self, key: str, report: EstimatorReport):
def put(self, key: str, report: EstimatorReport | CrossValidationReport):
"""
Put a key-report pair to the hub project.

Expand All @@ -137,7 +137,7 @@ def put(self, key: str, report: EstimatorReport):
----------
key : str
The key to associate with ``report`` in the hub project.
report : skore.EstimatorReport
report : skore.EstimatorReport | skore.CrossValidationReport
The report to associate with ``key`` in the hub project.

Raises
Expand All @@ -158,8 +158,8 @@ def put(self, key: str, report: EstimatorReport):
url = f"projects/{self.tenant}/{self.name}/cross-validation-reports"
else:
raise TypeError(
f"Report must be a `skore.EstimatorReport` or `skore.CrossValidationRep"
f"ort` (found '{type(report)}')"
f"Report must be a `skore.EstimatorReport` or "
f"`skore.CrossValidationReport` (found '{type(report)}')"
)

payload = Payload(project=self, key=key, report=report)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

from collections import defaultdict
from functools import cached_property
from typing import ClassVar, Literal, cast
from typing import ClassVar, cast

import numpy as np
from pydantic import Field, computed_field
from sklearn.model_selection import BaseCrossValidator
from sklearn.model_selection._split import _CVIterableWrapper
Expand Down Expand Up @@ -131,19 +132,25 @@ def model_post_init(self, context): # noqa: D102
if "classification" in self.ml_task:
class_to_class_indice = defaultdict(lambda: len(class_to_class_indice))

self.__sample_to_class_indice = [
self.__sample_to_class_index = [
class_to_class_indice[sample] for sample in self.report.y
]

assert len(self.__sample_to_class_indice) == len(self.report.X)
assert len(self.__sample_to_class_index) == len(self.report.X)

self.__classes = [str(class_) for class_ in class_to_class_indice]

assert max(self.__sample_to_class_indice) == (len(self.__classes) - 1)
assert max(self.__sample_to_class_index) == (len(self.__classes) - 1)
else:
self.__sample_to_class_indice = None
self.__sample_to_class_index = None
self.__classes = None

@computed_field # type: ignore[prop-decorator]
@cached_property
def dataset_size(self) -> int:
"""Size of the dataset."""
return len(self.report.X)

@computed_field # type: ignore[prop-decorator]
@cached_property
def splitting_strategy_name(self) -> str:
Expand All @@ -158,25 +165,29 @@ def splitting_strategy_name(self) -> str:

@computed_field # type: ignore[prop-decorator]
@cached_property
def splits(self) -> list[list[Literal[0, 1]]]:
def splits(self) -> list[list[float]]:
"""
The dataset splits used by the report.
Distribution between train and test by split.

Notes
-----
For each split and for each sample in the dataset:
- 0 if the sample is in the train-set,
- 1 if the sample is in the test-set.
The distribution of each split is computed by dividing the split into a maximum
of 200 buckets, and averaging the number of samples belonging to the test-set in
each of these buckets.
"""
splits = [
[0] * len(self.report.X) for i in range(len(self.report.split_indices))
]

for i, (_, test_indices) in enumerate(self.report.split_indices):
for test_indice in test_indices:
splits[i][test_indice] = 1
distributions = []
buckets_number = min(len(self.report.X), 200)

for _, test_indices in self.report.split_indices:
split = np.zeros(len(self.report.X), dtype=int)
split[test_indices] = 1

distributions.append(
[
float(np.mean(bucket))
for bucket in np.array_split(split, buckets_number)
]
)

return cast(list[list[Literal[0, 1]]], splits)
return distributions

groups: list[int] | None = None

Expand All @@ -189,8 +200,19 @@ def class_names(self) -> list[str] | None:
@computed_field # type: ignore[prop-decorator]
@property
def classes(self) -> list[int] | None:
"""In classification, the class indice of each sample used in the report."""
return self.__sample_to_class_indice
"""
In classification, the distribution of the classes in the dataset.

The distribution is computed by dividing the dataset into a maximum of 200
buckets, and noting the dominant class in each of these buckets.
"""
if self.__sample_to_class_index is None:
return None

buckets_number = min(len(self.__sample_to_class_index), 200)
buckets = np.array_split(self.__sample_to_class_index, buckets_number)

return [int(np.bincount(bucket).argmax()) for bucket in buckets]

@computed_field # type: ignore[prop-decorator]
@cached_property
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from json import loads
from urllib.parse import urljoin

import numpy as np
from httpx import Client, Response
from pydantic import ValidationError
from pytest import fixture, mark, raises
from sklearn.datasets import make_classification, make_regression
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import ShuffleSplit
from skore import CrossValidationReport
from skore_hub_project import Project
from skore_hub_project.artefact.serializer import Serializer
Expand Down Expand Up @@ -65,7 +69,7 @@ def monkeypatch_client(monkeypatch):
monkeypatch.setattr("skore_hub_project.artefact.upload.HUBClient", FakeClient)


def serialize(object: CrossValidationReport) -> tuple[bytes, str, int]:
def serialize(object: CrossValidationReport) -> tuple[bytes, str]:
reports = [object] + object.estimator_reports_
caches = []

Expand Down Expand Up @@ -106,19 +110,63 @@ def monkeypatch_routes(respx_mock):


class TestCrossValidationReportPayload:
def test_dataset_size(self, payload):
assert payload.dataset_size == 10

def test_splitting_strategy_name(self, payload):
assert payload.splitting_strategy_name == "StratifiedKFold"

def test_splits(self, payload):
def test_splits_test_samples_density(self, payload):
assert payload.splits == [
[1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 1, 1, 1, 1],
]

def test_splits_test_samples_density_many_rows(self):
X, y = make_regression(random_state=42, n_samples=10_000)
cvr = CrossValidationReport(
LinearRegression(),
X,
y,
splitter=ShuffleSplit(random_state=42, n_splits=7),
)
payload = CrossValidationReportPayload(
project=Project("<tenant>", "<name>"),
report=cvr,
key="<key>",
)
splits = payload.splits
assert len(splits) == 7
assert all(len(s) == 200 for s in splits)
for s in splits:
assert all(bucket >= 0 and bucket <= 1 for bucket in s)

def test_class_names(self, payload):
assert payload.class_names == ["1", "0"]

def test_classes(self, payload):
X, y = make_classification(
random_state=42,
n_samples=10_000,
n_classes=2,
)
cvr = CrossValidationReport(
LogisticRegression(),
X,
y,
splitter=ShuffleSplit(random_state=42, n_splits=7),
)
payload = CrossValidationReportPayload(
project=Project("<tenant>", "<name>"),
report=cvr,
key="<key>",
)
classes = payload.classes
assert len(classes) == 200
assert np.unique(classes).tolist() == [0, 1]
assert np.sum(classes) == 93

def test_classes_many_rows(self, payload):
assert payload.classes == [0, 0, 1, 1, 1, 0, 0, 1, 0, 1]

def test_estimators(self, payload, respx_mock):
Expand Down Expand Up @@ -218,11 +266,9 @@ def test_model_dump(self, small_cv_binary_classification, payload):
"ml_task": "binary-classification",
"groups": None,
"parameters": {"checksum": checksum},
"dataset_size": 10,
"splitting_strategy_name": "StratifiedKFold",
"splits": [
[1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 1, 1, 1, 1],
],
"splits": [[1, 1, 1, 1, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 1, 1, 1, 1]],
"class_names": ["1", "0"],
"classes": [0, 0, 1, 1, 1, 0, 0, 1, 0, 1],
}
Expand Down
Loading