Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 39 additions & 40 deletions skore/src/skore/_sklearn/_estimator/feature_importance_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,46 +490,46 @@ def _feature_permutation(
data_source=data_source, X=X, y=y
)

# If seed is None, then we do not do any caching
if seed is None:
cache_key = None

elif isinstance(seed, int):
# build the cache key components to finally create a tuple that will be used
# to check if the metric has already been computed
cache_key_parts: list[Any] = [
self._parent._hash,
"permutation_importance",
data_source,
]

if data_source_hash is not None:
cache_key_parts.append(data_source_hash)

if callable(scoring) or isinstance(scoring, list | dict):
cache_key_parts.append(joblib.hash(scoring))
else:
cache_key_parts.append(scoring)

# aggregate is not included in the cache
# in order to trade off computation for storage

# order arguments by key to ensure cache works
# n_jobs variable should not be in the cache
kwargs = {
"n_repeats": n_repeats,
"max_samples": max_samples,
"seed": seed,
}
for _, v in sorted(kwargs.items()):
cache_key_parts.append(v)

cache_key = tuple(cache_key_parts)

else:
# NOTE: to temporary improve the `project.put` UX, we always store the
# permutation importance into the cache dictionary even when seed is None.
# Be aware that if seed is None, we still trigger the computation for all cases.
# We only store it such that when we serialize to send to the hub, we only
# fetch for the cache store instead of recomputing it because it is expensive.
# FIXME: the workaround above should be removed once we are able to trigger
# computation on the server side of skore-hub.

if seed is not None and not isinstance(seed, int):
raise ValueError(f"seed must be an integer or None; got {type(seed)}")

if cache_key in self._parent._cache:
# build the cache key components to finally create a tuple that will be used
# to check if the metric has already been computed
cache_key_parts: list[Any] = [
self._parent._hash,
"permutation_importance",
data_source,
]
cache_key_parts.append(data_source_hash)

if callable(scoring) or isinstance(scoring, list | dict):
cache_key_parts.append(joblib.hash(scoring))
else:
cache_key_parts.append(scoring)

# aggregate is not included in the cache in order to trade off computation for
# storage
# order arguments by key to ensure cache works n_jobs variable should not be in
# the cache
kwargs = {"n_repeats": n_repeats, "max_samples": max_samples, "seed": seed}
for _, v in sorted(kwargs.items()):
cache_key_parts.append(v)

cache_key = tuple(cache_key_parts)

if cache_key in self._parent._cache and seed is not None:
# NOTE: avoid to fetch from the cache if the seed is None because we want
# to trigger the computation in this case. We only have the permutation
# stored as a workaround for the serialization for skore-hub as explained
# earlier.
score = self._parent._cache[cache_key]
else:
sklearn_score = permutation_importance(
Expand Down Expand Up @@ -593,8 +593,7 @@ def _feature_permutation(
score = pd.DataFrame(data=data, index=index, columns=columns)

if cache_key is not None:
# Unless seed is an int (i.e. the call is deterministic),
# we do not cache
# NOTE: for the moment, we will always store the permutation importance
self._parent._cache[cache_key] = score

if aggregate:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import contextlib
import copy

import numpy as np
Expand Down Expand Up @@ -207,42 +208,65 @@ def test_cache_n_jobs(regression_data):
pd.testing.assert_frame_equal(cached_result, result)


def test_cache_seed(regression_data):
"""If `seed` is not an int (the default is None)
then the result is not cached.
def test_cache_seed_error(regression_data):
"""Check that we only accept int and None as value for `seed`."""

`seed` must be an int or None.
X, y = regression_data
report = EstimatorReport(LinearRegression(), X_train=X, y_train=y)
assert report._cache == {}

err_msg = "seed must be an integer or None"
with pytest.raises(ValueError, match=err_msg):
report.feature_importance.permutation(
data_source="train", seed=np.random.RandomState(42)
)


def test_cache_seed_none(regression_data):
"""Check the strategy on how we use the cache when `seed` is None.

In this case, we store the result in the cache for sending to the hub but we
always retrigger the computation.
"""

X, y = regression_data
report = EstimatorReport(LinearRegression(), X_train=X, y_train=y)
assert report._cache == {}

# seed is None so no cache
with check_cache_unchanged(report._cache):
report.feature_importance.permutation(data_source="train")
importance_first_call = report.feature_importance.permutation(data_source="train")
assert report._cache != {}
importance_second_call = report.feature_importance.permutation(data_source="train")
# the dataframes should be different
with contextlib.suppress(AssertionError):
pd.testing.assert_frame_equal(importance_first_call, importance_second_call)
# the cache should contain the last result
assert len(report._cache) == 1
key = list(report._cache.keys())[0]
pd.testing.assert_frame_equal(report._cache[key], importance_second_call)

# seed is a RandomState so no cache
with check_cache_unchanged(report._cache):
err_msg = (
"seed must be an integer or None; "
"got <class 'numpy.random.mtrand.RandomState'>"
)
with pytest.raises(ValueError, match=err_msg):
report.feature_importance.permutation(
data_source="train",
seed=np.random.RandomState(42),
)

# seed is an int so the result is cached
with check_cache_changed(report._cache):
result = report.feature_importance.permutation(data_source="train", seed=42)
def test_cache_seed_int(regression_data):
"""Check the strategy on how we use the cache when `seed` is an int.

with check_cache_unchanged(report._cache):
cached_result = report.feature_importance.permutation(
data_source="train", seed=42
)
In this case, we store and reload from the cache
"""
X, y = regression_data
report = EstimatorReport(LinearRegression(), X_train=X, y_train=y)
assert report._cache == {}

pd.testing.assert_frame_equal(cached_result, result)
importance_first_call = report.feature_importance.permutation(
data_source="train", seed=42
)
assert report._cache != {}
importance_second_call = report.feature_importance.permutation(
data_source="train", seed=42
)
# the dataframes should be the same
pd.testing.assert_frame_equal(importance_first_call, importance_second_call)
# the cache should contain the last result
assert len(report._cache) == 1
key = list(report._cache.keys())[0]
pd.testing.assert_frame_equal(report._cache[key], importance_second_call)


def test_cache_scoring(regression_data):
Expand Down