Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
bf0d3ff
feat: Show permutation at different stages of a pipeline
auguste-probabl Aug 28, 2025
22864ae
add docs
auguste-probabl Aug 28, 2025
fc9742e
sphinx
auguste-probabl Aug 28, 2025
5696a4b
rename stage to at_step
auguste-probabl Sep 1, 2025
c4e9f63
Make at_step 0 or -1
auguste-probabl Sep 1, 2025
e8f928c
Remove default argument from private method
auguste-probabl Sep 1, 2025
f594a2e
Clarify if-block
auguste-probabl Sep 1, 2025
b0d1cae
generalize `at_step` to a step index
auguste-probabl Sep 1, 2025
7421b69
use n_features_in_
auguste-probabl Sep 1, 2025
da25d06
remove redundant feature_names_source
auguste-probabl Sep 1, 2025
9eaba4f
add constraint on at_step
auguste-probabl Sep 1, 2025
3fd7573
refine test descriptions
auguste-probabl Sep 1, 2025
25067c9
remove example
auguste-probabl Sep 1, 2025
d944a18
generalize at_step to a step name
auguste-probabl Sep 1, 2025
983e421
factorize tests with fixture
auguste-probabl Sep 1, 2025
7b2c63c
clean
auguste-probabl Sep 1, 2025
030ebe9
refuse at_step too large in the negatives
auguste-probabl Oct 15, 2025
42e592f
refuse wrong type
auguste-probabl Oct 15, 2025
3b1ceca
fuse error tests
auguste-probabl Oct 15, 2025
da29974
remove PipelineStep type
auguste-probabl Oct 15, 2025
dd528e5
fix typo
auguste-probabl Oct 15, 2025
0ce5733
Move `at_steps` tests to a class
auguste-probabl Oct 16, 2025
e169be7
add non-regression test for sparse arrays
auguste-probabl Oct 16, 2025
4f6a881
fix bug
auguste-probabl Oct 16, 2025
7b17498
docs: Make plot more readable
auguste-probabl Oct 17, 2025
b477a0c
add non-regression test
auguste-probabl Oct 17, 2025
ff93405
refactor
auguste-probabl Oct 17, 2025
3812fbe
compute feature_names differently in the Pipeline case
auguste-probabl Oct 17, 2025
8b80ee8
rephrase example
auguste-probabl Oct 17, 2025
884c970
fix
auguste-probabl Oct 17, 2025
e86e88c
fix plot legend
auguste-probabl Oct 17, 2025
f4a3d97
refactor plot_permutation_train_test and add `at_step=-1` example
auguste-probabl Oct 17, 2025
d66de80
fix doctest
auguste-probabl Oct 20, 2025
0cd3b0c
fix
auguste-probabl Oct 20, 2025
f77f456
change explanations
auguste-probabl Oct 27, 2025
c0c765e
use seaborn
auguste-probabl Oct 27, 2025
d44e3da
add failing test
auguste-probabl Oct 28, 2025
e5bf6d2
fix bug
auguste-probabl Oct 28, 2025
37ff18e
don't use n_features_in_
auguste-probabl Oct 28, 2025
e894c53
Add note about densifying sparse arrays
auguste-probabl Oct 28, 2025
55f7d1f
fix
auguste-probabl Oct 29, 2025
75fd69f
remove dead code
auguste-probabl Oct 29, 2025
da3c847
refactor: Put data in fixture
auguste-probabl Oct 29, 2025
1dbbc94
test dataframes as well
auguste-probabl Oct 29, 2025
a0c1f5e
clean
auguste-probabl Oct 29, 2025
d852a81
mypy
auguste-probabl Oct 29, 2025
228971f
Use non-sklearn regressor
auguste-probabl Oct 30, 2025
085a41c
also test at_step=0
auguste-probabl Oct 30, 2025
8444567
use skore train_test_split
auguste-probabl Oct 30, 2025
612dee8
reuse fixtures
auguste-probabl Oct 30, 2025
cb9ae1e
wrap
auguste-probabl Oct 30, 2025
09d575e
change error type
auguste-probabl Oct 30, 2025
ef9fbf7
refactor to _get_feature_names
auguste-probabl Oct 30, 2025
2cbc1f1
refactor
auguste-probabl Oct 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ repos:
name: mypy skore/
files: ^skore/
args: [--config-file=skore/pyproject.toml, skore]
additional_dependencies: [IPython, matplotlib, numpy, polars, rich]
additional_dependencies: [IPython, matplotlib, numpy, polars, rich, scipy-stubs]
- id: mypy
pass_filenames: false
alias: mypy-skore-hub-project
Expand Down
133 changes: 82 additions & 51 deletions examples/use_cases/plot_feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,23 +448,23 @@ def unscale_coefficients(df, feature_mean, feature_std):
# Let us display the 15 largest absolute coefficients:

# %%
engineered_rigde_report_feature_importance = (
engineered_ridge_report_feature_importance = (
engineered_ridge_report.feature_importance.coefficients()
.frame()
.sort_values(by="Coefficient", key=abs, ascending=True)
.tail(15)
)

engineered_rigde_report_feature_importance.index = (
engineered_rigde_report_feature_importance.index.str.replace("remainder__", "")
engineered_ridge_report_feature_importance.index = (
engineered_ridge_report_feature_importance.index.str.replace("remainder__", "")
)
engineered_rigde_report_feature_importance.index = (
engineered_rigde_report_feature_importance.index.str.replace(
engineered_ridge_report_feature_importance.index = (
engineered_ridge_report_feature_importance.index.str.replace(
"kmeans__", "geospatial__"
)
)

engineered_rigde_report_feature_importance.plot.barh(
engineered_ridge_report_feature_importance.plot.barh(
title="Model weights",
xlabel="Coefficient",
ylabel="Feature",
Expand Down Expand Up @@ -572,7 +572,7 @@ def add_y_true_pred(model_report, split):
# We visualize the distributions of the prediction errors on both train and test sets:

# %%
sns.histplot(data=X_y_plot, x="squared_error", hue="split", bins=30)
sns.histplot(data=X_y_plot, x="squared_error", hue="split", multiple="dodge", bins=30)
plt.title("Train and test sets")
plt.show()

Expand Down Expand Up @@ -948,7 +948,7 @@ def add_y_true_pred(model_report, split):
# models and the MDI that is specific to tree-based models.
# In this section, we look into the
# `permutation importance <https://scikit-learn.org/stable/modules/permutation_importance.html>`_
# which is model agnostic, meaning that it can be applied to any fitted estimator.
# which is model-agnostic, meaning that it can be applied to any fitted estimator.
# In particular, it works for linear models and tree-based ones.

# %%
Expand Down Expand Up @@ -991,62 +991,59 @@ def add_y_true_pred(model_report, split):
ridge_report.feature_importance.permutation(seed=0)

# %%
# The permutation importance is often calculated several times, each time
# with different permutations of the feature.
# Hence, we can have measure its variance (or standard deviation).
# Now, we plot the permutation feature importance on the train and test sets using boxplots:

# Since the permutation importance involves permuting values of a feature at random,
# by default it is computed several times, each time with different permutations of
# the feature. For this reason, if `seed` is not passed, skore does not cache the
# permutation importance results.

# %%
# Now, we plot the permutation feature importance on the train and test sets using boxplots:


def plot_permutation_train_test(est_report):
def plot_permutation_train_test(importances):
_, ax = plt.subplots(figsize=(8, 6))

train_color = "blue"
test_color = "orange"

est_report.feature_importance.permutation(data_source="train", seed=0).T.boxplot(
# create a long format required by seaborn
importances = importances.stack().reset_index()
importances.columns = ["Dataset", "Feature", "Repeats", "Importance"]

sns.boxplot(
data=importances,
x="Importance",
y="Feature",
hue="Dataset",
orient="h",
order=importances["Feature"].unique()[::-1],
ax=ax,
vert=False,
widths=0.35,
patch_artist=True,
boxprops={"facecolor": train_color, "alpha": 0.7},
medianprops={"color": "black"},
positions=[x + 0.4 for x in range(len(est_report.X_train.columns))],
)
est_report.feature_importance.permutation(data_source="test", seed=0).T.boxplot(
ax=ax,
vert=False,
widths=0.35,
patch_artist=True,
boxprops={"facecolor": test_color, "alpha": 0.7},
medianprops={"color": "black"},
positions=range(len(est_report.X_test.columns)),
)
ax.set_xlabel("Decrease of $R^2$ score")
ax.set_title("Permutation feature importance (Train vs Test)")

ax.legend(
handles=[
plt.Line2D([0], [0], color=train_color, lw=5, label="Train"),
plt.Line2D([0], [0], color=test_color, lw=5, label="Test"),
],
loc="best",
title="Dataset",
)
plt.tight_layout()
plt.show()

ax.set_title(
f"Permutation feature importance of {est_report.estimator_name_} (Train vs Test)"

# %%


def compute_permutation_importances(report, at_step=0):
train_importance = report.feature_importance.permutation(
data_source="train", seed=0, at_step=at_step
)
test_importance = report.feature_importance.permutation(
data_source="test", seed=0, at_step=at_step
)
ax.set_xlabel("$R^2$")
ax.set_yticks([x + 0.2 for x in range(len(est_report.X_train.columns))])
ax.set_yticklabels(est_report.X_train.columns)
print(train_importance)

plt.tight_layout()
plt.show()
return pd.concat(
{"train": train_importance, "test": test_importance},
axis=0,
names=["Dataset"],
).droplevel(level=1, axis=0)


# %%
plot_permutation_train_test(ridge_report)
plot_permutation_train_test(compute_permutation_importances(ridge_report))

# %%
# The standard deviation seems quite low.
Expand All @@ -1062,19 +1059,53 @@ def plot_permutation_train_test(est_report):
# pipeline (with regards to the original features):

# %%
plot_permutation_train_test(selectk_ridge_report)
plot_permutation_train_test(compute_permutation_importances(selectk_ridge_report))

# %%
# Since this estimator involves complex feature engineering, it is interesting to look
# at the impact of the *engineered* features rather than the original input features.
# For instance, we can check whether features with a low importance rating have indeed
# been selected out of the engineered features.

# %%
importances = compute_permutation_importances(selectk_ridge_report, at_step=-1)

# Rename some features for clarity
importances = importances.reset_index()
importances["Feature"] = (
importances["Feature"]
.str.replace("remainder__", "")
.str.replace("kmeans__", "geospatial__")
)

# Take only the 15 most important train features
importances = importances.set_index(["Dataset", "Feature"])
best_15_features = (
importances.loc[("test", slice(None))]
.aggregate("mean", axis=1)
.sort_values(key=abs)
.tail(15)
.index
)
importances = importances[importances.index.isin(best_15_features, level=1)]

plot_permutation_train_test(importances)

# %%
# Hence, contrary to coefficients, although we have created many features in our
# preprocessing, the interpretability is easier.
# We notice that, due to our preprocessing using a clustering on the geospatial data,
# these features are of great importance to our model.
#
# Also, Average Bedrooms and Average Rooms appear often in the plot, whereas they were
# not considered as important when looking at the coefficients. It means that once
# combined with other features, they become more relevant.

# %%
# For our decision tree, here is our permutation importance on the train and test sets:

# %%
plot_permutation_train_test(tree_report)
plot_permutation_train_test(compute_permutation_importances(tree_report))

# %%
# The result of the inspection is the same as with the MDI:
Expand Down
Loading