From 9f1f0723609c8a7cfae9b9aa3a711f0cded6d84a Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 15:37:39 +0100
Subject: [PATCH 01/15] skrub ready course v1

---
 ...categorical_pipeline_column_transformer.py | 78 ++++++++++---------
 python_scripts/datasets_ames_housing.py       | 65 ++++------------
 python_scripts/datasets_bike_rides.py         | 21 ++---
 python_scripts/datasets_blood_transfusion.py  | 17 ++--
 python_scripts/datasets_california_housing.py | 22 +++---
 5 files changed, 78 insertions(+), 125 deletions(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index 921950cf6..05b31676b 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -62,27 +62,42 @@
 # In the previous sections, we saw that we need to treat data differently
 # depending on their nature (i.e. numerical or categorical).
 #
-# Scikit-learn provides a `ColumnTransformer` class which sends specific
-# columns to a specific transformer, making it easy to fit a single predictive
-# model on a dataset that combines both kinds of variables together
-# (heterogeneously typed tabular data).
+# Skrub is a data preprocessing library built to work seamlessly with
+# scikit-learn. It provides a convenient transformer called `TableVectorizer`
+# that can handle both numerical and categorical variables in a single
+# transformer. It makes the column selection automatically by using a column's
+# `dtype`.
 #
-# We first define the columns depending on their data type:
+# It separates the columns into four groups:
+# * **low cardinality categorical columns** (categorical columns with a limited
+#   number of unique values, one hot encoded by default);
+# * **high cardinality categorical columns** (categorical columns with a large
+#   number of unique values, string encoded by default);
+# * **numerical columns** (untouched by default).
+# * **time columns** (columns that encode time information, as present in time
+#   series for instance, converted to numerical features that can be used by
+#   learners; for more information, see the
+#   [documentation](https://skrub-data.org/stable/reference/generated/skrub.DatetimeEncoder.html)).
 #
-# * **one-hot encoding** is applied to categorical columns. Besides, we use
-#   `handle_unknown="ignore"` to solve the potential issues due to rare
-#   categories.
+# The threshold to determine whether a categorical column is of low or high
+# cardinality can be set using the `cardinality_threshold` parameter. We will see
+# its impact later on.
+#
+# We apply the following transformations:
+#
+# * **one-hot encoding** is applied to the low cardinality categorical columns.
+#   Besides, we use `handle_unknown="ignore"` to solve the potential issues due
+#   to rare categories.
 # * **numerical scaling** numerical features which will be standardized.
 #
-# Now, we create our `ColumnTransfomer` using the helper function
-# `make_column_transformer`. We specify two values: the transformer, and the
-# columns. First, let's create the preprocessors for the numerical and
-# categorical parts.
+# Now, we create our transformer using the helper function `TableVectorizer`. We
+# specify the transformers. First, let's create the preprocessors for the
+# numerical and low cardinality categorical parts.
 
 # %%
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
-categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
+categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
 numerical_preprocessor = StandardScaler()
 
 # %% [markdown]
@@ -90,30 +105,26 @@
 # their respective columns.
 
 # %%
-from sklearn.compose import make_column_transformer
+from skrub import TableVectorizer
 
-preprocessor = make_column_transformer(
-    (categorical_preprocessor, categorical_columns),
-    (numerical_preprocessor, numerical_columns),
-)
+vectorizer = TableVectorizer(low_cardinality = categorical_preprocessor, numeric = numerical_preprocessor)
 
 # %% [markdown]
 # We can take a minute to represent graphically the structure of a
-# `ColumnTransformer`:
+# `TableVectorizer`:
 #
 # ![columntransformer diagram](../figures/api_diagram-columntransformer.svg)
 #
-# A `ColumnTransformer` does the following:
+# A `TableVectorizer` does the following:
 #
-# * It **splits the columns** of the original dataset based on the column names
-#   or indices provided. We obtain as many subsets as the number of transformers
-#   passed into the `ColumnTransformer`.
+# * It **splits the columns** of the original dataset based on the data type and
+#   cardinality of unique values.
 # * It **transforms each subsets**. A specific transformer is applied to each
 #   subset: it internally calls `fit_transform` or `transform`. The output of
 #   this step is a set of transformed datasets.
 # * It then **concatenates the transformed datasets** into a single dataset.
 
-# The important thing is that `ColumnTransformer` is like any other scikit-learn
+# The important thing is that `TableVectorizer` is like any other scikit-learn
 # transformer. In particular it can be combined with a classifier in a
 # `Pipeline`:
 
@@ -121,7 +132,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import make_pipeline
 
-model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
+model = make_pipeline(vectorizer, LogisticRegression(max_iter=500))
 model
 
 # %% [markdown]
@@ -227,16 +238,11 @@
 
 # %%
 from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.preprocessing import OrdinalEncoder
+from skrub import ToCategorical
 
-categorical_preprocessor = OrdinalEncoder(
-    handle_unknown="use_encoded_value", unknown_value=-1
-)
+categorical_preprocessor = ToCategorical()
 
-preprocessor = make_column_transformer(
-    (categorical_preprocessor, categorical_columns),
-    remainder="passthrough",
-)
+preprocessor = TableVectorizer(low_cardinality=categorical_preprocessor)
 
 model = make_pipeline(preprocessor, HistGradientBoostingClassifier())
 
@@ -262,8 +268,10 @@
 # %% [markdown]
 # In this notebook we:
 #
-# * used a `ColumnTransformer` to apply different preprocessing for categorical
+# * used a `TableVectorizer` to apply different preprocessing for categorical
 #   and numerical variables;
-# * used a pipeline to chain the `ColumnTransformer` preprocessing and logistic
+# * used a pipeline to chain the `TableVectorizer` preprocessing and logistic
 #   regression fitting;
 # * saw that **gradient boosting methods** can outperform **linear models**.
+
+# %%
diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py
index c69c236b1..e6e7de4a3 100644
--- a/python_scripts/datasets_ames_housing.py
+++ b/python_scripts/datasets_ames_housing.py
@@ -49,20 +49,15 @@
 # Let's have a quick look at the target before to focus on the data.
 
 # %%
-target.head()
+from skrub import TableReport
 
-# %% [markdown]
-# We see that the target contains continuous value. It corresponds to the price
-# of a house in $. We can have a look at the target distribution.
-
-# %%
-import matplotlib.pyplot as plt
-
-target.plot.hist(bins=20, edgecolor="black")
-plt.xlabel("House price in $")
-_ = plt.title("Distribution of the house price \nin Ames")
+TableReport(target)
 
-# %% [markdown]
+# %% [markdown] 
+# We see that the target contains continuous value. It corresponds to the price
+# of a house in $. We can have a look at the target distribution in the
+# "Distributions" tab.
+#
 # We see that the distribution has a long tail. It means that most of the house
 # are normally distributed but a couple of houses have a higher than normal
 # value. It could be critical to take this peculiarity into account when
@@ -72,7 +67,7 @@
 # house prices.
 
 # %%
-data.info()
+TableReport(data)
 
 # %% [markdown]
 # Looking at the dataframe general information, we can see that 79 features are
@@ -84,24 +79,17 @@
 
 # %%
 numerical_data = data.select_dtypes("number")
-numerical_data.info()
+TableReport(numerical_data, max_plot_columns=40)
 
 # %% [markdown]
 # We see that the data are mainly represented with integer number. Let's have a
-# look at the histogram for all these features.
-
-# %%
-numerical_data.hist(
-    bins=20, figsize=(12, 22), edgecolor="black", layout=(9, 4)
-)
-plt.subplots_adjust(hspace=0.8, wspace=0.8)
+# look at the histogram for all these features in the "Distributions" tab.
 
-# %% [markdown]
 # We see that some features have high picks for 0. It could be linked that this
 # value was assigned when the criterion did not apply, for instance the area of
 # the swimming pool when no swimming pools are available.
 #
-# We also have some feature encoding some date (for instance year).
+# We also have some features encoding a date (for instance year).
 #
 # These information are useful and should also be considered when designing a
 # predictive model.
@@ -110,34 +98,13 @@
 
 # %%
 string_data = data.select_dtypes(object)
-string_data.info()
+TableReport(string_data, max_plot_columns=45)
 
 # %% [markdown]
-# These features are categorical. We can make some bar plot to see categories
-# count for each feature.
-
-# %%
-from math import ceil
-from itertools import zip_longest
-
-n_string_features = string_data.shape[1]
-nrows, ncols = ceil(n_string_features / 4), 4
-
-fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 80))
-
-for feature_name, ax in zip_longest(string_data, axs.ravel()):
-    if feature_name is None:
-        # do not show the axis
-        ax.axis("off")
-        continue
-
-    string_data[feature_name].value_counts().plot.barh(ax=ax)
-    ax.set_title(feature_name)
-
-plt.subplots_adjust(hspace=0.2, wspace=0.8)
-
-# %% [markdown]
-# Plotting this information allows us to answer to two questions:
+# These features are categorical. We can make analyze the bar plots in the
+# "Distribution" tab to see categories count for each feature.
+#
+# This allows us to answer to two questions:
 #
 # * Is there few or many categories for a given features?
 # * Is there rare categories for some features?
diff --git a/python_scripts/datasets_bike_rides.py b/python_scripts/datasets_bike_rides.py
index 9cb2ec77a..a4dfa1f6a 100644
--- a/python_scripts/datasets_bike_rides.py
+++ b/python_scripts/datasets_bike_rides.py
@@ -94,10 +94,8 @@
 # We can have a first look at the target distribution.
 
 # %%
-import matplotlib.pyplot as plt
-
-target.plot.hist(bins=50, edgecolor="black")
-plt.xlabel("Power (W)")
+from skrub import TableReport
+TableReport(target)
 
 # %% [markdown]
 # We see a pick at 0 Watts, it corresponds to whenever our cyclist does not
@@ -144,6 +142,8 @@
 data_ride, target_ride = data.loc[date_first_ride], target.loc[date_first_ride]
 
 # %%
+import matplotlib.pyplot as plt
+
 data_ride.plot()
 plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
 _ = plt.title("Sensor values for different cyclist measurements")
@@ -163,18 +163,7 @@
 # We can check the range of the different features:
 
 # %%
-axs = data_ride.hist(figsize=(10, 12), bins=50, edgecolor="black", grid=False)
-# add the units to the plots
-units = [
-    "beats per minute",
-    "rotations per minute",
-    "meters per second",
-    "meters per second squared",
-    "%",
-]
-for unit, ax in zip(units, axs.ravel()):
-    ax.set_xlabel(unit)
-plt.subplots_adjust(hspace=0.6)
+TableReport(data_ride)
 
 # %% [markdown]
 # From these plots, we can see some interesting information: a cyclist is
diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py
index 1042f16f1..10ed30d60 100644
--- a/python_scripts/datasets_blood_transfusion.py
+++ b/python_scripts/datasets_blood_transfusion.py
@@ -54,17 +54,14 @@
 # columns and if any missing values are present in our dataset.
 
 # %%
-data.info()
+from skrub import TableReport
+TableReport(data)
 
 # %% [markdown]
 # Our dataset is made of 748 samples. All features are represented with integer
 # numbers and there is no missing values. We can have a look at each feature
-# distributions.
-
-# %%
-_ = data.hist(figsize=(12, 10), bins=30, edgecolor="black")
-
-# %% [markdown]
+# distributions in the "Distributions" tab.
+#
 # There is nothing shocking regarding the distributions. We only observe a high
 # value range for the features `"Recency"`, `"Frequency"`, and `"Monetary"`. It
 # means that we have a few extreme high values for these features.
@@ -76,11 +73,7 @@
 target.head()
 
 # %%
-import matplotlib.pyplot as plt
-
-target.value_counts(normalize=True).plot.barh()
-plt.xlabel("Number of samples")
-_ = plt.title("Class distribution")
+TableReport(target)
 
 # %% [markdown]
 # We see that the target is discrete and contains two categories: whether a
diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py
index 16869021a..a37515153 100644
--- a/python_scripts/datasets_california_housing.py
+++ b/python_scripts/datasets_california_housing.py
@@ -67,14 +67,13 @@
 # * all features are numerical features encoded as floating number;
 # * there is no missing values.
 #
-# Let's have a quick look at the distribution of these features by plotting
-# their histograms.
+# Let's have a quick look at the distribution of these features with the 
+# TableReport from the skrub package.
 
 # %%
-import matplotlib.pyplot as plt
+from skrub import TableReport
 
-california_housing.frame.hist(figsize=(12, 10), bins=30, edgecolor="black")
-plt.subplots_adjust(hspace=0.7, wspace=0.4)
+TableReport(california_housing.frame)
 
 
 # %% [markdown]
@@ -95,14 +94,10 @@
 # population, the range of the data is large with unnoticeable bin for the
 # largest values. It means that there are very high and few values (maybe they
 # could be considered as outliers?). We can see this specificity looking at the
-# statistics for these features:
-
-# %%
-features_of_interest = ["AveRooms", "AveBedrms", "AveOccup", "Population"]
-california_housing.frame[features_of_interest].describe()
-
-# %% [markdown]
-# For each of these features, comparing the `max` and `75%` values, we can see a
+# statistics for these features by clicking on the corresponding columns in the
+# table.
+#
+# For each of these features, comparing the `max` and `Median ± IQR` values, we can see a
 # huge difference. It confirms the intuitions that there are a couple of extreme
 # values.
 #
@@ -115,6 +110,7 @@
 
 # %%
 import seaborn as sns
+import matplotlib.pyplot as plt
 
 sns.scatterplot(
     data=california_housing.frame,

From 58d8fc46d31298ba6fb2d39b1cd947e0ed214c49 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 15:47:36 +0100
Subject: [PATCH 02/15] test fix

---
 python_scripts/datasets_ames_housing.py       | 2 +-
 python_scripts/datasets_california_housing.py | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py
index e6e7de4a3..353a1fb52 100644
--- a/python_scripts/datasets_ames_housing.py
+++ b/python_scripts/datasets_ames_housing.py
@@ -84,7 +84,7 @@
 # %% [markdown]
 # We see that the data are mainly represented with integer number. Let's have a
 # look at the histogram for all these features in the "Distributions" tab.
-
+#
 # We see that some features have high picks for 0. It could be linked that this
 # value was assigned when the criterion did not apply, for instance the area of
 # the swimming pool when no swimming pools are available.
diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py
index a37515153..cb4030ccb 100644
--- a/python_scripts/datasets_california_housing.py
+++ b/python_scripts/datasets_california_housing.py
@@ -18,7 +18,7 @@
 california_housing = fetch_california_housing(as_frame=True)
 
 # %% [markdown]
-# We can have a first look at the available description
+# We can have a first look at the available description.
 
 # %%
 print(california_housing.DESCR)
@@ -75,7 +75,6 @@
 
 TableReport(california_housing.frame)
 
-
 # %% [markdown]
 # We can first focus on features for which their distributions would be more or
 # less expected.
@@ -97,9 +96,9 @@
 # statistics for these features by clicking on the corresponding columns in the
 # table.
 #
-# For each of these features, comparing the `max` and `Median ± IQR` values, we can see a
-# huge difference. It confirms the intuitions that there are a couple of extreme
-# values.
+# For each of these features, comparing the `max` and `Median ± IQR` values, we
+# can see a huge difference. It confirms the intuitions that there are a couple
+# of extreme values.
 #
 # Up to now, we discarded the longitude and latitude that carry geographical
 # information. In short, the combination of these features could help us decide

From 35e8c26f032e83259eb3ff6b01d4fcefb4e1add1 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 15:50:19 +0100
Subject: [PATCH 03/15] fixed thanks to arturo

---
 python_scripts/datasets_ames_housing.py       | 10 +++-------
 python_scripts/datasets_california_housing.py |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py
index 353a1fb52..b3b973a27 100644
--- a/python_scripts/datasets_ames_housing.py
+++ b/python_scripts/datasets_ames_housing.py
@@ -53,7 +53,7 @@
 
 TableReport(target)
 
-# %% [markdown] 
+# %% [markdown]
 # We see that the target contains continuous value. It corresponds to the price
 # of a house in $. We can have a look at the target distribution in the
 # "Distributions" tab.
@@ -119,9 +119,7 @@
 # ```
 
 # %%
-ames_housing_no_missing = pd.read_csv(
-    "../datasets/ames_housing_no_missing.csv"
-)
+ames_housing_no_missing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
 ames_housing_no_missing.head()
 
 # %% [markdown]
@@ -178,7 +176,5 @@
     columns=categorical_features.tolist() + numerical_features,
 )
 ames_housing_preprocessed = ames_housing_preprocessed[ames_housing.columns]
-ames_housing_preprocessed = ames_housing_preprocessed.astype(
-    ames_housing.dtypes
-)
+ames_housing_preprocessed = ames_housing_preprocessed.astype(ames_housing.dtypes)
 (ames_housing_no_missing == ames_housing_preprocessed).all()
diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py
index cb4030ccb..b53bc70be 100644
--- a/python_scripts/datasets_california_housing.py
+++ b/python_scripts/datasets_california_housing.py
@@ -67,7 +67,7 @@
 # * all features are numerical features encoded as floating number;
 # * there is no missing values.
 #
-# Let's have a quick look at the distribution of these features with the 
+# Let's have a quick look at the distribution of these features with the
 # TableReport from the skrub package.
 
 # %%

From 489b391d2ff0fed07319263d7b8489491f23377b Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 15:53:11 +0100
Subject: [PATCH 04/15] fixed (?)

---
 python_scripts/03_categorical_pipeline_column_transformer.py | 4 +++-
 python_scripts/datasets_bike_rides.py                        | 1 +
 python_scripts/datasets_blood_transfusion.py                 | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index 05b31676b..f9ed7cb59 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -107,7 +107,9 @@
 # %%
 from skrub import TableVectorizer
 
-vectorizer = TableVectorizer(low_cardinality = categorical_preprocessor, numeric = numerical_preprocessor)
+vectorizer = TableVectorizer(
+    low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor
+)
 
 # %% [markdown]
 # We can take a minute to represent graphically the structure of a
diff --git a/python_scripts/datasets_bike_rides.py b/python_scripts/datasets_bike_rides.py
index a4dfa1f6a..fa1ce8019 100644
--- a/python_scripts/datasets_bike_rides.py
+++ b/python_scripts/datasets_bike_rides.py
@@ -95,6 +95,7 @@
 
 # %%
 from skrub import TableReport
+
 TableReport(target)
 
 # %% [markdown]
diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py
index 10ed30d60..2f32ed005 100644
--- a/python_scripts/datasets_blood_transfusion.py
+++ b/python_scripts/datasets_blood_transfusion.py
@@ -55,6 +55,7 @@
 
 # %%
 from skrub import TableReport
+
 TableReport(data)
 
 # %% [markdown]

From 388a652a91dd1fd018912ec1a47bb469e068da6b Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 15:58:10 +0100
Subject: [PATCH 05/15] ames

---
 python_scripts/datasets_ames_housing.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py
index b3b973a27..86c4b51d2 100644
--- a/python_scripts/datasets_ames_housing.py
+++ b/python_scripts/datasets_ames_housing.py
@@ -119,7 +119,9 @@
 # ```
 
 # %%
-ames_housing_no_missing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
+ames_housing_no_missing = pd.read_csv(
+    "../datasets/ames_housing_no_missing.csv"
+)
 ames_housing_no_missing.head()
 
 # %% [markdown]
@@ -176,5 +178,7 @@
     columns=categorical_features.tolist() + numerical_features,
 )
 ames_housing_preprocessed = ames_housing_preprocessed[ames_housing.columns]
-ames_housing_preprocessed = ames_housing_preprocessed.astype(ames_housing.dtypes)
+ames_housing_preprocessed = ames_housing_preprocessed.astype(
+    ames_housing.dtypes
+)
 (ames_housing_no_missing == ames_housing_preprocessed).all()

From d632d756d84cfe3f063c27dee83c98472cb37b6e Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 16:00:08 +0100
Subject: [PATCH 06/15] fixed categorica; pipeline

---
 python_scripts/03_categorical_pipeline_column_transformer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index f9ed7cb59..31f935b0c 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -275,5 +275,3 @@
 # * used a pipeline to chain the `TableVectorizer` preprocessing and logistic
 #   regression fitting;
 # * saw that **gradient boosting methods** can outperform **linear models**.
-
-# %%

From ceb92c777de2de85e25ad936b1130e20ea50835e Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 16:04:11 +0100
Subject: [PATCH 07/15] test

---
 python_scripts/03_categorical_pipeline_column_transformer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index 31f935b0c..fa4fd5886 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -251,7 +251,6 @@
 # %% [markdown]
 # Now that we created our model, we can check its generalization performance.
 
-# %%
 # %%time
 _ = model.fit(data_train, target_train)
 

From 520051c124d0f9f6bd5badd05b3913169f03faaa Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 16:05:53 +0100
Subject: [PATCH 08/15] test

---
 python_scripts/03_categorical_pipeline_column_transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index fa4fd5886..468b2edbc 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -117,7 +117,7 @@
 #
 # ![columntransformer diagram](../figures/api_diagram-columntransformer.svg)
 #
-# A `TableVectorizer` does the following:
+# `TableVectorizer` does the following:
 #
 # * It **splits the columns** of the original dataset based on the data type and
 #   cardinality of unique values.
@@ -125,7 +125,7 @@
 #   subset: it internally calls `fit_transform` or `transform`. The output of
 #   this step is a set of transformed datasets.
 # * It then **concatenates the transformed datasets** into a single dataset.
-
+#
 # The important thing is that `TableVectorizer` is like any other scikit-learn
 # transformer. In particular it can be combined with a classifier in a
 # `Pipeline`:

From 9bb831494205450d9dd52b93a5f29c11d60d8e2a Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 19 Nov 2025 16:10:09 +0100
Subject: [PATCH 09/15] finito?

---
 python_scripts/03_categorical_pipeline_column_transformer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index 468b2edbc..7f72e3790 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -97,7 +97,9 @@
 # %%
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
-categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
+categorical_preprocessor = OneHotEncoder(
+    handle_unknown="ignore", sparse_output=False
+)
 numerical_preprocessor = StandardScaler()
 
 # %% [markdown]

From 053d2df8e697acec3f2a187653e1022175769725 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 10 Dec 2025 15:17:37 +0100
Subject: [PATCH 10/15] added pandas notebook + corrections

---
 python_scripts/01_tabular_data_exploration.py |  68 ++--
 .../01_tabular_data_exploration_pandas.py     | 371 ++++++++++++++++++
 ...categorical_pipeline_column_transformer.py | 144 +++++--
 3 files changed, 515 insertions(+), 68 deletions(-)
 create mode 100644 python_scripts/01_tabular_data_exploration_pandas.py

diff --git a/python_scripts/01_tabular_data_exploration.py b/python_scripts/01_tabular_data_exploration.py
index 4ecb8ebf8..43a547079 100644
--- a/python_scripts/01_tabular_data_exploration.py
+++ b/python_scripts/01_tabular_data_exploration.py
@@ -162,45 +162,42 @@
 
 # %% [markdown]
 # Let's look at the distribution of individual features, to get some insights
-# about the data. We can start by plotting histograms, note that this only works
-# for features containing numerical values:
+# about the data. We will use `skrub`'s `TableReport` class to generate an
+# overview of the dataset.
 
 # %%
-_ = adult_census.hist(figsize=(20, 14))
+
+from skrub import TableReport
+
+report = TableReport(adult_census)
+report
+# _ = adult_census.hist(figsize=(20, 14))
 
 # %% [markdown]
-# ```{tip}
-# In the previous cell, we used the following pattern: `_ = func()`. We do this
-# to avoid showing the output of `func()` which in this case is not that
-# useful. We actually assign the output of `func()` into the variable `_`
-# (called underscore). By convention, in Python the underscore variable is used
-# as a "garbage" variable to store results that we are not interested in.
-# ```
+# The report shows many useful statistics about each variable. On the first tab
+# "Table", we have a representation of the dataframe. Clicking on each column
+# name shows a statistical summary of the variable. For a better view of the
+# distribution of each variable, we can click on the "Distributions" tab.
 #
-# We can already make a few comments about some of the variables:
+# Numerical features's distributions are displayed as histograms, while
+# categorical values are shown as bar plots. We can already make a few comments
+# about some of the variables:
 #
 # * `"age"`: there are not that many points for `age > 70`. The dataset
 #   description does indicate that retired people have been filtered out
 #   (`hours-per-week > 0`);
 # * `"education-num"`: peak at 10 and 13, hard to tell what it corresponds to
 #   without looking much further. We'll do that later in this notebook;
+# * most values of `"capital-gain"` and `"capital-loss"` are close to zero;
 # * `"hours-per-week"` peaks at 40, this was very likely the standard number of
 #   working hours at the time of the data collection;
-# * most values of `"capital-gain"` and `"capital-loss"` are close to zero.
-
-# %% [markdown]
-# For categorical variables, we can look at the distribution of values:
-
-# %%
-adult_census["sex"].value_counts()
-
-# %% [markdown]
-# Note that the data collection process resulted in an important imbalance
-# between the number of male/female samples.
+# * `"sex"`: the data collection process resulted in an important imbalance
+#   between the number of male/female samples.
 #
-# Be aware that training a model with such data imbalance can cause
-# disproportioned prediction errors for the under-represented groups. This is a
-# typical cause of
+# About the last observation, be aware that training a model with such data
+# imbalance can cause disproportioned prediction errors for the
+# under-represented sensitive groups (based on gender or ethnicity for
+# instance). This is a typical cause of
 # [fairness](https://docs.microsoft.com/en-us/azure/machine-learning/concept-fairness-ml#what-is-machine-learning-fairness)
 # problems if used naively when deploying a machine learning based system in a
 # real life setting.
@@ -210,35 +207,40 @@
 # related to the deployment of automated decision making systems that rely on
 # machine learning components.
 #
-# Studying why the data collection process of this dataset lead to such an
+# Studying why the data collection process of this dataset led to such an
 # unexpected gender imbalance is beyond the scope of this MOOC but we should
 # keep in mind that this dataset is not representative of the US population
 # before drawing any conclusions based on its statistics or the predictions of
 # models trained on it.
 
-# %%
-adult_census["education"].value_counts()
-
 # %% [markdown]
 # As noted above, `"education-num"` distribution has two clear peaks around 10
 # and 13. It would be reasonable to expect that `"education-num"` is the number
 # of years of education.
 #
-# Let's look at the relationship between `"education"` and `"education-num"`.
+# Let's look at the relationship between `"education"` and `"education-num"` by
+# going to the "Associations" tab of the report. This tab shows the statistical
+# relationship between each pair of variables in the dataset, using [Cramér's
+# V](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V) and [Pearson's
+# Correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
+# We can see that `"education"` and `"education-num"` are very strongly related.
+
 # %%
 pd.crosstab(
     index=adult_census["education"], columns=adult_census["education-num"]
 )
 
 # %% [markdown]
-# For every entry in `\"education\"`, there is only one single corresponding
-# value in `\"education-num\"`. This shows that `"education"` and
+# For every entry in `"education"`, there is only one single corresponding
+# value in `"education-num"`. This shows that `"education"` and
 # `"education-num"` give you the same information. For example,
 # `"education-num"=2` is equivalent to `"education"="1st-4th"`. In practice that
 # means we can remove `"education-num"` without losing information. Note that
 # having redundant (or highly correlated) columns can be a problem for machine
 # learning algorithms.
-
+#
+# All of this data expliration can be done manually with `pandas`. See [this
+# notebook](lol) for more information.
 # %% [markdown]
 # ```{note}
 # In the upcoming notebooks, we will only keep the `"education"` variable,
diff --git a/python_scripts/01_tabular_data_exploration_pandas.py b/python_scripts/01_tabular_data_exploration_pandas.py
new file mode 100644
index 000000000..39b9a7cd4
--- /dev/null
+++ b/python_scripts/01_tabular_data_exploration_pandas.py
@@ -0,0 +1,371 @@
+# ---
+# jupyter:
+#   kernelspec:
+#     display_name: Python 3
+#     name: python3
+# ---
+
+# %% [markdown]
+# # First look at our dataset
+#
+# In this notebook, we look at the necessary steps required before any machine
+#  learning takes place. It involves:
+#
+# * loading the data;
+# * looking at the variables in the dataset, in particular, differentiate
+#   between numerical and categorical variables, which need different
+#   preprocessing in most machine learning workflows;
+# * visualizing the distribution of the variables to gain some insights into the
+#   dataset.
+
+# %% [markdown]
+# ## Loading the adult census dataset
+#
+# We use data from the 1994 US census that we downloaded from
+# [OpenML](http://openml.org/).
+#
+# You can look at the OpenML webpage to learn more about this dataset:
+# <http://www.openml.org/d/1590>
+#
+# The dataset is available as a CSV (Comma-Separated Values) file and we use
+# `pandas` to read it.
+#
+# ```{note}
+# [Pandas](https://pandas.pydata.org/) is a Python library used for
+# manipulating 1 and 2 dimensional structured data. If you have never used
+# pandas, we recommend you look at this
+# [tutorial](https://pandas.pydata.org/docs/user_guide/10min.html).
+# ```
+
+# %%
+import pandas as pd
+
+adult_census = pd.read_csv("../datasets/adult-census.csv")
+
+# %% [markdown]
+# The goal with this data is to predict whether a person earns over 50K a year
+# from heterogeneous data such as age, employment, education, family
+# information, etc.
+
+# %% [markdown]
+# ## The variables (columns) in the dataset
+#
+# The data are stored in a `pandas` dataframe. A dataframe is a type of
+# structured data composed of 2 dimensions. This type of data is also referred
+# as tabular data.
+#
+# Each row represents a "sample". In the field of machine learning or
+# descriptive statistics, commonly used equivalent terms are "record",
+# "instance", or "observation".
+#
+# Each column represents a type of information that has been collected and is
+# called a "feature". In the field of machine learning and descriptive
+# statistics, commonly used equivalent terms are "variable", "attribute", or
+# "covariate".
+
+# %% [markdown]
+# A quick way to inspect the dataframe is to show the first few lines with the
+# `head` method:
+
+# %%
+adult_census.head()
+
+# %% [markdown]
+# An alternative is to omit the `head` method. This would output the initial and
+# final rows and columns, but everything in between is not shown by default. It
+# also provides the dataframe's dimensions at the bottom in the format `n_rows`
+# x `n_columns`.
+
+# %%
+adult_census
+
+# %% [markdown]
+# The column named **class** is our target variable (i.e., the variable which we
+# want to predict). The two possible classes are `<=50K` (low-revenue) and
+# `>50K` (high-revenue). The resulting prediction problem is therefore a binary
+# classification problem as `class` has only two possible values. We use the
+# left-over columns (any column other than `class`) as input variables for our
+# model.
+
+# %%
+target_column = "class"
+adult_census[target_column].value_counts()
+
+# %% [markdown]
+# ```{note}
+# Here, classes are slightly imbalanced, meaning there are more samples of one
+# or more classes compared to others. In this case, we have many more samples
+# with `" <=50K"` than with `" >50K"`. Class imbalance happens often in practice
+# and may need special techniques when building a predictive model.
+#
+# For example in a medical setting, if we are trying to predict whether subjects
+# may develop a rare disease, there would be a lot more healthy subjects than
+# ill subjects in the dataset.
+# ```
+
+# %% [markdown]
+# The dataset contains both numerical and categorical data. Numerical values
+# take continuous values, for example `"age"`. Categorical values can have a
+# finite number of values, for example `"native-country"`.
+
+# %%
+numerical_columns = [
+    "age",
+    "education-num",
+    "capital-gain",
+    "capital-loss",
+    "hours-per-week",
+]
+categorical_columns = [
+    "workclass",
+    "education",
+    "marital-status",
+    "occupation",
+    "relationship",
+    "race",
+    "sex",
+    "native-country",
+]
+all_columns = numerical_columns + categorical_columns + [target_column]
+
+adult_census = adult_census[all_columns]
+
+# %% [markdown]
+# We can check the number of samples and the number of columns available in the
+# dataset:
+
+# %%
+print(
+    f"The dataset contains {adult_census.shape[0]} samples and "
+    f"{adult_census.shape[1]} columns"
+)
+
+# %% [markdown]
+# We can compute the number of features by counting the number of columns and
+# subtract 1, since one of the columns is the target.
+
+# %%
+print(f"The dataset contains {adult_census.shape[1] - 1} features.")
+
+# %% [markdown]
+# ## Visual inspection of the data
+# Before building a predictive model, it is a good idea to look at the data:
+#
+# * maybe the task you are trying to achieve can be solved without machine
+#   learning;
+# * you need to check that the information you need for your task is actually
+#   present in the dataset;
+# * inspecting the data is a good way to find peculiarities. These can arise
+#   during data collection (for example, malfunctioning sensor or missing
+#   values), or from the way the data is processed afterwards (for example
+#   capped values).
+
+# %% [markdown]
+# Let's look at the distribution of individual features, to get some insights
+# about the data. We can start by plotting histograms, note that this only works
+# for features containing numerical values:
+
+# %%
+_ = adult_census.hist(figsize=(20, 14))
+
+# %% [markdown]
+# ```{tip}
+# In the previous cell, we used the following pattern: `_ = func()`. We do this
+# to avoid showing the output of `func()` which in this case is not that
+# useful. We actually assign the output of `func()` into the variable `_`
+# (called underscore). By convention, in Python the underscore variable is used
+# as a "garbage" variable to store results that we are not interested in.
+# ```
+#
+# We can already make a few comments about some of the variables:
+#
+# * `"age"`: there are not that many points for `age > 70`. The dataset
+#   description does indicate that retired people have been filtered out
+#   (`hours-per-week > 0`);
+# * `"education-num"`: peak at 10 and 13, hard to tell what it corresponds to
+#   without looking much further. We'll do that later in this notebook;
+# * `"hours-per-week"` peaks at 40, this was very likely the standard number of
+#   working hours at the time of the data collection;
+# * most values of `"capital-gain"` and `"capital-loss"` are close to zero.
+
+# %% [markdown]
+# For categorical variables, we can look at the distribution of values:
+
+# %%
+adult_census["sex"].value_counts()
+
+# %% [markdown]
+# Note that the data collection process resulted in an important imbalance
+# between the number of male/female samples.
+#
+# Be aware that training a model with such data imbalance can cause
+# disproportioned prediction errors for the under-represented groups. This is a
+# typical cause of
+# [fairness](https://docs.microsoft.com/en-us/azure/machine-learning/concept-fairness-ml#what-is-machine-learning-fairness)
+# problems if used naively when deploying a machine learning based system in a
+# real life setting.
+#
+# We recommend our readers to refer to [fairlearn.org](https://fairlearn.org)
+# for resources on how to quantify and potentially mitigate fairness issues
+# related to the deployment of automated decision making systems that rely on
+# machine learning components.
+#
+# Studying why the data collection process of this dataset lead to such an
+# unexpected gender imbalance is beyond the scope of this MOOC but we should
+# keep in mind that this dataset is not representative of the US population
+# before drawing any conclusions based on its statistics or the predictions of
+# models trained on it.
+
+# %%
+adult_census["education"].value_counts()
+
+# %% [markdown]
+# As noted above, `"education-num"` distribution has two clear peaks around 10
+# and 13. It would be reasonable to expect that `"education-num"` is the number
+# of years of education.
+#
+# Let's look at the relationship between `"education"` and `"education-num"`.
+# %%
+pd.crosstab(
+    index=adult_census["education"], columns=adult_census["education-num"]
+)
+
+# %% [markdown]
+# For every entry in `\"education\"`, there is only one single corresponding
+# value in `\"education-num\"`. This shows that `"education"` and
+# `"education-num"` give you the same information. For example,
+# `"education-num"=2` is equivalent to `"education"="1st-4th"`. In practice that
+# means we can remove `"education-num"` without losing information. Note that
+# having redundant (or highly correlated) columns can be a problem for machine
+# learning algorithms.
+
+# %% [markdown]
+# ```{note}
+# In the upcoming notebooks, we will only keep the `"education"` variable,
+# excluding the `"education-num"` variable since the latter is redundant with
+# the former.
+# ```
+
+# %% [markdown]
+# Another way to inspect the data is to do a `pairplot` and show how each
+# variable differs according to our target, i.e. `"class"`. Plots along the
+# diagonal show the distribution of individual variables for each `"class"`. The
+# plots on the off-diagonal can reveal interesting interactions between
+# variables.
+
+# %%
+import seaborn as sns
+
+# We plot a subset of the data to keep the plot readable and make the plotting
+# faster
+n_samples_to_plot = 5000
+columns = ["age", "education-num", "hours-per-week"]
+_ = sns.pairplot(
+    data=adult_census[:n_samples_to_plot],
+    vars=columns,
+    hue=target_column,
+    plot_kws={"alpha": 0.2},
+    height=3,
+    diag_kind="hist",
+    diag_kws={"bins": 30},
+)
+
+# %% [markdown]
+# ## Creating decision rules by hand
+#
+# By looking at the previous plots, we could create some hand-written rules that
+# predict whether someone has a high- or low-income. For instance, we could
+# focus on the combination of the `"hours-per-week"` and `"age"` features.
+
+# %%
+_ = sns.scatterplot(
+    x="age",
+    y="hours-per-week",
+    data=adult_census[:n_samples_to_plot],
+    hue=target_column,
+    alpha=0.5,
+)
+
+# %% [markdown]
+# The data points (circles) show the distribution of `"hours-per-week"` and
+# `"age"` in the dataset. Blue points mean low-income and orange points mean
+# high-income. This part of the plot is the same as the bottom-left plot in the
+# pairplot above.
+#
+# In this plot, we can try to find regions that mainly contains a single class
+# such that we can easily decide what class one should predict. We could come up
+# with hand-written rules as shown in this plot:
+
+# %%
+import matplotlib.pyplot as plt
+
+ax = sns.scatterplot(
+    x="age",
+    y="hours-per-week",
+    data=adult_census[:n_samples_to_plot],
+    hue=target_column,
+    alpha=0.5,
+)
+
+age_limit = 27
+plt.axvline(x=age_limit, ymin=0, ymax=1, color="black", linestyle="--")
+
+hours_per_week_limit = 40
+plt.axhline(
+    y=hours_per_week_limit, xmin=0.18, xmax=1, color="black", linestyle="--"
+)
+
+plt.annotate("<=50K", (17, 25), rotation=90, fontsize=35)
+plt.annotate("<=50K", (35, 20), fontsize=35)
+_ = plt.annotate("???", (45, 60), fontsize=35)
+
+# %% [markdown]
+# * In the region `age < 27` (left region) the prediction is low-income. Indeed,
+#   there are many blue points and we cannot see any orange points.
+# * In the region `age > 27 AND hours-per-week < 40` (bottom-right region), the
+#   prediction is low-income. Indeed, there are many blue points and only a few
+#   orange points.
+# * In the region `age > 27 AND hours-per-week > 40` (top-right region), we see
+#   a mix of blue points and orange points. It seems complicated to choose which
+#   class we should predict in this region.
+#
+# It is interesting to note that some machine learning models work similarly to
+# what we did: they are known as decision tree models. The two thresholds that
+# we chose (27 years and 40 hours) are somewhat arbitrary, i.e. we chose them by
+# only looking at the pairplot. In contrast, a decision tree chooses the "best"
+# splits based on data without human intervention or inspection. Decision trees
+# will be covered more in detail in a future module.
+#
+# Note that machine learning is often used when creating rules by hand is not
+# straightforward. For example because we are in high dimension (many features
+# in a table) or because there are no simple and obvious rules that separate the
+# two classes as in the top-right region of the previous plot.
+#
+# To sum up, the important thing to remember is that in a machine-learning
+# setting, a model automatically creates the "rules" from the existing data in
+# order to make predictions on new unseen data.
+
+# %% [markdown]
+# ## Notebook Recap
+#
+# In this notebook we:
+#
+# * loaded the data from a CSV file using `pandas`;
+# * looked at the different kind of variables to differentiate between
+#   categorical and numerical variables;
+# * inspected the data with `pandas` and `seaborn`. Data inspection can allow
+#   you to decide whether using machine learning is appropriate for your data
+#   and to highlight potential peculiarities in your data.
+#
+# We made important observations (which will be discussed later in more detail):
+#
+# * if your target variable is imbalanced (e.g., you have more samples from one
+#   target category than another), you may need to be careful when interpreting
+#   the values of performance metrics;
+# * columns can be redundant (or highly correlated), which is not necessarily a
+#   problem, but may require special treatment as we will cover in future
+#   notebooks;
+# * decision trees create prediction rules by comparing each feature to a
+#   threshold value, resulting in decision boundaries that are always parallel
+#   to the axes. In 2D, this means the boundaries are vertical or horizontal
+#   line segments at the feature threshold values.
\ No newline at end of file
diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index 7f72e3790..9c04078dc 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -28,23 +28,19 @@
 data = adult_census.drop(columns=[target_name])
 
 # %% [markdown]
-# ## Selection based on data types
+# ## Dispatch columns to a specific processor
+#
+# In the previous sections, we saw that we need to treat data differently
+# depending on their nature (i.e. numerical or categorical).
+#
+# Skrub is a data preprocessing library built to work seamlessly with
+# scikit-learn. It provides a convenient transformer called `TableVectorizer`
+# that can handle both numerical and categorical variables in a single
+# transformer. It makes the column selection automatically by using a column's
+# `dtype`. This is equivalent to using a
+# [`sklearn.compose.make_column_selector`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html)
+# and selecting or excluding `object` dtypes.
 #
-# We separate categorical and numerical variables using their data types to
-# identify them, as we saw previously that `object` corresponds to categorical
-# columns (strings). We make use of `make_column_selector` helper to select the
-# corresponding columns.
-
-# %%
-from sklearn.compose import make_column_selector as selector
-
-numerical_columns_selector = selector(dtype_exclude=object)
-categorical_columns_selector = selector(dtype_include=object)
-
-numerical_columns = numerical_columns_selector(data)
-categorical_columns = categorical_columns_selector(data)
-
-# %% [markdown]
 # ```{caution}
 # Here, we know that `object` data type is used to represent strings and thus
 # categorical features. Be aware that this is not always the case. Sometimes
@@ -55,20 +51,8 @@
 # In a more general scenario you should manually introspect the content of your
 # dataframe not to wrongly use `make_column_selector`.
 # ```
-
-# %% [markdown]
-# ## Dispatch columns to a specific processor
-#
-# In the previous sections, we saw that we need to treat data differently
-# depending on their nature (i.e. numerical or categorical).
-#
-# Skrub is a data preprocessing library built to work seamlessly with
-# scikit-learn. It provides a convenient transformer called `TableVectorizer`
-# that can handle both numerical and categorical variables in a single
-# transformer. It makes the column selection automatically by using a column's
-# `dtype`.
 #
-# It separates the columns into four groups:
+# `TableVectorizer` separates the columns into four groups:
 # * **low cardinality categorical columns** (categorical columns with a limited
 #   number of unique values, one hot encoded by default);
 # * **high cardinality categorical columns** (categorical columns with a large
@@ -80,10 +64,57 @@
 #   [documentation](https://skrub-data.org/stable/reference/generated/skrub.DatetimeEncoder.html)).
 #
 # The threshold to determine whether a categorical column is of low or high
-# cardinality can be set using the `cardinality_threshold` parameter. We will see
-# its impact later on.
+# cardinality can be set using the `cardinality_threshold` parameter.
+
+# %% [markdown]
+# ## Effect of the cardinality threshold
+#
+# As previously stated, `TableVectorizer` separates categorical columns into two
+# groups: low cardinality and high cardinality. By default, the threshold is set
+# to 40 unique values. However, this value can be changed using the `cardinality_threshold`
+# parameter of `TableVectorizer`.
+# Let's vizualize its effect on the `"native-country"` column of the dataset. This column
+# corresponds to the country of origin of each individual. Let's check how many unique 
+# values it contains.
+
+# %%
+data["native-country"].nunique()
+
+#%% [markdown]
+# In the setup we used so far, this column is considered as a high cardinality categorical column.
+# Let us compare both encodings.
+
+# %%
+from skrub import TableVectorizer
+
+native_country_data = data[["native-country"]]
+
+high_thresh_vectorizer = TableVectorizer(cardinality_threshold=50)
+high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data)
+
+high_thresh_vectorizer
+
+# %%
+low_thresh_vectorizer = TableVectorizer()
+low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data)
+
+
+low_thresh_vectorizer
+
+# %% [markdown]
+# On the encoder or pipeline HTML diagrams, we can see that the "native-country"
+# column has been passed as a high cardinality categorical column in the first
+# case, and as a low cardinality categorical column in the second case by
+# clicking the on the `low_cardinality` and `high_cardinality` boxes.
+#
+# We set the `cardinality_threshold` parameter to ensure that all the categorical columns
+# are considered as low cardinality. This way, all categorical columns are encoded in the same
+# manner.
+
+# %% [markdown]
+# ## Preprocessing and modeling pipeline
 #
-# We apply the following transformations:
+# For the rest of the notebook we apply the following transformations to the whole dataset:
 #
 # * **one-hot encoding** is applied to the low cardinality categorical columns.
 #   Besides, we use `handle_unknown="ignore"` to solve the potential issues due
@@ -107,10 +138,8 @@
 # their respective columns.
 
 # %%
-from skrub import TableVectorizer
-
 vectorizer = TableVectorizer(
-    low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor
+    low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor, cardinality_threshold=50
 )
 
 # %% [markdown]
@@ -194,6 +223,51 @@
 # %%
 model.score(data_test, target_test)
 
+# %% [markdown]
+# ## Effect of the cardinality threshold
+#
+# As previously stated, `TableVectorizer` separates categorical columns into two
+# groups: low cardinality and high cardinality. By default, the threshold is set
+# to 40 unique values. However, this value can be changed using the `cardinality_threshold`
+# parameter of `TableVectorizer`.
+# Let's vizualize its effect on the `"native-country"` column of the dataset. This column
+# corresponds to the country of origin of each individual. Let's check how many unique 
+# values it contains.
+
+# %%
+data["native-country"].nunique()
+
+#%% [markdown]
+# In the setup we used so far, this column is considered as a high cardinality categorical column.
+# Let us compare both encodings.
+
+# %%
+native_country_data = data[["native-country"]]
+
+high_thresh_vectorizer = TableVectorizer(
+    low_cardinality=OneHotEncoder(sparse_output=False), cardinality_threshold=50)
+high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data)
+
+high_thresh_vectorizer
+
+# %%
+low_thresh_vectorizer = TableVectorizer(
+    low_cardinality=OneHotEncoder(sparse_output=False))
+low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data)
+
+
+low_thresh_vectorizer
+
+# %% [markdown]
+# On the encoder or pipeline HTML diagrams, we can see that the "native-country"
+# column has been passed as a high cardinality categorical column in the first
+# case, and as a low cardinality categorical column in the second case by
+# clicking the on the `low_cardinality` and `high_cardinality` boxes.
+#
+# We set the `cardinality_threshold` parameter to ensure that all the categorical columns
+# are considered as low cardinality. This way, all categorical columns are encoded in the same
+# manner.
+
 # %% [markdown]
 # ## Evaluation of the model with cross-validation
 #

From 12fa9eec77d9f663c6ad5f0587f0969d71cdc88e Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 10 Dec 2025 15:20:30 +0100
Subject: [PATCH 11/15] correction whitespace

---
 ...categorical_pipeline_column_transformer.py | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index 9c04078dc..d16d48285 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -71,18 +71,18 @@
 #
 # As previously stated, `TableVectorizer` separates categorical columns into two
 # groups: low cardinality and high cardinality. By default, the threshold is set
-# to 40 unique values. However, this value can be changed using the `cardinality_threshold`
-# parameter of `TableVectorizer`.
-# Let's vizualize its effect on the `"native-country"` column of the dataset. This column
-# corresponds to the country of origin of each individual. Let's check how many unique 
-# values it contains.
+# to 40 unique values. However, this value can be changed using the
+# `cardinality_threshold` parameter of `TableVectorizer`. Let's vizualize its
+# effect on the `"native-country"` column of the dataset. This column
+# corresponds to the country of origin of each individual. Let's check how many
+# unique values it contains.
 
 # %%
 data["native-country"].nunique()
 
 #%% [markdown]
-# In the setup we used so far, this column is considered as a high cardinality categorical column.
-# Let us compare both encodings.
+# In the setup we used so far, this column is considered as a high cardinality
+# categorical column. Let us compare both encodings.
 
 # %%
 from skrub import TableVectorizer
@@ -98,7 +98,6 @@
 low_thresh_vectorizer = TableVectorizer()
 low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data)
 
-
 low_thresh_vectorizer
 
 # %% [markdown]
@@ -107,14 +106,15 @@
 # case, and as a low cardinality categorical column in the second case by
 # clicking the on the `low_cardinality` and `high_cardinality` boxes.
 #
-# We set the `cardinality_threshold` parameter to ensure that all the categorical columns
-# are considered as low cardinality. This way, all categorical columns are encoded in the same
-# manner.
+# We set the `cardinality_threshold` parameter to ensure that all the
+# categorical columns are considered as low cardinality. This way, all
+# categorical columns are encoded in the same manner.
 
 # %% [markdown]
 # ## Preprocessing and modeling pipeline
 #
-# For the rest of the notebook we apply the following transformations to the whole dataset:
+# For the rest of the notebook we apply the following transformations to the
+# whole dataset:
 #
 # * **one-hot encoding** is applied to the low cardinality categorical columns.
 #   Besides, we use `handle_unknown="ignore"` to solve the potential issues due
@@ -228,18 +228,18 @@
 #
 # As previously stated, `TableVectorizer` separates categorical columns into two
 # groups: low cardinality and high cardinality. By default, the threshold is set
-# to 40 unique values. However, this value can be changed using the `cardinality_threshold`
-# parameter of `TableVectorizer`.
-# Let's vizualize its effect on the `"native-country"` column of the dataset. This column
-# corresponds to the country of origin of each individual. Let's check how many unique 
-# values it contains.
+# to 40 unique values. However, this value can be changed using the
+# `cardinality_threshold` parameter of `TableVectorizer`. Let's vizualize its
+# effect on the `"native-country"` column of the dataset. This column
+# corresponds to the country of origin of each individual. Let's check how many
+# unique values it contains.
 
 # %%
 data["native-country"].nunique()
 
 #%% [markdown]
-# In the setup we used so far, this column is considered as a high cardinality categorical column.
-# Let us compare both encodings.
+# In the setup we used so far, this column is considered as a high cardinality
+# categorical column. Let us compare both encodings.
 
 # %%
 native_country_data = data[["native-country"]]
@@ -264,9 +264,9 @@
 # case, and as a low cardinality categorical column in the second case by
 # clicking the on the `low_cardinality` and `high_cardinality` boxes.
 #
-# We set the `cardinality_threshold` parameter to ensure that all the categorical columns
-# are considered as low cardinality. This way, all categorical columns are encoded in the same
-# manner.
+# We set the `cardinality_threshold` parameter to ensure that all the
+# categorical columns are considered as low cardinality. This way, all
+# categorical columns are encoded in the same manner.
 
 # %% [markdown]
 # ## Evaluation of the model with cross-validation

From d53b09b443ab210d75a59492c784b384c613213e Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 10 Dec 2025 15:26:13 +0100
Subject: [PATCH 12/15] pre-commit

---
 python_scripts/01_tabular_data_exploration_pandas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python_scripts/01_tabular_data_exploration_pandas.py b/python_scripts/01_tabular_data_exploration_pandas.py
index 39b9a7cd4..4ecb8ebf8 100644
--- a/python_scripts/01_tabular_data_exploration_pandas.py
+++ b/python_scripts/01_tabular_data_exploration_pandas.py
@@ -368,4 +368,4 @@
 # * decision trees create prediction rules by comparing each feature to a
 #   threshold value, resulting in decision boundaries that are always parallel
 #   to the axes. In 2D, this means the boundaries are vertical or horizontal
-#   line segments at the feature threshold values.
\ No newline at end of file
+#   line segments at the feature threshold values.

From ebef1a3c44455eafe9bffea6ff5c75c08d1d302f Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 10 Dec 2025 15:32:30 +0100
Subject: [PATCH 13/15] fix

---
 .../03_categorical_pipeline_column_transformer.py      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index d16d48285..cb64f1561 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -100,11 +100,11 @@
 
 low_thresh_vectorizer
 
-# %% [markdown]
-# On the encoder or pipeline HTML diagrams, we can see that the "native-country"
-# column has been passed as a high cardinality categorical column in the first
-# case, and as a low cardinality categorical column in the second case by
-# clicking the on the `low_cardinality` and `high_cardinality` boxes.
+# %% [markdown] On the encoder or pipeline HTML diagrams, we can see that the
+# "native-country" column has been passed as a high cardinality categorical
+# column in the first case, and as a low cardinality categorical column in the
+# second case by clicking the on the `low_cardinality` and `high_cardinality`
+# boxes.
 #
 # We set the `cardinality_threshold` parameter to ensure that all the
 # categorical columns are considered as low cardinality. This way, all

From 5ddc1440cd7a4b916dd4a704dda39f165849b5b1 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 10 Dec 2025 15:35:54 +0100
Subject: [PATCH 14/15] fix

---
 .../03_categorical_pipeline_column_transformer.py        | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index cb64f1561..d0a77a5e8 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -139,7 +139,9 @@
 
 # %%
 vectorizer = TableVectorizer(
-    low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor, cardinality_threshold=50
+    low_cardinality=categorical_preprocessor,
+    numeric=numerical_preprocessor,
+    cardinality_threshold=50
 )
 
 # %% [markdown]
@@ -245,7 +247,9 @@
 native_country_data = data[["native-country"]]
 
 high_thresh_vectorizer = TableVectorizer(
-    low_cardinality=OneHotEncoder(sparse_output=False), cardinality_threshold=50)
+    low_cardinality=OneHotEncoder(sparse_output=False),
+    cardinality_threshold=50
+)
 high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data)
 
 high_thresh_vectorizer
@@ -255,7 +259,6 @@
     low_cardinality=OneHotEncoder(sparse_output=False))
 low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data)
 
-
 low_thresh_vectorizer
 
 # %% [markdown]

From ab6ae1d18dad1084bfee6d0118b079834dad69e8 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 14 Jan 2026 11:05:21 +0100
Subject: [PATCH 15/15] pre-commit fixes

---
 .../03_categorical_pipeline_column_transformer.py     | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
index d0a77a5e8..8d27c05bb 100644
--- a/python_scripts/03_categorical_pipeline_column_transformer.py
+++ b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -80,7 +80,7 @@
 # %%
 data["native-country"].nunique()
 
-#%% [markdown]
+# %% [markdown]
 # In the setup we used so far, this column is considered as a high cardinality
 # categorical column. Let us compare both encodings.
 
@@ -141,7 +141,7 @@
 vectorizer = TableVectorizer(
     low_cardinality=categorical_preprocessor,
     numeric=numerical_preprocessor,
-    cardinality_threshold=50
+    cardinality_threshold=50,
 )
 
 # %% [markdown]
@@ -239,7 +239,7 @@
 # %%
 data["native-country"].nunique()
 
-#%% [markdown]
+# %% [markdown]
 # In the setup we used so far, this column is considered as a high cardinality
 # categorical column. Let us compare both encodings.
 
@@ -248,7 +248,7 @@
 
 high_thresh_vectorizer = TableVectorizer(
     low_cardinality=OneHotEncoder(sparse_output=False),
-    cardinality_threshold=50
+    cardinality_threshold=50,
 )
 high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data)
 
@@ -256,7 +256,8 @@
 
 # %%
 low_thresh_vectorizer = TableVectorizer(
-    low_cardinality=OneHotEncoder(sparse_output=False))
+    low_cardinality=OneHotEncoder(sparse_output=False)
+)
 low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data)
 
 low_thresh_vectorizer