From 9f1f0723609c8a7cfae9b9aa3a711f0cded6d84a Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 15:37:39 +0100 Subject: [PATCH 01/15] skrub ready course v1 --- ...categorical_pipeline_column_transformer.py | 78 ++++++++++--------- python_scripts/datasets_ames_housing.py | 65 ++++------------ python_scripts/datasets_bike_rides.py | 21 ++--- python_scripts/datasets_blood_transfusion.py | 17 ++-- python_scripts/datasets_california_housing.py | 22 +++--- 5 files changed, 78 insertions(+), 125 deletions(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 921950cf6..05b31676b 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -62,27 +62,42 @@ # In the previous sections, we saw that we need to treat data differently # depending on their nature (i.e. numerical or categorical). # -# Scikit-learn provides a `ColumnTransformer` class which sends specific -# columns to a specific transformer, making it easy to fit a single predictive -# model on a dataset that combines both kinds of variables together -# (heterogeneously typed tabular data). +# Skrub is a data preprocessing library built to work seamlessly with +# scikit-learn. It provides a convenient transformer called `TableVectorizer` +# that can handle both numerical and categorical variables in a single +# transformer. It makes the column selection automatically by using a column's +# `dtype`. # -# We first define the columns depending on their data type: +# It separates the columns into four groups: +# * **low cardinality categorical columns** (categorical columns with a limited +# number of unique values, one hot encoded by default); +# * **high cardinality categorical columns** (categorical columns with a large +# number of unique values, string encoded by default); +# * **numerical columns** (untouched by default). +# * **time columns** (columns that encode time information, as present in time +# series for instance, converted to numerical features that can be used by +# learners; for more information, see the +# [documentation](https://skrub-data.org/stable/reference/generated/skrub.DatetimeEncoder.html)). # -# * **one-hot encoding** is applied to categorical columns. Besides, we use -# `handle_unknown="ignore"` to solve the potential issues due to rare -# categories. +# The threshold to determine whether a categorical column is of low or high +# cardinality can be set using the `cardinality_threshold` parameter. We will see +# its impact later on. +# +# We apply the following transformations: +# +# * **one-hot encoding** is applied to the low cardinality categorical columns. +# Besides, we use `handle_unknown="ignore"` to solve the potential issues due +# to rare categories. # * **numerical scaling** numerical features which will be standardized. # -# Now, we create our `ColumnTransfomer` using the helper function -# `make_column_transformer`. We specify two values: the transformer, and the -# columns. First, let's create the preprocessors for the numerical and -# categorical parts. +# Now, we create our transformer using the helper function `TableVectorizer`. We +# specify the transformers. First, let's create the preprocessors for the +# numerical and low cardinality categorical parts. # %% from sklearn.preprocessing import OneHotEncoder, StandardScaler -categorical_preprocessor = OneHotEncoder(handle_unknown="ignore") +categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False) numerical_preprocessor = StandardScaler() # %% [markdown] @@ -90,30 +105,26 @@ # their respective columns. # %% -from sklearn.compose import make_column_transformer +from skrub import TableVectorizer -preprocessor = make_column_transformer( - (categorical_preprocessor, categorical_columns), - (numerical_preprocessor, numerical_columns), -) +vectorizer = TableVectorizer(low_cardinality = categorical_preprocessor, numeric = numerical_preprocessor) # %% [markdown] # We can take a minute to represent graphically the structure of a -# `ColumnTransformer`: +# `TableVectorizer`: # # ![columntransformer diagram](../figures/api_diagram-columntransformer.svg) # -# A `ColumnTransformer` does the following: +# A `TableVectorizer` does the following: # -# * It **splits the columns** of the original dataset based on the column names -# or indices provided. We obtain as many subsets as the number of transformers -# passed into the `ColumnTransformer`. +# * It **splits the columns** of the original dataset based on the data type and +# cardinality of unique values. # * It **transforms each subsets**. A specific transformer is applied to each # subset: it internally calls `fit_transform` or `transform`. The output of # this step is a set of transformed datasets. # * It then **concatenates the transformed datasets** into a single dataset. -# The important thing is that `ColumnTransformer` is like any other scikit-learn +# The important thing is that `TableVectorizer` is like any other scikit-learn # transformer. In particular it can be combined with a classifier in a # `Pipeline`: @@ -121,7 +132,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline -model = make_pipeline(preprocessor, LogisticRegression(max_iter=500)) +model = make_pipeline(vectorizer, LogisticRegression(max_iter=500)) model # %% [markdown] @@ -227,16 +238,11 @@ # %% from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.preprocessing import OrdinalEncoder +from skrub import ToCategorical -categorical_preprocessor = OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=-1 -) +categorical_preprocessor = ToCategorical() -preprocessor = make_column_transformer( - (categorical_preprocessor, categorical_columns), - remainder="passthrough", -) +preprocessor = TableVectorizer(low_cardinality=categorical_preprocessor) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) @@ -262,8 +268,10 @@ # %% [markdown] # In this notebook we: # -# * used a `ColumnTransformer` to apply different preprocessing for categorical +# * used a `TableVectorizer` to apply different preprocessing for categorical # and numerical variables; -# * used a pipeline to chain the `ColumnTransformer` preprocessing and logistic +# * used a pipeline to chain the `TableVectorizer` preprocessing and logistic # regression fitting; # * saw that **gradient boosting methods** can outperform **linear models**. + +# %% diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py index c69c236b1..e6e7de4a3 100644 --- a/python_scripts/datasets_ames_housing.py +++ b/python_scripts/datasets_ames_housing.py @@ -49,20 +49,15 @@ # Let's have a quick look at the target before to focus on the data. # %% -target.head() +from skrub import TableReport -# %% [markdown] -# We see that the target contains continuous value. It corresponds to the price -# of a house in $. We can have a look at the target distribution. - -# %% -import matplotlib.pyplot as plt - -target.plot.hist(bins=20, edgecolor="black") -plt.xlabel("House price in $") -_ = plt.title("Distribution of the house price \nin Ames") +TableReport(target) -# %% [markdown] +# %% [markdown] +# We see that the target contains continuous value. It corresponds to the price +# of a house in $. We can have a look at the target distribution in the +# "Distributions" tab. +# # We see that the distribution has a long tail. It means that most of the house # are normally distributed but a couple of houses have a higher than normal # value. It could be critical to take this peculiarity into account when @@ -72,7 +67,7 @@ # house prices. # %% -data.info() +TableReport(data) # %% [markdown] # Looking at the dataframe general information, we can see that 79 features are @@ -84,24 +79,17 @@ # %% numerical_data = data.select_dtypes("number") -numerical_data.info() +TableReport(numerical_data, max_plot_columns=40) # %% [markdown] # We see that the data are mainly represented with integer number. Let's have a -# look at the histogram for all these features. - -# %% -numerical_data.hist( - bins=20, figsize=(12, 22), edgecolor="black", layout=(9, 4) -) -plt.subplots_adjust(hspace=0.8, wspace=0.8) +# look at the histogram for all these features in the "Distributions" tab. -# %% [markdown] # We see that some features have high picks for 0. It could be linked that this # value was assigned when the criterion did not apply, for instance the area of # the swimming pool when no swimming pools are available. # -# We also have some feature encoding some date (for instance year). +# We also have some features encoding a date (for instance year). # # These information are useful and should also be considered when designing a # predictive model. @@ -110,34 +98,13 @@ # %% string_data = data.select_dtypes(object) -string_data.info() +TableReport(string_data, max_plot_columns=45) # %% [markdown] -# These features are categorical. We can make some bar plot to see categories -# count for each feature. - -# %% -from math import ceil -from itertools import zip_longest - -n_string_features = string_data.shape[1] -nrows, ncols = ceil(n_string_features / 4), 4 - -fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 80)) - -for feature_name, ax in zip_longest(string_data, axs.ravel()): - if feature_name is None: - # do not show the axis - ax.axis("off") - continue - - string_data[feature_name].value_counts().plot.barh(ax=ax) - ax.set_title(feature_name) - -plt.subplots_adjust(hspace=0.2, wspace=0.8) - -# %% [markdown] -# Plotting this information allows us to answer to two questions: +# These features are categorical. We can make analyze the bar plots in the +# "Distribution" tab to see categories count for each feature. +# +# This allows us to answer to two questions: # # * Is there few or many categories for a given features? # * Is there rare categories for some features? diff --git a/python_scripts/datasets_bike_rides.py b/python_scripts/datasets_bike_rides.py index 9cb2ec77a..a4dfa1f6a 100644 --- a/python_scripts/datasets_bike_rides.py +++ b/python_scripts/datasets_bike_rides.py @@ -94,10 +94,8 @@ # We can have a first look at the target distribution. # %% -import matplotlib.pyplot as plt - -target.plot.hist(bins=50, edgecolor="black") -plt.xlabel("Power (W)") +from skrub import TableReport +TableReport(target) # %% [markdown] # We see a pick at 0 Watts, it corresponds to whenever our cyclist does not @@ -144,6 +142,8 @@ data_ride, target_ride = data.loc[date_first_ride], target.loc[date_first_ride] # %% +import matplotlib.pyplot as plt + data_ride.plot() plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Sensor values for different cyclist measurements") @@ -163,18 +163,7 @@ # We can check the range of the different features: # %% -axs = data_ride.hist(figsize=(10, 12), bins=50, edgecolor="black", grid=False) -# add the units to the plots -units = [ - "beats per minute", - "rotations per minute", - "meters per second", - "meters per second squared", - "%", -] -for unit, ax in zip(units, axs.ravel()): - ax.set_xlabel(unit) -plt.subplots_adjust(hspace=0.6) +TableReport(data_ride) # %% [markdown] # From these plots, we can see some interesting information: a cyclist is diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py index 1042f16f1..10ed30d60 100644 --- a/python_scripts/datasets_blood_transfusion.py +++ b/python_scripts/datasets_blood_transfusion.py @@ -54,17 +54,14 @@ # columns and if any missing values are present in our dataset. # %% -data.info() +from skrub import TableReport +TableReport(data) # %% [markdown] # Our dataset is made of 748 samples. All features are represented with integer # numbers and there is no missing values. We can have a look at each feature -# distributions. - -# %% -_ = data.hist(figsize=(12, 10), bins=30, edgecolor="black") - -# %% [markdown] +# distributions in the "Distributions" tab. +# # There is nothing shocking regarding the distributions. We only observe a high # value range for the features `"Recency"`, `"Frequency"`, and `"Monetary"`. It # means that we have a few extreme high values for these features. @@ -76,11 +73,7 @@ target.head() # %% -import matplotlib.pyplot as plt - -target.value_counts(normalize=True).plot.barh() -plt.xlabel("Number of samples") -_ = plt.title("Class distribution") +TableReport(target) # %% [markdown] # We see that the target is discrete and contains two categories: whether a diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py index 16869021a..a37515153 100644 --- a/python_scripts/datasets_california_housing.py +++ b/python_scripts/datasets_california_housing.py @@ -67,14 +67,13 @@ # * all features are numerical features encoded as floating number; # * there is no missing values. # -# Let's have a quick look at the distribution of these features by plotting -# their histograms. +# Let's have a quick look at the distribution of these features with the +# TableReport from the skrub package. # %% -import matplotlib.pyplot as plt +from skrub import TableReport -california_housing.frame.hist(figsize=(12, 10), bins=30, edgecolor="black") -plt.subplots_adjust(hspace=0.7, wspace=0.4) +TableReport(california_housing.frame) # %% [markdown] @@ -95,14 +94,10 @@ # population, the range of the data is large with unnoticeable bin for the # largest values. It means that there are very high and few values (maybe they # could be considered as outliers?). We can see this specificity looking at the -# statistics for these features: - -# %% -features_of_interest = ["AveRooms", "AveBedrms", "AveOccup", "Population"] -california_housing.frame[features_of_interest].describe() - -# %% [markdown] -# For each of these features, comparing the `max` and `75%` values, we can see a +# statistics for these features by clicking on the corresponding columns in the +# table. +# +# For each of these features, comparing the `max` and `Median ± IQR` values, we can see a # huge difference. It confirms the intuitions that there are a couple of extreme # values. # @@ -115,6 +110,7 @@ # %% import seaborn as sns +import matplotlib.pyplot as plt sns.scatterplot( data=california_housing.frame, From 58d8fc46d31298ba6fb2d39b1cd947e0ed214c49 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 15:47:36 +0100 Subject: [PATCH 02/15] test fix --- python_scripts/datasets_ames_housing.py | 2 +- python_scripts/datasets_california_housing.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py index e6e7de4a3..353a1fb52 100644 --- a/python_scripts/datasets_ames_housing.py +++ b/python_scripts/datasets_ames_housing.py @@ -84,7 +84,7 @@ # %% [markdown] # We see that the data are mainly represented with integer number. Let's have a # look at the histogram for all these features in the "Distributions" tab. - +# # We see that some features have high picks for 0. It could be linked that this # value was assigned when the criterion did not apply, for instance the area of # the swimming pool when no swimming pools are available. diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py index a37515153..cb4030ccb 100644 --- a/python_scripts/datasets_california_housing.py +++ b/python_scripts/datasets_california_housing.py @@ -18,7 +18,7 @@ california_housing = fetch_california_housing(as_frame=True) # %% [markdown] -# We can have a first look at the available description +# We can have a first look at the available description. # %% print(california_housing.DESCR) @@ -75,7 +75,6 @@ TableReport(california_housing.frame) - # %% [markdown] # We can first focus on features for which their distributions would be more or # less expected. @@ -97,9 +96,9 @@ # statistics for these features by clicking on the corresponding columns in the # table. # -# For each of these features, comparing the `max` and `Median ± IQR` values, we can see a -# huge difference. It confirms the intuitions that there are a couple of extreme -# values. +# For each of these features, comparing the `max` and `Median ± IQR` values, we +# can see a huge difference. It confirms the intuitions that there are a couple +# of extreme values. # # Up to now, we discarded the longitude and latitude that carry geographical # information. In short, the combination of these features could help us decide From 35e8c26f032e83259eb3ff6b01d4fcefb4e1add1 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 15:50:19 +0100 Subject: [PATCH 03/15] fixed thanks to arturo --- python_scripts/datasets_ames_housing.py | 10 +++------- python_scripts/datasets_california_housing.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py index 353a1fb52..b3b973a27 100644 --- a/python_scripts/datasets_ames_housing.py +++ b/python_scripts/datasets_ames_housing.py @@ -53,7 +53,7 @@ TableReport(target) -# %% [markdown] +# %% [markdown] # We see that the target contains continuous value. It corresponds to the price # of a house in $. We can have a look at the target distribution in the # "Distributions" tab. @@ -119,9 +119,7 @@ # ``` # %% -ames_housing_no_missing = pd.read_csv( - "../datasets/ames_housing_no_missing.csv" -) +ames_housing_no_missing = pd.read_csv("../datasets/ames_housing_no_missing.csv") ames_housing_no_missing.head() # %% [markdown] @@ -178,7 +176,5 @@ columns=categorical_features.tolist() + numerical_features, ) ames_housing_preprocessed = ames_housing_preprocessed[ames_housing.columns] -ames_housing_preprocessed = ames_housing_preprocessed.astype( - ames_housing.dtypes -) +ames_housing_preprocessed = ames_housing_preprocessed.astype(ames_housing.dtypes) (ames_housing_no_missing == ames_housing_preprocessed).all() diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py index cb4030ccb..b53bc70be 100644 --- a/python_scripts/datasets_california_housing.py +++ b/python_scripts/datasets_california_housing.py @@ -67,7 +67,7 @@ # * all features are numerical features encoded as floating number; # * there is no missing values. # -# Let's have a quick look at the distribution of these features with the +# Let's have a quick look at the distribution of these features with the # TableReport from the skrub package. # %% From 489b391d2ff0fed07319263d7b8489491f23377b Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 15:53:11 +0100 Subject: [PATCH 04/15] fixed (?) --- python_scripts/03_categorical_pipeline_column_transformer.py | 4 +++- python_scripts/datasets_bike_rides.py | 1 + python_scripts/datasets_blood_transfusion.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 05b31676b..f9ed7cb59 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -107,7 +107,9 @@ # %% from skrub import TableVectorizer -vectorizer = TableVectorizer(low_cardinality = categorical_preprocessor, numeric = numerical_preprocessor) +vectorizer = TableVectorizer( + low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor +) # %% [markdown] # We can take a minute to represent graphically the structure of a diff --git a/python_scripts/datasets_bike_rides.py b/python_scripts/datasets_bike_rides.py index a4dfa1f6a..fa1ce8019 100644 --- a/python_scripts/datasets_bike_rides.py +++ b/python_scripts/datasets_bike_rides.py @@ -95,6 +95,7 @@ # %% from skrub import TableReport + TableReport(target) # %% [markdown] diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py index 10ed30d60..2f32ed005 100644 --- a/python_scripts/datasets_blood_transfusion.py +++ b/python_scripts/datasets_blood_transfusion.py @@ -55,6 +55,7 @@ # %% from skrub import TableReport + TableReport(data) # %% [markdown] From 388a652a91dd1fd018912ec1a47bb469e068da6b Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 15:58:10 +0100 Subject: [PATCH 05/15] ames --- python_scripts/datasets_ames_housing.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py index b3b973a27..86c4b51d2 100644 --- a/python_scripts/datasets_ames_housing.py +++ b/python_scripts/datasets_ames_housing.py @@ -119,7 +119,9 @@ # ``` # %% -ames_housing_no_missing = pd.read_csv("../datasets/ames_housing_no_missing.csv") +ames_housing_no_missing = pd.read_csv( + "../datasets/ames_housing_no_missing.csv" +) ames_housing_no_missing.head() # %% [markdown] @@ -176,5 +178,7 @@ columns=categorical_features.tolist() + numerical_features, ) ames_housing_preprocessed = ames_housing_preprocessed[ames_housing.columns] -ames_housing_preprocessed = ames_housing_preprocessed.astype(ames_housing.dtypes) +ames_housing_preprocessed = ames_housing_preprocessed.astype( + ames_housing.dtypes +) (ames_housing_no_missing == ames_housing_preprocessed).all() From d632d756d84cfe3f063c27dee83c98472cb37b6e Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 16:00:08 +0100 Subject: [PATCH 06/15] fixed categorica; pipeline --- python_scripts/03_categorical_pipeline_column_transformer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index f9ed7cb59..31f935b0c 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -275,5 +275,3 @@ # * used a pipeline to chain the `TableVectorizer` preprocessing and logistic # regression fitting; # * saw that **gradient boosting methods** can outperform **linear models**. - -# %% From ceb92c777de2de85e25ad936b1130e20ea50835e Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 16:04:11 +0100 Subject: [PATCH 07/15] test --- python_scripts/03_categorical_pipeline_column_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 31f935b0c..fa4fd5886 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -251,7 +251,6 @@ # %% [markdown] # Now that we created our model, we can check its generalization performance. -# %% # %%time _ = model.fit(data_train, target_train) From 520051c124d0f9f6bd5badd05b3913169f03faaa Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 16:05:53 +0100 Subject: [PATCH 08/15] test --- python_scripts/03_categorical_pipeline_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index fa4fd5886..468b2edbc 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -117,7 +117,7 @@ # # ![columntransformer diagram](../figures/api_diagram-columntransformer.svg) # -# A `TableVectorizer` does the following: +# `TableVectorizer` does the following: # # * It **splits the columns** of the original dataset based on the data type and # cardinality of unique values. @@ -125,7 +125,7 @@ # subset: it internally calls `fit_transform` or `transform`. The output of # this step is a set of transformed datasets. # * It then **concatenates the transformed datasets** into a single dataset. - +# # The important thing is that `TableVectorizer` is like any other scikit-learn # transformer. In particular it can be combined with a classifier in a # `Pipeline`: From 9bb831494205450d9dd52b93a5f29c11d60d8e2a Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 19 Nov 2025 16:10:09 +0100 Subject: [PATCH 09/15] finito? --- python_scripts/03_categorical_pipeline_column_transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 468b2edbc..7f72e3790 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -97,7 +97,9 @@ # %% from sklearn.preprocessing import OneHotEncoder, StandardScaler -categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False) +categorical_preprocessor = OneHotEncoder( + handle_unknown="ignore", sparse_output=False +) numerical_preprocessor = StandardScaler() # %% [markdown] From 053d2df8e697acec3f2a187653e1022175769725 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 10 Dec 2025 15:17:37 +0100 Subject: [PATCH 10/15] added pandas notebook + corrections --- python_scripts/01_tabular_data_exploration.py | 68 ++-- .../01_tabular_data_exploration_pandas.py | 371 ++++++++++++++++++ ...categorical_pipeline_column_transformer.py | 144 +++++-- 3 files changed, 515 insertions(+), 68 deletions(-) create mode 100644 python_scripts/01_tabular_data_exploration_pandas.py diff --git a/python_scripts/01_tabular_data_exploration.py b/python_scripts/01_tabular_data_exploration.py index 4ecb8ebf8..43a547079 100644 --- a/python_scripts/01_tabular_data_exploration.py +++ b/python_scripts/01_tabular_data_exploration.py @@ -162,45 +162,42 @@ # %% [markdown] # Let's look at the distribution of individual features, to get some insights -# about the data. We can start by plotting histograms, note that this only works -# for features containing numerical values: +# about the data. We will use `skrub`'s `TableReport` class to generate an +# overview of the dataset. # %% -_ = adult_census.hist(figsize=(20, 14)) + +from skrub import TableReport + +report = TableReport(adult_census) +report +# _ = adult_census.hist(figsize=(20, 14)) # %% [markdown] -# ```{tip} -# In the previous cell, we used the following pattern: `_ = func()`. We do this -# to avoid showing the output of `func()` which in this case is not that -# useful. We actually assign the output of `func()` into the variable `_` -# (called underscore). By convention, in Python the underscore variable is used -# as a "garbage" variable to store results that we are not interested in. -# ``` +# The report shows many useful statistics about each variable. On the first tab +# "Table", we have a representation of the dataframe. Clicking on each column +# name shows a statistical summary of the variable. For a better view of the +# distribution of each variable, we can click on the "Distributions" tab. # -# We can already make a few comments about some of the variables: +# Numerical features's distributions are displayed as histograms, while +# categorical values are shown as bar plots. We can already make a few comments +# about some of the variables: # # * `"age"`: there are not that many points for `age > 70`. The dataset # description does indicate that retired people have been filtered out # (`hours-per-week > 0`); # * `"education-num"`: peak at 10 and 13, hard to tell what it corresponds to # without looking much further. We'll do that later in this notebook; +# * most values of `"capital-gain"` and `"capital-loss"` are close to zero; # * `"hours-per-week"` peaks at 40, this was very likely the standard number of # working hours at the time of the data collection; -# * most values of `"capital-gain"` and `"capital-loss"` are close to zero. - -# %% [markdown] -# For categorical variables, we can look at the distribution of values: - -# %% -adult_census["sex"].value_counts() - -# %% [markdown] -# Note that the data collection process resulted in an important imbalance -# between the number of male/female samples. +# * `"sex"`: the data collection process resulted in an important imbalance +# between the number of male/female samples. # -# Be aware that training a model with such data imbalance can cause -# disproportioned prediction errors for the under-represented groups. This is a -# typical cause of +# About the last observation, be aware that training a model with such data +# imbalance can cause disproportioned prediction errors for the +# under-represented sensitive groups (based on gender or ethnicity for +# instance). This is a typical cause of # [fairness](https://docs.microsoft.com/en-us/azure/machine-learning/concept-fairness-ml#what-is-machine-learning-fairness) # problems if used naively when deploying a machine learning based system in a # real life setting. @@ -210,35 +207,40 @@ # related to the deployment of automated decision making systems that rely on # machine learning components. # -# Studying why the data collection process of this dataset lead to such an +# Studying why the data collection process of this dataset led to such an # unexpected gender imbalance is beyond the scope of this MOOC but we should # keep in mind that this dataset is not representative of the US population # before drawing any conclusions based on its statistics or the predictions of # models trained on it. -# %% -adult_census["education"].value_counts() - # %% [markdown] # As noted above, `"education-num"` distribution has two clear peaks around 10 # and 13. It would be reasonable to expect that `"education-num"` is the number # of years of education. # -# Let's look at the relationship between `"education"` and `"education-num"`. +# Let's look at the relationship between `"education"` and `"education-num"` by +# going to the "Associations" tab of the report. This tab shows the statistical +# relationship between each pair of variables in the dataset, using [Cramér's +# V](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V) and [Pearson's +# Correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). +# We can see that `"education"` and `"education-num"` are very strongly related. + # %% pd.crosstab( index=adult_census["education"], columns=adult_census["education-num"] ) # %% [markdown] -# For every entry in `\"education\"`, there is only one single corresponding -# value in `\"education-num\"`. This shows that `"education"` and +# For every entry in `"education"`, there is only one single corresponding +# value in `"education-num"`. This shows that `"education"` and # `"education-num"` give you the same information. For example, # `"education-num"=2` is equivalent to `"education"="1st-4th"`. In practice that # means we can remove `"education-num"` without losing information. Note that # having redundant (or highly correlated) columns can be a problem for machine # learning algorithms. - +# +# All of this data expliration can be done manually with `pandas`. See [this +# notebook](lol) for more information. # %% [markdown] # ```{note} # In the upcoming notebooks, we will only keep the `"education"` variable, diff --git a/python_scripts/01_tabular_data_exploration_pandas.py b/python_scripts/01_tabular_data_exploration_pandas.py new file mode 100644 index 000000000..39b9a7cd4 --- /dev/null +++ b/python_scripts/01_tabular_data_exploration_pandas.py @@ -0,0 +1,371 @@ +# --- +# jupyter: +# kernelspec: +# display_name: Python 3 +# name: python3 +# --- + +# %% [markdown] +# # First look at our dataset +# +# In this notebook, we look at the necessary steps required before any machine +# learning takes place. It involves: +# +# * loading the data; +# * looking at the variables in the dataset, in particular, differentiate +# between numerical and categorical variables, which need different +# preprocessing in most machine learning workflows; +# * visualizing the distribution of the variables to gain some insights into the +# dataset. + +# %% [markdown] +# ## Loading the adult census dataset +# +# We use data from the 1994 US census that we downloaded from +# [OpenML](http://openml.org/). +# +# You can look at the OpenML webpage to learn more about this dataset: +# +# +# The dataset is available as a CSV (Comma-Separated Values) file and we use +# `pandas` to read it. +# +# ```{note} +# [Pandas](https://pandas.pydata.org/) is a Python library used for +# manipulating 1 and 2 dimensional structured data. If you have never used +# pandas, we recommend you look at this +# [tutorial](https://pandas.pydata.org/docs/user_guide/10min.html). +# ``` + +# %% +import pandas as pd + +adult_census = pd.read_csv("../datasets/adult-census.csv") + +# %% [markdown] +# The goal with this data is to predict whether a person earns over 50K a year +# from heterogeneous data such as age, employment, education, family +# information, etc. + +# %% [markdown] +# ## The variables (columns) in the dataset +# +# The data are stored in a `pandas` dataframe. A dataframe is a type of +# structured data composed of 2 dimensions. This type of data is also referred +# as tabular data. +# +# Each row represents a "sample". In the field of machine learning or +# descriptive statistics, commonly used equivalent terms are "record", +# "instance", or "observation". +# +# Each column represents a type of information that has been collected and is +# called a "feature". In the field of machine learning and descriptive +# statistics, commonly used equivalent terms are "variable", "attribute", or +# "covariate". + +# %% [markdown] +# A quick way to inspect the dataframe is to show the first few lines with the +# `head` method: + +# %% +adult_census.head() + +# %% [markdown] +# An alternative is to omit the `head` method. This would output the initial and +# final rows and columns, but everything in between is not shown by default. It +# also provides the dataframe's dimensions at the bottom in the format `n_rows` +# x `n_columns`. + +# %% +adult_census + +# %% [markdown] +# The column named **class** is our target variable (i.e., the variable which we +# want to predict). The two possible classes are `<=50K` (low-revenue) and +# `>50K` (high-revenue). The resulting prediction problem is therefore a binary +# classification problem as `class` has only two possible values. We use the +# left-over columns (any column other than `class`) as input variables for our +# model. + +# %% +target_column = "class" +adult_census[target_column].value_counts() + +# %% [markdown] +# ```{note} +# Here, classes are slightly imbalanced, meaning there are more samples of one +# or more classes compared to others. In this case, we have many more samples +# with `" <=50K"` than with `" >50K"`. Class imbalance happens often in practice +# and may need special techniques when building a predictive model. +# +# For example in a medical setting, if we are trying to predict whether subjects +# may develop a rare disease, there would be a lot more healthy subjects than +# ill subjects in the dataset. +# ``` + +# %% [markdown] +# The dataset contains both numerical and categorical data. Numerical values +# take continuous values, for example `"age"`. Categorical values can have a +# finite number of values, for example `"native-country"`. + +# %% +numerical_columns = [ + "age", + "education-num", + "capital-gain", + "capital-loss", + "hours-per-week", +] +categorical_columns = [ + "workclass", + "education", + "marital-status", + "occupation", + "relationship", + "race", + "sex", + "native-country", +] +all_columns = numerical_columns + categorical_columns + [target_column] + +adult_census = adult_census[all_columns] + +# %% [markdown] +# We can check the number of samples and the number of columns available in the +# dataset: + +# %% +print( + f"The dataset contains {adult_census.shape[0]} samples and " + f"{adult_census.shape[1]} columns" +) + +# %% [markdown] +# We can compute the number of features by counting the number of columns and +# subtract 1, since one of the columns is the target. + +# %% +print(f"The dataset contains {adult_census.shape[1] - 1} features.") + +# %% [markdown] +# ## Visual inspection of the data +# Before building a predictive model, it is a good idea to look at the data: +# +# * maybe the task you are trying to achieve can be solved without machine +# learning; +# * you need to check that the information you need for your task is actually +# present in the dataset; +# * inspecting the data is a good way to find peculiarities. These can arise +# during data collection (for example, malfunctioning sensor or missing +# values), or from the way the data is processed afterwards (for example +# capped values). + +# %% [markdown] +# Let's look at the distribution of individual features, to get some insights +# about the data. We can start by plotting histograms, note that this only works +# for features containing numerical values: + +# %% +_ = adult_census.hist(figsize=(20, 14)) + +# %% [markdown] +# ```{tip} +# In the previous cell, we used the following pattern: `_ = func()`. We do this +# to avoid showing the output of `func()` which in this case is not that +# useful. We actually assign the output of `func()` into the variable `_` +# (called underscore). By convention, in Python the underscore variable is used +# as a "garbage" variable to store results that we are not interested in. +# ``` +# +# We can already make a few comments about some of the variables: +# +# * `"age"`: there are not that many points for `age > 70`. The dataset +# description does indicate that retired people have been filtered out +# (`hours-per-week > 0`); +# * `"education-num"`: peak at 10 and 13, hard to tell what it corresponds to +# without looking much further. We'll do that later in this notebook; +# * `"hours-per-week"` peaks at 40, this was very likely the standard number of +# working hours at the time of the data collection; +# * most values of `"capital-gain"` and `"capital-loss"` are close to zero. + +# %% [markdown] +# For categorical variables, we can look at the distribution of values: + +# %% +adult_census["sex"].value_counts() + +# %% [markdown] +# Note that the data collection process resulted in an important imbalance +# between the number of male/female samples. +# +# Be aware that training a model with such data imbalance can cause +# disproportioned prediction errors for the under-represented groups. This is a +# typical cause of +# [fairness](https://docs.microsoft.com/en-us/azure/machine-learning/concept-fairness-ml#what-is-machine-learning-fairness) +# problems if used naively when deploying a machine learning based system in a +# real life setting. +# +# We recommend our readers to refer to [fairlearn.org](https://fairlearn.org) +# for resources on how to quantify and potentially mitigate fairness issues +# related to the deployment of automated decision making systems that rely on +# machine learning components. +# +# Studying why the data collection process of this dataset lead to such an +# unexpected gender imbalance is beyond the scope of this MOOC but we should +# keep in mind that this dataset is not representative of the US population +# before drawing any conclusions based on its statistics or the predictions of +# models trained on it. + +# %% +adult_census["education"].value_counts() + +# %% [markdown] +# As noted above, `"education-num"` distribution has two clear peaks around 10 +# and 13. It would be reasonable to expect that `"education-num"` is the number +# of years of education. +# +# Let's look at the relationship between `"education"` and `"education-num"`. +# %% +pd.crosstab( + index=adult_census["education"], columns=adult_census["education-num"] +) + +# %% [markdown] +# For every entry in `\"education\"`, there is only one single corresponding +# value in `\"education-num\"`. This shows that `"education"` and +# `"education-num"` give you the same information. For example, +# `"education-num"=2` is equivalent to `"education"="1st-4th"`. In practice that +# means we can remove `"education-num"` without losing information. Note that +# having redundant (or highly correlated) columns can be a problem for machine +# learning algorithms. + +# %% [markdown] +# ```{note} +# In the upcoming notebooks, we will only keep the `"education"` variable, +# excluding the `"education-num"` variable since the latter is redundant with +# the former. +# ``` + +# %% [markdown] +# Another way to inspect the data is to do a `pairplot` and show how each +# variable differs according to our target, i.e. `"class"`. Plots along the +# diagonal show the distribution of individual variables for each `"class"`. The +# plots on the off-diagonal can reveal interesting interactions between +# variables. + +# %% +import seaborn as sns + +# We plot a subset of the data to keep the plot readable and make the plotting +# faster +n_samples_to_plot = 5000 +columns = ["age", "education-num", "hours-per-week"] +_ = sns.pairplot( + data=adult_census[:n_samples_to_plot], + vars=columns, + hue=target_column, + plot_kws={"alpha": 0.2}, + height=3, + diag_kind="hist", + diag_kws={"bins": 30}, +) + +# %% [markdown] +# ## Creating decision rules by hand +# +# By looking at the previous plots, we could create some hand-written rules that +# predict whether someone has a high- or low-income. For instance, we could +# focus on the combination of the `"hours-per-week"` and `"age"` features. + +# %% +_ = sns.scatterplot( + x="age", + y="hours-per-week", + data=adult_census[:n_samples_to_plot], + hue=target_column, + alpha=0.5, +) + +# %% [markdown] +# The data points (circles) show the distribution of `"hours-per-week"` and +# `"age"` in the dataset. Blue points mean low-income and orange points mean +# high-income. This part of the plot is the same as the bottom-left plot in the +# pairplot above. +# +# In this plot, we can try to find regions that mainly contains a single class +# such that we can easily decide what class one should predict. We could come up +# with hand-written rules as shown in this plot: + +# %% +import matplotlib.pyplot as plt + +ax = sns.scatterplot( + x="age", + y="hours-per-week", + data=adult_census[:n_samples_to_plot], + hue=target_column, + alpha=0.5, +) + +age_limit = 27 +plt.axvline(x=age_limit, ymin=0, ymax=1, color="black", linestyle="--") + +hours_per_week_limit = 40 +plt.axhline( + y=hours_per_week_limit, xmin=0.18, xmax=1, color="black", linestyle="--" +) + +plt.annotate("<=50K", (17, 25), rotation=90, fontsize=35) +plt.annotate("<=50K", (35, 20), fontsize=35) +_ = plt.annotate("???", (45, 60), fontsize=35) + +# %% [markdown] +# * In the region `age < 27` (left region) the prediction is low-income. Indeed, +# there are many blue points and we cannot see any orange points. +# * In the region `age > 27 AND hours-per-week < 40` (bottom-right region), the +# prediction is low-income. Indeed, there are many blue points and only a few +# orange points. +# * In the region `age > 27 AND hours-per-week > 40` (top-right region), we see +# a mix of blue points and orange points. It seems complicated to choose which +# class we should predict in this region. +# +# It is interesting to note that some machine learning models work similarly to +# what we did: they are known as decision tree models. The two thresholds that +# we chose (27 years and 40 hours) are somewhat arbitrary, i.e. we chose them by +# only looking at the pairplot. In contrast, a decision tree chooses the "best" +# splits based on data without human intervention or inspection. Decision trees +# will be covered more in detail in a future module. +# +# Note that machine learning is often used when creating rules by hand is not +# straightforward. For example because we are in high dimension (many features +# in a table) or because there are no simple and obvious rules that separate the +# two classes as in the top-right region of the previous plot. +# +# To sum up, the important thing to remember is that in a machine-learning +# setting, a model automatically creates the "rules" from the existing data in +# order to make predictions on new unseen data. + +# %% [markdown] +# ## Notebook Recap +# +# In this notebook we: +# +# * loaded the data from a CSV file using `pandas`; +# * looked at the different kind of variables to differentiate between +# categorical and numerical variables; +# * inspected the data with `pandas` and `seaborn`. Data inspection can allow +# you to decide whether using machine learning is appropriate for your data +# and to highlight potential peculiarities in your data. +# +# We made important observations (which will be discussed later in more detail): +# +# * if your target variable is imbalanced (e.g., you have more samples from one +# target category than another), you may need to be careful when interpreting +# the values of performance metrics; +# * columns can be redundant (or highly correlated), which is not necessarily a +# problem, but may require special treatment as we will cover in future +# notebooks; +# * decision trees create prediction rules by comparing each feature to a +# threshold value, resulting in decision boundaries that are always parallel +# to the axes. In 2D, this means the boundaries are vertical or horizontal +# line segments at the feature threshold values. \ No newline at end of file diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 7f72e3790..9c04078dc 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -28,23 +28,19 @@ data = adult_census.drop(columns=[target_name]) # %% [markdown] -# ## Selection based on data types +# ## Dispatch columns to a specific processor +# +# In the previous sections, we saw that we need to treat data differently +# depending on their nature (i.e. numerical or categorical). +# +# Skrub is a data preprocessing library built to work seamlessly with +# scikit-learn. It provides a convenient transformer called `TableVectorizer` +# that can handle both numerical and categorical variables in a single +# transformer. It makes the column selection automatically by using a column's +# `dtype`. This is equivalent to using a +# [`sklearn.compose.make_column_selector`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html) +# and selecting or excluding `object` dtypes. # -# We separate categorical and numerical variables using their data types to -# identify them, as we saw previously that `object` corresponds to categorical -# columns (strings). We make use of `make_column_selector` helper to select the -# corresponding columns. - -# %% -from sklearn.compose import make_column_selector as selector - -numerical_columns_selector = selector(dtype_exclude=object) -categorical_columns_selector = selector(dtype_include=object) - -numerical_columns = numerical_columns_selector(data) -categorical_columns = categorical_columns_selector(data) - -# %% [markdown] # ```{caution} # Here, we know that `object` data type is used to represent strings and thus # categorical features. Be aware that this is not always the case. Sometimes @@ -55,20 +51,8 @@ # In a more general scenario you should manually introspect the content of your # dataframe not to wrongly use `make_column_selector`. # ``` - -# %% [markdown] -# ## Dispatch columns to a specific processor -# -# In the previous sections, we saw that we need to treat data differently -# depending on their nature (i.e. numerical or categorical). -# -# Skrub is a data preprocessing library built to work seamlessly with -# scikit-learn. It provides a convenient transformer called `TableVectorizer` -# that can handle both numerical and categorical variables in a single -# transformer. It makes the column selection automatically by using a column's -# `dtype`. # -# It separates the columns into four groups: +# `TableVectorizer` separates the columns into four groups: # * **low cardinality categorical columns** (categorical columns with a limited # number of unique values, one hot encoded by default); # * **high cardinality categorical columns** (categorical columns with a large @@ -80,10 +64,57 @@ # [documentation](https://skrub-data.org/stable/reference/generated/skrub.DatetimeEncoder.html)). # # The threshold to determine whether a categorical column is of low or high -# cardinality can be set using the `cardinality_threshold` parameter. We will see -# its impact later on. +# cardinality can be set using the `cardinality_threshold` parameter. + +# %% [markdown] +# ## Effect of the cardinality threshold +# +# As previously stated, `TableVectorizer` separates categorical columns into two +# groups: low cardinality and high cardinality. By default, the threshold is set +# to 40 unique values. However, this value can be changed using the `cardinality_threshold` +# parameter of `TableVectorizer`. +# Let's vizualize its effect on the `"native-country"` column of the dataset. This column +# corresponds to the country of origin of each individual. Let's check how many unique +# values it contains. + +# %% +data["native-country"].nunique() + +#%% [markdown] +# In the setup we used so far, this column is considered as a high cardinality categorical column. +# Let us compare both encodings. + +# %% +from skrub import TableVectorizer + +native_country_data = data[["native-country"]] + +high_thresh_vectorizer = TableVectorizer(cardinality_threshold=50) +high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data) + +high_thresh_vectorizer + +# %% +low_thresh_vectorizer = TableVectorizer() +low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data) + + +low_thresh_vectorizer + +# %% [markdown] +# On the encoder or pipeline HTML diagrams, we can see that the "native-country" +# column has been passed as a high cardinality categorical column in the first +# case, and as a low cardinality categorical column in the second case by +# clicking the on the `low_cardinality` and `high_cardinality` boxes. +# +# We set the `cardinality_threshold` parameter to ensure that all the categorical columns +# are considered as low cardinality. This way, all categorical columns are encoded in the same +# manner. + +# %% [markdown] +# ## Preprocessing and modeling pipeline # -# We apply the following transformations: +# For the rest of the notebook we apply the following transformations to the whole dataset: # # * **one-hot encoding** is applied to the low cardinality categorical columns. # Besides, we use `handle_unknown="ignore"` to solve the potential issues due @@ -107,10 +138,8 @@ # their respective columns. # %% -from skrub import TableVectorizer - vectorizer = TableVectorizer( - low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor + low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor, cardinality_threshold=50 ) # %% [markdown] @@ -194,6 +223,51 @@ # %% model.score(data_test, target_test) +# %% [markdown] +# ## Effect of the cardinality threshold +# +# As previously stated, `TableVectorizer` separates categorical columns into two +# groups: low cardinality and high cardinality. By default, the threshold is set +# to 40 unique values. However, this value can be changed using the `cardinality_threshold` +# parameter of `TableVectorizer`. +# Let's vizualize its effect on the `"native-country"` column of the dataset. This column +# corresponds to the country of origin of each individual. Let's check how many unique +# values it contains. + +# %% +data["native-country"].nunique() + +#%% [markdown] +# In the setup we used so far, this column is considered as a high cardinality categorical column. +# Let us compare both encodings. + +# %% +native_country_data = data[["native-country"]] + +high_thresh_vectorizer = TableVectorizer( + low_cardinality=OneHotEncoder(sparse_output=False), cardinality_threshold=50) +high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data) + +high_thresh_vectorizer + +# %% +low_thresh_vectorizer = TableVectorizer( + low_cardinality=OneHotEncoder(sparse_output=False)) +low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data) + + +low_thresh_vectorizer + +# %% [markdown] +# On the encoder or pipeline HTML diagrams, we can see that the "native-country" +# column has been passed as a high cardinality categorical column in the first +# case, and as a low cardinality categorical column in the second case by +# clicking the on the `low_cardinality` and `high_cardinality` boxes. +# +# We set the `cardinality_threshold` parameter to ensure that all the categorical columns +# are considered as low cardinality. This way, all categorical columns are encoded in the same +# manner. + # %% [markdown] # ## Evaluation of the model with cross-validation # From 12fa9eec77d9f663c6ad5f0587f0969d71cdc88e Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 10 Dec 2025 15:20:30 +0100 Subject: [PATCH 11/15] correction whitespace --- ...categorical_pipeline_column_transformer.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 9c04078dc..d16d48285 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -71,18 +71,18 @@ # # As previously stated, `TableVectorizer` separates categorical columns into two # groups: low cardinality and high cardinality. By default, the threshold is set -# to 40 unique values. However, this value can be changed using the `cardinality_threshold` -# parameter of `TableVectorizer`. -# Let's vizualize its effect on the `"native-country"` column of the dataset. This column -# corresponds to the country of origin of each individual. Let's check how many unique -# values it contains. +# to 40 unique values. However, this value can be changed using the +# `cardinality_threshold` parameter of `TableVectorizer`. Let's vizualize its +# effect on the `"native-country"` column of the dataset. This column +# corresponds to the country of origin of each individual. Let's check how many +# unique values it contains. # %% data["native-country"].nunique() #%% [markdown] -# In the setup we used so far, this column is considered as a high cardinality categorical column. -# Let us compare both encodings. +# In the setup we used so far, this column is considered as a high cardinality +# categorical column. Let us compare both encodings. # %% from skrub import TableVectorizer @@ -98,7 +98,6 @@ low_thresh_vectorizer = TableVectorizer() low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data) - low_thresh_vectorizer # %% [markdown] @@ -107,14 +106,15 @@ # case, and as a low cardinality categorical column in the second case by # clicking the on the `low_cardinality` and `high_cardinality` boxes. # -# We set the `cardinality_threshold` parameter to ensure that all the categorical columns -# are considered as low cardinality. This way, all categorical columns are encoded in the same -# manner. +# We set the `cardinality_threshold` parameter to ensure that all the +# categorical columns are considered as low cardinality. This way, all +# categorical columns are encoded in the same manner. # %% [markdown] # ## Preprocessing and modeling pipeline # -# For the rest of the notebook we apply the following transformations to the whole dataset: +# For the rest of the notebook we apply the following transformations to the +# whole dataset: # # * **one-hot encoding** is applied to the low cardinality categorical columns. # Besides, we use `handle_unknown="ignore"` to solve the potential issues due @@ -228,18 +228,18 @@ # # As previously stated, `TableVectorizer` separates categorical columns into two # groups: low cardinality and high cardinality. By default, the threshold is set -# to 40 unique values. However, this value can be changed using the `cardinality_threshold` -# parameter of `TableVectorizer`. -# Let's vizualize its effect on the `"native-country"` column of the dataset. This column -# corresponds to the country of origin of each individual. Let's check how many unique -# values it contains. +# to 40 unique values. However, this value can be changed using the +# `cardinality_threshold` parameter of `TableVectorizer`. Let's vizualize its +# effect on the `"native-country"` column of the dataset. This column +# corresponds to the country of origin of each individual. Let's check how many +# unique values it contains. # %% data["native-country"].nunique() #%% [markdown] -# In the setup we used so far, this column is considered as a high cardinality categorical column. -# Let us compare both encodings. +# In the setup we used so far, this column is considered as a high cardinality +# categorical column. Let us compare both encodings. # %% native_country_data = data[["native-country"]] @@ -264,9 +264,9 @@ # case, and as a low cardinality categorical column in the second case by # clicking the on the `low_cardinality` and `high_cardinality` boxes. # -# We set the `cardinality_threshold` parameter to ensure that all the categorical columns -# are considered as low cardinality. This way, all categorical columns are encoded in the same -# manner. +# We set the `cardinality_threshold` parameter to ensure that all the +# categorical columns are considered as low cardinality. This way, all +# categorical columns are encoded in the same manner. # %% [markdown] # ## Evaluation of the model with cross-validation From d53b09b443ab210d75a59492c784b384c613213e Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 10 Dec 2025 15:26:13 +0100 Subject: [PATCH 12/15] pre-commit --- python_scripts/01_tabular_data_exploration_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_scripts/01_tabular_data_exploration_pandas.py b/python_scripts/01_tabular_data_exploration_pandas.py index 39b9a7cd4..4ecb8ebf8 100644 --- a/python_scripts/01_tabular_data_exploration_pandas.py +++ b/python_scripts/01_tabular_data_exploration_pandas.py @@ -368,4 +368,4 @@ # * decision trees create prediction rules by comparing each feature to a # threshold value, resulting in decision boundaries that are always parallel # to the axes. In 2D, this means the boundaries are vertical or horizontal -# line segments at the feature threshold values. \ No newline at end of file +# line segments at the feature threshold values. From ebef1a3c44455eafe9bffea6ff5c75c08d1d302f Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 10 Dec 2025 15:32:30 +0100 Subject: [PATCH 13/15] fix --- .../03_categorical_pipeline_column_transformer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index d16d48285..cb64f1561 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -100,11 +100,11 @@ low_thresh_vectorizer -# %% [markdown] -# On the encoder or pipeline HTML diagrams, we can see that the "native-country" -# column has been passed as a high cardinality categorical column in the first -# case, and as a low cardinality categorical column in the second case by -# clicking the on the `low_cardinality` and `high_cardinality` boxes. +# %% [markdown] On the encoder or pipeline HTML diagrams, we can see that the +# "native-country" column has been passed as a high cardinality categorical +# column in the first case, and as a low cardinality categorical column in the +# second case by clicking the on the `low_cardinality` and `high_cardinality` +# boxes. # # We set the `cardinality_threshold` parameter to ensure that all the # categorical columns are considered as low cardinality. This way, all From 5ddc1440cd7a4b916dd4a704dda39f165849b5b1 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 10 Dec 2025 15:35:54 +0100 Subject: [PATCH 14/15] fix --- .../03_categorical_pipeline_column_transformer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index cb64f1561..d0a77a5e8 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -139,7 +139,9 @@ # %% vectorizer = TableVectorizer( - low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor, cardinality_threshold=50 + low_cardinality=categorical_preprocessor, + numeric=numerical_preprocessor, + cardinality_threshold=50 ) # %% [markdown] @@ -245,7 +247,9 @@ native_country_data = data[["native-country"]] high_thresh_vectorizer = TableVectorizer( - low_cardinality=OneHotEncoder(sparse_output=False), cardinality_threshold=50) + low_cardinality=OneHotEncoder(sparse_output=False), + cardinality_threshold=50 +) high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data) high_thresh_vectorizer @@ -255,7 +259,6 @@ low_cardinality=OneHotEncoder(sparse_output=False)) low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data) - low_thresh_vectorizer # %% [markdown] From ab6ae1d18dad1084bfee6d0118b079834dad69e8 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 14 Jan 2026 11:05:21 +0100 Subject: [PATCH 15/15] pre-commit fixes --- .../03_categorical_pipeline_column_transformer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index d0a77a5e8..8d27c05bb 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -80,7 +80,7 @@ # %% data["native-country"].nunique() -#%% [markdown] +# %% [markdown] # In the setup we used so far, this column is considered as a high cardinality # categorical column. Let us compare both encodings. @@ -141,7 +141,7 @@ vectorizer = TableVectorizer( low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor, - cardinality_threshold=50 + cardinality_threshold=50, ) # %% [markdown] @@ -239,7 +239,7 @@ # %% data["native-country"].nunique() -#%% [markdown] +# %% [markdown] # In the setup we used so far, this column is considered as a high cardinality # categorical column. Let us compare both encodings. @@ -248,7 +248,7 @@ high_thresh_vectorizer = TableVectorizer( low_cardinality=OneHotEncoder(sparse_output=False), - cardinality_threshold=50 + cardinality_threshold=50, ) high_card_encoded = high_thresh_vectorizer.fit_transform(native_country_data) @@ -256,7 +256,8 @@ # %% low_thresh_vectorizer = TableVectorizer( - low_cardinality=OneHotEncoder(sparse_output=False)) + low_cardinality=OneHotEncoder(sparse_output=False) +) low_card_encoded = low_thresh_vectorizer.fit_transform(native_country_data) low_thresh_vectorizer