5959true_coef = rng .normal (size = n_features )
6060X = rng .normal (size = (n_samples , n_features ))
6161z = X @ true_coef
62- intercept = - 4
63- y = rng .binomial (n = 1 , p = expit (z + intercept ))
62+ true_intercept = - 4
63+ y = rng .binomial (n = 1 , p = expit (z + true_intercept ))
6464
65- # create pandas data structures for convenience
65+ # Wrap as pandas data structures for convenience.
6666X = pd .DataFrame (X , columns = [f"feature_{ i } " for i in range (n_features )])
6767y = pd .Series (y , name = "target" )
6868
6969# %% [markdown]
7070#
71- # Let's recall what the expit function also called the sigmoid function is.
71+ # Recall that the `expit` function, also known as the logistic sigmoid function,
72+ # is defined as `expit(x) = 1 / (1 + np.exp(-x))` and looks as follows:
7273
7374# %%
7475_ , ax = plt .subplots ()
75- x = np .linspace (- 10 , 10 , 100 )
76- ax .plot (x , expit (x ))
76+ z = np .linspace (- 10 , 10 , 100 )
77+ ax .plot (z , expit (z ))
7778_ = ax .set (
7879 title = "Sigmoid/Expit function" ,
7980 xlabel = "Linear predictor" ,
131132# %%
132133comparison_coef = pd .DataFrame (
133134 {
134- "Data generating model" : np .hstack ((intercept , true_coef )),
135+ "Data generating model" : np .hstack ((true_intercept , true_coef )),
135136 "Unpenalized logistic regression" : np .hstack (
136137 (model .intercept_ , model .coef_ .flatten ())
137138 ),
172173
173174
174175# %%
175- def generate_imbalanced_dataset (true_coef , n_samples = 10_000 , seed = 0 ):
176+ def generate_imbalanced_dataset (true_coef , true_intercept , n_samples = 10_000 , seed = 0 ):
176177 rng = np .random .default_rng (seed )
177178
178- # we can sample a new design matrix but we need to keep the same true coefficients
179+ # We can sample a new design matrix but we need to keep the same true coefficients.
179180 X = rng .normal (size = (n_samples , true_coef .shape [0 ]))
180181 z = X @ true_coef
181- intercept = - 4
182- y = rng .binomial (n = 1 , p = expit (z + intercept ))
182+ y = rng .binomial (n = 1 , p = expit (z + true_intercept ))
183183
184- # create pandas data structures for convenience
184+ # Wrap as pandas data structures for convenience.
185185 X = pd .DataFrame (X , columns = [f"feature_{ i } " for i in range (X .shape [1 ])])
186186 y = pd .Series (y , name = "target" )
187187
188188 return X , y
189189
190190
191- X_exercise , y_exercise = generate_imbalanced_dataset (true_coef , n_samples = 10_000 )
191+ X_exercise , y_exercise = generate_imbalanced_dataset (
192+ true_coef , true_intercept , n_samples = 10_000 , seed = 1
193+ )
192194model_exercise = LogisticRegression (penalty = None ).fit (X_exercise , y_exercise )
193195
194196comparison_coef_exercise = pd .DataFrame (
195197 {
196- "Data generating model" : np .hstack ((intercept , true_coef )),
198+ "Data generating model" : np .hstack ((true_intercept , true_coef )),
197199 "Unpenalized logistic regression" : np .hstack (
198200 (model_exercise .intercept_ , model_exercise .coef_ .flatten ())
199201 ),
@@ -213,8 +215,12 @@ def generate_imbalanced_dataset(true_coef, n_samples=10_000, seed=0):
213215# %% [markdown]
214216#
215217# We observe that we have a larger difference between the coefficients of the true
216- # generative process and the learned model. The reason is that the coefficients of the
217- # generative process can only be recovered if the following assumptions are met:
218+ # generative process and the learned model and that furthermore, the learned model has a
219+ # larger variance (different coefficients when we vary the seed used to sample the
220+ # training set).
221+ #
222+ # The reason is that the coefficients of the generative process can only be recovered if
223+ # the following assumptions are met:
218224#
219225# - We have access to an unlimited number of labeled training data points. As the sample
220226# size increases, the coefficients of the predictive model will get closer to the true
@@ -413,7 +419,7 @@ def generate_imbalanced_dataset(true_coef, n_samples=10_000, seed=0):
413419_ = display .ax_ .set_title ("Confusion matrix of the under-sampled logistic regression" )
414420
415421# %%
416- print (classification_report (y , model .predict (X )))
422+ print (classification_report (y , undersampling_model .predict (X )))
417423
418424# %% [markdown]
419425#
@@ -461,7 +467,7 @@ def generate_imbalanced_dataset(true_coef, n_samples=10_000, seed=0):
461467# %%
462468comparison_coef = pd .DataFrame (
463469 {
464- "Data generating model" : np .hstack ((intercept , true_coef )),
470+ "Data generating model" : np .hstack ((true_intercept , true_coef )),
465471 "Model trained on under-sampled data" : np .hstack (
466472 (
467473 undersampling_model [- 1 ].intercept_ ,
0 commit comments