Skip to content

Commit 2e14bc9

Browse files
committed
More fixes
1 parent a4d7a13 commit 2e14bc9

File tree

1 file changed

+24
-18
lines changed

1 file changed

+24
-18
lines changed

content/python_files/imbalanced_classification.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,22 @@
5959
true_coef = rng.normal(size=n_features)
6060
X = rng.normal(size=(n_samples, n_features))
6161
z = X @ true_coef
62-
intercept = -4
63-
y = rng.binomial(n=1, p=expit(z + intercept))
62+
true_intercept = -4
63+
y = rng.binomial(n=1, p=expit(z + true_intercept))
6464

65-
# create pandas data structures for convenience
65+
# Wrap as pandas data structures for convenience.
6666
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(n_features)])
6767
y = pd.Series(y, name="target")
6868

6969
# %% [markdown]
7070
#
71-
# Let's recall what the expit function also called the sigmoid function is.
71+
# Recall that the `expit` function, also known as the logistic sigmoid function,
72+
# is defined as `expit(x) = 1 / (1 + np.exp(-x))` and looks as follows:
7273

7374
# %%
7475
_, ax = plt.subplots()
75-
x = np.linspace(-10, 10, 100)
76-
ax.plot(x, expit(x))
76+
z = np.linspace(-10, 10, 100)
77+
ax.plot(z, expit(z))
7778
_ = ax.set(
7879
title="Sigmoid/Expit function",
7980
xlabel="Linear predictor",
@@ -131,7 +132,7 @@
131132
# %%
132133
comparison_coef = pd.DataFrame(
133134
{
134-
"Data generating model": np.hstack((intercept, true_coef)),
135+
"Data generating model": np.hstack((true_intercept, true_coef)),
135136
"Unpenalized logistic regression": np.hstack(
136137
(model.intercept_, model.coef_.flatten())
137138
),
@@ -172,28 +173,29 @@
172173

173174

174175
# %%
175-
def generate_imbalanced_dataset(true_coef, n_samples=10_000, seed=0):
176+
def generate_imbalanced_dataset(true_coef, true_intercept, n_samples=10_000, seed=0):
176177
rng = np.random.default_rng(seed)
177178

178-
# we can sample a new design matrix but we need to keep the same true coefficients
179+
# We can sample a new design matrix but we need to keep the same true coefficients.
179180
X = rng.normal(size=(n_samples, true_coef.shape[0]))
180181
z = X @ true_coef
181-
intercept = -4
182-
y = rng.binomial(n=1, p=expit(z + intercept))
182+
y = rng.binomial(n=1, p=expit(z + true_intercept))
183183

184-
# create pandas data structures for convenience
184+
# Wrap as pandas data structures for convenience.
185185
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
186186
y = pd.Series(y, name="target")
187187

188188
return X, y
189189

190190

191-
X_exercise, y_exercise = generate_imbalanced_dataset(true_coef, n_samples=10_000)
191+
X_exercise, y_exercise = generate_imbalanced_dataset(
192+
true_coef, true_intercept, n_samples=10_000, seed=1
193+
)
192194
model_exercise = LogisticRegression(penalty=None).fit(X_exercise, y_exercise)
193195

194196
comparison_coef_exercise = pd.DataFrame(
195197
{
196-
"Data generating model": np.hstack((intercept, true_coef)),
198+
"Data generating model": np.hstack((true_intercept, true_coef)),
197199
"Unpenalized logistic regression": np.hstack(
198200
(model_exercise.intercept_, model_exercise.coef_.flatten())
199201
),
@@ -213,8 +215,12 @@ def generate_imbalanced_dataset(true_coef, n_samples=10_000, seed=0):
213215
# %% [markdown]
214216
#
215217
# We observe that we have a larger difference between the coefficients of the true
216-
# generative process and the learned model. The reason is that the coefficients of the
217-
# generative process can only be recovered if the following assumptions are met:
218+
# generative process and the learned model and that furthermore, the learned model has a
219+
# larger variance (different coefficients when we vary the seed used to sample the
220+
# training set).
221+
#
222+
# The reason is that the coefficients of the generative process can only be recovered if
223+
# the following assumptions are met:
218224
#
219225
# - We have access to an unlimited number of labeled training data points. As the sample
220226
# size increases, the coefficients of the predictive model will get closer to the true
@@ -413,7 +419,7 @@ def generate_imbalanced_dataset(true_coef, n_samples=10_000, seed=0):
413419
_ = display.ax_.set_title("Confusion matrix of the under-sampled logistic regression")
414420

415421
# %%
416-
print(classification_report(y, model.predict(X)))
422+
print(classification_report(y, undersampling_model.predict(X)))
417423

418424
# %% [markdown]
419425
#
@@ -461,7 +467,7 @@ def generate_imbalanced_dataset(true_coef, n_samples=10_000, seed=0):
461467
# %%
462468
comparison_coef = pd.DataFrame(
463469
{
464-
"Data generating model": np.hstack((intercept, true_coef)),
470+
"Data generating model": np.hstack((true_intercept, true_coef)),
465471
"Model trained on under-sampled data": np.hstack(
466472
(
467473
undersampling_model[-1].intercept_,

0 commit comments

Comments
 (0)