@@ -3115,11 +3115,327 @@ \subsection{Exercise 09}
3115
3115
\item example with Fourier series, i.e. polynomial regression
3116
3116
\end {itemize }
3117
3117
3118
- no slides so far
3118
+ \end { frame }
3119
3119
3120
3120
3121
+ \begin {frame }[t]{Ex09: Bias Variance Trade-Off vs. Model Complexity}
3122
+ %
3123
+ \vspace {-1em}
3124
+ %
3125
+ total variance = (model bias$ ^2 $ ) + (model variance) + (data noise variance)
3126
+ %
3127
+ \begin {table }[]
3128
+ \begin {tabular }{|l|l|l|}
3129
+ \hline
3130
+ true model & lowest bias$ ^2 $ & lowest variance\\\hline
3131
+ low model complexity & high bias$ ^2 $ & low variance\\\hline
3132
+ high model complexity & low bias$ ^2 $ & high variance\\\hline
3133
+ optimum model complexity & \multicolumn {2}{l|}{optimum bias$ ^2 $ +variance}\\\hline
3134
+ \end {tabular }
3135
+ \end {table }
3136
+ %
3137
+ \begin {center }
3138
+ \begin {tikzpicture }
3139
+ \begin {axis }[
3140
+ width=12cm,
3141
+ height=6cm,
3142
+ legend style={at={(0.015,0.65)}, anchor=north west},
3143
+ xtick={-6,0,6},
3144
+ xticklabels={(too) low, optimum, (too) high},
3145
+ xlabel = {model complexity / \# of non-zero model parameters},
3146
+ ytick={0,1},
3147
+ yticklabels={low, high},
3148
+ ylabel = {bias$ ^2 $ / variance},
3149
+ ]
3150
+ \addplot [domain=-6:6, C0, ultra thick, samples=32] {1-1/(1+exp(-x))};
3151
+ \addplot [domain=-6:6, C1, ultra thick, samples=32] {1/(1+exp(-x))};
3152
+ \addlegendentry {bias$ ^2 $ }
3153
+ \addlegendentry {variance}
3154
+ \end {axis }
3155
+ \end {tikzpicture }
3156
+ \end {center }
3157
+ %
3121
3158
\end {frame }
3122
3159
3160
+
3161
+
3162
+
3163
+
3164
+ \begin {frame }[t]{Bias Variance Trade-Off vs. Regularisation}
3165
+ %
3166
+ \vspace {-1em}
3167
+ %
3168
+ total variance = (model bias$ ^2 $ ) + (model variance) + (data noise variance)
3169
+ %
3170
+ \begin {table }[]
3171
+ \begin {tabular }{|l|l|l|}
3172
+ \hline
3173
+ true model & lowest bias$ ^2 $ & lowest variance\\\hline
3174
+ high regularisation & high bias$ ^2 $ & low variance\\\hline
3175
+ low regularisation & low bias$ ^2 $ & high variance\\\hline
3176
+ optimum regularisation & \multicolumn {2}{l|}{optimum bias$ ^2 $ +variance}\\\hline
3177
+ \end {tabular }
3178
+ \end {table }
3179
+ %
3180
+ \vspace {-0.5em}
3181
+ %
3182
+ \begin {center }
3183
+ \includegraphics [width=0.8\textwidth ]{../bias_variance_plots/bias_var_l2_regularisation.png}
3184
+ \end {center }
3185
+ %
3186
+ \end {frame }
3187
+
3188
+
3189
+
3190
+
3191
+
3192
+
3193
+
3194
+
3195
+ \begin {frame }[t]{Bias Variance Trade-Off: Intro Example}
3196
+ %
3197
+ \vspace {-1em}
3198
+ %
3199
+ $ \cdot $ ground truth model ($ N=1 +4 =5 $ features) with full column rank $ M \times N$ matrix, $ M>N$
3200
+ $$ \bm {x}_1 = \frac {2\pi }{M} \cdot 0 ,\quad\bm {x}_2 = \frac {2\pi }{M} \cdot 1 ,\quad\dots ,\quad\bm {x}_M = \frac {2\pi }{M} \cdot (M-1 )$$
3201
+ $$
3202
+ \bm {X}_t =
3203
+ \begin {bmatrix}
3204
+ 1 & \cos (\bm {x}_1 ) & \sin (2 \bm {x}_1 ) & \cos (5 \bm {x}_1 ) & \cos (6 \bm {x}_1 ) \\
3205
+ 1 & \cos (\bm {x}_2 ) & \sin (2 \bm {x}_2 ) & \cos (5 \bm {x}_2 ) & \cos (6 \bm {x}_2 )\\
3206
+ \vdots & \vdots & \vdots & \vdots & \vdots \\
3207
+ 1 & \cos (\bm {x}_M) & \sin (2 \bm {x}_M) & \cos (5 \bm {x}_M) & \cos (6 \bm {x}_M)\\
3208
+ \end {bmatrix}\qquad
3209
+ \bm {\beta }_t =
3210
+ \begin {bmatrix}
3211
+ 3 \\ 2 \\ 1 \\ \nicefrac {1}{2}\\ \nicefrac {1}{4}
3212
+ \end {bmatrix}
3213
+ \qquad
3214
+ \bm {t} = \bm {X}_t \bm {\beta }_t
3215
+ $$
3216
+
3217
+ $ \cdot $ mean-free, fixed variance noise $ \bm {n}$ $ \rightarrow $ $ l$ measurements, $ 1 \leq l \leq L$
3218
+ $$ \bm {y}^{(l)} = \bm {t} + \bm {n}^{(l)}$$
3219
+
3220
+ $ \cdot $ OLS with a model design matrix $ \bm {X}$ and the $ l$ -th data set $ \bm {y}^{(l)}$
3221
+ \begin {align* }
3222
+ &\hat {\bm {\beta }}^{(l)} = \quad \,\,\,\, (\bm {X}^\mathrm {T} \bm {X})^{-1} \bm {X}^\mathrm {T} \bm {y}^{(l)}\\
3223
+ \hat {\bm {y}}^{(l)} = \bm {X}\cdot &\hat {\bm {\beta }}^{(l)} = \bm {X}\cdot (\bm {X}^\mathrm {T} \bm {X})^{-1} \bm {X}^\mathrm {T} \bm {y}^{(l)}
3224
+ \end {align* }
3225
+
3226
+ $ \cdot $ measurement: $ \bm {y}_m^{(l)}$ is $ m$ -th entry of vector $ \bm {y}^{(l)}$ ,\quad prediction: $ \hat {\bm {y}}_m^{(l)}$ is $ m$ -th entry of vector $ \hat {\bm {y}}^{(l)}$
3227
+
3228
+ \end {frame }
3229
+
3230
+ \begin {frame }[t]{Bias Variance Trade-Off: Math}
3231
+ %
3232
+ \vspace {-1em}
3233
+ %
3234
+ $ \cdot $ mean of all predictions $ \rightarrow $ 1st order raw moment
3235
+ $$
3236
+ \begin {bmatrix}
3237
+ |\\ \tilde {\bm {y}}\\ |
3238
+ \end {bmatrix}
3239
+ =
3240
+ \frac {1}{L}
3241
+ \left (
3242
+ \begin {bmatrix}
3243
+ |\\ \hat {\bm {y}}^{(1)}\\ |
3244
+ \end {bmatrix}
3245
+ +\begin {bmatrix}
3246
+ |\\ \hat {\bm {y}}^{(2)}\\ |
3247
+ \end {bmatrix}
3248
+ +
3249
+ \dots
3250
+ +
3251
+ \begin {bmatrix}
3252
+ |\\ \hat {\bm {y}}^{(L)}\\ |
3253
+ \end {bmatrix}
3254
+ \right )
3255
+ $$
3256
+ %
3257
+ $ \cdot $ \underline {bias$ ^2 $ }: how much deviates mean of all predictions from the truth data $ \rightarrow $ 2nd order moment
3258
+ $$
3259
+ \begin {bmatrix}
3260
+ |\\ \bm {e}_b\\ |
3261
+ \end {bmatrix}=
3262
+ \begin {bmatrix}
3263
+ |\\ \bm {t}\\ |
3264
+ \end {bmatrix}-
3265
+ \begin {bmatrix}
3266
+ |\\ \tilde {\bm {y}}\\ |
3267
+ \end {bmatrix}
3268
+ \qquad
3269
+ \text {bias}^2 = \frac {1}{M}\bm {e}_b^\mathrm {T} \bm {e}_b = \frac {1}{M} \sum \limits _{m=1}^{M} (\bm {t}_m - \tilde {\bm {y}}_m)^2
3270
+ $$
3271
+ %
3272
+ $ \cdot $ mean of squared deviations w.r.t. data $ \rightarrow $ 2nd order centralized moment
3273
+ $$
3274
+ \begin {bmatrix}
3275
+ |\\ \bm {v}\\ |
3276
+ \end {bmatrix}
3277
+ =
3278
+ \frac {1}{L}
3279
+ \left (
3280
+ \begin {bmatrix}
3281
+ |\\ (\hat {\bm {y}}^{(1)}-\tilde {\bm {y}})^2 \\ |
3282
+ \end {bmatrix}
3283
+ +\begin {bmatrix}
3284
+ |\\ (\hat {\bm {y}}^{(2)}-\tilde {\bm {y}})^2 \\ |
3285
+ \end {bmatrix}
3286
+ +
3287
+ \dots
3288
+ +
3289
+ \begin {bmatrix}
3290
+ |\\ (\hat {\bm {y}}^{(L)}-\tilde {\bm {y}})^2 \\ |
3291
+ \end {bmatrix}
3292
+ \right )
3293
+ $$
3294
+ %
3295
+ $ \cdot $ \underline {variance}: we want a single number for outcome of $ \bm {v}$ $ \rightarrow $ 1st order raw moment (=mean)
3296
+ $$
3297
+ \text {variance} = \frac {1}{M} \sum \limits _{m=1}^{M} \bm {v}_m
3298
+ $$
3299
+
3300
+ \end {frame }
3301
+
3302
+
3303
+
3304
+
3305
+ \begin {frame }[t]{Bias Variance Trade-Off: Essence of Example}
3306
+ %
3307
+ \vspace {-1em}
3308
+ %
3309
+ \begin {center }
3310
+ \begin {tikzpicture }
3311
+ \begin {axis }[
3312
+ width=12cm,
3313
+ height=6cm,
3314
+ legend style={at={(0.015,0.65)}, anchor=north west},
3315
+ xtick={-6,0,6},
3316
+ xticklabels={too simple, robust, too complex},
3317
+ xlabel = {model complexity / \# of non-zero model parameters},
3318
+ ytick={0,1},
3319
+ yticklabels={low, high},
3320
+ ylabel = {bias$ ^2 $ / variance},
3321
+ ]
3322
+ \addplot [domain=-6:6, C0, ultra thick, samples=32] {1-1/(1+exp(-x))};
3323
+ \addplot [domain=-6:6, C1, ultra thick, samples=32] {1/(1+exp(-x))};
3324
+ \addlegendentry {bias$ ^2 $ }
3325
+ \addlegendentry {variance}
3326
+ \end {axis }
3327
+ \end {tikzpicture }
3328
+ \end {center }
3329
+ %
3330
+ \begin {align* }
3331
+ \bm {X} =
3332
+ \begin {bmatrix }
3333
+ 1 & \bm {x}_1\\
3334
+ 1 & \bm {x}_2\\
3335
+ \vdots & \vdots \\
3336
+ 1 & \bm {x}_M
3337
+ \end {bmatrix }
3338
+ %
3339
+ \qquad\qquad
3340
+ \bm {X} =
3341
+ \begin {bmatrix }
3342
+ 1 & \cos (\bm {x}_1) & \sin (2\bm {x}_1)\\
3343
+ 1 & \cos (\bm {x}_2) & \sin (2\bm {x}_2)\\
3344
+ \vdots & \vdots & \vdots \\
3345
+ 1 & \cos (\bm {x}_M) & \sin (2\bm {x}_M)
3346
+ \end {bmatrix }
3347
+ %
3348
+ \qquad\qquad
3349
+ \bm {X}=?
3350
+ \end {align* }
3351
+
3352
+ \end {frame }
3353
+
3354
+
3355
+
3356
+
3357
+ \begin {frame }[t]{Example: True Data}
3358
+ \centering
3359
+ \includegraphics [width=0.8\textwidth ]{../bias_variance_plots/true_data.png}
3360
+ \end {frame }
3361
+
3362
+ \begin {frame }[t]{Example: True Model}
3363
+ \centering
3364
+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/true_model.png}
3365
+ \end {frame }
3366
+
3367
+ \begin {frame }[t]{Example: Model Too Simple}
3368
+ \centering
3369
+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/too_simple_model.png}
3370
+ \end {frame }
3371
+
3372
+ \begin {frame }[t]{Example: Model Too Complex}
3373
+ \centering
3374
+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/too_complex_model.png}
3375
+ \end {frame }
3376
+
3377
+ \begin {frame }[t]{Example: Robust Model}
3378
+ \centering
3379
+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/robust_model.png}
3380
+ \end {frame }
3381
+
3382
+
3383
+ \begin {frame }[t]{Empirical Correlation Coefficient $ R^2 $ Between $ \mathbf {y}$ and $ \hat {\mathbf {y}}$ }
3384
+ \vspace {-1em}
3385
+ $ \cdot $ measured $ \bm {y}^{(l)}$ , predicted $ \hat {\bm {y}}^{(l)}$
3386
+
3387
+ $ \cdot $ we calculate all for the $ l$ -th data set, but we omit index $ l$ :
3388
+
3389
+ - Sum of Squares \textbf {Error } (SS\textbf {E })
3390
+ $$ \mathrm {SSE} = \sum _{m=1}^{M} (\bm {y}_m - \hat {\bm {y}}_m)^2 = (\bm {y} - \bm {X}\hat {\bm {\beta }})^\mathrm {T} (\bm {y} - \bm {X}\hat {\bm {\beta }})$$
3391
+
3392
+ - mean of measured data
3393
+ $$ \bar {{y}} = \frac {1}{M} \sum _{m=1}^{M} \bm {y}_m$$
3394
+
3395
+ - Sum of Squares \textbf {Total } (SS\textbf {T })
3396
+ $$ \mathrm {SST} = \sum _{m=1}^{M} (\bm {y}_m - \bar {{y}})^2 $$
3397
+
3398
+ - Sum of Squares (due to) \textbf {Regression } (SS\textbf {R })
3399
+ $$ \mathrm {SSR} = \sum _{m=1}^{M} (\hat {\bm {y}}_m - \bar {{y}})^2 $$
3400
+
3401
+ $$ \mathrm {SST} = \mathrm {SSR} + \mathrm {SSE}$$
3402
+
3403
+ \end {frame }
3404
+
3405
+ \begin {frame }[t]{Empirical Correlation Coefficient $ R^2 $ Between $ \mathbf {y}$ and $ \hat {\mathbf {y}}$ }
3406
+ \vspace {-1em}
3407
+ $$ \mathrm {SST} = \mathrm {SSR} + \mathrm {SSE}$$
3408
+
3409
+ $ \cdot $ empirical correlation coefficient or coefficient of determination $ 0 \leq R^2 \leq 1 $
3410
+
3411
+ $$ R^2 = \frac {\mathrm {SSR}}{\mathrm {SST}} = \frac {\mathrm {SST}-\mathrm {SSE}}{\mathrm {SST}} = 1 ^2 - \frac {\mathrm {SSE}}{\mathrm {SST}}$$
3412
+
3413
+ $ \cdot $ normalise for independence w.r.t. number of data samples $ M$ and number of features $ N$
3414
+ $$ R_\text {adjusted}^2 = 1 ^2 - \frac {\frac {\mathrm {SSE}}{M-N}}{\frac {\mathrm {SST}}{M-1}}$$
3415
+
3416
+ $ \cdot $ $ R_\text {adjusted}^2 $ holds for models with intercept!
3417
+
3418
+ \vspace {1em}
3419
+
3420
+ $ \cdot $ hence: measured $ \bm {y}^{(l)}$ , model design matrix $ \bm {X}$ , fitted $ \hat {\bm {\beta }}^{(l)}$ , predicted $ \hat {\bm {y}}^{(l)}$ $ \rightarrow $ $ R_\text {adjusted}^{2,(l)}$
3421
+
3422
+ \end {frame }
3423
+
3424
+
3425
+
3426
+
3427
+
3428
+
3429
+
3430
+
3431
+
3432
+
3433
+
3434
+
3435
+
3436
+
3437
+
3438
+
3123
3439
\subsection {Exercise 10 }
3124
3440
\begin {frame }{Ex 10: Gradient Descent}
3125
3441
0 commit comments