Skip to content

Commit 5d239a8

Browse files
committed
Update ddasp_exercise_slides.tex
add bias/var slides
1 parent dc0c035 commit 5d239a8

File tree

1 file changed

+317
-1
lines changed

1 file changed

+317
-1
lines changed

slides/ddasp_exercise_slides.tex

Lines changed: 317 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3115,11 +3115,327 @@ \subsection{Exercise 09}
31153115
\item example with Fourier series, i.e. polynomial regression
31163116
\end{itemize}
31173117

3118-
no slides so far
3118+
\end{frame}
31193119

31203120

3121+
\begin{frame}[t]{Ex09: Bias Variance Trade-Off vs. Model Complexity}
3122+
%
3123+
\vspace{-1em}
3124+
%
3125+
total variance = (model bias$^2$) + (model variance) + (data noise variance)
3126+
%
3127+
\begin{table}[]
3128+
\begin{tabular}{|l|l|l|}
3129+
\hline
3130+
true model & lowest bias$^2$ & lowest variance\\\hline
3131+
low model complexity & high bias$^2$ & low variance\\\hline
3132+
high model complexity & low bias$^2$ & high variance\\\hline
3133+
optimum model complexity & \multicolumn{2}{l|}{optimum bias$^2$+variance}\\\hline
3134+
\end{tabular}
3135+
\end{table}
3136+
%
3137+
\begin{center}
3138+
\begin{tikzpicture}
3139+
\begin{axis}[
3140+
width=12cm,
3141+
height=6cm,
3142+
legend style={at={(0.015,0.65)}, anchor=north west},
3143+
xtick={-6,0,6},
3144+
xticklabels={(too) low, optimum, (too) high},
3145+
xlabel = {model complexity / \# of non-zero model parameters},
3146+
ytick={0,1},
3147+
yticklabels={low, high},
3148+
ylabel = {bias$^2$ / variance},
3149+
]
3150+
\addplot[domain=-6:6, C0, ultra thick, samples=32] {1-1/(1+exp(-x))};
3151+
\addplot[domain=-6:6, C1, ultra thick, samples=32] {1/(1+exp(-x))};
3152+
\addlegendentry{bias$^2$}
3153+
\addlegendentry{variance}
3154+
\end{axis}
3155+
\end{tikzpicture}
3156+
\end{center}
3157+
%
31213158
\end{frame}
31223159

3160+
3161+
3162+
3163+
3164+
\begin{frame}[t]{Bias Variance Trade-Off vs. Regularisation}
3165+
%
3166+
\vspace{-1em}
3167+
%
3168+
total variance = (model bias$^2$) + (model variance) + (data noise variance)
3169+
%
3170+
\begin{table}[]
3171+
\begin{tabular}{|l|l|l|}
3172+
\hline
3173+
true model & lowest bias$^2$ & lowest variance\\\hline
3174+
high regularisation & high bias$^2$ & low variance\\\hline
3175+
low regularisation & low bias$^2$ & high variance\\\hline
3176+
optimum regularisation & \multicolumn{2}{l|}{optimum bias$^2$+variance}\\\hline
3177+
\end{tabular}
3178+
\end{table}
3179+
%
3180+
\vspace{-0.5em}
3181+
%
3182+
\begin{center}
3183+
\includegraphics[width=0.8\textwidth]{../bias_variance_plots/bias_var_l2_regularisation.png}
3184+
\end{center}
3185+
%
3186+
\end{frame}
3187+
3188+
3189+
3190+
3191+
3192+
3193+
3194+
3195+
\begin{frame}[t]{Bias Variance Trade-Off: Intro Example}
3196+
%
3197+
\vspace{-1em}
3198+
%
3199+
$\cdot$ ground truth model ($N=1+4=5$ features) with full column rank $M \times N$ matrix, $M>N$
3200+
$$\bm{x}_1 = \frac{2\pi}{M} \cdot 0,\quad\bm{x}_2 = \frac{2\pi}{M} \cdot 1,\quad\dots,\quad\bm{x}_M = \frac{2\pi}{M} \cdot (M-1)$$
3201+
$$
3202+
\bm{X}_t =
3203+
\begin{bmatrix}
3204+
1 & \cos(\bm{x}_1) & \sin(2\bm{x}_1) & \cos(5\bm{x}_1) & \cos(6\bm{x}_1) \\
3205+
1 & \cos(\bm{x}_2) & \sin(2\bm{x}_2) & \cos(5\bm{x}_2) & \cos(6\bm{x}_2)\\
3206+
\vdots & \vdots & \vdots & \vdots & \vdots\\
3207+
1 & \cos(\bm{x}_M) & \sin(2\bm{x}_M) & \cos(5\bm{x}_M) & \cos(6\bm{x}_M)\\
3208+
\end{bmatrix}\qquad
3209+
\bm{\beta}_t =
3210+
\begin{bmatrix}
3211+
3\\2\\1\\\nicefrac{1}{2}\\\nicefrac{1}{4}
3212+
\end{bmatrix}
3213+
\qquad
3214+
\bm{t} = \bm{X}_t \bm{\beta}_t
3215+
$$
3216+
3217+
$\cdot$ mean-free, fixed variance noise $\bm{n}$ $\rightarrow$ $l$ measurements, $1 \leq l \leq L$
3218+
$$\bm{y}^{(l)} = \bm{t} + \bm{n}^{(l)}$$
3219+
3220+
$\cdot$ OLS with a model design matrix $\bm{X}$ and the $l$-th data set $\bm{y}^{(l)}$
3221+
\begin{align*}
3222+
&\hat{\bm{\beta}}^{(l)} = \quad\,\,\,\,(\bm{X}^\mathrm{T} \bm{X})^{-1} \bm{X}^\mathrm{T} \bm{y}^{(l)}\\
3223+
\hat{\bm{y}}^{(l)} = \bm{X}\cdot &\hat{\bm{\beta}}^{(l)} = \bm{X}\cdot (\bm{X}^\mathrm{T} \bm{X})^{-1} \bm{X}^\mathrm{T} \bm{y}^{(l)}
3224+
\end{align*}
3225+
3226+
$\cdot$ measurement: $\bm{y}_m^{(l)}$ is $m$-th entry of vector $\bm{y}^{(l)}$,\quad prediction: $\hat{\bm{y}}_m^{(l)}$ is $m$-th entry of vector $\hat{\bm{y}}^{(l)}$
3227+
3228+
\end{frame}
3229+
3230+
\begin{frame}[t]{Bias Variance Trade-Off: Math}
3231+
%
3232+
\vspace{-1em}
3233+
%
3234+
$\cdot$ mean of all predictions $\rightarrow$ 1st order raw moment
3235+
$$
3236+
\begin{bmatrix}
3237+
|\\\tilde{\bm{y}}\\|
3238+
\end{bmatrix}
3239+
=
3240+
\frac{1}{L}
3241+
\left(
3242+
\begin{bmatrix}
3243+
|\\\hat{\bm{y}}^{(1)}\\|
3244+
\end{bmatrix}
3245+
+\begin{bmatrix}
3246+
|\\\hat{\bm{y}}^{(2)}\\|
3247+
\end{bmatrix}
3248+
+
3249+
\dots
3250+
+
3251+
\begin{bmatrix}
3252+
|\\\hat{\bm{y}}^{(L)}\\|
3253+
\end{bmatrix}
3254+
\right)
3255+
$$
3256+
%
3257+
$\cdot$ \underline{bias$^2$}: how much deviates mean of all predictions from the truth data $\rightarrow$ 2nd order moment
3258+
$$
3259+
\begin{bmatrix}
3260+
|\\\bm{e}_b\\|
3261+
\end{bmatrix}=
3262+
\begin{bmatrix}
3263+
|\\\bm{t}\\|
3264+
\end{bmatrix}-
3265+
\begin{bmatrix}
3266+
|\\\tilde{\bm{y}}\\|
3267+
\end{bmatrix}
3268+
\qquad
3269+
\text{bias}^2 = \frac{1}{M}\bm{e}_b^\mathrm{T} \bm{e}_b = \frac{1}{M} \sum\limits_{m=1}^{M} (\bm{t}_m - \tilde{\bm{y}}_m)^2
3270+
$$
3271+
%
3272+
$\cdot$ mean of squared deviations w.r.t. data $\rightarrow$ 2nd order centralized moment
3273+
$$
3274+
\begin{bmatrix}
3275+
|\\\bm{v}\\|
3276+
\end{bmatrix}
3277+
=
3278+
\frac{1}{L}
3279+
\left(
3280+
\begin{bmatrix}
3281+
|\\(\hat{\bm{y}}^{(1)}-\tilde{\bm{y}})^2\\|
3282+
\end{bmatrix}
3283+
+\begin{bmatrix}
3284+
|\\(\hat{\bm{y}}^{(2)}-\tilde{\bm{y}})^2\\|
3285+
\end{bmatrix}
3286+
+
3287+
\dots
3288+
+
3289+
\begin{bmatrix}
3290+
|\\(\hat{\bm{y}}^{(L)}-\tilde{\bm{y}})^2\\|
3291+
\end{bmatrix}
3292+
\right)
3293+
$$
3294+
%
3295+
$\cdot$ \underline{variance}: we want a single number for outcome of $\bm{v}$ $\rightarrow$ 1st order raw moment (=mean)
3296+
$$
3297+
\text{variance} = \frac{1}{M} \sum\limits_{m=1}^{M} \bm{v}_m
3298+
$$
3299+
3300+
\end{frame}
3301+
3302+
3303+
3304+
3305+
\begin{frame}[t]{Bias Variance Trade-Off: Essence of Example}
3306+
%
3307+
\vspace{-1em}
3308+
%
3309+
\begin{center}
3310+
\begin{tikzpicture}
3311+
\begin{axis}[
3312+
width=12cm,
3313+
height=6cm,
3314+
legend style={at={(0.015,0.65)}, anchor=north west},
3315+
xtick={-6,0,6},
3316+
xticklabels={too simple, robust, too complex},
3317+
xlabel = {model complexity / \# of non-zero model parameters},
3318+
ytick={0,1},
3319+
yticklabels={low, high},
3320+
ylabel = {bias$^2$ / variance},
3321+
]
3322+
\addplot[domain=-6:6, C0, ultra thick, samples=32] {1-1/(1+exp(-x))};
3323+
\addplot[domain=-6:6, C1, ultra thick, samples=32] {1/(1+exp(-x))};
3324+
\addlegendentry{bias$^2$}
3325+
\addlegendentry{variance}
3326+
\end{axis}
3327+
\end{tikzpicture}
3328+
\end{center}
3329+
%
3330+
\begin{align*}
3331+
\bm{X} =
3332+
\begin{bmatrix}
3333+
1 & \bm{x}_1\\
3334+
1 & \bm{x}_2\\
3335+
\vdots & \vdots\\
3336+
1 & \bm{x}_M
3337+
\end{bmatrix}
3338+
%
3339+
\qquad\qquad
3340+
\bm{X} =
3341+
\begin{bmatrix}
3342+
1 & \cos(\bm{x}_1) & \sin(2\bm{x}_1)\\
3343+
1 & \cos(\bm{x}_2) & \sin(2\bm{x}_2)\\
3344+
\vdots & \vdots & \vdots\\
3345+
1 & \cos(\bm{x}_M) & \sin(2\bm{x}_M)
3346+
\end{bmatrix}
3347+
%
3348+
\qquad\qquad
3349+
\bm{X}=?
3350+
\end{align*}
3351+
3352+
\end{frame}
3353+
3354+
3355+
3356+
3357+
\begin{frame}[t]{Example: True Data}
3358+
\centering
3359+
\includegraphics[width=0.8\textwidth]{../bias_variance_plots/true_data.png}
3360+
\end{frame}
3361+
3362+
\begin{frame}[t]{Example: True Model}
3363+
\centering
3364+
\includegraphics[width=1\textwidth]{../bias_variance_plots/true_model.png}
3365+
\end{frame}
3366+
3367+
\begin{frame}[t]{Example: Model Too Simple}
3368+
\centering
3369+
\includegraphics[width=1\textwidth]{../bias_variance_plots/too_simple_model.png}
3370+
\end{frame}
3371+
3372+
\begin{frame}[t]{Example: Model Too Complex}
3373+
\centering
3374+
\includegraphics[width=1\textwidth]{../bias_variance_plots/too_complex_model.png}
3375+
\end{frame}
3376+
3377+
\begin{frame}[t]{Example: Robust Model}
3378+
\centering
3379+
\includegraphics[width=1\textwidth]{../bias_variance_plots/robust_model.png}
3380+
\end{frame}
3381+
3382+
3383+
\begin{frame}[t]{Empirical Correlation Coefficient $R^2$ Between $\mathbf{y}$ and $\hat{\mathbf{y}}$}
3384+
\vspace{-1em}
3385+
$\cdot$ measured $\bm{y}^{(l)}$, predicted $\hat{\bm{y}}^{(l)}$
3386+
3387+
$\cdot$ we calculate all for the $l$-th data set, but we omit index $l$:
3388+
3389+
- Sum of Squares \textbf{Error} (SS\textbf{E})
3390+
$$\mathrm{SSE} = \sum_{m=1}^{M} (\bm{y}_m - \hat{\bm{y}}_m)^2 = (\bm{y} - \bm{X}\hat{\bm{\beta}})^\mathrm{T} (\bm{y} - \bm{X}\hat{\bm{\beta}})$$
3391+
3392+
- mean of measured data
3393+
$$\bar{{y}} = \frac{1}{M} \sum_{m=1}^{M} \bm{y}_m$$
3394+
3395+
- Sum of Squares \textbf{Total} (SS\textbf{T})
3396+
$$\mathrm{SST} = \sum_{m=1}^{M} (\bm{y}_m - \bar{{y}})^2$$
3397+
3398+
- Sum of Squares (due to) \textbf{Regression} (SS\textbf{R})
3399+
$$\mathrm{SSR} = \sum_{m=1}^{M} (\hat{\bm{y}}_m - \bar{{y}})^2$$
3400+
3401+
$$\mathrm{SST} = \mathrm{SSR} + \mathrm{SSE}$$
3402+
3403+
\end{frame}
3404+
3405+
\begin{frame}[t]{Empirical Correlation Coefficient $R^2$ Between $\mathbf{y}$ and $\hat{\mathbf{y}}$}
3406+
\vspace{-1em}
3407+
$$\mathrm{SST} = \mathrm{SSR} + \mathrm{SSE}$$
3408+
3409+
$\cdot$ empirical correlation coefficient or coefficient of determination $0 \leq R^2 \leq 1$
3410+
3411+
$$R^2 = \frac{\mathrm{SSR}}{\mathrm{SST}} = \frac{\mathrm{SST}-\mathrm{SSE}}{\mathrm{SST}} = 1^2 - \frac{\mathrm{SSE}}{\mathrm{SST}}$$
3412+
3413+
$\cdot$ normalise for independence w.r.t. number of data samples $M$ and number of features $N$
3414+
$$R_\text{adjusted}^2 = 1^2 - \frac{\frac{\mathrm{SSE}}{M-N}}{\frac{\mathrm{SST}}{M-1}}$$
3415+
3416+
$\cdot$ $R_\text{adjusted}^2$ holds for models with intercept!
3417+
3418+
\vspace{1em}
3419+
3420+
$\cdot$ hence: measured $\bm{y}^{(l)}$, model design matrix $\bm{X}$, fitted $\hat{\bm{\beta}}^{(l)}$, predicted $\hat{\bm{y}}^{(l)}$ $\rightarrow$ $R_\text{adjusted}^{2,(l)}$
3421+
3422+
\end{frame}
3423+
3424+
3425+
3426+
3427+
3428+
3429+
3430+
3431+
3432+
3433+
3434+
3435+
3436+
3437+
3438+
31233439
\subsection{Exercise 10}
31243440
\begin{frame}{Ex 10: Gradient Descent}
31253441

0 commit comments

Comments
 (0)