diff --git a/chapter_attention-mechanisms-and-transformers/attention-pooling.md b/chapter_attention-mechanisms-and-transformers/attention-pooling.md index c2ba4dbb7e..0949b1f6c6 100644 --- a/chapter_attention-mechanisms-and-transformers/attention-pooling.md +++ b/chapter_attention-mechanisms-and-transformers/attention-pooling.md @@ -8,7 +8,7 @@ At their core, Nadaraya--Watson estimators rely on some similarity kernel $\alph $$\begin{aligned} \alpha(\mathbf{q}, \mathbf{k}) & = \exp\left(-\frac{1}{2} \|\mathbf{q} - \mathbf{k}\|^2 \right) && \textrm{Gaussian;} \\ \alpha(\mathbf{q}, \mathbf{k}) & = 1 \textrm{ if } \|\mathbf{q} - \mathbf{k}\| \leq 1 && \textrm{Boxcar;} \\ -\alpha(\mathbf{q}, \mathbf{k}) & = \mathop{\mathrm{max}}\left(0, 1 - \|\mathbf{q} - \mathbf{k}\|\right) && \textrm{Epanechikov.} +\alpha(\mathbf{q}, \mathbf{k}) & = \mathop{\mathrm{max}}\left(0, 1 - \|\mathbf{q} - \mathbf{k}\|\right) && \textrm{Triangular.} \end{aligned} $$ @@ -77,16 +77,16 @@ def constant(x): return 1.0 + 0 * x if tab.selected('pytorch'): - def epanechikov(x): + def triangular(x): return torch.max(1 - d2l.abs(x), torch.zeros_like(x)) if tab.selected('mxnet'): - def epanechikov(x): + def triangular(x): return np.maximum(1 - d2l.abs(x), 0) if tab.selected('tensorflow'): - def epanechikov(x): + def triangular(x): return tf.maximum(1 - d2l.abs(x), 0) if tab.selected('jax'): - def epanechikov(x): + def triangular(x): return jnp.maximum(1 - d2l.abs(x), 0) ``` @@ -94,8 +94,8 @@ if tab.selected('jax'): %%tab all fig, axes = d2l.plt.subplots(1, 4, sharey=True, figsize=(12, 3)) -kernels = (gaussian, boxcar, constant, epanechikov) -names = ('Gaussian', 'Boxcar', 'Constant', 'Epanechikov') +kernels = (gaussian, boxcar, constant, triangular) +names = ('Gaussian', 'Boxcar', 'Constant', 'Triangular') x = d2l.arange(-2.5, 2.5, 0.1) for kernel, name, ax in zip(kernels, names, axes): if tab.selected('pytorch', 'mxnet', 'tensorflow'):