From a86bdbf2a56a20eb594e975af5ff3469e0c7f948 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 29 Mar 2026 20:25:19 -0400
Subject: [PATCH 01/34] Standardize W1D1-W1D4 tutorial notebooks: dependency
 checks and imports

- Add version check cell to W1D1-W1D4 tutorials (prints installed package
  versions, points local users to requirements_tutorials.txt)
- Consolidate all imports to top-level Setup section; remove scattered
  imports and duplicate pip installs throughout each notebook
- Update W1D1 Meet Our Lecturers: flat alphabetical list of all content
  creators with websites, annotated by source; add Past contributors section

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_Tutorial1.ipynb                      | 123 ++++++++++++------
 .../W1D2_Tutorial1.ipynb                      |  32 ++++-
 .../W1D2_Tutorial2.ipynb                      |  34 ++++-
 .../W1D2_Tutorial3.ipynb                      |  40 ++++--
 .../W1D3_Tutorial1.ipynb                      |  30 ++++-
 .../W1D3_Tutorial2.ipynb                      |  30 ++++-
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    |  26 +++-
 7 files changed, 250 insertions(+), 65 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 4750dadda..0b0e6ff90 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -100,7 +100,26 @@
    "outputs": [],
    "source": [
     "# @title Install dependencies\n",
-    "!pip install pandas --quiet"
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import subprocess, sys, importlib\n",
+    "\n",
+    "_to_install = {'pandas': 'pandas', 'imageio': 'imageio',\n",
+    "               'altair': 'altair', 'vega_datasets': 'vega_datasets'}\n",
+    "for _pkg, _pip in _to_install.items():\n",
+    "    try:\n",
+    "        importlib.import_module(_pkg)\n",
+    "    except ImportError:\n",
+    "        subprocess.check_call([sys.executable, '-m', 'pip', 'install', _pip, '-q'])\n",
+    "\n",
+    "# Print versions for reproducibility / bug reports\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'pandas', 'matplotlib', 'torch', 'torchvision', 'sklearn', 'imageio', 'altair']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')\n"
    ]
   },
   {
@@ -142,16 +161,27 @@
     "# Imports\n",
     "import time\n",
     "import random\n",
+    "from pathlib import Path\n",
+    "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
+    "import imageio.v2 as imageio\n",
+    "import altair as alt\n",
     "\n",
-    "# PyTorch libraries\n",
+    "# PyTorch\n",
     "import torch\n",
     "from torch import nn\n",
     "from torchvision import datasets\n",
+    "from torchvision.transforms import Compose, Grayscale, ToTensor\n",
     "from torch.utils.data import DataLoader\n",
-    "from torchvision.transforms import ToTensor"
+    "\n",
+    "# Scikit-learn\n",
+    "from sklearn.datasets import make_moons\n",
+    "\n",
+    "# IPython display utilities\n",
+    "from IPython.core.interactiveshell import InteractiveShell\n",
+    "from IPython.display import Image, display\n"
    ]
   },
   {
@@ -2679,11 +2709,7 @@
    },
    "outputs": [],
    "source": [
-    "# Import dataset and dataloaders related packages\n",
-    "from torchvision import datasets\n",
-    "from torchvision.transforms import ToTensor\n",
-    "from torch.utils.data import DataLoader\n",
-    "from torchvision.transforms import Compose, Grayscale"
+    "# Imports moved to Setup section above\n"
    ]
   },
   {
@@ -3307,7 +3333,6 @@
    "source": [
     "# @title Generate sample data\n",
     "# @markdown we used `scikit-learn` module\n",
-    "from sklearn.datasets import make_moons\n",
     "\n",
     "# Create a dataset of 256 points with a little noise\n",
     "X, y = make_moons(256, noise=0.1)\n",
@@ -3821,7 +3846,6 @@
     "\n",
     "# Code adapted from this notebook: https://jonchar.net/notebooks/Artificial-Neural-Network-with-Keras/\n",
     "\n",
-    "from pathlib import Path\n",
     "\n",
     "def plot_decision_boundary(model, X, y, device):\n",
     "  \"\"\"\n",
@@ -3974,13 +3998,7 @@
    "source": [
     "# @title Visualize the training process\n",
     "# @markdown Execute this cell!\n",
-    "!pip install imageio --quiet\n",
-    "!pip install pathlib --quiet\n",
     "\n",
-    "import imageio.v2 as imageio\n",
-    "from IPython.core.interactiveshell import InteractiveShell\n",
-    "from IPython.display import Image, display\n",
-    "from pathlib import Path\n",
     "\n",
     "InteractiveShell.ast_node_interactivity = \"all\"\n",
     "\n",
@@ -4502,27 +4520,55 @@
     "execution": {}
    },
    "source": [
-    "## Meet our lecturers\n",
+    "## Meet our content creators\n",
     "\n",
-    "### Week 1: the building blocks\n",
-    "* [Konrad Kording](https://kordinglab.com)\n",
-    "* [Andrew Saxe](https://www.saxelab.org/)\n",
-    "* [Surya Ganguli](https://ganguli-gang.stanford.edu/)\n",
+    "* [Alish Dipani](https://alishdipani.github.io/) \n",
+    "* [Alexander Ecker](https://eckerlab.org/) \n",
+    "* [Alona Fyshe](https://webdocs.cs.ualberta.ca/~alona/) \n",
+    "* [Andrew Saxe](https://www.saxelab.org/) \n",
+    "* Arash Ash\n",
+    "* [Arna Ghosh](https://arnaghosh.github.io/) \n",
+    "* Bikram Khastgir\n",
+    "* [Binxu Wang](https://animadversio.github.io/) \n",
+    "* [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
+    "* [Colleen Gillon](https://colleenjg.github.io/)\n",
+    "* Dawn Estes McKnight\n",
+    "* [Egor Zverev](https://egozverev.github.io/)\n",
+    "* [He He](https://hhexiy.github.io/) \n",
     "* [Ioannis Mitliagkas](http://mitliagkas.github.io/)\n",
-    "* [Lyle Ungar](https://www.cis.upenn.edu/~ungar/)\n",
-    "\n",
-    "### Week 2: making things work\n",
-    "* [Alona Fyshe](https://webdocs.cs.ualberta.ca/~alona/)\n",
-    "* [Alexander Ecker](https://eckerlab.org/)\n",
-    "* [James Evans](https://sociology.uchicago.edu/directory/james-evans)\n",
-    "* [He He](https://hhexiy.github.io/)\n",
-    "* [Vikash Gilja](https://tnel.ucsd.edu/bio) and [Akash Srivastava](https://akashgit.github.io/)\n",
+    "* [Jose Gallego-Posada](https://gallego-posada.github.io/) \n",
+    "* [Jordan Matelsky](https://jordan.matelsky.com/) \n",
+    "* Kevin Machado Gamboa\n",
+    "* [Konrad Kording](https://kordinglab.com) *\n",
+    "* Kushaan Gupta\n",
+    "* [Lyle Ungar](https://www.cis.upenn.edu/~ungar/) \n",
+    "* [Mandana Samiei](https://mandanasmi.github.io/) \n",
+    "* Matthew Sargent\n",
+    "* Mohitrajhu Lingan Kumaraian\n",
+    "* [Pablo Samuel Castro](https://psc-g.github.io/)\n",
+    "* Rajaswa Patil\n",
+    "* Ravi Teja Konkimalla\n",
+    "* [Raymond Chua](https://raymondchua.github.io/)\n",
+    "* [Richard Gerum](https://rgerum.github.io/)\n",
+    "* [Rohan Saha](https://www.rohansaha.in/) \n",
+    "* [Saeed Salehi](https://saeedsalehi.com/) \n",
+    "* Saeed Najafi\n",
+    "* [Shaonan Wang](https://wangshaonan.github.io/)\n",
+    "* Shubh Pachchigar\n",
+    "* [Spiros Chavlis](https://spiroschv.github.io/)\n",
+    "* [Surya Ganguli](https://ganguli-gang.stanford.edu/)\n",
+    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php)\n",
+    "* [Timo Lüddecke](https://timojl.github.io/) \n",
+    "* [Vikash Gilja](https://tnel.ucsd.edu/bio) \n",
+    "* [Vladimir Haltakov](https://haltakov.net/) \n",
     "\n",
-    "### Week 3: more magic\n",
-    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
-    "* [Jane Wang](http://www.janexwang.com/) and [Feryal Behbahani](https://feryal.github.io/)\n",
-    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
-    "* [Josh Vogelstein](https://jovo.me/) and [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/)"
+    "### Past contributors\n",
+    "* [Akash Srivastava](https://akashgit.github.io/) \n",
+    "* [Feryal Behbahani](https://feryal.github.io/) \n",
+    "* [James Evans](https://sociology.uchicago.edu/directory/james-evans) \n",
+    "* [Jane Wang](http://www.janexwang.com/) \n",
+    "* [Josh Vogelstein](https://jovo.me/) \n",
+    "* [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/) \n"
    ]
   },
   {
@@ -4559,10 +4605,8 @@
    },
    "outputs": [],
    "source": [
-    "# @title Install and Import `altair` and `vega_datasets`.\n",
-    "!pip install altair vega_datasets --quiet\n",
+    "# @title Import `altair` and `vega_datasets`\n",
     "\n",
-    "import altair as alt  # altair is defining data visualizations\n",
     "\n",
     "# Source data files\n",
     "# Position data file maps ID to x,y positions\n",
@@ -4831,7 +4875,8 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "nma-dl-jax",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -4844,7 +4889,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.11"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
index 327ef5ce4..5e7ad563a 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
@@ -105,6 +105,29 @@
     "feedback_prefix = \"W1D2_T1\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -118,7 +141,10 @@
     "import numpy as np\n",
     "from torch import nn\n",
     "from math import pi\n",
-    "import matplotlib.pyplot as plt"
+    "import matplotlib.pyplot as plt",
+    "\n",
+    "import ipywidgets as widgets\n",
+    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
    ]
   },
   {
@@ -134,7 +160,6 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
-    "import ipywidgets as widgets  # Interactive display\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")"
    ]
@@ -150,7 +175,6 @@
    "source": [
     "# @title Plotting functions\n",
     "\n",
-    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
     "\n",
     "def ex3_plot(model, x, y, ep, lss):\n",
     "  \"\"\"\n",
@@ -1985,4 +2009,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
index 3cd755448..38d04704a 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
@@ -104,6 +104,29 @@
     "feedback_prefix = \"W1D2_T2\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -116,7 +139,11 @@
     "import time\n",
     "import numpy as np\n",
     "import matplotlib\n",
-    "import matplotlib.pyplot as plt"
+    "import matplotlib.pyplot as plt",
+    "\n",
+    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
+    "from ipywidgets import HBox, interactive_output, ToggleButton, Layout\n",
+    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
    ]
   },
   {
@@ -132,9 +159,6 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
-    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
-    "from ipywidgets import HBox, interactive_output, ToggleButton, Layout\n",
-    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
     "\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")"
@@ -2610,4 +2634,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
index 73c23c9fb..731188102 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
@@ -103,6 +103,29 @@
     "feedback_prefix = \"W1D2_T3\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -120,7 +143,13 @@
     "import matplotlib.pyplot as plt\n",
     "\n",
     "import torch.nn as nn\n",
-    "import torch.optim as optim"
+    "import torch.optim as optim",
+    "\n",
+    "import warnings\n",
+    "from matplotlib import gridspec\n",
+    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
+    "from ipywidgets import FloatLogSlider, Layout, VBox, interactive_output\n",
+    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
    ]
   },
   {
@@ -136,14 +165,7 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
-    "from matplotlib import gridspec\n",
-    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
-    "from ipywidgets import FloatLogSlider, Layout, VBox\n",
-    "from ipywidgets import interactive_output\n",
-    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
     "\n",
-    "import warnings\n",
-    "warnings.filterwarnings(\"ignore\")\n",
     "\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")"
@@ -3175,4 +3197,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
index 0dac98a50..058f68f95 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
@@ -105,6 +105,29 @@
     "feedback_prefix = \"W1D3_T1\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -124,7 +147,9 @@
     "import torch.optim as optim\n",
     "from tqdm.auto import tqdm\n",
     "from IPython.display import display\n",
-    "from torch.utils.data import DataLoader, TensorDataset"
+    "from torch.utils.data import DataLoader, TensorDataset",
+    "\n",
+    "import ipywidgets as widgets\n"
    ]
   },
   {
@@ -140,7 +165,6 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
-    "import ipywidgets as widgets  # Interactive display\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")"
    ]
@@ -2330,4 +2354,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
index af3d14444..00ff3efc4 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
@@ -103,6 +103,29 @@
     "feedback_prefix = \"W1D3_T2\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -127,7 +150,9 @@
     "from torch.utils.data import DataLoader, TensorDataset\n",
     "\n",
     "from tqdm.auto import tqdm\n",
-    "from IPython.display import display"
+    "from IPython.display import display",
+    "\n",
+    "import ipywidgets as widgets\n"
    ]
   },
   {
@@ -143,7 +168,6 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
-    "import ipywidgets as widgets  # Interactive display\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")\n",
     "my_layout = widgets.Layout()"
@@ -1979,4 +2003,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index 4294dfa55..e3004c61a 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -105,6 +105,29 @@
     "feedback_prefix = \"W1D4_T1\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -143,7 +166,6 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
-    "import ipywidgets as widgets  # interactive display\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")\n",
     "plt.rc('axes', unicode_minus=False)"
@@ -3804,4 +3826,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file

From 72217d34f78614b5c2345f5b0bcaf9b3c7e50ede Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 29 Mar 2026 20:17:21 -0400
Subject: [PATCH 02/34] add tutorials/requirements_tutorials.txt file for
 running tutorials locally

---
 tutorials/requirements_tutorials.txt | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 tutorials/requirements_tutorials.txt

diff --git a/tutorials/requirements_tutorials.txt b/tutorials/requirements_tutorials.txt
new file mode 100644
index 000000000..b5eac4f7c
--- /dev/null
+++ b/tutorials/requirements_tutorials.txt
@@ -0,0 +1,23 @@
+# Requirements for Neuromatch Academy Deep Learning tutorials
+# These packages are pre-installed on Google Colab/Kaggle.
+# For local setup: pip install -r requirements_tutorials.txt
+#
+# Python >= 3.10 required
+# Tutorial-specific packages (e.g. transformers, diffusers, altair) are
+# installed at the top of the relevant tutorial notebooks.
+
+numpy>=2.0
+pandas>=2.2
+matplotlib>=3.10
+torch>=2.0
+torchvision>=0.15
+scikit-learn>=1.3
+scipy>=1.13
+Pillow>=10.0
+imageio>=2.30
+seaborn>=0.13
+nltk>=3.9
+tensorboard>=2.19
+ipywidgets>=8.0
+tqdm>=4.0
+requests>=2.31

From 6550969a0ff17daa17c0e9bc4df9cad86558eadc Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 29 Mar 2026 21:41:17 -0400
Subject: [PATCH 03/34] W1D1: extract bonus section, fix imports, add GPU tips
 and appendix links

- Extract altair/paper visualization bonus section into W1D1_BonusLecture.ipynb
  with its own install, feedback, and imports cells; remove altair/vega_datasets
  from main tutorial install and imports
- Add Colab GPU conservation tips callout before Section 2.4; highlight that
  setup cells must be rerun after runtime restart
- Add bold Appendix references with Colab jump links in cells referencing
  tensor methods, .view() docs, and GPU usage policy
- Fix commented-out imports in Section 2.5 with explanatory note

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_BonusLecture.ipynb                   | 355 ++++++++++++++++++
 .../W1D1_Tutorial1.ipynb                      | 302 +--------------
 2 files changed, 373 insertions(+), 284 deletions(-)
 create mode 100644 tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
new file mode 100644
index 000000000..6655dc73e
--- /dev/null
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
@@ -0,0 +1,355 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "",
+   "metadata": {},
+   "source": "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install dependencies\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import subprocess, sys, importlib\n",
+    "\n",
+    "for _pkg, _pip in {'altair': 'altair', 'vega_datasets': 'vega_datasets'}.items():\n",
+    "    try:\n",
+    "        importlib.import_module(_pkg)\n",
+    "    except ImportError:\n",
+    "        subprocess.check_call([sys.executable, '-m', 'pip', 'install', _pip, '-q'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and import feedback gadget\n",
+    "!pip3 install vibecheck datatops --quiet\n",
+    "\n",
+    "from vibecheck import DatatopsContentReviewContainer\n",
+    "def content_review(notebook_section: str):\n",
+    "    return DatatopsContentReviewContainer(\n",
+    "        '',\n",
+    "        notebook_section,\n",
+    "        {\n",
+    "            'url': 'https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab',\n",
+    "            'name': 'neuromatch_dl',\n",
+    "            'user_key': 'f379rz8y',\n",
+    "        },\n",
+    "    ).render()\n",
+    "\n",
+    "feedback_prefix = 'W1D1_BonusLecture'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import pandas as pd\n",
+    "import altair as alt\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Bonus - 60 years of Machine Learning Research in one Plot\n",
+    "\n",
+    "By [Hendrik Strobelt](http://hendrik.strobelt.com) (MIT-IBM Watson AI Lab) with support from Benjamin Hoover.\n",
+    "\n",
+    "In this notebook we visualize a subset* of 3,300 articles retreived from the AllenAI [S2ORC dataset](https://github.com/allenai/s2orc). We represent each paper by a position that is output of a dimensionality reduction method applied to a vector representation of each paper. The vector representation is the output of a neural network.\n",
+    "\n",
+    "**Note:** The selection is very biased on the keywords and methodology we used to filter. Please see the details section to learn about what we did."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Import `altair` and `vega_datasets`\n",
+    "\n",
+    "\n",
+    "# Source data files\n",
+    "# Position data file maps ID to x,y positions\n",
+    "# original link: http://gltr.io/temp/ml_regexv1_cs_ma_citation+_99perc.pos_umap_cosine_100_d0.1.json\n",
+    "POS_FILE = 'https://osf.io/qyrfn/download'\n",
+    "# original link: http://gltr.io/temp/ml_regexv1_cs_ma_citation+_99perc_clean.csv\n",
+    "# Metadata file maps ID to title, abstract, author,....\n",
+    "META_FILE = 'https://osf.io/vfdu6/download'\n",
+    "\n",
+    "# data loading and wrangling\n",
+    "def load_data():\n",
+    "  \"\"\"\n",
+    "  Loading the data\n",
+    "\n",
+    "  Args:\n",
+    "    None\n",
+    "\n",
+    "  Returns:\n",
+    "    Merged read dataFrame combining id and paper_id;\n",
+    "  \"\"\"\n",
+    "  positions = pd.read_json(POS_FILE)\n",
+    "  positions[['x', 'y']] = positions['pos'].to_list()\n",
+    "  meta = pd.read_csv(META_FILE)\n",
+    "  return positions.merge(meta, left_on='id', right_on='paper_id')\n",
+    "\n",
+    "\n",
+    "# load data\n",
+    "data = load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Define Visualization using ALtair\n",
+    "YEAR_PERIOD = \"quinquennial\"  # @param\n",
+    "selection = alt.selection_multi(fields=[YEAR_PERIOD], bind='legend')\n",
+    "data[YEAR_PERIOD] = (data[\"year\"] / 5.0).apply(np.floor) * 5\n",
+    "chart = alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\"]], width=800,\n",
+    "                  height=800).mark_circle(radius=2, opacity=0.2).encode(\n",
+    "    alt.Color(YEAR_PERIOD+':O',\n",
+    "              scale=alt.Scale(scheme='viridis', reverse=False, clamp=True, domain=list(range(1955,2020,5))),\n",
+    "              # legend=alt.Legend(title='Total Records')\n",
+    "              ),\n",
+    "    alt.Size('citation_count',\n",
+    "              scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])\n",
+    "              ),\n",
+    "       alt.X('x:Q',\n",
+    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
+    "    ),\n",
+    "       alt.Y('y:Q',\n",
+    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
+    "    ),\n",
+    "    tooltip=['title', 'authors'],\n",
+    "    # size='citation_count',\n",
+    "    # color=\"decade:O\",\n",
+    "    opacity=alt.condition(selection, alt.value(.8), alt.value(0.2)),\n",
+    "\n",
+    ").add_selection(\n",
+    "    selection\n",
+    ").interactive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Lets look at the Visualization. Each dot represents one paper. Close dots mean that the respective papers are closer related than distant ones. The color indicates the 5-year period of when the paper was published. The dot size indicates the citation count (within S2ORC corpus) as of July 2020.\n",
+    "\n",
+    "The view is **interactive** and allows for three main interactions. Try them and play around:\n",
+    "1. Hover over a dot to see a tooltip (title, author)\n",
+    "2. Select a year in the legend (right) to filter dots\n",
+    "3. Zoom in/out with scroll -- double click resets view"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "chart"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Questions\n",
+    "\n",
+    "By playing around, can you find some answers to the following questions?\n",
+    "\n",
+    "1. Can you find topical clusters? What cluster might occur because of a filtering error?\n",
+    "2. Can you see a temporal trend in the data and clusters?\n",
+    "3. Can you determine when deep learning methods started booming ?\n",
+    "4. Can you find the key papers that where written before the DL \"winter\" that define milestones for a cluster? (tip: look for large dots of different color)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "1. As specified below, the data is already filtered for topics such as Computer Science/Mathematics.\n",
+    "Filtering errors could occur if keywords in a paper are incorrectly tagged or if cases don't match etc.\n",
+    "\n",
+    "2. To look for temporal trends in the data/clusters, observe the color transitions in the above\n",
+    "visualization. We see that a lot more papers were published in diversified topics,\n",
+    "as we transitioned out of AI Winters.\n",
+    "\n",
+    "3. Based on the color of the clusters, we can infer that deep learning methods\n",
+    "boomed between the 2010 and 2015 period.\n",
+    "\n",
+    "4. After filtering around the mid 1900's, hovering on the larger dots show\n",
+    "the key papers before the DL winters.\n",
+    "For instance, \"Neural networks and physical systems with emergent\n",
+    "collective computational abilities\" by John J Hopfield (1980's)\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Bonus_Section_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Methods\n",
+    "\n",
+    "Here is what we did:\n",
+    "1. Filtering of all papers who fullfilled the criterria:\n",
+    "  - are categorized as `Computer Science` or `Mathematics`\n",
+    "  - one of the following keywords appearing in title or abstract: `\"machine learning|artificial intelligence|neural network|(machine|computer) vision|perceptron|network architecture| RNN | CNN | LSTM | BLEU | MNIST | CIFAR |reinforcement learning|gradient descent| Imagenet \"`\n",
+    "2. Per year, remove all papers that are below the 99 percentile of citation count in that year\n",
+    "3. Embed each paper by using abstract + title in SPECTER model\n",
+    "4. Project based on embedding using UMAP\n",
+    "5. Visualize using Altair"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "### Find Authors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Edit the `AUTHOR_FILTER` variable to full text search for authors.\n",
+    "\n",
+    "AUTHOR_FILTER = \"Rush \"  # @param space at the end means \"word border\"\n",
+    "\n",
+    "### Don't ignore case when searching...\n",
+    "FLAGS = 0\n",
+    "### uncomment do ignore case\n",
+    "# FLAGS = re.IGNORECASE\n",
+    "\n",
+    "## --- FILTER CODE.. make it your own ---\n",
+    "data['issel'] = data['authors'].str.contains(AUTHOR_FILTER, na=False, flags=FLAGS, )\n",
+    "if data['issel'].mean()<0.0000000001:\n",
+    "  print('No match found')\n",
+    "\n",
+    "## --- FROM HERE ON VIS CODE ---\n",
+    "alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\", \"issel\"]], width=800,\n",
+    "                  height=800) \\\n",
+    "    .mark_circle(stroke=\"black\", strokeOpacity=1).encode(\n",
+    "    alt.Color(YEAR_PERIOD+':O',\n",
+    "              scale=alt.Scale(scheme='viridis', reverse=False),\n",
+    "              # legend=alt.Legend(title='Total Records')\n",
+    "              ),\n",
+    "    alt.Size('citation_count',\n",
+    "              scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])\n",
+    "              ),\n",
+    "    alt.StrokeWidth('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[0, 2]), legend=None),\n",
+    "\n",
+    "    alt.Opacity('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[.2, 1]), legend=None),\n",
+    "    alt.X('x:Q',\n",
+    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
+    "    ),\n",
+    "    alt.Y('y:Q',\n",
+    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
+    "    ),\n",
+    "    tooltip=['title', 'authors'],\n",
+    ").interactive()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "include_colab_link": true,
+   "name": "W1D1_Tutorial1",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernel": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "kernelspec": {
+   "display_name": "nma-dl-jax",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 0b0e6ff90..7c0fcfafc 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -104,8 +104,7 @@
     "# Running locally? See tutorials/requirements_tutorials.txt\n",
     "import subprocess, sys, importlib\n",
     "\n",
-    "_to_install = {'pandas': 'pandas', 'imageio': 'imageio',\n",
-    "               'altair': 'altair', 'vega_datasets': 'vega_datasets'}\n",
+    "_to_install = {'pandas': 'pandas', 'imageio': 'imageio'}\n",
     "for _pkg, _pip in _to_install.items():\n",
     "    try:\n",
     "        importlib.import_module(_pkg)\n",
@@ -114,7 +113,7 @@
     "\n",
     "# Print versions for reproducibility / bug reports\n",
     "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'pandas', 'matplotlib', 'torch', 'torchvision', 'sklearn', 'imageio', 'altair']:\n",
+    "for _pkg in ['numpy', 'pandas', 'matplotlib', 'torch', 'torchvision', 'sklearn', 'imageio']:\n",
     "    try:\n",
     "        _mod = importlib.import_module(_pkg)\n",
     "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
@@ -167,7 +166,6 @@
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
     "import imageio.v2 as imageio\n",
-    "import altair as alt\n",
     "\n",
     "# PyTorch\n",
     "import torch\n",
@@ -1142,11 +1140,7 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the appendix (there are a lot!)\n",
-    "\n",
-    "All of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
-   ]
+   "source": "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n\nAll of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
   },
   {
    "cell_type": "code",
@@ -1592,9 +1586,7 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the Appendix."
-   ]
+   "source": "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
   },
   {
    "cell_type": "markdown",
@@ -2236,15 +2228,13 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n",
-    "\n",
-    "By following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n",
-    "\n",
-    "Once you have done this your runtime will restart and you will need to rerun the first setup cell to reimport PyTorch. Then proceed to the next cell.\n",
-    "\n",
-    "For more information on the GPU usage policy you can view in the Appendix."
-   ]
+   "source": "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n\nBy following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n\nOnce you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n\nFor more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+  },
+  {
+   "cell_type": "markdown",
+   "id": "",
+   "metadata": {},
+   "source": "> **Colab GPU tips:**\n> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
   },
   {
    "cell_type": "markdown",
@@ -2709,7 +2699,10 @@
    },
    "outputs": [],
    "source": [
-    "# Imports moved to Setup section above\n"
+    "# Imports moved to Setup section above; kept here for reference\n",
+    "# from torchvision import datasets\n",
+    "# from torchvision.transforms import ToTensor, Compose, Grayscale\n",
+    "# from torch.utils.data import DataLoader\n"
    ]
   },
   {
@@ -4571,266 +4564,6 @@
     "* [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/) \n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Now, go to the [visualization of ICLR papers](https://iclr.cc/virtual/2021/paper_vis.html). Read a few abstracts. Look at the various clusters. Where do you see yourself in this map?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Bonus - 60 years of Machine Learning Research in one Plot\n",
-    "\n",
-    "By [Hendrik Strobelt](http://hendrik.strobelt.com) (MIT-IBM Watson AI Lab) with support from Benjamin Hoover.\n",
-    "\n",
-    "In this notebook we visualize a subset* of 3,300 articles retreived from the AllenAI [S2ORC dataset](https://github.com/allenai/s2orc). We represent each paper by a position that is output of a dimensionality reduction method applied to a vector representation of each paper. The vector representation is the output of a neural network.\n",
-    "\n",
-    "**Note:** The selection is very biased on the keywords and methodology we used to filter. Please see the details section to learn about what we did."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Import `altair` and `vega_datasets`\n",
-    "\n",
-    "\n",
-    "# Source data files\n",
-    "# Position data file maps ID to x,y positions\n",
-    "# original link: http://gltr.io/temp/ml_regexv1_cs_ma_citation+_99perc.pos_umap_cosine_100_d0.1.json\n",
-    "POS_FILE = 'https://osf.io/qyrfn/download'\n",
-    "# original link: http://gltr.io/temp/ml_regexv1_cs_ma_citation+_99perc_clean.csv\n",
-    "# Metadata file maps ID to title, abstract, author,....\n",
-    "META_FILE = 'https://osf.io/vfdu6/download'\n",
-    "\n",
-    "# data loading and wrangling\n",
-    "def load_data():\n",
-    "  \"\"\"\n",
-    "  Loading the data\n",
-    "\n",
-    "  Args:\n",
-    "    None\n",
-    "\n",
-    "  Returns:\n",
-    "    Merged read dataFrame combining id and paper_id;\n",
-    "  \"\"\"\n",
-    "  positions = pd.read_json(POS_FILE)\n",
-    "  positions[['x', 'y']] = positions['pos'].to_list()\n",
-    "  meta = pd.read_csv(META_FILE)\n",
-    "  return positions.merge(meta, left_on='id', right_on='paper_id')\n",
-    "\n",
-    "\n",
-    "# load data\n",
-    "data = load_data()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Define Visualization using ALtair\n",
-    "YEAR_PERIOD = \"quinquennial\"  # @param\n",
-    "selection = alt.selection_multi(fields=[YEAR_PERIOD], bind='legend')\n",
-    "data[YEAR_PERIOD] = (data[\"year\"] / 5.0).apply(np.floor) * 5\n",
-    "chart = alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\"]], width=800,\n",
-    "                  height=800).mark_circle(radius=2, opacity=0.2).encode(\n",
-    "    alt.Color(YEAR_PERIOD+':O',\n",
-    "              scale=alt.Scale(scheme='viridis', reverse=False, clamp=True, domain=list(range(1955,2020,5))),\n",
-    "              # legend=alt.Legend(title='Total Records')\n",
-    "              ),\n",
-    "    alt.Size('citation_count',\n",
-    "              scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])\n",
-    "              ),\n",
-    "       alt.X('x:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "       alt.Y('y:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "    tooltip=['title', 'authors'],\n",
-    "    # size='citation_count',\n",
-    "    # color=\"decade:O\",\n",
-    "    opacity=alt.condition(selection, alt.value(.8), alt.value(0.2)),\n",
-    "\n",
-    ").add_selection(\n",
-    "    selection\n",
-    ").interactive()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Lets look at the Visualization. Each dot represents one paper. Close dots mean that the respective papers are closer related than distant ones. The color indicates the 5-year period of when the paper was published. The dot size indicates the citation count (within S2ORC corpus) as of July 2020.\n",
-    "\n",
-    "The view is **interactive** and allows for three main interactions. Try them and play around:\n",
-    "1. Hover over a dot to see a tooltip (title, author)\n",
-    "2. Select a year in the legend (right) to filter dots\n",
-    "3. Zoom in/out with scroll -- double click resets view"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "chart"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Questions\n",
-    "\n",
-    "By playing around, can you find some answers to the following questions?\n",
-    "\n",
-    "1. Can you find topical clusters? What cluster might occur because of a filtering error?\n",
-    "2. Can you see a temporal trend in the data and clusters?\n",
-    "3. Can you determine when deep learning methods started booming ?\n",
-    "4. Can you find the key papers that where written before the DL \"winter\" that define milestones for a cluster? (tip: look for large dots of different color)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "1. As specified below, the data is already filtered for topics such as Computer Science/Mathematics.\n",
-    "Filtering errors could occur if keywords in a paper are incorrectly tagged or if cases don't match etc.\n",
-    "\n",
-    "2. To look for temporal trends in the data/clusters, observe the color transitions in the above\n",
-    "visualization. We see that a lot more papers were published in diversified topics,\n",
-    "as we transitioned out of AI Winters.\n",
-    "\n",
-    "3. Based on the color of the clusters, we can infer that deep learning methods\n",
-    "boomed between the 2010 and 2015 period.\n",
-    "\n",
-    "4. After filtering around the mid 1900's, hovering on the larger dots show\n",
-    "the key papers before the DL winters.\n",
-    "For instance, \"Neural networks and physical systems with emergent\n",
-    "collective computational abilities\" by John J Hopfield (1980's)\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Bonus_Section_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Methods\n",
-    "\n",
-    "Here is what we did:\n",
-    "1. Filtering of all papers who fullfilled the criterria:\n",
-    "  - are categorized as `Computer Science` or `Mathematics`\n",
-    "  - one of the following keywords appearing in title or abstract: `\"machine learning|artificial intelligence|neural network|(machine|computer) vision|perceptron|network architecture| RNN | CNN | LSTM | BLEU | MNIST | CIFAR |reinforcement learning|gradient descent| Imagenet \"`\n",
-    "2. Per year, remove all papers that are below the 99 percentile of citation count in that year\n",
-    "3. Embed each paper by using abstract + title in SPECTER model\n",
-    "4. Project based on embedding using UMAP\n",
-    "5. Visualize using Altair"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Find Authors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Edit the `AUTHOR_FILTER` variable to full text search for authors.\n",
-    "\n",
-    "AUTHOR_FILTER = \"Rush \"  # @param space at the end means \"word border\"\n",
-    "\n",
-    "### Don't ignore case when searching...\n",
-    "FLAGS = 0\n",
-    "### uncomment do ignore case\n",
-    "# FLAGS = re.IGNORECASE\n",
-    "\n",
-    "## --- FILTER CODE.. make it your own ---\n",
-    "data['issel'] = data['authors'].str.contains(AUTHOR_FILTER, na=False, flags=FLAGS, )\n",
-    "if data['issel'].mean()<0.0000000001:\n",
-    "  print('No match found')\n",
-    "\n",
-    "## --- FROM HERE ON VIS CODE ---\n",
-    "alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\", \"issel\"]], width=800,\n",
-    "                  height=800) \\\n",
-    "    .mark_circle(stroke=\"black\", strokeOpacity=1).encode(\n",
-    "    alt.Color(YEAR_PERIOD+':O',\n",
-    "              scale=alt.Scale(scheme='viridis', reverse=False),\n",
-    "              # legend=alt.Legend(title='Total Records')\n",
-    "              ),\n",
-    "    alt.Size('citation_count',\n",
-    "              scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])\n",
-    "              ),\n",
-    "    alt.StrokeWidth('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[0, 2]), legend=None),\n",
-    "\n",
-    "    alt.Opacity('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[.2, 1]), legend=None),\n",
-    "    alt.X('x:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "    alt.Y('y:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "    tooltip=['title', 'authors'],\n",
-    ").interactive()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -4857,7 +4590,8 @@
     "\n",
     "## Books for reference:\n",
     "- [https://www.deeplearningbook.org/](https://www.deeplearningbook.org/) (Deep Learning by Ian Goodfellow, Yoshua Bengio and Aaron Courville)"
-   ]
+   ],
+   "id": "appendix"
   }
  ],
  "metadata": {
@@ -4894,4 +4628,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file

From ec7028b06c179bc260dea389610be331bc789bc8 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 31 Mar 2026 21:00:30 -0400
Subject: [PATCH 04/34] W1D2: fix T2 import order (torch before matplotlib)

Moved `import torch` before numpy/matplotlib in T2 imports cell to match
course-wide convention; removed duplicate `import torch` from set_seed helper.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
index 38d04704a..0fff5f482 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
@@ -137,10 +137,10 @@
    "source": [
     "# Imports\n",
     "import time\n",
+    "import torch\n",
     "import numpy as np\n",
     "import matplotlib\n",
-    "import matplotlib.pyplot as plt",
-    "\n",
+    "import matplotlib.pyplot as plt\n",
     "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
     "from ipywidgets import HBox, interactive_output, ToggleButton, Layout\n",
     "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
@@ -864,7 +864,6 @@
     "\n",
     "# Call `set_seed` function in the exercises to ensure reproducibility.\n",
     "import random\n",
-    "import torch\n",
     "\n",
     "def set_seed(seed=None, seed_torch=True):\n",
     "  \"\"\"\n",

From 1229b648ad97fc21a1015f1ace6bdc7fce2b0a18 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Wed, 1 Apr 2026 17:11:45 -0400
Subject: [PATCH 05/34] W1D4: split Tutorial1, add DL Case Study as Tutorial2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Split W1D4_Tutorial1: sections 5–8 (non-convexity, mini-batches,
  adaptive methods, ethics) and bonus exercise moved to new
  W1D4_BonusLecture.ipynb with a self-contained setup block.
  Tutorial1 now covers sections 1–4 only.
- Moved W2D2_Tutorial2 (DL Thinking 1 — cost functions) to
  W1D4_Tutorial2; renamed all "DL Thinking" text to "DL Case Study".
- Updated curriculum review plan: W1D4 done items, W2D2 T2 status.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D4_Optimization/W1D4_BonusLecture.ipynb | 2097 +++++++++++++++++
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    | 1729 +-------------
 .../W1D4_Tutorial2.ipynb}                     |   10 +-
 3 files changed, 2108 insertions(+), 1728 deletions(-)
 create mode 100644 tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
 rename tutorials/{W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb => W1D4_Optimization/W1D4_Tutorial2.ipynb} (99%)

diff --git a/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb b/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
new file mode 100644
index 000000000..8113b803b
--- /dev/null
+++ b/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
@@ -0,0 +1,2097 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "execution": {},
+    "id": "view-in-github"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "# Bonus Lecture: Optimization techniques (continued)\n",
+    "\n",
+    "**Week 1, Day 4: Optimization**\n",
+    "\n",
+    "**By Neuromatch Academy**\n",
+    "\n",
+    "__Content creators:__ Jose Gallego-Posada, Ioannis Mitliagkas\n",
+    "\n",
+    "__Content reviewers:__ Piyush Chauhan, Vladimir Haltakov, Siwei Bai, Kelson Shilling-Scrivo\n",
+    "\n",
+    "__Content editors:__ Charles J Edelson, Gagana B, Spiros Chavlis\n",
+    "\n",
+    "__Production editors:__ Arush Tagade, R. Krishnakumaran, Gagana B, Spiros Chavlis\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Tutorial Objectives\n",
+    "\n",
+    "Objectives:\n",
+    "*   Optimization in non-convex loss landscapes\n",
+    "*   Mini-batch sampling and stochastic gradients\n",
+    "*   'Adaptive' hyperparameter tuning\n",
+    "*   Ethical concerns\n",
+    "*   Putting it all together: training your own model\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Tutorial slides\n",
+    "from IPython.display import IFrame\n",
+    "link_id = \"ft2sz\"\n",
+    "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n",
+    "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and import feedback gadget\n",
+    "\n",
+    "!pip3 install vibecheck datatops --quiet\n",
+    "\n",
+    "from vibecheck import DatatopsContentReviewContainer\n",
+    "def content_review(notebook_section: str):\n",
+    "    return DatatopsContentReviewContainer(\n",
+    "        \"\",  # No text prompt\n",
+    "        notebook_section,\n",
+    "        {\n",
+    "            \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n",
+    "            \"name\": \"neuromatch_dl\",\n",
+    "            \"user_key\": \"f379rz8y\",\n",
+    "        },\n",
+    "    ).render()\n",
+    "\n",
+    "\n",
+    "feedback_prefix = \"W1D4_T1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": ""
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import copy\n",
+    "\n",
+    "import ipywidgets as widgets\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "import time\n",
+    "import torch\n",
+    "import torchvision\n",
+    "import torchvision.datasets as datasets\n",
+    "import torch.nn.functional as F\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Figure settings\n",
+    "import logging\n",
+    "logging.getLogger('matplotlib.font_manager').disabled = True\n",
+    "\n",
+    "%config InlineBackend.figure_format = 'retina'\n",
+    "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")\n",
+    "plt.rc('axes', unicode_minus=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Helper functions\n",
+    "def print_params(model):\n",
+    "  \"\"\"\n",
+    "  Lists the name and current value of the model's\n",
+    "  named parameters\n",
+    "\n",
+    "  Args:\n",
+    "    model: an nn.Module inherited model\n",
+    "      Represents the ML/DL model\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  for name, param in model.named_parameters():\n",
+    "    if param.requires_grad:\n",
+    "      print(name, param.data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Set random seed\n",
+    "\n",
+    "# @markdown Executing `set_seed(seed=seed)` you are setting the seed\n",
+    "\n",
+    "# for DL its critical to set the random seed so that students can have a\n",
+    "# baseline to compare their results to expected results.\n",
+    "# Read more here: https://pytorch.org/docs/stable/notes/randomness.html\n",
+    "\n",
+    "# Call the `set_seed` function in the exercises to ensure reproducibility.\n",
+    "import random\n",
+    "import torch\n",
+    "\n",
+    "def set_seed(seed=None, seed_torch=True):\n",
+    "  \"\"\"\n",
+    "  Handles variability by controlling sources of randomness\n",
+    "  through set seed values\n",
+    "\n",
+    "  Args:\n",
+    "    seed: Integer\n",
+    "      Set the seed value to given integer.\n",
+    "      If no seed, set seed value to random integer in the range 2^32\n",
+    "    seed_torch: Bool\n",
+    "      Seeds the random number generator for all devices to\n",
+    "      offer some guarantees on reproducibility\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  if seed is None:\n",
+    "    seed = np.random.choice(2 ** 32)\n",
+    "  random.seed(seed)\n",
+    "  np.random.seed(seed)\n",
+    "  if seed_torch:\n",
+    "    torch.manual_seed(seed)\n",
+    "    torch.cuda.manual_seed_all(seed)\n",
+    "    torch.cuda.manual_seed(seed)\n",
+    "    torch.backends.cudnn.benchmark = False\n",
+    "    torch.backends.cudnn.deterministic = True\n",
+    "  print(f'Random seed {seed} has been set.')\n",
+    "\n",
+    "\n",
+    "# In case that `DataLoader` is used\n",
+    "def seed_worker(worker_id):\n",
+    "  \"\"\"\n",
+    "  DataLoader will reseed workers following randomness in\n",
+    "  multi-process data loading algorithm.\n",
+    "\n",
+    "  Args:\n",
+    "    worker_id: integer\n",
+    "      ID of subprocess to seed. 0 means that\n",
+    "      the data will be loaded in the main process\n",
+    "      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  worker_seed = torch.initial_seed() % 2**32\n",
+    "  np.random.seed(worker_seed)\n",
+    "  random.seed(worker_seed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Set device (GPU or CPU). Execute `set_device()`\n",
+    "# especially if torch modules are used.\n",
+    "\n",
+    "# inform the user if the notebook uses GPU or CPU.\n",
+    "\n",
+    "def set_device():\n",
+    "  \"\"\"\n",
+    "  Set the device. CUDA if available, CPU otherwise\n",
+    "\n",
+    "  Args:\n",
+    "    None\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "  if device != \"cuda\":\n",
+    "    print(\"WARNING: For this notebook to perform best, \"\n",
+    "        \"if possible, in the menu under `Runtime` -> \"\n",
+    "        \"`Change runtime type.`  select `GPU` \")\n",
+    "  else:\n",
+    "    print(\"GPU is enabled in this notebook.\")\n",
+    "\n",
+    "  return device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "SEED = 2021\n",
+    "set_seed(seed=SEED)\n",
+    "DEVICE = set_device()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 5: Non-convexity\n",
+    "\n",
+    "*Time estimate: ~30 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "The introduction of even just 1 hidden layer in the neural network transforms the previous convex optimization problem into a non-convex one. And with great non-convexity, comes great responsibility... (Sorry, we couldn't help it!)\n",
+    "\n",
+    "**Note:** From this section onwards we will be dealing with non-convex optimization problems for the remainder of the tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 5: Overparameterization\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', '7vUpUEKKl5o'), ('Bilibili', 'BV16h41167Jr')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Overparameterization_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Take a couple of minutes to play with a more complex 3D visualization of the loss landscape of a neural network on a non-convex problem. Visit https://losslandscape.com/explorer.\n",
+    "\n",
+    "1. Explore the features on the bottom left corner. You can see an explanation for each icon by clicking on the ( i ) button located on the top right corner.\n",
+    "2. Use the 'gradient descent' feature to perform a thought experiment:\n",
+    "    -   Choose an initialization\n",
+    "    -   Choose the learning rate\n",
+    "    -   Mentally formulate your hypothesis about what kind of trajectory you expect to observe\n",
+    "3. Run the experiment and contrast your intuition with the observed behavior.\n",
+    "4. Repeat this experiment a handful of times for several initialization/learning rate configurations\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 5: Overparameterization to the rescue!\n",
+    "\n",
+    "As you may have seen, the non-convex nature of the surface can lead the optimization process to get stuck in undesirable local-optima. There is ample empirical evidence supporting the claim that 'overparameterized' models are easier to train.\n",
+    "\n",
+    "We will explore this assertion in the context of our MLP training. For this, we initialize a fixed model and construct several models by small random perturbations to the original initialized weights. Now, we train each of these perturbed models and see how the loss evolves. If we were in the convex setting, we should reach very similar objective values upon convergence since all these models were very close at the beginning of training, and in convex problems, the local optimum is also the global optimum.\n",
+    "\n",
+    "Use the interactive plot below to visualize the loss progression for these perturbed models:\n",
+    "\n",
+    "1. Select different settings from the `hidden_dims` drop-down menu.\n",
+    "2. Explore the effect of the number of steps and learning rate."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "\n",
+    "@widgets.interact_manual\n",
+    "def overparam(max_steps=widgets.IntSlider(150, 50, 500, 5),\n",
+    "              hidden_dims=widgets.Dropdown(options=[\"10\", \"20, 20\", \"100, 100\"],\n",
+    "                                           value=\"10\"),\n",
+    "              lr=widgets.FloatLogSlider(value=5e-2, min=-3, max=0, step=0.1),\n",
+    "              num_inits=widgets.IntSlider(7, 5, 10, 1)):\n",
+    "  \"\"\"\n",
+    "  Displays the overparameterization phenomenon as a widget\n",
+    "\n",
+    "  Args:\n",
+    "    max_steps: widget integer slider\n",
+    "      Maximum number of steps on the slider with default = 150\n",
+    "    hidden_dims: widget dropdown menu instance\n",
+    "      The number of hidden dimensions with default = 10\n",
+    "    lr: widget float slider\n",
+    "      Scalar specifying the learning rate or step-size for the update with default = 5e-2\n",
+    "    num_inits: widget integer slider\n",
+    "      Scalar number of epochs\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "\n",
+    "  X, y = train_set.data[subset_index, :], train_set.targets[subset_index]\n",
+    "\n",
+    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
+    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, 1, figsize=(5, 4))\n",
+    "\n",
+    "  for _ in tqdm(range(num_inits)):\n",
+    "    model = copy.deepcopy(base_model)\n",
+    "    random_update(model, noise_scale=2e-1)\n",
+    "    loss_hist = np.zeros((max_steps, 2))\n",
+    "    for step in range(max_steps):\n",
+    "      loss = loss_fn(model(X), y)\n",
+    "      gradient_update(loss, list(model.parameters()), lr=lr)\n",
+    "      loss_hist[step] = np.array([step, loss.item()])\n",
+    "\n",
+    "    plt.plot(loss_hist[:, 0], loss_hist[:, 1])\n",
+    "\n",
+    "  plt.xlabel('Iteration')\n",
+    "  plt.ylabel('Loss')\n",
+    "  plt.ylim(0, 3)\n",
+    "  plt.show()\n",
+    "\n",
+    "  num_params = sum([np.prod(_.shape) for _ in model.parameters()])\n",
+    "  print('Number of parameters in model:  ' + str(num_params))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Overparameterization_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "### Think! 5.1: Width and depth of the network\n",
+    "\n",
+    "- We see that as we increase the width/depth of the network, training becomes faster and more consistent across different initializations. What might be the reasons for this behavior?\n",
+    "\n",
+    "- What are some potential downsides of this approach to dealing with non-convexity?\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "- The exact mechanism for this phenomenon is still under active research.\n",
+    "Existing evidence points to the following: in the overparameterized setting,\n",
+    "there are many more 'good configurations' (values of the model’s weights) that\n",
+    "lead to a low value of the objective. Furthermore, this large set of possible solutions\n",
+    "seems to be increasingly easy to find in the space of all possible\n",
+    "parameter configurations. As you increase the number of parameters, it becomes\n",
+    "more likely that your initialization will be close to one of these good parameter settings.\n",
+    "\n",
+    "- This approach will require more memory and computation. Furthermore, we need\n",
+    "to always be aware of the risk of overfitting: don’t forget to do cross-validation\n",
+    "in order to be able to detect overfitting.\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Width_and_depth_of_the_network_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 6: Full gradients are expensive\n",
+    "\n",
+    "*Time estimate: ~25 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "So far we have used only a small (fixed) subset of 500 training examples to perform the updates on the model parameters in our quest to minimize the loss. But what if we decided to use the training set? Do our current approach scale to datasets with tens of thousands, or millions of datapoints?\n",
+    "\n",
+    "In this section we explore an efficient alternative to avoid having to perform computations on all the training examples before performing a parameter update."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 6: Mini-batches\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'hbqUxpNBUGk'), ('Bilibili', 'BV1ty4y1T7Uh')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Mini_batches_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 6.1: Cost of computation\n",
+    "\n",
+    "Evaluating a neural network is a relatively fast process. However, when repeated millions of times, the computational cost of performing forward and backward passes through the network starts to become significant.\n",
+    "\n",
+    "In the visualization below, we show the time (averaged over 5 runs) of computing a forward and backward pass with a changing number of input examples. Choose from the different options in the drop-down box and note how the vertical scale changes depending on the size of the network.\n",
+    "\n",
+    "**Remarks:** Note that the computational cost of a forward pass shows a clear linear relationship with the number of input examples, and the cost of the corresponding backward pass exhibits a similar computational complexity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "\n",
+    "def gradient_update(loss, params, lr=1e-3):\n",
+    "  \"\"\"\n",
+    "  Perform a gradient descent update on a given loss over a collection of parameters\n",
+    "\n",
+    "  Args:\n",
+    "    loss: Tensor\n",
+    "      A scalar tensor containing the loss through which the gradient will be computed\n",
+    "    params: List of iterables\n",
+    "      Collection of parameters with respect to which we compute gradients\n",
+    "    lr: Float\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
+    "  # successive backward calls\n",
+    "  zero_grad(params)\n",
+    "\n",
+    "  # Compute gradients on given objective\n",
+    "  loss.backward()\n",
+    "\n",
+    "  with torch.no_grad():\n",
+    "    for par in params:\n",
+    "       par.data -= lr * par.grad.data\n",
+    "\n",
+    "\n",
+    "def measure_update_time(model, num_points):\n",
+    "  \"\"\"\n",
+    "  Measuring the time for update\n",
+    "\n",
+    "  Args:\n",
+    "    model: an nn.Module inherited model\n",
+    "      Represents the ML/DL model\n",
+    "    num_points: integer\n",
+    "      The number of data points in the train_set\n",
+    "\n",
+    "  Returns:\n",
+    "    tuple of loss time and time for calculation of gradient\n",
+    "  \"\"\"\n",
+    "  X, y = train_set.data[:num_points], train_set.targets[:num_points]\n",
+    "  start_time = time.time()\n",
+    "  loss = loss_fn(model(X), y)\n",
+    "  loss_time = time.time()\n",
+    "  gradient_update(loss, list(model.parameters()), lr=0)\n",
+    "  gradient_time = time.time()\n",
+    "  return loss_time - start_time, gradient_time - loss_time\n",
+    "\n",
+    "\n",
+    "@widgets.interact\n",
+    "def computation_time(hidden_dims=widgets.Dropdown(options=[\"1\", \"100\", \"50, 50\"],\n",
+    "                                                  value=\"100\")):\n",
+    "  \"\"\"\n",
+    "  Demonstrating time taken for computation as a widget\n",
+    "\n",
+    "  Args:\n",
+    "    hidden_dims: widgets dropdown\n",
+    "      The number of hidden dimensions with default = 100\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
+    "  model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
+    "\n",
+    "  NUM_POINTS = [1, 5, 10, 100, 200, 500, 1000, 5000, 10000, 20000, 30000, 50000]\n",
+    "  times_list = []\n",
+    "  for _ in range(5):\n",
+    "    times_list.append(np.array([measure_update_time(model, _) for _ in NUM_POINTS]))\n",
+    "\n",
+    "  times = np.array(times_list).mean(axis=0)\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, 1, figsize=(5,4))\n",
+    "  plt.plot(NUM_POINTS, times[:, 0], label='Forward')\n",
+    "  plt.plot(NUM_POINTS, times[:, 1], label='Backward')\n",
+    "  plt.xlabel('Number of data points')\n",
+    "  plt.ylabel('Seconds')\n",
+    "  plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Cost_of_computation_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n\nComplete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "def sample_minibatch(input_data, target_data, num_points=100):\n",
+    "  \"\"\"\n",
+    "  Sample a minibatch of size num_point from the provided input-target data\n",
+    "\n",
+    "  Args:\n",
+    "    input_data: Tensor\n",
+    "      Multi-dimensional tensor containing the input data\n",
+    "    target_data: Tensor\n",
+    "      1D tensor containing the class labels\n",
+    "    num_points: Integer\n",
+    "      Number of elements to be included in minibatch with default=100\n",
+    "\n",
+    "  Returns:\n",
+    "    batch_inputs: Tensor\n",
+    "      Minibatch inputs\n",
+    "    batch_targets: Tensor\n",
+    "      Minibatch targets\n",
+    "  \"\"\"\n",
+    "  #################################################\n",
+    "  ## TODO for students: sample minibatch of data ##\n",
+    "  raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
+    "  #################################################\n",
+    "  # Sample a collection of IID indices from the existing data\n",
+    "  batch_indices = ...\n",
+    "  # Use batch_indices to extract entries from the input and target data tensors\n",
+    "  batch_inputs = input_data[...]\n",
+    "  batch_targets = target_data[...]\n",
+    "\n",
+    "  return batch_inputs, batch_targets\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Uncomment to test your function\n",
+    "# x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
+    "# print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "```\n",
+    "The input shape is torch.Size([100, 28, 28]) and the target shape is: torch.Size([100])\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove solution\n",
+    "def sample_minibatch(input_data, target_data, num_points=100):\n",
+    "  \"\"\"\n",
+    "  Sample a minibatch of size num_point from the provided input-target data\n",
+    "\n",
+    "  Args:\n",
+    "    input_data: Tensor\n",
+    "      Multi-dimensional tensor containing the input data\n",
+    "    target_data: Tensor\n",
+    "      1D tensor containing the class labels\n",
+    "    num_points: Integer\n",
+    "      Number of elements to be included in minibatch with default=100\n",
+    "\n",
+    "  Returns:\n",
+    "    batch_inputs: Tensor\n",
+    "      Minibatch inputs\n",
+    "    batch_targets: Tensor\n",
+    "      Minibatch targets\n",
+    "  \"\"\"\n",
+    "  # Sample a collection of IID indices from the existing data\n",
+    "  batch_indices = np.random.choice(len(input_data), num_points)\n",
+    "  # Use batch_indices to extract entries from the input and target data tensors\n",
+    "  batch_inputs = input_data[batch_indices, :]\n",
+    "  batch_targets = target_data[batch_indices]\n",
+    "\n",
+    "  return batch_inputs, batch_targets\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Uncomment to test your function\n",
+    "x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
+    "print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Implement_mini_batch_sampling_Exercise\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 6.2: *Compare* different minibatch sizes\n",
+    "\n",
+    "What are the trade-offs induced by the choice of minibatch size? The interactive plot below shows the training evolution of a 2-hidden layer MLP with 100 hidden units in each hidden layer. Different plots correspond to a different choice of minibatch size. We have a fixed time budget for all the cases, reflected in the horizontal axes of these plots."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "\n",
+    "@widgets.interact_manual\n",
+    "def minibatch_experiment(batch_sizes='20, 250, 1000',\n",
+    "                         lrs='5e-3, 5e-3, 5e-3',\n",
+    "                         time_budget=widgets.Dropdown(options=[\"2.5\", \"5\", \"10\"],\n",
+    "                                                      value=\"2.5\")):\n",
+    "  \"\"\"\n",
+    "  Demonstration of minibatch experiment\n",
+    "\n",
+    "  Args:\n",
+    "    batch_sizes: String\n",
+    "      Size of minibatches\n",
+    "    lrs: String\n",
+    "      Different learning rates\n",
+    "    time_budget: widget dropdown instance\n",
+    "      Different time budgets with default=2.5s\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  batch_sizes = [int(s) for s in batch_sizes.split(',')]\n",
+    "  lrs = [float(s) for s in lrs.split(',')]\n",
+    "\n",
+    "  LOSS_HIST = {_:[] for _ in batch_sizes}\n",
+    "\n",
+    "  X, y = train_set.data, train_set.targets\n",
+    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
+    "\n",
+    "  for id, batch_size in enumerate(tqdm(batch_sizes)):\n",
+    "    start_time = time.time()\n",
+    "    # Create a new copy of the model for each batch size\n",
+    "    model = copy.deepcopy(base_model)\n",
+    "    params = list(model.parameters())\n",
+    "    lr = lrs[id]\n",
+    "    # Fixed budget per choice of batch size\n",
+    "    while (time.time() - start_time) < float(time_budget):\n",
+    "      data, labels = sample_minibatch(X, y, batch_size)\n",
+    "      loss = loss_fn(model(data), labels)\n",
+    "      gradient_update(loss, params, lr=lr)\n",
+    "      LOSS_HIST[batch_size].append([time.time() - start_time,\n",
+    "                                    loss.item()])\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, len(batch_sizes), figsize=(10, 3))\n",
+    "  for ax, batch_size in zip(axs, batch_sizes):\n",
+    "    plot_data = np.array(LOSS_HIST[batch_size])\n",
+    "    ax.plot(plot_data[:, 0], plot_data[:, 1], label=batch_size,\n",
+    "            alpha=0.8)\n",
+    "    ax.set_title('Batch size: ' + str(batch_size))\n",
+    "    ax.set_xlabel('Seconds')\n",
+    "    ax.set_ylabel('Loss')\n",
+    "  plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "**Remarks:** SGD works! We have an algorithm that can be applied (with due precautions) to learn datasets of arbitrary size.\n",
+    "\n",
+    "However, **note the difference in the vertical scale** across the plots above. When using a larger minibatch, we can perform fewer parameter updates as the forward and backward passes are more expensive.\n",
+    "\n",
+    "This highlights the interplay between the minibatch size and the learning rate: when our minibatch is larger, we have a more confident estimator of the direction to move, and thus can afford a larger learning rate. On the other hand, extremely small minibatches are very fast computationally but are not representative of the data distribution and yield estimations of the gradient with high variance.\n",
+    "\n",
+    "We encourage you to tune the value of the learning rate for each of the minibatch sizes in the previous demo, to achieve a training loss steadily below 0.5 within 5 seconds."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Compare_different_minibatch_sizes_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 7: Adaptive methods\n",
+    "\n",
+    "*Time estimate: ~25 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "As of now, you should be aware that there are many knobs to turn when working on a machine learning problem. Some of these relate to the optimization algorithm, the choice of model, or the objective to minimize. Here are some prototypical examples:\n",
+    "\n",
+    "- Problem: loss function, regularization coefficients (Week 1, Day 5)\n",
+    "- Model: architecture, activations function\n",
+    "- Optimizer: learning rate, batch size, momentum coefficient\n",
+    "\n",
+    "We concentrate on the choices that are directly related to optimization. In particular, we will explore some _automatic_ methods for setting the learning rate in a way that fixes the poor-conditioning problem and is robust across different problems.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 7: Adaptive Methods\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'Zr6r2kfmQUM'), ('Bilibili', 'BV1eq4y1W7JG')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Adaptive_Methods_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": "## Coding Exercise 7 *(optional)*: Implement RMSprop\n\nIn this exercise you will implement the update of the RMSprop optimizer:\n\n\\begin{align}\nv_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\nw_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n\\end{align}\n\nwhere the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n\nHere, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
+    "  \"\"\"\n",
+    "  Perform an RMSprop update on a collection of parameters\n",
+    "\n",
+    "  Args:\n",
+    "    loss: Tensor\n",
+    "      A scalar tensor containing the loss whose gradient will be computed\n",
+    "    params: Iterable\n",
+    "      Collection of parameters with respect to which we compute gradients\n",
+    "    grad_sq: Iterable\n",
+    "      Moving average of squared gradients\n",
+    "    lr: Float\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "    alpha: Float\n",
+    "      Moving average parameter\n",
+    "    epsilon: Float\n",
+    "      quotient for numerical stability\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
+    "  # successive backward calls\n",
+    "  zero_grad(params)\n",
+    "  # Compute gradients on given objective\n",
+    "  loss.backward()\n",
+    "\n",
+    "  with torch.no_grad():\n",
+    "    for (par, gsq) in zip(params, grad_sq):\n",
+    "      #################################################\n",
+    "      ## TODO for students: update the value of the parameter ##\n",
+    "      # Use gsq.data and par.grad\n",
+    "      raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
+    "      #################################################\n",
+    "      # Update estimate of gradient variance\n",
+    "      gsq.data = ...\n",
+    "      # Update parameters\n",
+    "      par.data -=  ...\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "set_seed(seed=SEED)\n",
+    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
+    "print('\\n The model3 parameters before the update are: \\n')\n",
+    "print_params(model3)\n",
+    "loss = loss_fn(model3(X), y)\n",
+    "# Initialize the moving average of squared gradients\n",
+    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Uncomment below to test your function\n",
+    "# rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
+    "# print('\\n The model3 parameters after the update are: \\n')\n",
+    "# print_params(model3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "```\n",
+    " The model3 parameters after the update are:\n",
+    "\n",
+    "main.0.weight tensor([[-0.0240,  0.0031,  0.0193,  ...,  0.0316,  0.0297, -0.0198],\n",
+    "        [-0.0063, -0.0318, -0.0109,  ..., -0.0093,  0.0232, -0.0255],\n",
+    "        [ 0.0218, -0.0253,  0.0320,  ...,  0.0102,  0.0248, -0.0203],\n",
+    "        ...,\n",
+    "        [-0.0027,  0.0136,  0.0089,  ...,  0.0123, -0.0324, -0.0166],\n",
+    "        [ 0.0159,  0.0281,  0.0233,  ..., -0.0133, -0.0197,  0.0182],\n",
+    "        [ 0.0186, -0.0376, -0.0205,  ..., -0.0293,  0.0077, -0.0019]])\n",
+    "main.0.bias tensor([-0.0313, -0.0011,  0.0122, -0.0342,  0.0045,  0.0199,  0.0329,  0.0265,\n",
+    "         0.0182, -0.0041])\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove solution\n",
+    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
+    "  \"\"\"\n",
+    "  Perform an RMSprop update on a collection of parameters\n",
+    "\n",
+    "  Args:\n",
+    "    loss: Tensor\n",
+    "      A scalar tensor containing the loss whose gradient will be computed\n",
+    "    params: Iterable\n",
+    "      Collection of parameters with respect to which we compute gradients\n",
+    "    grad_sq: Iterable\n",
+    "      Moving average of squared gradients\n",
+    "    lr: Float\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "    alpha: Float\n",
+    "      Moving average parameter\n",
+    "    epsilon: Float\n",
+    "      quotient for numerical stability\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
+    "  # successive backward calls\n",
+    "  zero_grad(params)\n",
+    "  # Compute gradients on given objective\n",
+    "  loss.backward()\n",
+    "\n",
+    "  with torch.no_grad():\n",
+    "    for (par, gsq) in zip(params, grad_sq):\n",
+    "      # Update estimate of gradient variance\n",
+    "      gsq.data = alpha * gsq.data + (1 - alpha) * par.grad**2\n",
+    "      # Update parameters\n",
+    "      par.data -=  lr * (par.grad / (epsilon + gsq.data)**0.5)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "set_seed(seed=SEED)\n",
+    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
+    "print('\\n The model3 parameters before the update are: \\n')\n",
+    "print_params(model3)\n",
+    "loss = loss_fn(model3(X), y)\n",
+    "# Initialize the moving average of squared gradients\n",
+    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
+    "\n",
+    "## Uncomment below to test your function\n",
+    "rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
+    "print('\\n The model3 parameters after the update are: \\n')\n",
+    "print_params(model3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Implement_RMSProp_Exercise\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 7: Compare optimizers\n",
+    "\n",
+    "Below, we compare your implementations of **SGD**, **Momentum**, and **RMSprop**. If you have successfully coded all the exercises so far: congrats!\n",
+    "\n",
+    "You are now *in the know* of some of the most commonly used and powerful optimization tools for deep learning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "X, y = train_set.data, train_set.targets\n",
+    "\n",
+    "@widgets.interact_manual\n",
+    "def compare_optimizers(\n",
+    "    batch_size=(25, 250, 5),\n",
+    "    lr=widgets.FloatLogSlider(value=2e-3, min=-5, max=0),\n",
+    "    max_steps=(50, 500, 5)):\n",
+    "  \"\"\"\n",
+    "  Demonstration to compare optimisers - stochastic gradient descent, momentum, RMSprop\n",
+    "\n",
+    "  Args:\n",
+    "    batch_size: Tuple\n",
+    "      Size of minibatches\n",
+    "    lr: Float log slider instance\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "    max_steps: Tuple\n",
+    "      Max number of step sizes for incrementing\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  SGD_DICT = [gradient_update, 'SGD', 'black', '-', {'lr': lr}]\n",
+    "  MOM_DICT = [momentum_update, 'Momentum', 'red', '--', {'lr': lr, 'beta': 0.9}]\n",
+    "  RMS_DICT = [rmsprop_update, 'RMSprop', 'fuchsia', '-', {'lr': lr, 'alpha': 0.8}]\n",
+    "\n",
+    "  ALL_DICTS = [SGD_DICT, MOM_DICT, RMS_DICT]\n",
+    "\n",
+    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
+    "\n",
+    "  LOSS_HIST = {}\n",
+    "\n",
+    "  for opt_dict in tqdm(ALL_DICTS):\n",
+    "    update_fn, opt_name, color, lstyle, kwargs = opt_dict\n",
+    "    LOSS_HIST[opt_name] = []\n",
+    "\n",
+    "    model = copy.deepcopy(base_model)\n",
+    "    params = list(model.parameters())\n",
+    "\n",
+    "    if opt_name != 'SGD':\n",
+    "      aux_tensors = [torch.zeros_like(_) for _ in params]\n",
+    "\n",
+    "    for step in range(max_steps):\n",
+    "      data, labels = sample_minibatch(X, y, batch_size)\n",
+    "      loss = loss_fn(model(data), labels)\n",
+    "      if opt_name == 'SGD':\n",
+    "        update_fn(loss, params, **kwargs)\n",
+    "      else:\n",
+    "        update_fn(loss, params, aux_tensors, **kwargs)\n",
+    "      LOSS_HIST[opt_name].append(loss.item())\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, len(ALL_DICTS), figsize=(9, 3))\n",
+    "  for ax, optim_dict in zip(axs, ALL_DICTS):\n",
+    "    opt_name = optim_dict[1]\n",
+    "    ax.plot(range(max_steps), LOSS_HIST[opt_name], alpha=0.8)\n",
+    "    ax.set_title(opt_name)\n",
+    "    ax.set_xlabel('Iteration')\n",
+    "    ax.set_ylabel('Loss')\n",
+    "    ax.set_ylim(0, 2.5)\n",
+    "  plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Compare_optimizers_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think 7.1!: Compare optimizers\n",
+    "\n",
+    "Tune the three methods above - **SGD**, **Momentum**, and **RMSProp** - to make each excel and discuss your findings. How do the methods compare in terms of robustness to small changes of the hyperparameters? How easy was it to find a good hyperparameter configuration?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "Stochastic Gradient Descent (SGD): Performs updates one example at a time.\n",
+    "Momentum: Helps accelerate SGD in the relevant direction and dampens\n",
+    "oscillations specially ravines.\n",
+    "RMSProp: Allows each parameter to be updated at an 'appropriate' rate decided\n",
+    "based on magnitudes of past recent updates;\n",
+    "i.e., areas where the surface curves much more steeply in one dimension than\n",
+    "in another, which are common around local optima.\n",
+    "\n",
+    "Robustness: RMSProp > Momentum > SGD\n",
+    "Since, each example affects SGD by updating hyperparameters, it's not\n",
+    "considered very robust.\n",
+    "Adagrad greatly improved the robustness of SGD and is used for training\n",
+    "large-scale neural nets.\n",
+    "Momentum is quite robust: he momentum term increases for dimensions whose\n",
+    "gradients point in the same directions\n",
+    "and reduces updates for dimensions whose gradients change directions.\n",
+    "RMSProp is very robust; This combines the idea of only using the sign of\n",
+    "the gradient with the idea of adapting the step size separately\n",
+    "for each weight in a mini-batch.\n",
+    "\n",
+    "Generally, non-adaptive methods consistently produce more robust models\n",
+    "than adaptive methods. Refer https://arxiv.org/pdf/1911.03784.pdf - for more details\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Compare_optimizers_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "**Remarks:** Note that RMSprop allows us to use a 'per-dimension' learning rate _without having to tune one learning rate for each dimension **ourselves**_. The method uses information collected about the variance of the gradients throughout training to **adapt** the step size for each of the parameters automatically. The savings in tuning efforts of RMSprop over SGD or 'plain' momentum are undisputed on this task.\n",
+    "\n",
+    "Moreover, adaptive optimization methods are currently a highly active research domain, with many related algorithms like Adam, AMSgrad, Adagrad being used in practical application and theoretically investigated."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "### Locality of Gradients\n",
+    "\n",
+    "As we've seen throughout this tutorial, poor conditioning can be a significant burden on convergence to an optimum while using gradient-based optimization. Of the methods we've seen to deal with this issue, notice how both momentum and adaptive learning rates incorporate past gradient values into their update schemes. Why do we use past values of our loss function's gradient while updating our current MLP weights?\n",
+    "\n",
+    "Recall from *W1D2* that the gradient of a function, $\\nabla f(w_t)$, is a **local** property and computes the direction of maximum change of $f(w_t)$ at the point $w_t$. However, when we train our MLP model we are hoping to find the **global** optimum for our training loss. By incorporating past values of our function's gradient into our optimization schemes, we use more information about the overall shape of our function than just a single gradient alone can provide."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think! 7.2: Loss function and optimization\n",
+    "\n",
+    "Can you think of other ways we can incorporate more information about our loss function into our optimization schemes?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "We could consider incorporating the curvature of our function directly into our\n",
+    "optimization schemes. Methods that use this are often called Newton's methods\n",
+    "or Hessian based optimization methods.\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Loss_function_and_optimization_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 8: Ethical concerns\n",
+    "\n",
+    "*Time estimate: ~15mins*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 8: Ethical concerns\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', '0EthSI0cknI'), ('Bilibili', 'BV1TU4y1G7Je')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Ethical_concerns_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Summary\n",
+    "\n",
+    "* Optimization is necessary to create Deep Learning models that are guaranteed to converge\n",
+    "* Stochastic Gradient Descent and Momentum are two commonly used optimization techniques\n",
+    "* RMSProp is a way of adaptive hyperparameter tuning which utilises a per-dimension learning rate\n",
+    "* Poor choice of optimization objectives can lead to unforeseen, undesirable consequences\n",
+    "\n",
+    "If you have time left, you can read the Bonus material, where we put it all together and we compare our model with a benchmark model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Bonus: Putting it all together\n",
+    "\n",
+    "*Time estimate: ~40 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "We have progressively built a sophisticated optimization algorithm, which is able to deal with a non-convex, poor-conditioned problem concerning tens of thousands of training examples. Now we present _you_ with a small challenge: beat us! :P\n",
+    "\n",
+    "Your mission is to train an MLP model that can compete with a benchmark model which we have pre-trained for you. In this section you will be able to use the full Pytorch power: loading the data, defining the model, sampling minibatches as well as Pytorch's **optimizer implementations**.\n",
+    "\n",
+    "There is a big engineering component behind the design of optimizers and their implementation can sometimes become tricky. So unless you are directly doing research in optimization, it's recommended to use an implementation provided by a widely reviewed open-source library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 9: Putting it all together\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'DP9c13vLiOM'), ('Bilibili', 'BV1MK4y1u7u2')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Putting_it_all_together_Bonus_Video\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Download parameters of the benchmark model\n",
+    "import requests\n",
+    "\n",
+    "fname = 'benchmark_model.pt'\n",
+    "url = \"https://osf.io/sj4e8/download\"\n",
+    "r = requests.get(url, allow_redirects=True)\n",
+    "with open(fname, 'wb') as fh:\n",
+    "  fh.write(r.content)\n",
+    "\n",
+    "# Load the benchmark model's parameters\n",
+    "DEVICE = set_device()\n",
+    "if DEVICE == \"cuda\":\n",
+    "  benchmark_state_dict = torch.load(fname)\n",
+    "else:\n",
+    "  benchmark_state_dict = torch.load(fname, map_location=torch.device('cpu'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# Create MLP object and update weights with those of saved model\n",
+    "benchmark_model = MLP(in_dim=784, out_dim=10,\n",
+    "                      hidden_dims=[200, 100, 50]).to(DEVICE)\n",
+    "benchmark_model.load_state_dict(benchmark_state_dict)\n",
+    "\n",
+    "\n",
+    "# Define helper function to evaluate models\n",
+    "def eval_model(model, data_loader, num_batches=np.inf, device='cpu'):\n",
+    "  \"\"\"\n",
+    "  To evaluate a given model\n",
+    "\n",
+    "  Args:\n",
+    "    model: nn.Module derived class\n",
+    "      The model which is to be evaluated\n",
+    "    data_loader: Iterable\n",
+    "      A configured dataloading utility\n",
+    "    num_batches: Integer\n",
+    "      Size of minibatches\n",
+    "    device: String\n",
+    "      Sets the device. CUDA if available, CPU otherwise\n",
+    "\n",
+    "  Returns:\n",
+    "    mean of log loss and mean of log accuracy\n",
+    "  \"\"\"\n",
+    "\n",
+    "  loss_log, acc_log = [], []\n",
+    "  model.to(device=device)\n",
+    "\n",
+    "  # We are just evaluating the model, no need to compute gradients\n",
+    "  with torch.no_grad():\n",
+    "    for batch_id, batch in enumerate(data_loader):\n",
+    "      # If we only evaluate a number of batches, stop after we reach that number\n",
+    "      if batch_id > num_batches:\n",
+    "        break\n",
+    "      # Extract minibatch data\n",
+    "      data, labels = batch[0].to(device), batch[1].to(device)\n",
+    "      # Evaluate model and loss on minibatch\n",
+    "      preds = model(data)\n",
+    "      loss_log.append(loss_fn(preds, labels).item())\n",
+    "      acc_log.append(torch.mean(1. * (preds.argmax(dim=1) == labels)).item())\n",
+    "\n",
+    "  return np.mean(loss_log), np.mean(acc_log)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "We define an optimizer in the following steps:\n",
+    "\n",
+    "1. Load  the corresponding class that implements the parameter updates and other internal management activities, including:\n",
+    "    - create auxiliary variables,\n",
+    "    - update moving averages,\n",
+    "    - adjust the learning rate.\n",
+    "2. Pass the parameters of the Pytorch model that the optimizer has control over. Note that different optimizers can potentially control different parameter groups.\n",
+    "3. Specify hyperparameters, including learning rate, momentum, moving average factors, etc.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Exercise Bonus: Train your own model\n",
+    "\n",
+    "Now, train the model with your preferred optimizer and find a good combination of hyperparameter settings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "#################################################\n",
+    "## TODO for students: adjust training settings ##\n",
+    "\n",
+    "# The three parameters below are in your full control\n",
+    "MAX_EPOCHS = 2  # select number of epochs to train\n",
+    "LR = 1e-5  # choose the step size\n",
+    "BATCH_SIZE = 64  # number of examples per minibatch\n",
+    "\n",
+    "# Define the model and associated optimizer -- you may change its architecture!\n",
+    "my_model = MLP(in_dim=784, out_dim=10, hidden_dims=[200, 100, 50]).to(DEVICE)\n",
+    "\n",
+    "# You can take your pick from many different optimizers\n",
+    "# Check the optimizer documentation and hyperparameter meaning before using!\n",
+    "# More details on Pytorch optimizers: https://pytorch.org/docs/stable/optim.html\n",
+    "# optimizer = torch.optim.SGD(my_model.parameters(), lr=LR, momentum=0.9)\n",
+    "# optimizer = torch.optim.RMSprop(my_model.parameters(), lr=LR, alpha=0.99)\n",
+    "# optimizer = torch.optim.Adagrad(my_model.parameters(), lr=LR)\n",
+    "optimizer = torch.optim.Adam(my_model.parameters(), lr=LR)\n",
+    "#################################################"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "set_seed(seed=SEED)\n",
+    "# Print training stats every LOG_FREQ minibatches\n",
+    "LOG_FREQ = 200\n",
+    "# Frequency for evaluating the validation metrics\n",
+    "VAL_FREQ = 200\n",
+    "# Load data using a Pytorch Dataset\n",
+    "train_set_orig, test_set_orig = load_mnist_data(change_tensors=False)\n",
+    "\n",
+    "# We separate 10,000 training samples to create a validation set\n",
+    "train_set_orig, val_set_orig = torch.utils.data.random_split(train_set_orig, [50000, 10000])\n",
+    "\n",
+    "# Create the corresponding DataLoaders for training and test\n",
+    "g_seed = torch.Generator()\n",
+    "g_seed.manual_seed(SEED)\n",
+    "\n",
+    "train_loader = torch.utils.data.DataLoader(train_set_orig,\n",
+    "                                           shuffle=True,\n",
+    "                                           batch_size=BATCH_SIZE,\n",
+    "                                           num_workers=2,\n",
+    "                                           worker_init_fn=seed_worker,\n",
+    "                                           generator=g_seed)\n",
+    "val_loader = torch.utils.data.DataLoader(val_set_orig,\n",
+    "                                         shuffle=True,\n",
+    "                                         batch_size=256,\n",
+    "                                         num_workers=2,\n",
+    "                                         worker_init_fn=seed_worker,\n",
+    "                                         generator=g_seed)\n",
+    "test_loader = torch.utils.data.DataLoader(test_set_orig,\n",
+    "                                          batch_size=256,\n",
+    "                                          num_workers=2,\n",
+    "                                          worker_init_fn=seed_worker,\n",
+    "                                          generator=g_seed)\n",
+    "\n",
+    "# Run training\n",
+    "metrics = {'train_loss':[],\n",
+    "           'train_acc':[],\n",
+    "           'val_loss':[],\n",
+    "           'val_acc':[],\n",
+    "           'val_idx':[]}\n",
+    "\n",
+    "step_idx = 0\n",
+    "for epoch in tqdm(range(MAX_EPOCHS)):\n",
+    "\n",
+    "  running_loss, running_acc = 0., 0.\n",
+    "\n",
+    "  for batch_id, batch in enumerate(train_loader):\n",
+    "    step_idx += 1\n",
+    "    # Extract minibatch data and labels\n",
+    "    data, labels = batch[0].to(DEVICE), batch[1].to(DEVICE)\n",
+    "    # Just like before, refresh gradient accumulators.\n",
+    "    # Note that this is now a method of the optimizer.\n",
+    "    optimizer.zero_grad()\n",
+    "    # Evaluate model and loss on minibatch\n",
+    "    preds = my_model(data)\n",
+    "    loss = loss_fn(preds, labels)\n",
+    "    acc = torch.mean(1.0 * (preds.argmax(dim=1) == labels))\n",
+    "    # Compute gradients\n",
+    "    loss.backward()\n",
+    "    # Update parameters\n",
+    "    # Note how all the magic in the update of the parameters is encapsulated by\n",
+    "    # the optimizer class.\n",
+    "    optimizer.step()\n",
+    "    # Log metrics for plotting\n",
+    "    metrics['train_loss'].append(loss.cpu().item())\n",
+    "    metrics['train_acc'].append(acc.cpu().item())\n",
+    "\n",
+    "    if batch_id % VAL_FREQ == (VAL_FREQ - 1):\n",
+    "      # Get an estimate of the validation accuracy with 100 batches\n",
+    "      val_loss, val_acc = eval_model(my_model, val_loader,\n",
+    "                                     num_batches=100,\n",
+    "                                     device=DEVICE)\n",
+    "      metrics['val_idx'].append(step_idx)\n",
+    "      metrics['val_loss'].append(val_loss)\n",
+    "      metrics['val_acc'].append(val_acc)\n",
+    "\n",
+    "      print(f\"[VALID] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
+    "            f\"Loss: {val_loss:.3f} - Acc: {100*val_acc:.3f}%\")\n",
+    "\n",
+    "    # print statistics\n",
+    "    running_loss += loss.cpu().item()\n",
+    "    running_acc += acc.cpu().item()\n",
+    "    # Print every LOG_FREQ minibatches\n",
+    "    if batch_id % LOG_FREQ == (LOG_FREQ-1):\n",
+    "      print(f\"[TRAIN] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
+    "            f\"Loss: {running_loss / LOG_FREQ:.3f} - \"\n",
+    "            f\"Acc: {100 * running_acc / LOG_FREQ:.3f}%\")\n",
+    "\n",
+    "      running_loss, running_acc = 0., 0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
+    "\n",
+    "ax[0].plot(range(len(metrics['train_loss'])), metrics['train_loss'],\n",
+    "           alpha=0.8, label='Train')\n",
+    "ax[0].plot(metrics['val_idx'], metrics['val_loss'], label='Valid')\n",
+    "ax[0].set_xlabel('Iteration')\n",
+    "ax[0].set_ylabel('Loss')\n",
+    "ax[0].legend()\n",
+    "\n",
+    "ax[1].plot(range(len(metrics['train_acc'])), metrics['train_acc'],\n",
+    "           alpha=0.8, label='Train')\n",
+    "ax[1].plot(metrics['val_idx'], metrics['val_acc'], label='Valid')\n",
+    "ax[1].set_xlabel('Iteration')\n",
+    "ax[1].set_ylabel('Accuracy')\n",
+    "ax[1].legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Train_your_own_model_Bonus_Exercise\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think! Bonus: Metrics\n",
+    "\n",
+    "Which metric did you optimize when searching for the right configuration? The training set loss? Accuracy? Validation/test set metrics? Why? Discuss!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "  Remember the discussion in Section 1 about surrogate objectives.\n",
+    "Our optimization methods minimize the loss, but at the end of the day we care about test accuracy.\n",
+    "\n",
+    "  However, we can't directly optimize for test accuracy and the finite size of our\n",
+    "datasets lead us to (cross-)validation:\n",
+    "\n",
+    "  1. We minimize the loss (empirical risk minimization) on our *training set*.\n",
+    "  2. We choose models and hyperparameters on the *validation set*.\n",
+    "  3. We use the *test set* in order to report the final performance of our model on unseen data.\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Metrics_Bonus_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "### Evaluation\n",
+    "\n",
+    "We _finally_ can evaluate and compare the performance of the models on previously unseen examples.\n",
+    "\n",
+    "Which model would you keep? (\\*drum roll*)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "print('Your model...')\n",
+    "train_loss, train_accuracy = eval_model(my_model, train_loader, device=DEVICE)\n",
+    "test_loss, test_accuracy = eval_model(my_model, test_loader, device=DEVICE)\n",
+    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
+    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')\n",
+    "\n",
+    "print('\\nBenchmark model')\n",
+    "train_loss, train_accuracy = eval_model(benchmark_model, train_loader, device=DEVICE)\n",
+    "test_loss, test_accuracy = eval_model(benchmark_model, test_loader, device=DEVICE)\n",
+    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
+    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "include_colab_link": true,
+   "name": "W1D4_Tutorial1",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernel": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  },
+  "toc-autonumbering": true,
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index e3004c61a..edbf036e4 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -44,9 +44,6 @@
     "Objectives:\n",
     "*   Necessity and importance of optimization\n",
     "*   Introduction to commonly used optimization techniques\n",
-    "*   Optimization in non-convex loss landscapes\n",
-    "*   'Adaptive' hyperparameter tuning\n",
-    "*   Ethical concerns\n",
     "\n"
    ]
   },
@@ -1485,7 +1482,7 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 4: Implement momentum\n",
+    "## Coding Exercise 4 *(optional)*: Implement momentum\n",
     "\n",
     "In this exercise you will implement the momentum update given by:\n",
     "\n",
@@ -2032,1272 +2029,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 5: Non-convexity\n",
-    "\n",
-    "*Time estimate: ~30 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "The introduction of even just 1 hidden layer in the neural network transforms the previous convex optimization problem into a non-convex one. And with great non-convexity, comes great responsibility... (Sorry, we couldn't help it!)\n",
-    "\n",
-    "**Note:** From this section onwards we will be dealing with non-convex optimization problems for the remainder of the tutorial."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 5: Overparameterization\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', '7vUpUEKKl5o'), ('Bilibili', 'BV16h41167Jr')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Overparameterization_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Take a couple of minutes to play with a more complex 3D visualization of the loss landscape of a neural network on a non-convex problem. Visit https://losslandscape.com/explorer.\n",
-    "\n",
-    "1. Explore the features on the bottom left corner. You can see an explanation for each icon by clicking on the ( i ) button located on the top right corner.\n",
-    "2. Use the 'gradient descent' feature to perform a thought experiment:\n",
-    "    -   Choose an initialization\n",
-    "    -   Choose the learning rate\n",
-    "    -   Mentally formulate your hypothesis about what kind of trajectory you expect to observe\n",
-    "3. Run the experiment and contrast your intuition with the observed behavior.\n",
-    "4. Repeat this experiment a handful of times for several initialization/learning rate configurations\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 5: Overparameterization to the rescue!\n",
-    "\n",
-    "As you may have seen, the non-convex nature of the surface can lead the optimization process to get stuck in undesirable local-optima. There is ample empirical evidence supporting the claim that 'overparameterized' models are easier to train.\n",
-    "\n",
-    "We will explore this assertion in the context of our MLP training. For this, we initialize a fixed model and construct several models by small random perturbations to the original initialized weights. Now, we train each of these perturbed models and see how the loss evolves. If we were in the convex setting, we should reach very similar objective values upon convergence since all these models were very close at the beginning of training, and in convex problems, the local optimum is also the global optimum.\n",
-    "\n",
-    "Use the interactive plot below to visualize the loss progression for these perturbed models:\n",
-    "\n",
-    "1. Select different settings from the `hidden_dims` drop-down menu.\n",
-    "2. Explore the effect of the number of steps and learning rate."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "\n",
-    "@widgets.interact_manual\n",
-    "def overparam(max_steps=widgets.IntSlider(150, 50, 500, 5),\n",
-    "              hidden_dims=widgets.Dropdown(options=[\"10\", \"20, 20\", \"100, 100\"],\n",
-    "                                           value=\"10\"),\n",
-    "              lr=widgets.FloatLogSlider(value=5e-2, min=-3, max=0, step=0.1),\n",
-    "              num_inits=widgets.IntSlider(7, 5, 10, 1)):\n",
-    "  \"\"\"\n",
-    "  Displays the overparameterization phenomenon as a widget\n",
-    "\n",
-    "  Args:\n",
-    "    max_steps: widget integer slider\n",
-    "      Maximum number of steps on the slider with default = 150\n",
-    "    hidden_dims: widget dropdown menu instance\n",
-    "      The number of hidden dimensions with default = 10\n",
-    "    lr: widget float slider\n",
-    "      Scalar specifying the learning rate or step-size for the update with default = 5e-2\n",
-    "    num_inits: widget integer slider\n",
-    "      Scalar number of epochs\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "\n",
-    "  X, y = train_set.data[subset_index, :], train_set.targets[subset_index]\n",
-    "\n",
-    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
-    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, 1, figsize=(5, 4))\n",
-    "\n",
-    "  for _ in tqdm(range(num_inits)):\n",
-    "    model = copy.deepcopy(base_model)\n",
-    "    random_update(model, noise_scale=2e-1)\n",
-    "    loss_hist = np.zeros((max_steps, 2))\n",
-    "    for step in range(max_steps):\n",
-    "      loss = loss_fn(model(X), y)\n",
-    "      gradient_update(loss, list(model.parameters()), lr=lr)\n",
-    "      loss_hist[step] = np.array([step, loss.item()])\n",
-    "\n",
-    "    plt.plot(loss_hist[:, 0], loss_hist[:, 1])\n",
-    "\n",
-    "  plt.xlabel('Iteration')\n",
-    "  plt.ylabel('Loss')\n",
-    "  plt.ylim(0, 3)\n",
-    "  plt.show()\n",
-    "\n",
-    "  num_params = sum([np.prod(_.shape) for _ in model.parameters()])\n",
-    "  print('Number of parameters in model:  ' + str(num_params))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Overparameterization_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Think! 5.1: Width and depth of the network\n",
-    "\n",
-    "- We see that as we increase the width/depth of the network, training becomes faster and more consistent across different initializations. What might be the reasons for this behavior?\n",
-    "\n",
-    "- What are some potential downsides of this approach to dealing with non-convexity?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "- The exact mechanism for this phenomenon is still under active research.\n",
-    "Existing evidence points to the following: in the overparameterized setting,\n",
-    "there are many more 'good configurations' (values of the model’s weights) that\n",
-    "lead to a low value of the objective. Furthermore, this large set of possible solutions\n",
-    "seems to be increasingly easy to find in the space of all possible\n",
-    "parameter configurations. As you increase the number of parameters, it becomes\n",
-    "more likely that your initialization will be close to one of these good parameter settings.\n",
-    "\n",
-    "- This approach will require more memory and computation. Furthermore, we need\n",
-    "to always be aware of the risk of overfitting: don’t forget to do cross-validation\n",
-    "in order to be able to detect overfitting.\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Width_and_depth_of_the_network_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 6: Full gradients are expensive\n",
-    "\n",
-    "*Time estimate: ~25 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "So far we have used only a small (fixed) subset of 500 training examples to perform the updates on the model parameters in our quest to minimize the loss. But what if we decided to use the training set? Do our current approach scale to datasets with tens of thousands, or millions of datapoints?\n",
-    "\n",
-    "In this section we explore an efficient alternative to avoid having to perform computations on all the training examples before performing a parameter update."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 6: Mini-batches\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'hbqUxpNBUGk'), ('Bilibili', 'BV1ty4y1T7Uh')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Mini_batches_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 6.1: Cost of computation\n",
-    "\n",
-    "Evaluating a neural network is a relatively fast process. However, when repeated millions of times, the computational cost of performing forward and backward passes through the network starts to become significant.\n",
-    "\n",
-    "In the visualization below, we show the time (averaged over 5 runs) of computing a forward and backward pass with a changing number of input examples. Choose from the different options in the drop-down box and note how the vertical scale changes depending on the size of the network.\n",
-    "\n",
-    "**Remarks:** Note that the computational cost of a forward pass shows a clear linear relationship with the number of input examples, and the cost of the corresponding backward pass exhibits a similar computational complexity."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "\n",
-    "def gradient_update(loss, params, lr=1e-3):\n",
-    "  \"\"\"\n",
-    "  Perform a gradient descent update on a given loss over a collection of parameters\n",
-    "\n",
-    "  Args:\n",
-    "    loss: Tensor\n",
-    "      A scalar tensor containing the loss through which the gradient will be computed\n",
-    "    params: List of iterables\n",
-    "      Collection of parameters with respect to which we compute gradients\n",
-    "    lr: Float\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
-    "  # successive backward calls\n",
-    "  zero_grad(params)\n",
-    "\n",
-    "  # Compute gradients on given objective\n",
-    "  loss.backward()\n",
-    "\n",
-    "  with torch.no_grad():\n",
-    "    for par in params:\n",
-    "       par.data -= lr * par.grad.data\n",
-    "\n",
-    "\n",
-    "def measure_update_time(model, num_points):\n",
-    "  \"\"\"\n",
-    "  Measuring the time for update\n",
-    "\n",
-    "  Args:\n",
-    "    model: an nn.Module inherited model\n",
-    "      Represents the ML/DL model\n",
-    "    num_points: integer\n",
-    "      The number of data points in the train_set\n",
-    "\n",
-    "  Returns:\n",
-    "    tuple of loss time and time for calculation of gradient\n",
-    "  \"\"\"\n",
-    "  X, y = train_set.data[:num_points], train_set.targets[:num_points]\n",
-    "  start_time = time.time()\n",
-    "  loss = loss_fn(model(X), y)\n",
-    "  loss_time = time.time()\n",
-    "  gradient_update(loss, list(model.parameters()), lr=0)\n",
-    "  gradient_time = time.time()\n",
-    "  return loss_time - start_time, gradient_time - loss_time\n",
-    "\n",
-    "\n",
-    "@widgets.interact\n",
-    "def computation_time(hidden_dims=widgets.Dropdown(options=[\"1\", \"100\", \"50, 50\"],\n",
-    "                                                  value=\"100\")):\n",
-    "  \"\"\"\n",
-    "  Demonstrating time taken for computation as a widget\n",
-    "\n",
-    "  Args:\n",
-    "    hidden_dims: widgets dropdown\n",
-    "      The number of hidden dimensions with default = 100\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
-    "  model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
-    "\n",
-    "  NUM_POINTS = [1, 5, 10, 100, 200, 500, 1000, 5000, 10000, 20000, 30000, 50000]\n",
-    "  times_list = []\n",
-    "  for _ in range(5):\n",
-    "    times_list.append(np.array([measure_update_time(model, _) for _ in NUM_POINTS]))\n",
-    "\n",
-    "  times = np.array(times_list).mean(axis=0)\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, 1, figsize=(5,4))\n",
-    "  plt.plot(NUM_POINTS, times[:, 0], label='Forward')\n",
-    "  plt.plot(NUM_POINTS, times[:, 1], label='Backward')\n",
-    "  plt.xlabel('Number of data points')\n",
-    "  plt.ylabel('Seconds')\n",
-    "  plt.legend()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Cost_of_computation_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Coding Exercise 6: Implement minibatch sampling\n",
-    "\n",
-    "Complete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "def sample_minibatch(input_data, target_data, num_points=100):\n",
-    "  \"\"\"\n",
-    "  Sample a minibatch of size num_point from the provided input-target data\n",
-    "\n",
-    "  Args:\n",
-    "    input_data: Tensor\n",
-    "      Multi-dimensional tensor containing the input data\n",
-    "    target_data: Tensor\n",
-    "      1D tensor containing the class labels\n",
-    "    num_points: Integer\n",
-    "      Number of elements to be included in minibatch with default=100\n",
-    "\n",
-    "  Returns:\n",
-    "    batch_inputs: Tensor\n",
-    "      Minibatch inputs\n",
-    "    batch_targets: Tensor\n",
-    "      Minibatch targets\n",
-    "  \"\"\"\n",
-    "  #################################################\n",
-    "  ## TODO for students: sample minibatch of data ##\n",
-    "  raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
-    "  #################################################\n",
-    "  # Sample a collection of IID indices from the existing data\n",
-    "  batch_indices = ...\n",
-    "  # Use batch_indices to extract entries from the input and target data tensors\n",
-    "  batch_inputs = input_data[...]\n",
-    "  batch_targets = target_data[...]\n",
-    "\n",
-    "  return batch_inputs, batch_targets\n",
-    "\n",
-    "\n",
-    "\n",
-    "## Uncomment to test your function\n",
-    "# x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
-    "# print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "```\n",
-    "The input shape is torch.Size([100, 28, 28]) and the target shape is: torch.Size([100])\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove solution\n",
-    "def sample_minibatch(input_data, target_data, num_points=100):\n",
-    "  \"\"\"\n",
-    "  Sample a minibatch of size num_point from the provided input-target data\n",
-    "\n",
-    "  Args:\n",
-    "    input_data: Tensor\n",
-    "      Multi-dimensional tensor containing the input data\n",
-    "    target_data: Tensor\n",
-    "      1D tensor containing the class labels\n",
-    "    num_points: Integer\n",
-    "      Number of elements to be included in minibatch with default=100\n",
-    "\n",
-    "  Returns:\n",
-    "    batch_inputs: Tensor\n",
-    "      Minibatch inputs\n",
-    "    batch_targets: Tensor\n",
-    "      Minibatch targets\n",
-    "  \"\"\"\n",
-    "  # Sample a collection of IID indices from the existing data\n",
-    "  batch_indices = np.random.choice(len(input_data), num_points)\n",
-    "  # Use batch_indices to extract entries from the input and target data tensors\n",
-    "  batch_inputs = input_data[batch_indices, :]\n",
-    "  batch_targets = target_data[batch_indices]\n",
-    "\n",
-    "  return batch_inputs, batch_targets\n",
-    "\n",
-    "\n",
-    "\n",
-    "## Uncomment to test your function\n",
-    "x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
-    "print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Implement_mini_batch_sampling_Exercise\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 6.2: *Compare* different minibatch sizes\n",
-    "\n",
-    "What are the trade-offs induced by the choice of minibatch size? The interactive plot below shows the training evolution of a 2-hidden layer MLP with 100 hidden units in each hidden layer. Different plots correspond to a different choice of minibatch size. We have a fixed time budget for all the cases, reflected in the horizontal axes of these plots."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "\n",
-    "@widgets.interact_manual\n",
-    "def minibatch_experiment(batch_sizes='20, 250, 1000',\n",
-    "                         lrs='5e-3, 5e-3, 5e-3',\n",
-    "                         time_budget=widgets.Dropdown(options=[\"2.5\", \"5\", \"10\"],\n",
-    "                                                      value=\"2.5\")):\n",
-    "  \"\"\"\n",
-    "  Demonstration of minibatch experiment\n",
-    "\n",
-    "  Args:\n",
-    "    batch_sizes: String\n",
-    "      Size of minibatches\n",
-    "    lrs: String\n",
-    "      Different learning rates\n",
-    "    time_budget: widget dropdown instance\n",
-    "      Different time budgets with default=2.5s\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  batch_sizes = [int(s) for s in batch_sizes.split(',')]\n",
-    "  lrs = [float(s) for s in lrs.split(',')]\n",
-    "\n",
-    "  LOSS_HIST = {_:[] for _ in batch_sizes}\n",
-    "\n",
-    "  X, y = train_set.data, train_set.targets\n",
-    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
-    "\n",
-    "  for id, batch_size in enumerate(tqdm(batch_sizes)):\n",
-    "    start_time = time.time()\n",
-    "    # Create a new copy of the model for each batch size\n",
-    "    model = copy.deepcopy(base_model)\n",
-    "    params = list(model.parameters())\n",
-    "    lr = lrs[id]\n",
-    "    # Fixed budget per choice of batch size\n",
-    "    while (time.time() - start_time) < float(time_budget):\n",
-    "      data, labels = sample_minibatch(X, y, batch_size)\n",
-    "      loss = loss_fn(model(data), labels)\n",
-    "      gradient_update(loss, params, lr=lr)\n",
-    "      LOSS_HIST[batch_size].append([time.time() - start_time,\n",
-    "                                    loss.item()])\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, len(batch_sizes), figsize=(10, 3))\n",
-    "  for ax, batch_size in zip(axs, batch_sizes):\n",
-    "    plot_data = np.array(LOSS_HIST[batch_size])\n",
-    "    ax.plot(plot_data[:, 0], plot_data[:, 1], label=batch_size,\n",
-    "            alpha=0.8)\n",
-    "    ax.set_title('Batch size: ' + str(batch_size))\n",
-    "    ax.set_xlabel('Seconds')\n",
-    "    ax.set_ylabel('Loss')\n",
-    "  plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "**Remarks:** SGD works! We have an algorithm that can be applied (with due precautions) to learn datasets of arbitrary size.\n",
-    "\n",
-    "However, **note the difference in the vertical scale** across the plots above. When using a larger minibatch, we can perform fewer parameter updates as the forward and backward passes are more expensive.\n",
-    "\n",
-    "This highlights the interplay between the minibatch size and the learning rate: when our minibatch is larger, we have a more confident estimator of the direction to move, and thus can afford a larger learning rate. On the other hand, extremely small minibatches are very fast computationally but are not representative of the data distribution and yield estimations of the gradient with high variance.\n",
-    "\n",
-    "We encourage you to tune the value of the learning rate for each of the minibatch sizes in the previous demo, to achieve a training loss steadily below 0.5 within 5 seconds."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Compare_different_minibatch_sizes_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 7: Adaptive methods\n",
-    "\n",
-    "*Time estimate: ~25 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "As of now, you should be aware that there are many knobs to turn when working on a machine learning problem. Some of these relate to the optimization algorithm, the choice of model, or the objective to minimize. Here are some prototypical examples:\n",
-    "\n",
-    "- Problem: loss function, regularization coefficients (Week 1, Day 5)\n",
-    "- Model: architecture, activations function\n",
-    "- Optimizer: learning rate, batch size, momentum coefficient\n",
-    "\n",
-    "We concentrate on the choices that are directly related to optimization. In particular, we will explore some _automatic_ methods for setting the learning rate in a way that fixes the poor-conditioning problem and is robust across different problems.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 7: Adaptive Methods\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'Zr6r2kfmQUM'), ('Bilibili', 'BV1eq4y1W7JG')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Adaptive_Methods_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Coding Exercise 7: Implement RMSprop\n",
-    "\n",
-    "In this exercise you will implement the update of the RMSprop optimizer:\n",
-    "\n",
-    "\\begin{align}\n",
-    "v_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\n",
-    "w_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n",
-    "\\end{align}\n",
-    "\n",
-    "where the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n",
-    "\n",
-    "Here, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
-    "  \"\"\"\n",
-    "  Perform an RMSprop update on a collection of parameters\n",
-    "\n",
-    "  Args:\n",
-    "    loss: Tensor\n",
-    "      A scalar tensor containing the loss whose gradient will be computed\n",
-    "    params: Iterable\n",
-    "      Collection of parameters with respect to which we compute gradients\n",
-    "    grad_sq: Iterable\n",
-    "      Moving average of squared gradients\n",
-    "    lr: Float\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "    alpha: Float\n",
-    "      Moving average parameter\n",
-    "    epsilon: Float\n",
-    "      quotient for numerical stability\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
-    "  # successive backward calls\n",
-    "  zero_grad(params)\n",
-    "  # Compute gradients on given objective\n",
-    "  loss.backward()\n",
-    "\n",
-    "  with torch.no_grad():\n",
-    "    for (par, gsq) in zip(params, grad_sq):\n",
-    "      #################################################\n",
-    "      ## TODO for students: update the value of the parameter ##\n",
-    "      # Use gsq.data and par.grad\n",
-    "      raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
-    "      #################################################\n",
-    "      # Update estimate of gradient variance\n",
-    "      gsq.data = ...\n",
-    "      # Update parameters\n",
-    "      par.data -=  ...\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "set_seed(seed=SEED)\n",
-    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
-    "print('\\n The model3 parameters before the update are: \\n')\n",
-    "print_params(model3)\n",
-    "loss = loss_fn(model3(X), y)\n",
-    "# Initialize the moving average of squared gradients\n",
-    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
-    "\n",
-    "\n",
-    "\n",
-    "## Uncomment below to test your function\n",
-    "# rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
-    "# print('\\n The model3 parameters after the update are: \\n')\n",
-    "# print_params(model3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "```\n",
-    " The model3 parameters after the update are:\n",
-    "\n",
-    "main.0.weight tensor([[-0.0240,  0.0031,  0.0193,  ...,  0.0316,  0.0297, -0.0198],\n",
-    "        [-0.0063, -0.0318, -0.0109,  ..., -0.0093,  0.0232, -0.0255],\n",
-    "        [ 0.0218, -0.0253,  0.0320,  ...,  0.0102,  0.0248, -0.0203],\n",
-    "        ...,\n",
-    "        [-0.0027,  0.0136,  0.0089,  ...,  0.0123, -0.0324, -0.0166],\n",
-    "        [ 0.0159,  0.0281,  0.0233,  ..., -0.0133, -0.0197,  0.0182],\n",
-    "        [ 0.0186, -0.0376, -0.0205,  ..., -0.0293,  0.0077, -0.0019]])\n",
-    "main.0.bias tensor([-0.0313, -0.0011,  0.0122, -0.0342,  0.0045,  0.0199,  0.0329,  0.0265,\n",
-    "         0.0182, -0.0041])\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove solution\n",
-    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
-    "  \"\"\"\n",
-    "  Perform an RMSprop update on a collection of parameters\n",
-    "\n",
-    "  Args:\n",
-    "    loss: Tensor\n",
-    "      A scalar tensor containing the loss whose gradient will be computed\n",
-    "    params: Iterable\n",
-    "      Collection of parameters with respect to which we compute gradients\n",
-    "    grad_sq: Iterable\n",
-    "      Moving average of squared gradients\n",
-    "    lr: Float\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "    alpha: Float\n",
-    "      Moving average parameter\n",
-    "    epsilon: Float\n",
-    "      quotient for numerical stability\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
-    "  # successive backward calls\n",
-    "  zero_grad(params)\n",
-    "  # Compute gradients on given objective\n",
-    "  loss.backward()\n",
-    "\n",
-    "  with torch.no_grad():\n",
-    "    for (par, gsq) in zip(params, grad_sq):\n",
-    "      # Update estimate of gradient variance\n",
-    "      gsq.data = alpha * gsq.data + (1 - alpha) * par.grad**2\n",
-    "      # Update parameters\n",
-    "      par.data -=  lr * (par.grad / (epsilon + gsq.data)**0.5)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "set_seed(seed=SEED)\n",
-    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
-    "print('\\n The model3 parameters before the update are: \\n')\n",
-    "print_params(model3)\n",
-    "loss = loss_fn(model3(X), y)\n",
-    "# Initialize the moving average of squared gradients\n",
-    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
-    "\n",
-    "## Uncomment below to test your function\n",
-    "rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
-    "print('\\n The model3 parameters after the update are: \\n')\n",
-    "print_params(model3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Implement_RMSProp_Exercise\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 7: Compare optimizers\n",
-    "\n",
-    "Below, we compare your implementations of **SGD**, **Momentum**, and **RMSprop**. If you have successfully coded all the exercises so far: congrats!\n",
-    "\n",
-    "You are now *in the know* of some of the most commonly used and powerful optimization tools for deep learning."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "X, y = train_set.data, train_set.targets\n",
-    "\n",
-    "@widgets.interact_manual\n",
-    "def compare_optimizers(\n",
-    "    batch_size=(25, 250, 5),\n",
-    "    lr=widgets.FloatLogSlider(value=2e-3, min=-5, max=0),\n",
-    "    max_steps=(50, 500, 5)):\n",
-    "  \"\"\"\n",
-    "  Demonstration to compare optimisers - stochastic gradient descent, momentum, RMSprop\n",
-    "\n",
-    "  Args:\n",
-    "    batch_size: Tuple\n",
-    "      Size of minibatches\n",
-    "    lr: Float log slider instance\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "    max_steps: Tuple\n",
-    "      Max number of step sizes for incrementing\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  SGD_DICT = [gradient_update, 'SGD', 'black', '-', {'lr': lr}]\n",
-    "  MOM_DICT = [momentum_update, 'Momentum', 'red', '--', {'lr': lr, 'beta': 0.9}]\n",
-    "  RMS_DICT = [rmsprop_update, 'RMSprop', 'fuchsia', '-', {'lr': lr, 'alpha': 0.8}]\n",
-    "\n",
-    "  ALL_DICTS = [SGD_DICT, MOM_DICT, RMS_DICT]\n",
-    "\n",
-    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
-    "\n",
-    "  LOSS_HIST = {}\n",
-    "\n",
-    "  for opt_dict in tqdm(ALL_DICTS):\n",
-    "    update_fn, opt_name, color, lstyle, kwargs = opt_dict\n",
-    "    LOSS_HIST[opt_name] = []\n",
-    "\n",
-    "    model = copy.deepcopy(base_model)\n",
-    "    params = list(model.parameters())\n",
-    "\n",
-    "    if opt_name != 'SGD':\n",
-    "      aux_tensors = [torch.zeros_like(_) for _ in params]\n",
-    "\n",
-    "    for step in range(max_steps):\n",
-    "      data, labels = sample_minibatch(X, y, batch_size)\n",
-    "      loss = loss_fn(model(data), labels)\n",
-    "      if opt_name == 'SGD':\n",
-    "        update_fn(loss, params, **kwargs)\n",
-    "      else:\n",
-    "        update_fn(loss, params, aux_tensors, **kwargs)\n",
-    "      LOSS_HIST[opt_name].append(loss.item())\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, len(ALL_DICTS), figsize=(9, 3))\n",
-    "  for ax, optim_dict in zip(axs, ALL_DICTS):\n",
-    "    opt_name = optim_dict[1]\n",
-    "    ax.plot(range(max_steps), LOSS_HIST[opt_name], alpha=0.8)\n",
-    "    ax.set_title(opt_name)\n",
-    "    ax.set_xlabel('Iteration')\n",
-    "    ax.set_ylabel('Loss')\n",
-    "    ax.set_ylim(0, 2.5)\n",
-    "  plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Compare_optimizers_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think 7.1!: Compare optimizers\n",
-    "\n",
-    "Tune the three methods above - **SGD**, **Momentum**, and **RMSProp** - to make each excel and discuss your findings. How do the methods compare in terms of robustness to small changes of the hyperparameters? How easy was it to find a good hyperparameter configuration?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "Stochastic Gradient Descent (SGD): Performs updates one example at a time.\n",
-    "Momentum: Helps accelerate SGD in the relevant direction and dampens\n",
-    "oscillations specially ravines.\n",
-    "RMSProp: Allows each parameter to be updated at an 'appropriate' rate decided\n",
-    "based on magnitudes of past recent updates;\n",
-    "i.e., areas where the surface curves much more steeply in one dimension than\n",
-    "in another, which are common around local optima.\n",
-    "\n",
-    "Robustness: RMSProp > Momentum > SGD\n",
-    "Since, each example affects SGD by updating hyperparameters, it's not\n",
-    "considered very robust.\n",
-    "Adagrad greatly improved the robustness of SGD and is used for training\n",
-    "large-scale neural nets.\n",
-    "Momentum is quite robust: he momentum term increases for dimensions whose\n",
-    "gradients point in the same directions\n",
-    "and reduces updates for dimensions whose gradients change directions.\n",
-    "RMSProp is very robust; This combines the idea of only using the sign of\n",
-    "the gradient with the idea of adapting the step size separately\n",
-    "for each weight in a mini-batch.\n",
-    "\n",
-    "Generally, non-adaptive methods consistently produce more robust models\n",
-    "than adaptive methods. Refer https://arxiv.org/pdf/1911.03784.pdf - for more details\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Compare_optimizers_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "**Remarks:** Note that RMSprop allows us to use a 'per-dimension' learning rate _without having to tune one learning rate for each dimension **ourselves**_. The method uses information collected about the variance of the gradients throughout training to **adapt** the step size for each of the parameters automatically. The savings in tuning efforts of RMSprop over SGD or 'plain' momentum are undisputed on this task.\n",
-    "\n",
-    "Moreover, adaptive optimization methods are currently a highly active research domain, with many related algorithms like Adam, AMSgrad, Adagrad being used in practical application and theoretically investigated."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Locality of Gradients\n",
-    "\n",
-    "As we've seen throughout this tutorial, poor conditioning can be a significant burden on convergence to an optimum while using gradient-based optimization. Of the methods we've seen to deal with this issue, notice how both momentum and adaptive learning rates incorporate past gradient values into their update schemes. Why do we use past values of our loss function's gradient while updating our current MLP weights?\n",
-    "\n",
-    "Recall from *W1D2* that the gradient of a function, $\\nabla f(w_t)$, is a **local** property and computes the direction of maximum change of $f(w_t)$ at the point $w_t$. However, when we train our MLP model we are hoping to find the **global** optimum for our training loss. By incorporating past values of our function's gradient into our optimization schemes, we use more information about the overall shape of our function than just a single gradient alone can provide."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think! 7.2: Loss function and optimization\n",
-    "\n",
-    "Can you think of other ways we can incorporate more information about our loss function into our optimization schemes?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "We could consider incorporating the curvature of our function directly into our\n",
-    "optimization schemes. Methods that use this are often called Newton's methods\n",
-    "or Hessian based optimization methods.\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Loss_function_and_optimization_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 8: Ethical concerns\n",
-    "\n",
-    "*Time estimate: ~15mins*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 8: Ethical concerns\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', '0EthSI0cknI'), ('Bilibili', 'BV1TU4y1G7Je')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Ethical_concerns_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
+   "id": "summary",
    "metadata": {
     "execution": {}
    },
@@ -3306,459 +2038,10 @@
     "# Summary\n",
     "\n",
     "* Optimization is necessary to create Deep Learning models that are guaranteed to converge\n",
-    "* Stochastic Gradient Descent and Momentum are two commonly used optimization techniques\n",
-    "* RMSProp is a way of adaptive hyperparameter tuning which utilises a per-dimension learning rate\n",
-    "* Poor choice of optimization objectives can lead to unforeseen, undesirable consequences\n",
-    "\n",
-    "If you have time left, you can read the Bonus material, where we put it all together and we compare our model with a benchmark model."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Bonus: Putting it all together\n",
-    "\n",
-    "*Time estimate: ~40 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "We have progressively built a sophisticated optimization algorithm, which is able to deal with a non-convex, poor-conditioned problem concerning tens of thousands of training examples. Now we present _you_ with a small challenge: beat us! :P\n",
-    "\n",
-    "Your mission is to train an MLP model that can compete with a benchmark model which we have pre-trained for you. In this section you will be able to use the full Pytorch power: loading the data, defining the model, sampling minibatches as well as Pytorch's **optimizer implementations**.\n",
-    "\n",
-    "There is a big engineering component behind the design of optimizers and their implementation can sometimes become tricky. So unless you are directly doing research in optimization, it's recommended to use an implementation provided by a widely reviewed open-source library."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 9: Putting it all together\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'DP9c13vLiOM'), ('Bilibili', 'BV1MK4y1u7u2')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Putting_it_all_together_Bonus_Video\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Download parameters of the benchmark model\n",
-    "import requests\n",
-    "\n",
-    "fname = 'benchmark_model.pt'\n",
-    "url = \"https://osf.io/sj4e8/download\"\n",
-    "r = requests.get(url, allow_redirects=True)\n",
-    "with open(fname, 'wb') as fh:\n",
-    "  fh.write(r.content)\n",
-    "\n",
-    "# Load the benchmark model's parameters\n",
-    "DEVICE = set_device()\n",
-    "if DEVICE == \"cuda\":\n",
-    "  benchmark_state_dict = torch.load(fname)\n",
-    "else:\n",
-    "  benchmark_state_dict = torch.load(fname, map_location=torch.device('cpu'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# Create MLP object and update weights with those of saved model\n",
-    "benchmark_model = MLP(in_dim=784, out_dim=10,\n",
-    "                      hidden_dims=[200, 100, 50]).to(DEVICE)\n",
-    "benchmark_model.load_state_dict(benchmark_state_dict)\n",
-    "\n",
-    "\n",
-    "# Define helper function to evaluate models\n",
-    "def eval_model(model, data_loader, num_batches=np.inf, device='cpu'):\n",
-    "  \"\"\"\n",
-    "  To evaluate a given model\n",
-    "\n",
-    "  Args:\n",
-    "    model: nn.Module derived class\n",
-    "      The model which is to be evaluated\n",
-    "    data_loader: Iterable\n",
-    "      A configured dataloading utility\n",
-    "    num_batches: Integer\n",
-    "      Size of minibatches\n",
-    "    device: String\n",
-    "      Sets the device. CUDA if available, CPU otherwise\n",
-    "\n",
-    "  Returns:\n",
-    "    mean of log loss and mean of log accuracy\n",
-    "  \"\"\"\n",
-    "\n",
-    "  loss_log, acc_log = [], []\n",
-    "  model.to(device=device)\n",
-    "\n",
-    "  # We are just evaluating the model, no need to compute gradients\n",
-    "  with torch.no_grad():\n",
-    "    for batch_id, batch in enumerate(data_loader):\n",
-    "      # If we only evaluate a number of batches, stop after we reach that number\n",
-    "      if batch_id > num_batches:\n",
-    "        break\n",
-    "      # Extract minibatch data\n",
-    "      data, labels = batch[0].to(device), batch[1].to(device)\n",
-    "      # Evaluate model and loss on minibatch\n",
-    "      preds = model(data)\n",
-    "      loss_log.append(loss_fn(preds, labels).item())\n",
-    "      acc_log.append(torch.mean(1. * (preds.argmax(dim=1) == labels)).item())\n",
-    "\n",
-    "  return np.mean(loss_log), np.mean(acc_log)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "We define an optimizer in the following steps:\n",
-    "\n",
-    "1. Load  the corresponding class that implements the parameter updates and other internal management activities, including:\n",
-    "    - create auxiliary variables,\n",
-    "    - update moving averages,\n",
-    "    - adjust the learning rate.\n",
-    "2. Pass the parameters of the Pytorch model that the optimizer has control over. Note that different optimizers can potentially control different parameter groups.\n",
-    "3. Specify hyperparameters, including learning rate, momentum, moving average factors, etc.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Exercise Bonus: Train your own model\n",
-    "\n",
-    "Now, train the model with your preferred optimizer and find a good combination of hyperparameter settings."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "#################################################\n",
-    "## TODO for students: adjust training settings ##\n",
-    "\n",
-    "# The three parameters below are in your full control\n",
-    "MAX_EPOCHS = 2  # select number of epochs to train\n",
-    "LR = 1e-5  # choose the step size\n",
-    "BATCH_SIZE = 64  # number of examples per minibatch\n",
-    "\n",
-    "# Define the model and associated optimizer -- you may change its architecture!\n",
-    "my_model = MLP(in_dim=784, out_dim=10, hidden_dims=[200, 100, 50]).to(DEVICE)\n",
-    "\n",
-    "# You can take your pick from many different optimizers\n",
-    "# Check the optimizer documentation and hyperparameter meaning before using!\n",
-    "# More details on Pytorch optimizers: https://pytorch.org/docs/stable/optim.html\n",
-    "# optimizer = torch.optim.SGD(my_model.parameters(), lr=LR, momentum=0.9)\n",
-    "# optimizer = torch.optim.RMSprop(my_model.parameters(), lr=LR, alpha=0.99)\n",
-    "# optimizer = torch.optim.Adagrad(my_model.parameters(), lr=LR)\n",
-    "optimizer = torch.optim.Adam(my_model.parameters(), lr=LR)\n",
-    "#################################################"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "set_seed(seed=SEED)\n",
-    "# Print training stats every LOG_FREQ minibatches\n",
-    "LOG_FREQ = 200\n",
-    "# Frequency for evaluating the validation metrics\n",
-    "VAL_FREQ = 200\n",
-    "# Load data using a Pytorch Dataset\n",
-    "train_set_orig, test_set_orig = load_mnist_data(change_tensors=False)\n",
-    "\n",
-    "# We separate 10,000 training samples to create a validation set\n",
-    "train_set_orig, val_set_orig = torch.utils.data.random_split(train_set_orig, [50000, 10000])\n",
-    "\n",
-    "# Create the corresponding DataLoaders for training and test\n",
-    "g_seed = torch.Generator()\n",
-    "g_seed.manual_seed(SEED)\n",
-    "\n",
-    "train_loader = torch.utils.data.DataLoader(train_set_orig,\n",
-    "                                           shuffle=True,\n",
-    "                                           batch_size=BATCH_SIZE,\n",
-    "                                           num_workers=2,\n",
-    "                                           worker_init_fn=seed_worker,\n",
-    "                                           generator=g_seed)\n",
-    "val_loader = torch.utils.data.DataLoader(val_set_orig,\n",
-    "                                         shuffle=True,\n",
-    "                                         batch_size=256,\n",
-    "                                         num_workers=2,\n",
-    "                                         worker_init_fn=seed_worker,\n",
-    "                                         generator=g_seed)\n",
-    "test_loader = torch.utils.data.DataLoader(test_set_orig,\n",
-    "                                          batch_size=256,\n",
-    "                                          num_workers=2,\n",
-    "                                          worker_init_fn=seed_worker,\n",
-    "                                          generator=g_seed)\n",
-    "\n",
-    "# Run training\n",
-    "metrics = {'train_loss':[],\n",
-    "           'train_acc':[],\n",
-    "           'val_loss':[],\n",
-    "           'val_acc':[],\n",
-    "           'val_idx':[]}\n",
-    "\n",
-    "step_idx = 0\n",
-    "for epoch in tqdm(range(MAX_EPOCHS)):\n",
-    "\n",
-    "  running_loss, running_acc = 0., 0.\n",
-    "\n",
-    "  for batch_id, batch in enumerate(train_loader):\n",
-    "    step_idx += 1\n",
-    "    # Extract minibatch data and labels\n",
-    "    data, labels = batch[0].to(DEVICE), batch[1].to(DEVICE)\n",
-    "    # Just like before, refresh gradient accumulators.\n",
-    "    # Note that this is now a method of the optimizer.\n",
-    "    optimizer.zero_grad()\n",
-    "    # Evaluate model and loss on minibatch\n",
-    "    preds = my_model(data)\n",
-    "    loss = loss_fn(preds, labels)\n",
-    "    acc = torch.mean(1.0 * (preds.argmax(dim=1) == labels))\n",
-    "    # Compute gradients\n",
-    "    loss.backward()\n",
-    "    # Update parameters\n",
-    "    # Note how all the magic in the update of the parameters is encapsulated by\n",
-    "    # the optimizer class.\n",
-    "    optimizer.step()\n",
-    "    # Log metrics for plotting\n",
-    "    metrics['train_loss'].append(loss.cpu().item())\n",
-    "    metrics['train_acc'].append(acc.cpu().item())\n",
-    "\n",
-    "    if batch_id % VAL_FREQ == (VAL_FREQ - 1):\n",
-    "      # Get an estimate of the validation accuracy with 100 batches\n",
-    "      val_loss, val_acc = eval_model(my_model, val_loader,\n",
-    "                                     num_batches=100,\n",
-    "                                     device=DEVICE)\n",
-    "      metrics['val_idx'].append(step_idx)\n",
-    "      metrics['val_loss'].append(val_loss)\n",
-    "      metrics['val_acc'].append(val_acc)\n",
-    "\n",
-    "      print(f\"[VALID] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
-    "            f\"Loss: {val_loss:.3f} - Acc: {100*val_acc:.3f}%\")\n",
-    "\n",
-    "    # print statistics\n",
-    "    running_loss += loss.cpu().item()\n",
-    "    running_acc += acc.cpu().item()\n",
-    "    # Print every LOG_FREQ minibatches\n",
-    "    if batch_id % LOG_FREQ == (LOG_FREQ-1):\n",
-    "      print(f\"[TRAIN] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
-    "            f\"Loss: {running_loss / LOG_FREQ:.3f} - \"\n",
-    "            f\"Acc: {100 * running_acc / LOG_FREQ:.3f}%\")\n",
-    "\n",
-    "      running_loss, running_acc = 0., 0."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
-    "\n",
-    "ax[0].plot(range(len(metrics['train_loss'])), metrics['train_loss'],\n",
-    "           alpha=0.8, label='Train')\n",
-    "ax[0].plot(metrics['val_idx'], metrics['val_loss'], label='Valid')\n",
-    "ax[0].set_xlabel('Iteration')\n",
-    "ax[0].set_ylabel('Loss')\n",
-    "ax[0].legend()\n",
-    "\n",
-    "ax[1].plot(range(len(metrics['train_acc'])), metrics['train_acc'],\n",
-    "           alpha=0.8, label='Train')\n",
-    "ax[1].plot(metrics['val_idx'], metrics['val_acc'], label='Valid')\n",
-    "ax[1].set_xlabel('Iteration')\n",
-    "ax[1].set_ylabel('Accuracy')\n",
-    "ax[1].legend()\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Train_your_own_model_Bonus_Exercise\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think! Bonus: Metrics\n",
-    "\n",
-    "Which metric did you optimize when searching for the right configuration? The training set loss? Accuracy? Validation/test set metrics? Why? Discuss!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "  Remember the discussion in Section 1 about surrogate objectives.\n",
-    "Our optimization methods minimize the loss, but at the end of the day we care about test accuracy.\n",
-    "\n",
-    "  However, we can't directly optimize for test accuracy and the finite size of our\n",
-    "datasets lead us to (cross-)validation:\n",
-    "\n",
-    "  1. We minimize the loss (empirical risk minimization) on our *training set*.\n",
-    "  2. We choose models and hyperparameters on the *validation set*.\n",
-    "  3. We use the *test set* in order to report the final performance of our model on unseen data.\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Metrics_Bonus_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Evaluation\n",
-    "\n",
-    "We _finally_ can evaluate and compare the performance of the models on previously unseen examples.\n",
-    "\n",
-    "Which model would you keep? (\\*drum roll*)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "print('Your model...')\n",
-    "train_loss, train_accuracy = eval_model(my_model, train_loader, device=DEVICE)\n",
-    "test_loss, test_accuracy = eval_model(my_model, test_loader, device=DEVICE)\n",
-    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
-    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')\n",
+    "* Gradient descent leverages automatic differentiation to efficiently update model parameters\n",
+    "* Momentum helps overcome poor conditioning by accumulating gradient history across updates\n",
     "\n",
-    "print('\\nBenchmark model')\n",
-    "train_loss, train_accuracy = eval_model(benchmark_model, train_loader, device=DEVICE)\n",
-    "test_loss, test_accuracy = eval_model(benchmark_model, test_loader, device=DEVICE)\n",
-    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
-    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')"
+    "Continue to the Bonus Lecture for sections on non-convexity, mini-batches, adaptive methods, ethical concerns, and a hands-on training exercise.\n"
    ]
   }
  ],
@@ -3826,4 +2109,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
similarity index 99%
rename from tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb
rename to tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
index 7df248018..b695e38a3 100644
--- a/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
@@ -17,9 +17,9 @@
     "execution": {}
    },
    "source": [
-    "# Tutorial 2: Deep Learning Thinking 1: Cost Functions\n",
+    "# Tutorial 2: Deep Learning Case Study 1: Cost Functions\n",
     "\n",
-    "**Week 2, Day 2: Convnets and DL Thinking**\n",
+    "**Week 1, Day 4: Convnets and DL Case Study**\n",
     "\n",
     "**By Neuromatch Academy**\n",
     "\n",
@@ -113,7 +113,7 @@
    },
    "source": [
     "---\n",
-    "# Section 1: Intro to Deep Learning Thinking\n",
+    "# Section 1: Intro to Deep Learning Case Study\n",
     "\n",
     "\n",
     "\n",
@@ -132,7 +132,7 @@
    },
    "outputs": [],
    "source": [
-    "# @title Video 1: Intro to DL Thinking\n",
+    "# @title Video 1: Intro to DL Case Study\n",
     "from ipywidgets import widgets\n",
     "from IPython.display import YouTubeVideo\n",
     "from IPython.display import IFrame\n",
@@ -189,7 +189,7 @@
    "outputs": [],
    "source": [
     "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Intro_to_DL_Thinking_Video\")"
+    "content_review(f\"{feedback_prefix}_Intro_to_DL_Case_Study_Video\")"
    ]
   },
   {

From ad01ebabcc17f7d9770f07e2d35bf44326d66b01 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 5 Apr 2026 11:29:08 -0400
Subject: [PATCH 06/34] W1D4: update materials.yml and review plan for
 Tutorial2 + BonusLecture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- materials.yml: move Tutorial 2 slide entry from W2D2 to W1D4;
  update tutorial counts (W1D4: 1→2, W2D2: 2→1)
- curriculum_review_plan.md: add W1D5 flex day note to check
  project notebook day title consistency after restructuring

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tutorials/materials.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tutorials/materials.yml b/tutorials/materials.yml
index 17e2e5f3c..694b2f34f 100644
--- a/tutorials/materials.yml
+++ b/tutorials/materials.yml
@@ -44,7 +44,9 @@
   slides:
   - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/ft2sz/?direct%26mode=render%26action=download%26mode=render
     title: Tutorial 1
-  tutorials: 1
+  - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/szcjn/?direct%26mode=render%26action=download%26mode=render
+    title: Tutorial 2
+  tutorials: 2
 
 - day: W2D1
   category: Fine Tuning
@@ -66,11 +68,9 @@
   slides:
   - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/s8xz5/?direct%26mode=render%26action=download%26mode=render
     title: Tutorial 1
-  - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/szcjn/?direct%26mode=render%26action=download%26mode=render
-    title: Tutorial 2
   - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/r9pjc/?direct%26mode=render%26action=download%26mode=render
     title: Bonus Lecture
-  tutorials: 2
+  tutorials: 1
 
 - day: W2D3
   category: ConvNets and Generative Models

From 070b02cb2e5aee853649682808140344bf9d5ebd Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 5 Apr 2026 15:54:07 -0400
Subject: [PATCH 07/34] W2D2: rename folder to W2D2_Convnets, fix
 W1D4_Tutorial2 style
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename W2D2_ConvnetsAndDlThinking → W2D2_Convnets to reflect that
  the DL Case Study tutorial has moved to W1D4
- materials.yml: update W2D2 day name to "Convnets"
- W1D4_Tutorial2: fix Colab/Kaggle badge paths, day label
  ("Convnets and DL Case Study" → "Optimization"), and
  feedback_prefix ("W2D2_T2" → "W1D4_T2")
- curriculum_review_plan.md: mark W2D2 rename as done

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb    |  10 +++++-----
 .../README.md                                       |   0
 .../W2D2_BonusLecture.ipynb                         |   0
 .../W2D2_Tutorial1.ipynb                            |   0
 .../instructor/W2D2_BonusLecture.ipynb              |   0
 .../instructor/W2D2_Tutorial1.ipynb                 |   0
 .../instructor/W2D2_Tutorial2.ipynb                 |   0
 .../solutions/W2D2_Tutorial1_Solution_0adbc972.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_1279086f.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_168b8fcf.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_18b18cac.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_240aa557.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_309474b2.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_3ef24bd7.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_4f643447.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_78a81e50.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_7c652c63.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_7cc3340b.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_800ed014.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_82e644f4.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_ae125a93.py   |   0
 .../solutions/W2D2_Tutorial1_Solution_c295e530.py   |   0
 .../static/Backpropagation.gif                      | Bin
 .../static/PoolingConvolution.svg                   |   0
 .../static/W2D2_Tutorial1_Solution_0adbc972_3.png   | Bin
 .../static/W2D2_Tutorial1_Solution_1279086f_3.png   | Bin
 .../static/W2D2_Tutorial1_Solution_240aa557_3.png   | Bin
 .../static/W2D2_Tutorial1_Solution_78a81e50_2.png   | Bin
 .../static/W2D2_Tutorial1_Solution_78a81e50_3.png   | Bin
 .../static/W2D2_Tutorial1_Solution_78a81e50_4.png   | Bin
 .../static/chicago_skyline_shrunk_v2.bmp            | Bin
 .../static/correlation.svg                          |   0
 .../static/img_params.png                           | Bin
 .../static/interactive_demo2.2.html                 |   0
 .../static/interactive_demo2.html                   |   0
 .../static/interactive_demo3.3.html                 |   0
 .../static/interactive_demo3.html                   |   0
 .../static/relu.png                                 | Bin
 .../static/think0.png                               | Bin
 .../static/twain.txt                                |   0
 .../student/W2D2_BonusLecture.ipynb                 |   0
 .../student/W2D2_Tutorial1.ipynb                    |   0
 .../student/W2D2_Tutorial2.ipynb                    |   0
 tutorials/materials.yml                             |   2 +-
 45 files changed, 6 insertions(+), 6 deletions(-)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/README.md (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/W2D2_BonusLecture.ipynb (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/W2D2_Tutorial1.ipynb (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/instructor/W2D2_BonusLecture.ipynb (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/instructor/W2D2_Tutorial1.ipynb (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/instructor/W2D2_Tutorial2.ipynb (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_0adbc972.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_1279086f.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_168b8fcf.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_18b18cac.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_240aa557.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_309474b2.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_4f643447.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_78a81e50.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_7c652c63.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_7cc3340b.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_800ed014.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_82e644f4.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_ae125a93.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/solutions/W2D2_Tutorial1_Solution_c295e530.py (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/Backpropagation.gif (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/PoolingConvolution.svg (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/W2D2_Tutorial1_Solution_0adbc972_3.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/W2D2_Tutorial1_Solution_1279086f_3.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/W2D2_Tutorial1_Solution_240aa557_3.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/W2D2_Tutorial1_Solution_78a81e50_2.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/W2D2_Tutorial1_Solution_78a81e50_3.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/W2D2_Tutorial1_Solution_78a81e50_4.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/chicago_skyline_shrunk_v2.bmp (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/correlation.svg (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/img_params.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/interactive_demo2.2.html (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/interactive_demo2.html (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/interactive_demo3.3.html (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/interactive_demo3.html (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/relu.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/think0.png (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/static/twain.txt (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/student/W2D2_BonusLecture.ipynb (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/student/W2D2_Tutorial1.ipynb (100%)
 rename tutorials/{W2D2_ConvnetsAndDlThinking => W2D2_Convnets}/student/W2D2_Tutorial2.ipynb (100%)

diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
index b695e38a3..991c4e6ec 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
@@ -8,7 +8,7 @@
     "id": "view-in-github"
    },
    "source": [
-    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
+    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
    ]
   },
   {
@@ -19,7 +19,7 @@
    "source": [
     "# Tutorial 2: Deep Learning Case Study 1: Cost Functions\n",
     "\n",
-    "**Week 1, Day 4: Convnets and DL Case Study**\n",
+    "**Week 1, Day 4: Optimization**\n",
     "\n",
     "**By Neuromatch Academy**\n",
     "\n",
@@ -30,7 +30,7 @@
     "\n",
     "__Content editors:__ Kelson Shilling-Scrivo\n",
     "\n",
-    "__Production editors:__ Gagana B, Spiros Chavlis"
+    "__Production editors:__ Gagana B, Spiros Chavlis\n"
    ]
   },
   {
@@ -103,7 +103,7 @@
     "    ).render()\n",
     "\n",
     "\n",
-    "feedback_prefix = \"W2D2_T2\""
+    "feedback_prefix = \"W1D4_T2\""
    ]
   },
   {
@@ -1378,4 +1378,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/README.md b/tutorials/W2D2_Convnets/README.md
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/README.md
rename to tutorials/W2D2_Convnets/README.md
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_BonusLecture.ipynb b/tutorials/W2D2_Convnets/W2D2_BonusLecture.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/W2D2_BonusLecture.ipynb
rename to tutorials/W2D2_Convnets/W2D2_BonusLecture.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial1.ipynb b/tutorials/W2D2_Convnets/W2D2_Tutorial1.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial1.ipynb
rename to tutorials/W2D2_Convnets/W2D2_Tutorial1.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_BonusLecture.ipynb b/tutorials/W2D2_Convnets/instructor/W2D2_BonusLecture.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_BonusLecture.ipynb
rename to tutorials/W2D2_Convnets/instructor/W2D2_BonusLecture.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial1.ipynb b/tutorials/W2D2_Convnets/instructor/W2D2_Tutorial1.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial1.ipynb
rename to tutorials/W2D2_Convnets/instructor/W2D2_Tutorial1.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial2.ipynb b/tutorials/W2D2_Convnets/instructor/W2D2_Tutorial2.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial2.ipynb
rename to tutorials/W2D2_Convnets/instructor/W2D2_Tutorial2.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_0adbc972.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_0adbc972.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_0adbc972.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_0adbc972.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_1279086f.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_1279086f.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_1279086f.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_1279086f.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_168b8fcf.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_168b8fcf.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_168b8fcf.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_168b8fcf.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_18b18cac.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_18b18cac.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_18b18cac.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_18b18cac.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_240aa557.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_240aa557.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_240aa557.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_240aa557.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_309474b2.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_309474b2.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_309474b2.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_309474b2.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_4f643447.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_4f643447.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_4f643447.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_4f643447.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_78a81e50.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_78a81e50.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_78a81e50.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_78a81e50.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7c652c63.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7c652c63.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7c652c63.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7c652c63.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7cc3340b.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7cc3340b.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7cc3340b.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7cc3340b.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_800ed014.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_800ed014.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_800ed014.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_800ed014.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_82e644f4.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_82e644f4.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_82e644f4.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_82e644f4.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_ae125a93.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_ae125a93.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_ae125a93.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_ae125a93.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_c295e530.py b/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_c295e530.py
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_c295e530.py
rename to tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_c295e530.py
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/Backpropagation.gif b/tutorials/W2D2_Convnets/static/Backpropagation.gif
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/Backpropagation.gif
rename to tutorials/W2D2_Convnets/static/Backpropagation.gif
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/PoolingConvolution.svg b/tutorials/W2D2_Convnets/static/PoolingConvolution.svg
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/PoolingConvolution.svg
rename to tutorials/W2D2_Convnets/static/PoolingConvolution.svg
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_0adbc972_3.png b/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_0adbc972_3.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_0adbc972_3.png
rename to tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_0adbc972_3.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_1279086f_3.png b/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_1279086f_3.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_1279086f_3.png
rename to tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_1279086f_3.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_240aa557_3.png b/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_240aa557_3.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_240aa557_3.png
rename to tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_240aa557_3.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_2.png b/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_2.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_2.png
rename to tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_2.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_3.png b/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_3.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_3.png
rename to tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_3.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_4.png b/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_4.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_4.png
rename to tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_4.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/chicago_skyline_shrunk_v2.bmp b/tutorials/W2D2_Convnets/static/chicago_skyline_shrunk_v2.bmp
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/chicago_skyline_shrunk_v2.bmp
rename to tutorials/W2D2_Convnets/static/chicago_skyline_shrunk_v2.bmp
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/correlation.svg b/tutorials/W2D2_Convnets/static/correlation.svg
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/correlation.svg
rename to tutorials/W2D2_Convnets/static/correlation.svg
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/img_params.png b/tutorials/W2D2_Convnets/static/img_params.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/img_params.png
rename to tutorials/W2D2_Convnets/static/img_params.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.2.html b/tutorials/W2D2_Convnets/static/interactive_demo2.2.html
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.2.html
rename to tutorials/W2D2_Convnets/static/interactive_demo2.2.html
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.html b/tutorials/W2D2_Convnets/static/interactive_demo2.html
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.html
rename to tutorials/W2D2_Convnets/static/interactive_demo2.html
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.3.html b/tutorials/W2D2_Convnets/static/interactive_demo3.3.html
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.3.html
rename to tutorials/W2D2_Convnets/static/interactive_demo3.3.html
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.html b/tutorials/W2D2_Convnets/static/interactive_demo3.html
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.html
rename to tutorials/W2D2_Convnets/static/interactive_demo3.html
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/relu.png b/tutorials/W2D2_Convnets/static/relu.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/relu.png
rename to tutorials/W2D2_Convnets/static/relu.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/think0.png b/tutorials/W2D2_Convnets/static/think0.png
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/think0.png
rename to tutorials/W2D2_Convnets/static/think0.png
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/static/twain.txt b/tutorials/W2D2_Convnets/static/twain.txt
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/static/twain.txt
rename to tutorials/W2D2_Convnets/static/twain.txt
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_BonusLecture.ipynb b/tutorials/W2D2_Convnets/student/W2D2_BonusLecture.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_BonusLecture.ipynb
rename to tutorials/W2D2_Convnets/student/W2D2_BonusLecture.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial1.ipynb b/tutorials/W2D2_Convnets/student/W2D2_Tutorial1.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial1.ipynb
rename to tutorials/W2D2_Convnets/student/W2D2_Tutorial1.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial2.ipynb b/tutorials/W2D2_Convnets/student/W2D2_Tutorial2.ipynb
similarity index 100%
rename from tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial2.ipynb
rename to tutorials/W2D2_Convnets/student/W2D2_Tutorial2.ipynb
diff --git a/tutorials/materials.yml b/tutorials/materials.yml
index 694b2f34f..e54007169 100644
--- a/tutorials/materials.yml
+++ b/tutorials/materials.yml
@@ -62,7 +62,7 @@
 
 - day: W2D2
   category: ConvNets and Generative Models
-  name: Convnets And Dl Thinking
+  name: Convnets
   playlist: https://youtube.com/playlist?list=PLkBQOLLbi18MYWrGf6xW2d8CRyF83m592
 
   slides:

From e89abe308525c931f416a094069d429807bd6a95 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Mon, 6 Apr 2026 21:49:49 -0400
Subject: [PATCH 08/34] delete the tutorial requirements file and updated the
 main requirements file per Konstantine's suggestion

---
 requirements.txt                     | 40 ++++++++++++++++------------
 tutorials/requirements_tutorials.txt | 23 ----------------
 2 files changed, 23 insertions(+), 40 deletions(-)
 delete mode 100644 tutorials/requirements_tutorials.txt

diff --git a/requirements.txt b/requirements.txt
index 5989dab78..b5eac4f7c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,23 @@
-requests
-numpy==1.26.4
-scipy
-matplotlib
-scikit-learn
-torch==1.13.1
-torchvision==0.14.1
-ipywidgets
-tqdm
-torchvision
-pathlib
-xkcd
-decorator==4.0.2
-pyvirtualdisplay
-tensorboard
-moviepy==1.0.3
-imageio_ffmpeg
+# Requirements for Neuromatch Academy Deep Learning tutorials
+# These packages are pre-installed on Google Colab/Kaggle.
+# For local setup: pip install -r requirements_tutorials.txt
+#
+# Python >= 3.10 required
+# Tutorial-specific packages (e.g. transformers, diffusers, altair) are
+# installed at the top of the relevant tutorial notebooks.
+
+numpy>=2.0
+pandas>=2.2
+matplotlib>=3.10
+torch>=2.0
+torchvision>=0.15
+scikit-learn>=1.3
+scipy>=1.13
+Pillow>=10.0
+imageio>=2.30
+seaborn>=0.13
+nltk>=3.9
+tensorboard>=2.19
+ipywidgets>=8.0
+tqdm>=4.0
+requests>=2.31
diff --git a/tutorials/requirements_tutorials.txt b/tutorials/requirements_tutorials.txt
deleted file mode 100644
index b5eac4f7c..000000000
--- a/tutorials/requirements_tutorials.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# Requirements for Neuromatch Academy Deep Learning tutorials
-# These packages are pre-installed on Google Colab/Kaggle.
-# For local setup: pip install -r requirements_tutorials.txt
-#
-# Python >= 3.10 required
-# Tutorial-specific packages (e.g. transformers, diffusers, altair) are
-# installed at the top of the relevant tutorial notebooks.
-
-numpy>=2.0
-pandas>=2.2
-matplotlib>=3.10
-torch>=2.0
-torchvision>=0.15
-scikit-learn>=1.3
-scipy>=1.13
-Pillow>=10.0
-imageio>=2.30
-seaborn>=0.13
-nltk>=3.9
-tensorboard>=2.19
-ipywidgets>=8.0
-tqdm>=4.0
-requests>=2.31

From 98173ee4c484b97cee5f9b2ec93befcdbe2ef964 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Fri, 10 Apr 2026 15:30:44 -0400
Subject: [PATCH 09/34] W1D4: fix version check cells

- Tutorial1: update stale requirements path from tutorials/requirements_tutorials.txt
  to GitHub URL (requirements.txt was consolidated per Konstantine's suggestion)
- Tutorial2: add missing version check / install cell to Setup section

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    | 19 +++------------
 .../W1D4_Optimization/W1D4_Tutorial2.ipynb    | 24 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index edbf036e4..b441453bd 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -110,20 +110,7 @@
     "id": ""
    },
    "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
+   "source": "# @title Install and check dependencies\n# Most packages are pre-installed on Colab/Kaggle.\n# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\nimport importlib\n\nprint('Package versions:')\nfor _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n    try:\n        _mod = importlib.import_module(_pkg)\n        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n    except ImportError:\n        print(f'  {_pkg}: NOT FOUND')"
   },
   {
    "cell_type": "code",
@@ -812,7 +799,7 @@
     "\n",
     "$$f(x) = \\text{softmax}(W x + b)$$\n",
     "\n",
-    "Here $x \\in \\mathbb{R}^{784}$, $W \\in \\mathbb{R}^{10 \\times 784}$ and $b \\in \\mathbb{R}^{10}$. Notice that the dimensions of the weight matrix are $10 \\times 784$ as the input tensors are flattened images, i.e., $28 \\times 28 = 784$-dimensional tensors and the output layer consists of $10$ nodes. Also, note that the implementation of softmax encapsulates b in W i.e., It maps the rows of the input instead of the columns. That is, the i’th row of the output is the mapping of the i’th row of the input under W, plus the bias term. Refer Affine maps here: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#affine-maps"
+    "Here $x \\in \\mathbb{R}^{784}$, $W \\in \\mathbb{R}^{10 \\times 784}$ and $b \\in \\mathbb{R}^{10}$. Notice that the dimensions of the weight matrix are $10 \\times 784$ as the input tensors are flattened images, i.e., $28 \\times 28 = 784$-dimensional tensors and the output layer consists of $10$ nodes. Also, note that the implementation of softmax encapsulates b in W i.e., It maps the rows of the input instead of the columns. That is, the i\u2019th row of the output is the mapping of the i\u2019th row of the input under W, plus the bias term. Refer Affine maps here: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#affine-maps"
    ]
   },
   {
@@ -2109,4 +2096,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
index 991c4e6ec..595e8b789 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
@@ -106,6 +106,28 @@
     "feedback_prefix = \"W1D4_T2\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -373,7 +395,7 @@
     "For neuron $i$, the probability of seeing $k_{i}$ spikes in some interval given an underlying firing rate $\\lambda_{i}$ is:\n",
     "\n",
     "\\begin{equation}\n",
-    "\\mathcal{f(k_{i}:λ_{i})} = \\mathcal{Pr(X=k_{i})} = \\frac {\\lambda_{i}^{k_{i}}e^{-\\lambda_{i}}}{k_{i}!}\n",
+    "\\mathcal{f(k_{i}:\u03bb_{i})} = \\mathcal{Pr(X=k_{i})} = \\frac {\\lambda_{i}^{k_{i}}e^{-\\lambda_{i}}}{k_{i}!}\n",
     "\\end{equation}\n",
     "\n",
     "So this poisson distribution may be relevant if we want to, in a way, have a good model for the spiking of neurons."

From 0a3345cc65371601357fd35c4ca5700dffa5e31c Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Fri, 10 Apr 2026 15:35:50 -0400
Subject: [PATCH 10/34] W1D1/W1D2/W1D3/W1D4: incorporate manual review edits

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_BonusLecture.ipynb                   |  9 ++---
 .../W1D1_Tutorial1.ipynb                      | 36 ++++++++++++++-----
 .../W1D2_Tutorial1.ipynb                      | 11 +++---
 .../W1D2_Tutorial2.ipynb                      |  4 +--
 .../W1D2_Tutorial3.ipynb                      |  6 ++--
 .../W1D3_Tutorial1.ipynb                      |  6 ++--
 .../W1D3_Tutorial2.ipynb                      |  6 ++--
 .../W1D4_Optimization/W1D4_BonusLecture.ipynb | 25 ++++++++++---
 8 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
index 6655dc73e..071220ff0 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
@@ -2,9 +2,10 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "",
    "metadata": {},
-   "source": "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
+   ]
   },
   {
    "cell_type": "code",
@@ -16,7 +17,7 @@
    "outputs": [],
    "source": [
     "# @title Install dependencies\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import subprocess, sys, importlib\n",
     "\n",
     "for _pkg, _pip in {'altair': 'altair', 'vega_datasets': 'vega_datasets'}.items():\n",
@@ -352,4 +353,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 7c0fcfafc..5ac5426ee 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -101,7 +101,7 @@
    "source": [
     "# @title Install dependencies\n",
     "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import subprocess, sys, importlib\n",
     "\n",
     "_to_install = {'pandas': 'pandas', 'imageio': 'imageio'}\n",
@@ -1140,7 +1140,11 @@
    "metadata": {
     "execution": {}
    },
-   "source": "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n\nAll of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
+   "source": [
+    "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n",
+    "\n",
+    "All of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
+   ]
   },
   {
    "cell_type": "code",
@@ -1586,7 +1590,9 @@
    "metadata": {
     "execution": {}
    },
-   "source": "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   "source": [
+    "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   ]
   },
   {
    "cell_type": "markdown",
@@ -2228,13 +2234,25 @@
    "metadata": {
     "execution": {}
    },
-   "source": "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n\nBy following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n\nOnce you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n\nFor more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   "source": [
+    "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n",
+    "\n",
+    "By following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n",
+    "\n",
+    "Once you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n",
+    "\n",
+    "For more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   ]
   },
   {
    "cell_type": "markdown",
-   "id": "",
    "metadata": {},
-   "source": "> **Colab GPU tips:**\n> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
+   "source": [
+    "> **Colab GPU tips:**\n",
+    "> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n",
+    "> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n",
+    "> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
+   ]
   },
   {
    "cell_type": "markdown",
@@ -4566,6 +4584,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "appendix",
    "metadata": {
     "execution": {}
    },
@@ -4590,8 +4609,7 @@
     "\n",
     "## Books for reference:\n",
     "- [https://www.deeplearningbook.org/](https://www.deeplearningbook.org/) (Deep Learning by Ian Goodfellow, Yoshua Bengio and Aaron Courville)"
-   ],
-   "id": "appendix"
+   ]
   }
  ],
  "metadata": {
@@ -4628,4 +4646,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
index 5e7ad563a..f3b7a3f95 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
@@ -116,7 +116,7 @@
    "source": [
     "# @title Install and check dependencies\n",
     "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import importlib\n",
     "\n",
     "print('Package versions:')\n",
@@ -141,7 +141,7 @@
     "import numpy as np\n",
     "from torch import nn\n",
     "from math import pi\n",
-    "import matplotlib.pyplot as plt",
+    "import matplotlib.pyplot as plt\n",
     "\n",
     "import ipywidgets as widgets\n",
     "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
@@ -1991,7 +1991,8 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "nma-dl-jax",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -2004,9 +2005,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.11"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
index 0fff5f482..80fe8a9e9 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
@@ -115,7 +115,7 @@
    "source": [
     "# @title Install and check dependencies\n",
     "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import importlib\n",
     "\n",
     "print('Package versions:')\n",
@@ -2633,4 +2633,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
index 731188102..bc0cf5ac2 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
@@ -114,7 +114,7 @@
    "source": [
     "# @title Install and check dependencies\n",
     "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import importlib\n",
     "\n",
     "print('Package versions:')\n",
@@ -143,7 +143,7 @@
     "import matplotlib.pyplot as plt\n",
     "\n",
     "import torch.nn as nn\n",
-    "import torch.optim as optim",
+    "import torch.optim as optim\n",
     "\n",
     "import warnings\n",
     "from matplotlib import gridspec\n",
@@ -3197,4 +3197,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
index 058f68f95..f0b5f542c 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
@@ -116,7 +116,7 @@
    "source": [
     "# @title Install and check dependencies\n",
     "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import importlib\n",
     "\n",
     "print('Package versions:')\n",
@@ -147,7 +147,7 @@
     "import torch.optim as optim\n",
     "from tqdm.auto import tqdm\n",
     "from IPython.display import display\n",
-    "from torch.utils.data import DataLoader, TensorDataset",
+    "from torch.utils.data import DataLoader, TensorDataset\n",
     "\n",
     "import ipywidgets as widgets\n"
    ]
@@ -2354,4 +2354,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
index 00ff3efc4..1a07ab468 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
@@ -114,7 +114,7 @@
    "source": [
     "# @title Install and check dependencies\n",
     "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import importlib\n",
     "\n",
     "print('Package versions:')\n",
@@ -150,7 +150,7 @@
     "from torch.utils.data import DataLoader, TensorDataset\n",
     "\n",
     "from tqdm.auto import tqdm\n",
-    "from IPython.display import display",
+    "from IPython.display import display\n",
     "\n",
     "import ipywidgets as widgets\n"
    ]
@@ -2003,4 +2003,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb b/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
index 8113b803b..95bf67e90 100644
--- a/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
@@ -115,7 +115,7 @@
    "source": [
     "# @title Install and check dependencies\n",
     "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See tutorials/requirements_tutorials.txt\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import importlib\n",
     "\n",
     "print('Package versions:')\n",
@@ -783,7 +783,11 @@
    "metadata": {
     "execution": {}
    },
-   "source": "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n\nComplete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
+   "source": [
+    "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n",
+    "\n",
+    "Complete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
+   ]
   },
   {
    "cell_type": "code",
@@ -1099,7 +1103,20 @@
    "metadata": {
     "execution": {}
    },
-   "source": "## Coding Exercise 7 *(optional)*: Implement RMSprop\n\nIn this exercise you will implement the update of the RMSprop optimizer:\n\n\\begin{align}\nv_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\nw_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n\\end{align}\n\nwhere the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n\nHere, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
+   "source": [
+    "## Coding Exercise 7 *(optional)*: Implement RMSprop\n",
+    "\n",
+    "In this exercise you will implement the update of the RMSprop optimizer:\n",
+    "\n",
+    "\\begin{align}\n",
+    "v_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\n",
+    "w_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n",
+    "\\end{align}\n",
+    "\n",
+    "where the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n",
+    "\n",
+    "Here, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
+   ]
   },
   {
    "cell_type": "code",
@@ -2094,4 +2111,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}

From ba26801569d94c12b798128bbf9879e87f41d7c3 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Fri, 10 Apr 2026 15:41:18 -0400
Subject: [PATCH 11/34] W1D1/W1D2/W1D3/W1D4: strip notebook metadata noise from
 editor

Revert JSON formatting artifacts introduced by opening notebooks in
VS Code/Jupyter: source fields changed from strings to arrays, and
id field reordering. No content changes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_BonusLecture.ipynb                   |  5 ++-
 .../W1D1_Tutorial1.ipynb                      | 32 ++++---------------
 .../W1D2_Tutorial1.ipynb                      |  5 ++-
 .../W1D4_Optimization/W1D4_BonusLecture.ipynb | 21 ++----------
 4 files changed, 13 insertions(+), 50 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
index 071220ff0..d7eb94fa5 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
@@ -2,10 +2,9 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "",
    "metadata": {},
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
-   ]
+   "source": "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 5ac5426ee..392cbde01 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -1140,11 +1140,7 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n",
-    "\n",
-    "All of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
-   ]
+   "source": "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n\nAll of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
   },
   {
    "cell_type": "code",
@@ -1590,9 +1586,7 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
-   ]
+   "source": "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
   },
   {
    "cell_type": "markdown",
@@ -2234,25 +2228,13 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n",
-    "\n",
-    "By following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n",
-    "\n",
-    "Once you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n",
-    "\n",
-    "For more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
-   ]
+   "source": "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n\nBy following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n\nOnce you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n\nFor more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
   },
   {
    "cell_type": "markdown",
+   "id": "",
    "metadata": {},
-   "source": [
-    "> **Colab GPU tips:**\n",
-    "> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n",
-    "> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n",
-    "> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
-   ]
+   "source": "> **Colab GPU tips:**\n> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
   },
   {
    "cell_type": "markdown",
@@ -4584,7 +4566,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "appendix",
    "metadata": {
     "execution": {}
    },
@@ -4609,7 +4590,8 @@
     "\n",
     "## Books for reference:\n",
     "- [https://www.deeplearningbook.org/](https://www.deeplearningbook.org/) (Deep Learning by Ian Goodfellow, Yoshua Bengio and Aaron Courville)"
-   ]
+   ],
+   "id": "appendix"
   }
  ],
  "metadata": {
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
index e11471d2a..e3d9870e3 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
@@ -1992,8 +1992,7 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "nma-dl-jax",
-   "language": "python",
+   "display_name": "Python 3",
    "name": "python3"
   },
   "language_info": {
@@ -2006,7 +2005,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.7.11"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb b/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
index 95bf67e90..ddb497759 100644
--- a/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
@@ -783,11 +783,7 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n",
-    "\n",
-    "Complete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
-   ]
+   "source": "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n\nComplete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
   },
   {
    "cell_type": "code",
@@ -1103,20 +1099,7 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "## Coding Exercise 7 *(optional)*: Implement RMSprop\n",
-    "\n",
-    "In this exercise you will implement the update of the RMSprop optimizer:\n",
-    "\n",
-    "\\begin{align}\n",
-    "v_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\n",
-    "w_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n",
-    "\\end{align}\n",
-    "\n",
-    "where the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n",
-    "\n",
-    "Here, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
-   ]
+   "source": "## Coding Exercise 7 *(optional)*: Implement RMSprop\n\nIn this exercise you will implement the update of the RMSprop optimizer:\n\n\\begin{align}\nv_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\nw_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n\\end{align}\n\nwhere the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n\nHere, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
   },
   {
    "cell_type": "code",

From dab8b608a5d99030b88d1935a707064e0cab0793 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Fri, 1 May 2026 17:28:34 -0400
Subject: [PATCH 12/34] W1D1_Tutorial1: restore bonus section inline; remove
 bonus notebook

- Bonus (60 years of ML research Altair visualization) merged back into
  Tutorial1 before the Appendix; altair/vega_datasets added to install
  cell and imports cell
- W1D1_Tutorial1_Bonus.ipynb deleted (content now in Tutorial1)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_BonusLecture.ipynb                   | 355 ------------------
 .../W1D1_Tutorial1.ipynb                      | 235 +++++++++++-
 2 files changed, 225 insertions(+), 365 deletions(-)
 delete mode 100644 tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
deleted file mode 100644
index d7eb94fa5..000000000
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb
+++ /dev/null
@@ -1,355 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "",
-   "metadata": {},
-   "source": "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install dependencies\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import subprocess, sys, importlib\n",
-    "\n",
-    "for _pkg, _pip in {'altair': 'altair', 'vega_datasets': 'vega_datasets'}.items():\n",
-    "    try:\n",
-    "        importlib.import_module(_pkg)\n",
-    "    except ImportError:\n",
-    "        subprocess.check_call([sys.executable, '-m', 'pip', 'install', _pip, '-q'])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and import feedback gadget\n",
-    "!pip3 install vibecheck datatops --quiet\n",
-    "\n",
-    "from vibecheck import DatatopsContentReviewContainer\n",
-    "def content_review(notebook_section: str):\n",
-    "    return DatatopsContentReviewContainer(\n",
-    "        '',\n",
-    "        notebook_section,\n",
-    "        {\n",
-    "            'url': 'https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab',\n",
-    "            'name': 'neuromatch_dl',\n",
-    "            'user_key': 'f379rz8y',\n",
-    "        },\n",
-    "    ).render()\n",
-    "\n",
-    "feedback_prefix = 'W1D1_BonusLecture'\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# Imports\n",
-    "import pandas as pd\n",
-    "import altair as alt\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Bonus - 60 years of Machine Learning Research in one Plot\n",
-    "\n",
-    "By [Hendrik Strobelt](http://hendrik.strobelt.com) (MIT-IBM Watson AI Lab) with support from Benjamin Hoover.\n",
-    "\n",
-    "In this notebook we visualize a subset* of 3,300 articles retreived from the AllenAI [S2ORC dataset](https://github.com/allenai/s2orc). We represent each paper by a position that is output of a dimensionality reduction method applied to a vector representation of each paper. The vector representation is the output of a neural network.\n",
-    "\n",
-    "**Note:** The selection is very biased on the keywords and methodology we used to filter. Please see the details section to learn about what we did."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Import `altair` and `vega_datasets`\n",
-    "\n",
-    "\n",
-    "# Source data files\n",
-    "# Position data file maps ID to x,y positions\n",
-    "# original link: http://gltr.io/temp/ml_regexv1_cs_ma_citation+_99perc.pos_umap_cosine_100_d0.1.json\n",
-    "POS_FILE = 'https://osf.io/qyrfn/download'\n",
-    "# original link: http://gltr.io/temp/ml_regexv1_cs_ma_citation+_99perc_clean.csv\n",
-    "# Metadata file maps ID to title, abstract, author,....\n",
-    "META_FILE = 'https://osf.io/vfdu6/download'\n",
-    "\n",
-    "# data loading and wrangling\n",
-    "def load_data():\n",
-    "  \"\"\"\n",
-    "  Loading the data\n",
-    "\n",
-    "  Args:\n",
-    "    None\n",
-    "\n",
-    "  Returns:\n",
-    "    Merged read dataFrame combining id and paper_id;\n",
-    "  \"\"\"\n",
-    "  positions = pd.read_json(POS_FILE)\n",
-    "  positions[['x', 'y']] = positions['pos'].to_list()\n",
-    "  meta = pd.read_csv(META_FILE)\n",
-    "  return positions.merge(meta, left_on='id', right_on='paper_id')\n",
-    "\n",
-    "\n",
-    "# load data\n",
-    "data = load_data()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Define Visualization using ALtair\n",
-    "YEAR_PERIOD = \"quinquennial\"  # @param\n",
-    "selection = alt.selection_multi(fields=[YEAR_PERIOD], bind='legend')\n",
-    "data[YEAR_PERIOD] = (data[\"year\"] / 5.0).apply(np.floor) * 5\n",
-    "chart = alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\"]], width=800,\n",
-    "                  height=800).mark_circle(radius=2, opacity=0.2).encode(\n",
-    "    alt.Color(YEAR_PERIOD+':O',\n",
-    "              scale=alt.Scale(scheme='viridis', reverse=False, clamp=True, domain=list(range(1955,2020,5))),\n",
-    "              # legend=alt.Legend(title='Total Records')\n",
-    "              ),\n",
-    "    alt.Size('citation_count',\n",
-    "              scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])\n",
-    "              ),\n",
-    "       alt.X('x:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "       alt.Y('y:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "    tooltip=['title', 'authors'],\n",
-    "    # size='citation_count',\n",
-    "    # color=\"decade:O\",\n",
-    "    opacity=alt.condition(selection, alt.value(.8), alt.value(0.2)),\n",
-    "\n",
-    ").add_selection(\n",
-    "    selection\n",
-    ").interactive()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Lets look at the Visualization. Each dot represents one paper. Close dots mean that the respective papers are closer related than distant ones. The color indicates the 5-year period of when the paper was published. The dot size indicates the citation count (within S2ORC corpus) as of July 2020.\n",
-    "\n",
-    "The view is **interactive** and allows for three main interactions. Try them and play around:\n",
-    "1. Hover over a dot to see a tooltip (title, author)\n",
-    "2. Select a year in the legend (right) to filter dots\n",
-    "3. Zoom in/out with scroll -- double click resets view"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "chart"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Questions\n",
-    "\n",
-    "By playing around, can you find some answers to the following questions?\n",
-    "\n",
-    "1. Can you find topical clusters? What cluster might occur because of a filtering error?\n",
-    "2. Can you see a temporal trend in the data and clusters?\n",
-    "3. Can you determine when deep learning methods started booming ?\n",
-    "4. Can you find the key papers that where written before the DL \"winter\" that define milestones for a cluster? (tip: look for large dots of different color)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "1. As specified below, the data is already filtered for topics such as Computer Science/Mathematics.\n",
-    "Filtering errors could occur if keywords in a paper are incorrectly tagged or if cases don't match etc.\n",
-    "\n",
-    "2. To look for temporal trends in the data/clusters, observe the color transitions in the above\n",
-    "visualization. We see that a lot more papers were published in diversified topics,\n",
-    "as we transitioned out of AI Winters.\n",
-    "\n",
-    "3. Based on the color of the clusters, we can infer that deep learning methods\n",
-    "boomed between the 2010 and 2015 period.\n",
-    "\n",
-    "4. After filtering around the mid 1900's, hovering on the larger dots show\n",
-    "the key papers before the DL winters.\n",
-    "For instance, \"Neural networks and physical systems with emergent\n",
-    "collective computational abilities\" by John J Hopfield (1980's)\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Bonus_Section_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Methods\n",
-    "\n",
-    "Here is what we did:\n",
-    "1. Filtering of all papers who fullfilled the criterria:\n",
-    "  - are categorized as `Computer Science` or `Mathematics`\n",
-    "  - one of the following keywords appearing in title or abstract: `\"machine learning|artificial intelligence|neural network|(machine|computer) vision|perceptron|network architecture| RNN | CNN | LSTM | BLEU | MNIST | CIFAR |reinforcement learning|gradient descent| Imagenet \"`\n",
-    "2. Per year, remove all papers that are below the 99 percentile of citation count in that year\n",
-    "3. Embed each paper by using abstract + title in SPECTER model\n",
-    "4. Project based on embedding using UMAP\n",
-    "5. Visualize using Altair"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Find Authors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Edit the `AUTHOR_FILTER` variable to full text search for authors.\n",
-    "\n",
-    "AUTHOR_FILTER = \"Rush \"  # @param space at the end means \"word border\"\n",
-    "\n",
-    "### Don't ignore case when searching...\n",
-    "FLAGS = 0\n",
-    "### uncomment do ignore case\n",
-    "# FLAGS = re.IGNORECASE\n",
-    "\n",
-    "## --- FILTER CODE.. make it your own ---\n",
-    "data['issel'] = data['authors'].str.contains(AUTHOR_FILTER, na=False, flags=FLAGS, )\n",
-    "if data['issel'].mean()<0.0000000001:\n",
-    "  print('No match found')\n",
-    "\n",
-    "## --- FROM HERE ON VIS CODE ---\n",
-    "alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\", \"issel\"]], width=800,\n",
-    "                  height=800) \\\n",
-    "    .mark_circle(stroke=\"black\", strokeOpacity=1).encode(\n",
-    "    alt.Color(YEAR_PERIOD+':O',\n",
-    "              scale=alt.Scale(scheme='viridis', reverse=False),\n",
-    "              # legend=alt.Legend(title='Total Records')\n",
-    "              ),\n",
-    "    alt.Size('citation_count',\n",
-    "              scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])\n",
-    "              ),\n",
-    "    alt.StrokeWidth('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[0, 2]), legend=None),\n",
-    "\n",
-    "    alt.Opacity('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[.2, 1]), legend=None),\n",
-    "    alt.X('x:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "    alt.Y('y:Q',\n",
-    "        scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)\n",
-    "    ),\n",
-    "    tooltip=['title', 'authors'],\n",
-    ").interactive()"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "collapsed_sections": [],
-   "include_colab_link": true,
-   "name": "W1D1_Tutorial1",
-   "provenance": [],
-   "toc_visible": true
-  },
-  "kernel": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "kernelspec": {
-   "display_name": "nma-dl-jax",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 392cbde01..a1065eb3d 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -104,7 +104,7 @@
     "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
     "import subprocess, sys, importlib\n",
     "\n",
-    "_to_install = {'pandas': 'pandas', 'imageio': 'imageio'}\n",
+    "_to_install = {'pandas': 'pandas', 'imageio': 'imageio', 'altair': 'altair', 'vega_datasets': 'vega_datasets'}\n",
     "for _pkg, _pip in _to_install.items():\n",
     "    try:\n",
     "        importlib.import_module(_pkg)\n",
@@ -113,7 +113,7 @@
     "\n",
     "# Print versions for reproducibility / bug reports\n",
     "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'pandas', 'matplotlib', 'torch', 'torchvision', 'sklearn', 'imageio']:\n",
+    "for _pkg in ['numpy', 'pandas', 'matplotlib', 'torch', 'torchvision', 'sklearn', 'imageio', 'altair']:\n",
     "    try:\n",
     "        _mod = importlib.import_module(_pkg)\n",
     "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
@@ -179,7 +179,10 @@
     "\n",
     "# IPython display utilities\n",
     "from IPython.core.interactiveshell import InteractiveShell\n",
-    "from IPython.display import Image, display\n"
+    "from IPython.display import Image, display\n",
+    "\n",
+    "import re\n",
+    "import altair as alt\n"
    ]
   },
   {
@@ -1140,7 +1143,11 @@
    "metadata": {
     "execution": {}
    },
-   "source": "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n\nAll of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
+   "source": [
+    "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n",
+    "\n",
+    "All of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
+   ]
   },
   {
    "cell_type": "code",
@@ -1586,7 +1593,9 @@
    "metadata": {
     "execution": {}
    },
-   "source": "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   "source": [
+    "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   ]
   },
   {
    "cell_type": "markdown",
@@ -2228,13 +2237,25 @@
    "metadata": {
     "execution": {}
    },
-   "source": "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n\nBy following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n\nOnce you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n\nFor more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   "source": [
+    "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n",
+    "\n",
+    "By following *Runtime* → *Change runtime type* and selecting **GPU** from the *Hardware Accelerator* dropdown list, we can start playing with sending tensors to GPUs.\n",
+    "\n",
+    "Once you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n",
+    "\n",
+    "For more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
+   ]
   },
   {
    "cell_type": "markdown",
-   "id": "",
    "metadata": {},
-   "source": "> **Colab GPU tips:**\n> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
+   "source": [
+    "> **Colab GPU tips:**\n",
+    "> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n",
+    "> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n",
+    "> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
+   ]
   },
   {
    "cell_type": "markdown",
@@ -4566,6 +4587,201 @@
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "# Bonus - 60 years of Machine Learning Research in one Plot\n",
+    "\n",
+    "By [Hendrik Strobelt](http://hendrik.strobelt.com) (MIT-IBM Watson AI Lab) with support from Benjamin Hoover.\n",
+    "\n",
+    "In this notebook we visualize a subset* of 3,300 articles retreived from the AllenAI [S2ORC dataset](https://github.com/allenai/s2orc). We represent each paper by a position that is output of a dimensionality reduction method applied to a vector representation of each paper. The vector representation is the output of a neural network.\n",
+    "\n",
+    "**Note:** The selection is very biased on the keywords and methodology we used to filter. Please see the details section to learn about what we did."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Load visualization data\n",
+    "\n",
+    "# Source data files\n",
+    "POS_FILE = 'https://osf.io/qyrfn/download'\n",
+    "META_FILE = 'https://osf.io/vfdu6/download'\n",
+    "\n",
+    "def load_data():\n",
+    "  \"\"\"\n",
+    "  Loading the data\n",
+    "\n",
+    "  Args:\n",
+    "    None\n",
+    "\n",
+    "  Returns:\n",
+    "    Merged read dataFrame combining id and paper_id;\n",
+    "  \"\"\"\n",
+    "  positions = pd.read_json(POS_FILE)\n",
+    "  positions[['x', 'y']] = positions['pos'].to_list()\n",
+    "  meta = pd.read_csv(META_FILE)\n",
+    "  return positions.merge(meta, left_on='id', right_on='paper_id')\n",
+    "\n",
+    "\n",
+    "data = load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Define Visualization using ALtair\n",
+    "YEAR_PERIOD = \"quinquennial\"  # @param\n",
+    "selection = alt.selection_multi(fields=[YEAR_PERIOD], bind='legend')\n",
+    "data[YEAR_PERIOD] = (data[\"year\"] / 5.0).apply(np.floor) * 5\n",
+    "chart = alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\"]], width=800,\n",
+    "                  height=800).mark_circle(radius=2, opacity=0.2).encode(\n",
+    "    alt.Color(YEAR_PERIOD+':O',\n",
+    "              scale=alt.Scale(scheme='viridis', reverse=False, clamp=True, domain=list(range(1955,2020,5)))),\n",
+    "    alt.Size('citation_count',\n",
+    "              scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])),\n",
+    "    alt.X('x:Q', scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)),\n",
+    "    alt.Y('y:Q', scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)),\n",
+    "    tooltip=['title', 'authors'],\n",
+    "    opacity=alt.condition(selection, alt.value(.8), alt.value(0.2)),\n",
+    ").add_selection(selection).interactive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets look at the Visualization. Each dot represents one paper. Close dots mean that the respective papers are closer related than distant ones. The color indicates the 5-year period of when the paper was published. The dot size indicates the citation count (within S2ORC corpus) as of July 2020.\n",
+    "\n",
+    "The view is **interactive** and allows for three main interactions. Try them and play around:\n",
+    "1. Hover over a dot to see a tooltip (title, author)\n",
+    "2. Select a year in the legend (right) to filter dots\n",
+    "3. Zoom in/out with scroll -- double click resets view"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Questions\n",
+    "\n",
+    "By playing around, can you find some answers to the following questions?\n",
+    "\n",
+    "1. Can you find topical clusters? What cluster might occur because of a filtering error?\n",
+    "2. Can you see a temporal trend in the data and clusters?\n",
+    "3. Can you determine when deep learning methods started booming ?\n",
+    "4. Can you find the key papers that where written before the DL \"winter\" that define milestones for a cluster? (tip: look for large dots of different color)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "1. As specified below, the data is already filtered for topics such as Computer Science/Mathematics.\n",
+    "Filtering errors could occur if keywords in a paper are incorrectly tagged or if cases don't match etc.\n",
+    "\n",
+    "2. To look for temporal trends in the data/clusters, observe the color transitions in the above\n",
+    "visualization. We see that a lot more papers were published in diversified topics,\n",
+    "as we transitioned out of AI Winters.\n",
+    "\n",
+    "3. Based on the color of the clusters, we can infer that deep learning methods\n",
+    "boomed between the 2010 and 2015 period.\n",
+    "\n",
+    "4. After filtering around the mid 1900's, hovering on the larger dots show\n",
+    "the key papers before the DL winters.\n",
+    "For instance, \\\"Neural networks and physical systems with emergent\n",
+    "collective computational abilities\\\" by John J Hopfield (1980's)\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Bonus_Section_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Methods\n",
+    "\n",
+    "Here is what we did:\n",
+    "1. Filtering of all papers who fullfilled the criterria:\n",
+    "  - are categorized as `Computer Science` or `Mathematics`\n",
+    "  - one of the following keywords appearing in title or abstract: `\"machine learning|artificial intelligence|neural network|(machine|computer) vision|perceptron|network architecture| RNN | CNN | LSTM | BLEU | MNIST | CIFAR |reinforcement learning|gradient descent| Imagenet \"`\n",
+    "2. Per year, remove all papers that are below the 99 percentile of citation count in that year\n",
+    "3. Embed each paper by using abstract + title in SPECTER model\n",
+    "4. Project based on embedding using UMAP\n",
+    "5. Visualize using Altair"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Find Authors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Edit the `AUTHOR_FILTER` variable to full text search for authors.\n",
+    "\n",
+    "AUTHOR_FILTER = \"Rush \"  # @param space at the end means \"word border\"\n",
+    "\n",
+    "### Don't ignore case when searching...\n",
+    "FLAGS = 0\n",
+    "### uncomment to ignore case\n",
+    "# FLAGS = re.IGNORECASE\n",
+    "\n",
+    "data['issel'] = data['authors'].str.contains(AUTHOR_FILTER, na=False, flags=FLAGS)\n",
+    "if data['issel'].mean() < 0.0000000001:\n",
+    "    print('No match found')\n",
+    "\n",
+    "alt.Chart(data[[\"x\", \"y\", \"authors\", \"title\", YEAR_PERIOD, \"citation_count\", \"issel\"]], width=800,\n",
+    "          height=800).mark_circle(stroke=\"black\", strokeOpacity=1).encode(\n",
+    "    alt.Color(YEAR_PERIOD+':O', scale=alt.Scale(scheme='viridis', reverse=False)),\n",
+    "    alt.Size('citation_count', scale=alt.Scale(type=\"pow\", exponent=1, range=[15, 300])),\n",
+    "    alt.StrokeWidth('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[0, 2]), legend=None),\n",
+    "    alt.Opacity('issel:Q', scale=alt.Scale(type=\"linear\", domain=[0,1], range=[.2, 1]), legend=None),\n",
+    "    alt.X('x:Q', scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)),\n",
+    "    alt.Y('y:Q', scale=alt.Scale(zero=False), axis=alt.Axis(labels=False)),\n",
+    "    tooltip=['title', 'authors'],\n",
+    ").interactive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "appendix",
    "metadata": {
     "execution": {}
    },
@@ -4590,8 +4806,7 @@
     "\n",
     "## Books for reference:\n",
     "- [https://www.deeplearningbook.org/](https://www.deeplearningbook.org/) (Deep Learning by Ian Goodfellow, Yoshua Bengio and Aaron Courville)"
-   ],
-   "id": "appendix"
+   ]
   }
  ],
  "metadata": {

From a31139120282f047f18a43a723d0c9326ef600ee Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Fri, 1 May 2026 18:27:04 -0400
Subject: [PATCH 13/34] W1D4: restore Tutorial1 as unified notebook; apply
 minimal fixes

Revert the Tutorial1/Tutorial1_Bonus split back to the original single
notebook from main. Apply only the two necessary changes:
- Insert import/version check cell (with correct GitHub URL for requirements)
- Mark Coding Exercises 4, 6, and 7 as *(optional)* to reduce student load

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    | 1744 ++++++++++++++++-
 1 file changed, 1713 insertions(+), 31 deletions(-)

diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index b441453bd..68298e04b 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -44,6 +44,9 @@
     "Objectives:\n",
     "*   Necessity and importance of optimization\n",
     "*   Introduction to commonly used optimization techniques\n",
+    "*   Optimization in non-convex loss landscapes\n",
+    "*   'Adaptive' hyperparameter tuning\n",
+    "*   Ethical concerns\n",
     "\n"
    ]
   },
@@ -107,7 +110,7 @@
    "execution_count": null,
    "metadata": {
     "cellView": "form",
-    "id": ""
+    "id": "import_check"
    },
    "outputs": [],
    "source": "# @title Install and check dependencies\n# Most packages are pre-installed on Colab/Kaggle.\n# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\nimport importlib\n\nprint('Package versions:')\nfor _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n    try:\n        _mod = importlib.import_module(_pkg)\n        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n    except ImportError:\n        print(f'  {_pkg}: NOT FOUND')"
@@ -150,6 +153,7 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
+    "import ipywidgets as widgets  # interactive display\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")\n",
     "plt.rc('axes', unicode_minus=False)"
@@ -1469,30 +1473,7 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 4 *(optional)*: Implement momentum\n",
-    "\n",
-    "In this exercise you will implement the momentum update given by:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "w_{t+1} = w_t - \\eta \\nabla J(w_t) + \\beta (w_t - w_{t-1})\n",
-    "\\end{equation}\n",
-    "\n",
-    "It is convenient to re-express this update rule in terms of a recursion. For that, we define 'velocity' as the quantity:\n",
-    "\\begin{equation}\n",
-    "v_{t-1} := w_{t} - w_{t-1}\n",
-    "\\end{equation}\n",
-    "\n",
-    "which leads to the two-step update rule:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "v_t = - \\eta \\nabla J(w_t) + \\beta (\\underbrace{w_t - w_{t-1}}_{v_{t-1}})\n",
-    "\\end{equation}\n",
-    "\n",
-    "\\begin{equation}\n",
-    "w_{t+1} \\leftarrow w_t + v_{t}\n",
-    "\\end{equation}\n",
-    "\n",
-    "Pay attention to the positive sign of the update in the last equation, given the definition of $v_t$, above."
+    "## Coding Exercise 4 *(optional)*: Implement momentum\n\nIn this exercise you will implement the momentum update given by:\n\n\\begin{equation}\nw_{t+1} = w_t - \\eta \\nabla J(w_t) + \\beta (w_t - w_{t-1})\n\\end{equation}\n\nIt is convenient to re-express this update rule in terms of a recursion. For that, we define 'velocity' as the quantity:\n\\begin{equation}\nv_{t-1} := w_{t} - w_{t-1}\n\\end{equation}\n\nwhich leads to the two-step update rule:\n\n\\begin{equation}\nv_t = - \\eta \\nabla J(w_t) + \\beta (\\underbrace{w_t - w_{t-1}}_{v_{t-1}})\n\\end{equation}\n\n\\begin{equation}\nw_{t+1} \\leftarrow w_t + v_{t}\n\\end{equation}\n\nPay attention to the positive sign of the update in the last equation, given the definition of $v_t$, above."
    ]
   },
   {
@@ -2016,19 +1997,1720 @@
   },
   {
    "cell_type": "markdown",
-   "id": "summary",
    "metadata": {
     "execution": {}
    },
    "source": [
     "---\n",
-    "# Summary\n",
+    "# Section 5: Non-convexity\n",
     "\n",
-    "* Optimization is necessary to create Deep Learning models that are guaranteed to converge\n",
-    "* Gradient descent leverages automatic differentiation to efficiently update model parameters\n",
-    "* Momentum helps overcome poor conditioning by accumulating gradient history across updates\n",
+    "*Time estimate: ~30 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "The introduction of even just 1 hidden layer in the neural network transforms the previous convex optimization problem into a non-convex one. And with great non-convexity, comes great responsibility... (Sorry, we couldn't help it!)\n",
+    "\n",
+    "**Note:** From this section onwards we will be dealing with non-convex optimization problems for the remainder of the tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 5: Overparameterization\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', '7vUpUEKKl5o'), ('Bilibili', 'BV16h41167Jr')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Overparameterization_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Take a couple of minutes to play with a more complex 3D visualization of the loss landscape of a neural network on a non-convex problem. Visit https://losslandscape.com/explorer.\n",
+    "\n",
+    "1. Explore the features on the bottom left corner. You can see an explanation for each icon by clicking on the ( i ) button located on the top right corner.\n",
+    "2. Use the 'gradient descent' feature to perform a thought experiment:\n",
+    "    -   Choose an initialization\n",
+    "    -   Choose the learning rate\n",
+    "    -   Mentally formulate your hypothesis about what kind of trajectory you expect to observe\n",
+    "3. Run the experiment and contrast your intuition with the observed behavior.\n",
+    "4. Repeat this experiment a handful of times for several initialization/learning rate configurations\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 5: Overparameterization to the rescue!\n",
+    "\n",
+    "As you may have seen, the non-convex nature of the surface can lead the optimization process to get stuck in undesirable local-optima. There is ample empirical evidence supporting the claim that 'overparameterized' models are easier to train.\n",
+    "\n",
+    "We will explore this assertion in the context of our MLP training. For this, we initialize a fixed model and construct several models by small random perturbations to the original initialized weights. Now, we train each of these perturbed models and see how the loss evolves. If we were in the convex setting, we should reach very similar objective values upon convergence since all these models were very close at the beginning of training, and in convex problems, the local optimum is also the global optimum.\n",
+    "\n",
+    "Use the interactive plot below to visualize the loss progression for these perturbed models:\n",
+    "\n",
+    "1. Select different settings from the `hidden_dims` drop-down menu.\n",
+    "2. Explore the effect of the number of steps and learning rate."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "\n",
+    "@widgets.interact_manual\n",
+    "def overparam(max_steps=widgets.IntSlider(150, 50, 500, 5),\n",
+    "              hidden_dims=widgets.Dropdown(options=[\"10\", \"20, 20\", \"100, 100\"],\n",
+    "                                           value=\"10\"),\n",
+    "              lr=widgets.FloatLogSlider(value=5e-2, min=-3, max=0, step=0.1),\n",
+    "              num_inits=widgets.IntSlider(7, 5, 10, 1)):\n",
+    "  \"\"\"\n",
+    "  Displays the overparameterization phenomenon as a widget\n",
+    "\n",
+    "  Args:\n",
+    "    max_steps: widget integer slider\n",
+    "      Maximum number of steps on the slider with default = 150\n",
+    "    hidden_dims: widget dropdown menu instance\n",
+    "      The number of hidden dimensions with default = 10\n",
+    "    lr: widget float slider\n",
+    "      Scalar specifying the learning rate or step-size for the update with default = 5e-2\n",
+    "    num_inits: widget integer slider\n",
+    "      Scalar number of epochs\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
     "\n",
-    "Continue to the Bonus Lecture for sections on non-convexity, mini-batches, adaptive methods, ethical concerns, and a hands-on training exercise.\n"
+    "  X, y = train_set.data[subset_index, :], train_set.targets[subset_index]\n",
+    "\n",
+    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
+    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, 1, figsize=(5, 4))\n",
+    "\n",
+    "  for _ in tqdm(range(num_inits)):\n",
+    "    model = copy.deepcopy(base_model)\n",
+    "    random_update(model, noise_scale=2e-1)\n",
+    "    loss_hist = np.zeros((max_steps, 2))\n",
+    "    for step in range(max_steps):\n",
+    "      loss = loss_fn(model(X), y)\n",
+    "      gradient_update(loss, list(model.parameters()), lr=lr)\n",
+    "      loss_hist[step] = np.array([step, loss.item()])\n",
+    "\n",
+    "    plt.plot(loss_hist[:, 0], loss_hist[:, 1])\n",
+    "\n",
+    "  plt.xlabel('Iteration')\n",
+    "  plt.ylabel('Loss')\n",
+    "  plt.ylim(0, 3)\n",
+    "  plt.show()\n",
+    "\n",
+    "  num_params = sum([np.prod(_.shape) for _ in model.parameters()])\n",
+    "  print('Number of parameters in model:  ' + str(num_params))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Overparameterization_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "### Think! 5.1: Width and depth of the network\n",
+    "\n",
+    "- We see that as we increase the width/depth of the network, training becomes faster and more consistent across different initializations. What might be the reasons for this behavior?\n",
+    "\n",
+    "- What are some potential downsides of this approach to dealing with non-convexity?\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "- The exact mechanism for this phenomenon is still under active research.\n",
+    "Existing evidence points to the following: in the overparameterized setting,\n",
+    "there are many more 'good configurations' (values of the model\u2019s weights) that\n",
+    "lead to a low value of the objective. Furthermore, this large set of possible solutions\n",
+    "seems to be increasingly easy to find in the space of all possible\n",
+    "parameter configurations. As you increase the number of parameters, it becomes\n",
+    "more likely that your initialization will be close to one of these good parameter settings.\n",
+    "\n",
+    "- This approach will require more memory and computation. Furthermore, we need\n",
+    "to always be aware of the risk of overfitting: don\u2019t forget to do cross-validation\n",
+    "in order to be able to detect overfitting.\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Width_and_depth_of_the_network_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 6: Full gradients are expensive\n",
+    "\n",
+    "*Time estimate: ~25 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "So far we have used only a small (fixed) subset of 500 training examples to perform the updates on the model parameters in our quest to minimize the loss. But what if we decided to use the training set? Do our current approach scale to datasets with tens of thousands, or millions of datapoints?\n",
+    "\n",
+    "In this section we explore an efficient alternative to avoid having to perform computations on all the training examples before performing a parameter update."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 6: Mini-batches\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'hbqUxpNBUGk'), ('Bilibili', 'BV1ty4y1T7Uh')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Mini_batches_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 6.1: Cost of computation\n",
+    "\n",
+    "Evaluating a neural network is a relatively fast process. However, when repeated millions of times, the computational cost of performing forward and backward passes through the network starts to become significant.\n",
+    "\n",
+    "In the visualization below, we show the time (averaged over 5 runs) of computing a forward and backward pass with a changing number of input examples. Choose from the different options in the drop-down box and note how the vertical scale changes depending on the size of the network.\n",
+    "\n",
+    "**Remarks:** Note that the computational cost of a forward pass shows a clear linear relationship with the number of input examples, and the cost of the corresponding backward pass exhibits a similar computational complexity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "\n",
+    "def gradient_update(loss, params, lr=1e-3):\n",
+    "  \"\"\"\n",
+    "  Perform a gradient descent update on a given loss over a collection of parameters\n",
+    "\n",
+    "  Args:\n",
+    "    loss: Tensor\n",
+    "      A scalar tensor containing the loss through which the gradient will be computed\n",
+    "    params: List of iterables\n",
+    "      Collection of parameters with respect to which we compute gradients\n",
+    "    lr: Float\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
+    "  # successive backward calls\n",
+    "  zero_grad(params)\n",
+    "\n",
+    "  # Compute gradients on given objective\n",
+    "  loss.backward()\n",
+    "\n",
+    "  with torch.no_grad():\n",
+    "    for par in params:\n",
+    "       par.data -= lr * par.grad.data\n",
+    "\n",
+    "\n",
+    "def measure_update_time(model, num_points):\n",
+    "  \"\"\"\n",
+    "  Measuring the time for update\n",
+    "\n",
+    "  Args:\n",
+    "    model: an nn.Module inherited model\n",
+    "      Represents the ML/DL model\n",
+    "    num_points: integer\n",
+    "      The number of data points in the train_set\n",
+    "\n",
+    "  Returns:\n",
+    "    tuple of loss time and time for calculation of gradient\n",
+    "  \"\"\"\n",
+    "  X, y = train_set.data[:num_points], train_set.targets[:num_points]\n",
+    "  start_time = time.time()\n",
+    "  loss = loss_fn(model(X), y)\n",
+    "  loss_time = time.time()\n",
+    "  gradient_update(loss, list(model.parameters()), lr=0)\n",
+    "  gradient_time = time.time()\n",
+    "  return loss_time - start_time, gradient_time - loss_time\n",
+    "\n",
+    "\n",
+    "@widgets.interact\n",
+    "def computation_time(hidden_dims=widgets.Dropdown(options=[\"1\", \"100\", \"50, 50\"],\n",
+    "                                                  value=\"100\")):\n",
+    "  \"\"\"\n",
+    "  Demonstrating time taken for computation as a widget\n",
+    "\n",
+    "  Args:\n",
+    "    hidden_dims: widgets dropdown\n",
+    "      The number of hidden dimensions with default = 100\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
+    "  model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
+    "\n",
+    "  NUM_POINTS = [1, 5, 10, 100, 200, 500, 1000, 5000, 10000, 20000, 30000, 50000]\n",
+    "  times_list = []\n",
+    "  for _ in range(5):\n",
+    "    times_list.append(np.array([measure_update_time(model, _) for _ in NUM_POINTS]))\n",
+    "\n",
+    "  times = np.array(times_list).mean(axis=0)\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, 1, figsize=(5,4))\n",
+    "  plt.plot(NUM_POINTS, times[:, 0], label='Forward')\n",
+    "  plt.plot(NUM_POINTS, times[:, 1], label='Backward')\n",
+    "  plt.xlabel('Number of data points')\n",
+    "  plt.ylabel('Seconds')\n",
+    "  plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Cost_of_computation_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n\nComplete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "def sample_minibatch(input_data, target_data, num_points=100):\n",
+    "  \"\"\"\n",
+    "  Sample a minibatch of size num_point from the provided input-target data\n",
+    "\n",
+    "  Args:\n",
+    "    input_data: Tensor\n",
+    "      Multi-dimensional tensor containing the input data\n",
+    "    target_data: Tensor\n",
+    "      1D tensor containing the class labels\n",
+    "    num_points: Integer\n",
+    "      Number of elements to be included in minibatch with default=100\n",
+    "\n",
+    "  Returns:\n",
+    "    batch_inputs: Tensor\n",
+    "      Minibatch inputs\n",
+    "    batch_targets: Tensor\n",
+    "      Minibatch targets\n",
+    "  \"\"\"\n",
+    "  #################################################\n",
+    "  ## TODO for students: sample minibatch of data ##\n",
+    "  raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
+    "  #################################################\n",
+    "  # Sample a collection of IID indices from the existing data\n",
+    "  batch_indices = ...\n",
+    "  # Use batch_indices to extract entries from the input and target data tensors\n",
+    "  batch_inputs = input_data[...]\n",
+    "  batch_targets = target_data[...]\n",
+    "\n",
+    "  return batch_inputs, batch_targets\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Uncomment to test your function\n",
+    "# x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
+    "# print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "```\n",
+    "The input shape is torch.Size([100, 28, 28]) and the target shape is: torch.Size([100])\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove solution\n",
+    "def sample_minibatch(input_data, target_data, num_points=100):\n",
+    "  \"\"\"\n",
+    "  Sample a minibatch of size num_point from the provided input-target data\n",
+    "\n",
+    "  Args:\n",
+    "    input_data: Tensor\n",
+    "      Multi-dimensional tensor containing the input data\n",
+    "    target_data: Tensor\n",
+    "      1D tensor containing the class labels\n",
+    "    num_points: Integer\n",
+    "      Number of elements to be included in minibatch with default=100\n",
+    "\n",
+    "  Returns:\n",
+    "    batch_inputs: Tensor\n",
+    "      Minibatch inputs\n",
+    "    batch_targets: Tensor\n",
+    "      Minibatch targets\n",
+    "  \"\"\"\n",
+    "  # Sample a collection of IID indices from the existing data\n",
+    "  batch_indices = np.random.choice(len(input_data), num_points)\n",
+    "  # Use batch_indices to extract entries from the input and target data tensors\n",
+    "  batch_inputs = input_data[batch_indices, :]\n",
+    "  batch_targets = target_data[batch_indices]\n",
+    "\n",
+    "  return batch_inputs, batch_targets\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Uncomment to test your function\n",
+    "x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
+    "print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Implement_mini_batch_sampling_Exercise\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 6.2: *Compare* different minibatch sizes\n",
+    "\n",
+    "What are the trade-offs induced by the choice of minibatch size? The interactive plot below shows the training evolution of a 2-hidden layer MLP with 100 hidden units in each hidden layer. Different plots correspond to a different choice of minibatch size. We have a fixed time budget for all the cases, reflected in the horizontal axes of these plots."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "\n",
+    "@widgets.interact_manual\n",
+    "def minibatch_experiment(batch_sizes='20, 250, 1000',\n",
+    "                         lrs='5e-3, 5e-3, 5e-3',\n",
+    "                         time_budget=widgets.Dropdown(options=[\"2.5\", \"5\", \"10\"],\n",
+    "                                                      value=\"2.5\")):\n",
+    "  \"\"\"\n",
+    "  Demonstration of minibatch experiment\n",
+    "\n",
+    "  Args:\n",
+    "    batch_sizes: String\n",
+    "      Size of minibatches\n",
+    "    lrs: String\n",
+    "      Different learning rates\n",
+    "    time_budget: widget dropdown instance\n",
+    "      Different time budgets with default=2.5s\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  batch_sizes = [int(s) for s in batch_sizes.split(',')]\n",
+    "  lrs = [float(s) for s in lrs.split(',')]\n",
+    "\n",
+    "  LOSS_HIST = {_:[] for _ in batch_sizes}\n",
+    "\n",
+    "  X, y = train_set.data, train_set.targets\n",
+    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
+    "\n",
+    "  for id, batch_size in enumerate(tqdm(batch_sizes)):\n",
+    "    start_time = time.time()\n",
+    "    # Create a new copy of the model for each batch size\n",
+    "    model = copy.deepcopy(base_model)\n",
+    "    params = list(model.parameters())\n",
+    "    lr = lrs[id]\n",
+    "    # Fixed budget per choice of batch size\n",
+    "    while (time.time() - start_time) < float(time_budget):\n",
+    "      data, labels = sample_minibatch(X, y, batch_size)\n",
+    "      loss = loss_fn(model(data), labels)\n",
+    "      gradient_update(loss, params, lr=lr)\n",
+    "      LOSS_HIST[batch_size].append([time.time() - start_time,\n",
+    "                                    loss.item()])\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, len(batch_sizes), figsize=(10, 3))\n",
+    "  for ax, batch_size in zip(axs, batch_sizes):\n",
+    "    plot_data = np.array(LOSS_HIST[batch_size])\n",
+    "    ax.plot(plot_data[:, 0], plot_data[:, 1], label=batch_size,\n",
+    "            alpha=0.8)\n",
+    "    ax.set_title('Batch size: ' + str(batch_size))\n",
+    "    ax.set_xlabel('Seconds')\n",
+    "    ax.set_ylabel('Loss')\n",
+    "  plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "**Remarks:** SGD works! We have an algorithm that can be applied (with due precautions) to learn datasets of arbitrary size.\n",
+    "\n",
+    "However, **note the difference in the vertical scale** across the plots above. When using a larger minibatch, we can perform fewer parameter updates as the forward and backward passes are more expensive.\n",
+    "\n",
+    "This highlights the interplay between the minibatch size and the learning rate: when our minibatch is larger, we have a more confident estimator of the direction to move, and thus can afford a larger learning rate. On the other hand, extremely small minibatches are very fast computationally but are not representative of the data distribution and yield estimations of the gradient with high variance.\n",
+    "\n",
+    "We encourage you to tune the value of the learning rate for each of the minibatch sizes in the previous demo, to achieve a training loss steadily below 0.5 within 5 seconds."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Compare_different_minibatch_sizes_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 7: Adaptive methods\n",
+    "\n",
+    "*Time estimate: ~25 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "As of now, you should be aware that there are many knobs to turn when working on a machine learning problem. Some of these relate to the optimization algorithm, the choice of model, or the objective to minimize. Here are some prototypical examples:\n",
+    "\n",
+    "- Problem: loss function, regularization coefficients (Week 1, Day 5)\n",
+    "- Model: architecture, activations function\n",
+    "- Optimizer: learning rate, batch size, momentum coefficient\n",
+    "\n",
+    "We concentrate on the choices that are directly related to optimization. In particular, we will explore some _automatic_ methods for setting the learning rate in a way that fixes the poor-conditioning problem and is robust across different problems.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 7: Adaptive Methods\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'Zr6r2kfmQUM'), ('Bilibili', 'BV1eq4y1W7JG')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Adaptive_Methods_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Coding Exercise 7 *(optional)*: Implement RMSprop\n\nIn this exercise you will implement the update of the RMSprop optimizer:\n\n\\begin{align}\nv_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\nw_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n\\end{align}\n\nwhere the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n\nHere, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
+    "  \"\"\"\n",
+    "  Perform an RMSprop update on a collection of parameters\n",
+    "\n",
+    "  Args:\n",
+    "    loss: Tensor\n",
+    "      A scalar tensor containing the loss whose gradient will be computed\n",
+    "    params: Iterable\n",
+    "      Collection of parameters with respect to which we compute gradients\n",
+    "    grad_sq: Iterable\n",
+    "      Moving average of squared gradients\n",
+    "    lr: Float\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "    alpha: Float\n",
+    "      Moving average parameter\n",
+    "    epsilon: Float\n",
+    "      quotient for numerical stability\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
+    "  # successive backward calls\n",
+    "  zero_grad(params)\n",
+    "  # Compute gradients on given objective\n",
+    "  loss.backward()\n",
+    "\n",
+    "  with torch.no_grad():\n",
+    "    for (par, gsq) in zip(params, grad_sq):\n",
+    "      #################################################\n",
+    "      ## TODO for students: update the value of the parameter ##\n",
+    "      # Use gsq.data and par.grad\n",
+    "      raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
+    "      #################################################\n",
+    "      # Update estimate of gradient variance\n",
+    "      gsq.data = ...\n",
+    "      # Update parameters\n",
+    "      par.data -=  ...\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "set_seed(seed=SEED)\n",
+    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
+    "print('\\n The model3 parameters before the update are: \\n')\n",
+    "print_params(model3)\n",
+    "loss = loss_fn(model3(X), y)\n",
+    "# Initialize the moving average of squared gradients\n",
+    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Uncomment below to test your function\n",
+    "# rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
+    "# print('\\n The model3 parameters after the update are: \\n')\n",
+    "# print_params(model3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "```\n",
+    " The model3 parameters after the update are:\n",
+    "\n",
+    "main.0.weight tensor([[-0.0240,  0.0031,  0.0193,  ...,  0.0316,  0.0297, -0.0198],\n",
+    "        [-0.0063, -0.0318, -0.0109,  ..., -0.0093,  0.0232, -0.0255],\n",
+    "        [ 0.0218, -0.0253,  0.0320,  ...,  0.0102,  0.0248, -0.0203],\n",
+    "        ...,\n",
+    "        [-0.0027,  0.0136,  0.0089,  ...,  0.0123, -0.0324, -0.0166],\n",
+    "        [ 0.0159,  0.0281,  0.0233,  ..., -0.0133, -0.0197,  0.0182],\n",
+    "        [ 0.0186, -0.0376, -0.0205,  ..., -0.0293,  0.0077, -0.0019]])\n",
+    "main.0.bias tensor([-0.0313, -0.0011,  0.0122, -0.0342,  0.0045,  0.0199,  0.0329,  0.0265,\n",
+    "         0.0182, -0.0041])\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove solution\n",
+    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
+    "  \"\"\"\n",
+    "  Perform an RMSprop update on a collection of parameters\n",
+    "\n",
+    "  Args:\n",
+    "    loss: Tensor\n",
+    "      A scalar tensor containing the loss whose gradient will be computed\n",
+    "    params: Iterable\n",
+    "      Collection of parameters with respect to which we compute gradients\n",
+    "    grad_sq: Iterable\n",
+    "      Moving average of squared gradients\n",
+    "    lr: Float\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "    alpha: Float\n",
+    "      Moving average parameter\n",
+    "    epsilon: Float\n",
+    "      quotient for numerical stability\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
+    "  # successive backward calls\n",
+    "  zero_grad(params)\n",
+    "  # Compute gradients on given objective\n",
+    "  loss.backward()\n",
+    "\n",
+    "  with torch.no_grad():\n",
+    "    for (par, gsq) in zip(params, grad_sq):\n",
+    "      # Update estimate of gradient variance\n",
+    "      gsq.data = alpha * gsq.data + (1 - alpha) * par.grad**2\n",
+    "      # Update parameters\n",
+    "      par.data -=  lr * (par.grad / (epsilon + gsq.data)**0.5)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "set_seed(seed=SEED)\n",
+    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
+    "print('\\n The model3 parameters before the update are: \\n')\n",
+    "print_params(model3)\n",
+    "loss = loss_fn(model3(X), y)\n",
+    "# Initialize the moving average of squared gradients\n",
+    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
+    "\n",
+    "## Uncomment below to test your function\n",
+    "rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
+    "print('\\n The model3 parameters after the update are: \\n')\n",
+    "print_params(model3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Implement_RMSProp_Exercise\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Interactive Demo 7: Compare optimizers\n",
+    "\n",
+    "Below, we compare your implementations of **SGD**, **Momentum**, and **RMSprop**. If you have successfully coded all the exercises so far: congrats!\n",
+    "\n",
+    "You are now *in the know* of some of the most commonly used and powerful optimization tools for deep learning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @markdown Execute this cell to enable the widget!\n",
+    "X, y = train_set.data, train_set.targets\n",
+    "\n",
+    "@widgets.interact_manual\n",
+    "def compare_optimizers(\n",
+    "    batch_size=(25, 250, 5),\n",
+    "    lr=widgets.FloatLogSlider(value=2e-3, min=-5, max=0),\n",
+    "    max_steps=(50, 500, 5)):\n",
+    "  \"\"\"\n",
+    "  Demonstration to compare optimisers - stochastic gradient descent, momentum, RMSprop\n",
+    "\n",
+    "  Args:\n",
+    "    batch_size: Tuple\n",
+    "      Size of minibatches\n",
+    "    lr: Float log slider instance\n",
+    "      Scalar specifying the learning rate or step-size for the update\n",
+    "    max_steps: Tuple\n",
+    "      Max number of step sizes for incrementing\n",
+    "\n",
+    "  Returns:\n",
+    "    Nothing\n",
+    "  \"\"\"\n",
+    "  SGD_DICT = [gradient_update, 'SGD', 'black', '-', {'lr': lr}]\n",
+    "  MOM_DICT = [momentum_update, 'Momentum', 'red', '--', {'lr': lr, 'beta': 0.9}]\n",
+    "  RMS_DICT = [rmsprop_update, 'RMSprop', 'fuchsia', '-', {'lr': lr, 'alpha': 0.8}]\n",
+    "\n",
+    "  ALL_DICTS = [SGD_DICT, MOM_DICT, RMS_DICT]\n",
+    "\n",
+    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
+    "\n",
+    "  LOSS_HIST = {}\n",
+    "\n",
+    "  for opt_dict in tqdm(ALL_DICTS):\n",
+    "    update_fn, opt_name, color, lstyle, kwargs = opt_dict\n",
+    "    LOSS_HIST[opt_name] = []\n",
+    "\n",
+    "    model = copy.deepcopy(base_model)\n",
+    "    params = list(model.parameters())\n",
+    "\n",
+    "    if opt_name != 'SGD':\n",
+    "      aux_tensors = [torch.zeros_like(_) for _ in params]\n",
+    "\n",
+    "    for step in range(max_steps):\n",
+    "      data, labels = sample_minibatch(X, y, batch_size)\n",
+    "      loss = loss_fn(model(data), labels)\n",
+    "      if opt_name == 'SGD':\n",
+    "        update_fn(loss, params, **kwargs)\n",
+    "      else:\n",
+    "        update_fn(loss, params, aux_tensors, **kwargs)\n",
+    "      LOSS_HIST[opt_name].append(loss.item())\n",
+    "\n",
+    "  fig, axs = plt.subplots(1, len(ALL_DICTS), figsize=(9, 3))\n",
+    "  for ax, optim_dict in zip(axs, ALL_DICTS):\n",
+    "    opt_name = optim_dict[1]\n",
+    "    ax.plot(range(max_steps), LOSS_HIST[opt_name], alpha=0.8)\n",
+    "    ax.set_title(opt_name)\n",
+    "    ax.set_xlabel('Iteration')\n",
+    "    ax.set_ylabel('Loss')\n",
+    "    ax.set_ylim(0, 2.5)\n",
+    "  plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Compare_optimizers_Interactive_Demo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think 7.1!: Compare optimizers\n",
+    "\n",
+    "Tune the three methods above - **SGD**, **Momentum**, and **RMSProp** - to make each excel and discuss your findings. How do the methods compare in terms of robustness to small changes of the hyperparameters? How easy was it to find a good hyperparameter configuration?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "Stochastic Gradient Descent (SGD): Performs updates one example at a time.\n",
+    "Momentum: Helps accelerate SGD in the relevant direction and dampens\n",
+    "oscillations specially ravines.\n",
+    "RMSProp: Allows each parameter to be updated at an 'appropriate' rate decided\n",
+    "based on magnitudes of past recent updates;\n",
+    "i.e., areas where the surface curves much more steeply in one dimension than\n",
+    "in another, which are common around local optima.\n",
+    "\n",
+    "Robustness: RMSProp > Momentum > SGD\n",
+    "Since, each example affects SGD by updating hyperparameters, it's not\n",
+    "considered very robust.\n",
+    "Adagrad greatly improved the robustness of SGD and is used for training\n",
+    "large-scale neural nets.\n",
+    "Momentum is quite robust: he momentum term increases for dimensions whose\n",
+    "gradients point in the same directions\n",
+    "and reduces updates for dimensions whose gradients change directions.\n",
+    "RMSProp is very robust; This combines the idea of only using the sign of\n",
+    "the gradient with the idea of adapting the step size separately\n",
+    "for each weight in a mini-batch.\n",
+    "\n",
+    "Generally, non-adaptive methods consistently produce more robust models\n",
+    "than adaptive methods. Refer https://arxiv.org/pdf/1911.03784.pdf - for more details\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Compare_optimizers_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "**Remarks:** Note that RMSprop allows us to use a 'per-dimension' learning rate _without having to tune one learning rate for each dimension **ourselves**_. The method uses information collected about the variance of the gradients throughout training to **adapt** the step size for each of the parameters automatically. The savings in tuning efforts of RMSprop over SGD or 'plain' momentum are undisputed on this task.\n",
+    "\n",
+    "Moreover, adaptive optimization methods are currently a highly active research domain, with many related algorithms like Adam, AMSgrad, Adagrad being used in practical application and theoretically investigated."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "### Locality of Gradients\n",
+    "\n",
+    "As we've seen throughout this tutorial, poor conditioning can be a significant burden on convergence to an optimum while using gradient-based optimization. Of the methods we've seen to deal with this issue, notice how both momentum and adaptive learning rates incorporate past gradient values into their update schemes. Why do we use past values of our loss function's gradient while updating our current MLP weights?\n",
+    "\n",
+    "Recall from *W1D2* that the gradient of a function, $\\nabla f(w_t)$, is a **local** property and computes the direction of maximum change of $f(w_t)$ at the point $w_t$. However, when we train our MLP model we are hoping to find the **global** optimum for our training loss. By incorporating past values of our function's gradient into our optimization schemes, we use more information about the overall shape of our function than just a single gradient alone can provide."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think! 7.2: Loss function and optimization\n",
+    "\n",
+    "Can you think of other ways we can incorporate more information about our loss function into our optimization schemes?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "We could consider incorporating the curvature of our function directly into our\n",
+    "optimization schemes. Methods that use this are often called Newton's methods\n",
+    "or Hessian based optimization methods.\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Loss_function_and_optimization_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 8: Ethical concerns\n",
+    "\n",
+    "*Time estimate: ~15mins*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 8: Ethical concerns\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', '0EthSI0cknI'), ('Bilibili', 'BV1TU4y1G7Je')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Ethical_concerns_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Summary\n",
+    "\n",
+    "* Optimization is necessary to create Deep Learning models that are guaranteed to converge\n",
+    "* Stochastic Gradient Descent and Momentum are two commonly used optimization techniques\n",
+    "* RMSProp is a way of adaptive hyperparameter tuning which utilises a per-dimension learning rate\n",
+    "* Poor choice of optimization objectives can lead to unforeseen, undesirable consequences\n",
+    "\n",
+    "If you have time left, you can read the Bonus material, where we put it all together and we compare our model with a benchmark model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Bonus: Putting it all together\n",
+    "\n",
+    "*Time estimate: ~40 mins*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "We have progressively built a sophisticated optimization algorithm, which is able to deal with a non-convex, poor-conditioned problem concerning tens of thousands of training examples. Now we present _you_ with a small challenge: beat us! :P\n",
+    "\n",
+    "Your mission is to train an MLP model that can compete with a benchmark model which we have pre-trained for you. In this section you will be able to use the full Pytorch power: loading the data, defining the model, sampling minibatches as well as Pytorch's **optimizer implementations**.\n",
+    "\n",
+    "There is a big engineering component behind the design of optimizers and their implementation can sometimes become tricky. So unless you are directly doing research in optimization, it's recommended to use an implementation provided by a widely reviewed open-source library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 9: Putting it all together\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'DP9c13vLiOM'), ('Bilibili', 'BV1MK4y1u7u2')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Putting_it_all_together_Bonus_Video\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Download parameters of the benchmark model\n",
+    "import requests\n",
+    "\n",
+    "fname = 'benchmark_model.pt'\n",
+    "url = \"https://osf.io/sj4e8/download\"\n",
+    "r = requests.get(url, allow_redirects=True)\n",
+    "with open(fname, 'wb') as fh:\n",
+    "  fh.write(r.content)\n",
+    "\n",
+    "# Load the benchmark model's parameters\n",
+    "DEVICE = set_device()\n",
+    "if DEVICE == \"cuda\":\n",
+    "  benchmark_state_dict = torch.load(fname)\n",
+    "else:\n",
+    "  benchmark_state_dict = torch.load(fname, map_location=torch.device('cpu'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# Create MLP object and update weights with those of saved model\n",
+    "benchmark_model = MLP(in_dim=784, out_dim=10,\n",
+    "                      hidden_dims=[200, 100, 50]).to(DEVICE)\n",
+    "benchmark_model.load_state_dict(benchmark_state_dict)\n",
+    "\n",
+    "\n",
+    "# Define helper function to evaluate models\n",
+    "def eval_model(model, data_loader, num_batches=np.inf, device='cpu'):\n",
+    "  \"\"\"\n",
+    "  To evaluate a given model\n",
+    "\n",
+    "  Args:\n",
+    "    model: nn.Module derived class\n",
+    "      The model which is to be evaluated\n",
+    "    data_loader: Iterable\n",
+    "      A configured dataloading utility\n",
+    "    num_batches: Integer\n",
+    "      Size of minibatches\n",
+    "    device: String\n",
+    "      Sets the device. CUDA if available, CPU otherwise\n",
+    "\n",
+    "  Returns:\n",
+    "    mean of log loss and mean of log accuracy\n",
+    "  \"\"\"\n",
+    "\n",
+    "  loss_log, acc_log = [], []\n",
+    "  model.to(device=device)\n",
+    "\n",
+    "  # We are just evaluating the model, no need to compute gradients\n",
+    "  with torch.no_grad():\n",
+    "    for batch_id, batch in enumerate(data_loader):\n",
+    "      # If we only evaluate a number of batches, stop after we reach that number\n",
+    "      if batch_id > num_batches:\n",
+    "        break\n",
+    "      # Extract minibatch data\n",
+    "      data, labels = batch[0].to(device), batch[1].to(device)\n",
+    "      # Evaluate model and loss on minibatch\n",
+    "      preds = model(data)\n",
+    "      loss_log.append(loss_fn(preds, labels).item())\n",
+    "      acc_log.append(torch.mean(1. * (preds.argmax(dim=1) == labels)).item())\n",
+    "\n",
+    "  return np.mean(loss_log), np.mean(acc_log)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "We define an optimizer in the following steps:\n",
+    "\n",
+    "1. Load  the corresponding class that implements the parameter updates and other internal management activities, including:\n",
+    "    - create auxiliary variables,\n",
+    "    - update moving averages,\n",
+    "    - adjust the learning rate.\n",
+    "2. Pass the parameters of the Pytorch model that the optimizer has control over. Note that different optimizers can potentially control different parameter groups.\n",
+    "3. Specify hyperparameters, including learning rate, momentum, moving average factors, etc.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Exercise Bonus: Train your own model\n",
+    "\n",
+    "Now, train the model with your preferred optimizer and find a good combination of hyperparameter settings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "#################################################\n",
+    "## TODO for students: adjust training settings ##\n",
+    "\n",
+    "# The three parameters below are in your full control\n",
+    "MAX_EPOCHS = 2  # select number of epochs to train\n",
+    "LR = 1e-5  # choose the step size\n",
+    "BATCH_SIZE = 64  # number of examples per minibatch\n",
+    "\n",
+    "# Define the model and associated optimizer -- you may change its architecture!\n",
+    "my_model = MLP(in_dim=784, out_dim=10, hidden_dims=[200, 100, 50]).to(DEVICE)\n",
+    "\n",
+    "# You can take your pick from many different optimizers\n",
+    "# Check the optimizer documentation and hyperparameter meaning before using!\n",
+    "# More details on Pytorch optimizers: https://pytorch.org/docs/stable/optim.html\n",
+    "# optimizer = torch.optim.SGD(my_model.parameters(), lr=LR, momentum=0.9)\n",
+    "# optimizer = torch.optim.RMSprop(my_model.parameters(), lr=LR, alpha=0.99)\n",
+    "# optimizer = torch.optim.Adagrad(my_model.parameters(), lr=LR)\n",
+    "optimizer = torch.optim.Adam(my_model.parameters(), lr=LR)\n",
+    "#################################################"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "set_seed(seed=SEED)\n",
+    "# Print training stats every LOG_FREQ minibatches\n",
+    "LOG_FREQ = 200\n",
+    "# Frequency for evaluating the validation metrics\n",
+    "VAL_FREQ = 200\n",
+    "# Load data using a Pytorch Dataset\n",
+    "train_set_orig, test_set_orig = load_mnist_data(change_tensors=False)\n",
+    "\n",
+    "# We separate 10,000 training samples to create a validation set\n",
+    "train_set_orig, val_set_orig = torch.utils.data.random_split(train_set_orig, [50000, 10000])\n",
+    "\n",
+    "# Create the corresponding DataLoaders for training and test\n",
+    "g_seed = torch.Generator()\n",
+    "g_seed.manual_seed(SEED)\n",
+    "\n",
+    "train_loader = torch.utils.data.DataLoader(train_set_orig,\n",
+    "                                           shuffle=True,\n",
+    "                                           batch_size=BATCH_SIZE,\n",
+    "                                           num_workers=2,\n",
+    "                                           worker_init_fn=seed_worker,\n",
+    "                                           generator=g_seed)\n",
+    "val_loader = torch.utils.data.DataLoader(val_set_orig,\n",
+    "                                         shuffle=True,\n",
+    "                                         batch_size=256,\n",
+    "                                         num_workers=2,\n",
+    "                                         worker_init_fn=seed_worker,\n",
+    "                                         generator=g_seed)\n",
+    "test_loader = torch.utils.data.DataLoader(test_set_orig,\n",
+    "                                          batch_size=256,\n",
+    "                                          num_workers=2,\n",
+    "                                          worker_init_fn=seed_worker,\n",
+    "                                          generator=g_seed)\n",
+    "\n",
+    "# Run training\n",
+    "metrics = {'train_loss':[],\n",
+    "           'train_acc':[],\n",
+    "           'val_loss':[],\n",
+    "           'val_acc':[],\n",
+    "           'val_idx':[]}\n",
+    "\n",
+    "step_idx = 0\n",
+    "for epoch in tqdm(range(MAX_EPOCHS)):\n",
+    "\n",
+    "  running_loss, running_acc = 0., 0.\n",
+    "\n",
+    "  for batch_id, batch in enumerate(train_loader):\n",
+    "    step_idx += 1\n",
+    "    # Extract minibatch data and labels\n",
+    "    data, labels = batch[0].to(DEVICE), batch[1].to(DEVICE)\n",
+    "    # Just like before, refresh gradient accumulators.\n",
+    "    # Note that this is now a method of the optimizer.\n",
+    "    optimizer.zero_grad()\n",
+    "    # Evaluate model and loss on minibatch\n",
+    "    preds = my_model(data)\n",
+    "    loss = loss_fn(preds, labels)\n",
+    "    acc = torch.mean(1.0 * (preds.argmax(dim=1) == labels))\n",
+    "    # Compute gradients\n",
+    "    loss.backward()\n",
+    "    # Update parameters\n",
+    "    # Note how all the magic in the update of the parameters is encapsulated by\n",
+    "    # the optimizer class.\n",
+    "    optimizer.step()\n",
+    "    # Log metrics for plotting\n",
+    "    metrics['train_loss'].append(loss.cpu().item())\n",
+    "    metrics['train_acc'].append(acc.cpu().item())\n",
+    "\n",
+    "    if batch_id % VAL_FREQ == (VAL_FREQ - 1):\n",
+    "      # Get an estimate of the validation accuracy with 100 batches\n",
+    "      val_loss, val_acc = eval_model(my_model, val_loader,\n",
+    "                                     num_batches=100,\n",
+    "                                     device=DEVICE)\n",
+    "      metrics['val_idx'].append(step_idx)\n",
+    "      metrics['val_loss'].append(val_loss)\n",
+    "      metrics['val_acc'].append(val_acc)\n",
+    "\n",
+    "      print(f\"[VALID] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
+    "            f\"Loss: {val_loss:.3f} - Acc: {100*val_acc:.3f}%\")\n",
+    "\n",
+    "    # print statistics\n",
+    "    running_loss += loss.cpu().item()\n",
+    "    running_acc += acc.cpu().item()\n",
+    "    # Print every LOG_FREQ minibatches\n",
+    "    if batch_id % LOG_FREQ == (LOG_FREQ-1):\n",
+    "      print(f\"[TRAIN] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
+    "            f\"Loss: {running_loss / LOG_FREQ:.3f} - \"\n",
+    "            f\"Acc: {100 * running_acc / LOG_FREQ:.3f}%\")\n",
+    "\n",
+    "      running_loss, running_acc = 0., 0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
+    "\n",
+    "ax[0].plot(range(len(metrics['train_loss'])), metrics['train_loss'],\n",
+    "           alpha=0.8, label='Train')\n",
+    "ax[0].plot(metrics['val_idx'], metrics['val_loss'], label='Valid')\n",
+    "ax[0].set_xlabel('Iteration')\n",
+    "ax[0].set_ylabel('Loss')\n",
+    "ax[0].legend()\n",
+    "\n",
+    "ax[1].plot(range(len(metrics['train_acc'])), metrics['train_acc'],\n",
+    "           alpha=0.8, label='Train')\n",
+    "ax[1].plot(metrics['val_idx'], metrics['val_acc'], label='Valid')\n",
+    "ax[1].set_xlabel('Iteration')\n",
+    "ax[1].set_ylabel('Accuracy')\n",
+    "ax[1].legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Train_your_own_model_Bonus_Exercise\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think! Bonus: Metrics\n",
+    "\n",
+    "Which metric did you optimize when searching for the right configuration? The training set loss? Accuracy? Validation/test set metrics? Why? Discuss!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# to_remove explanation\n",
+    "\n",
+    "\"\"\"\n",
+    "  Remember the discussion in Section 1 about surrogate objectives.\n",
+    "Our optimization methods minimize the loss, but at the end of the day we care about test accuracy.\n",
+    "\n",
+    "  However, we can't directly optimize for test accuracy and the finite size of our\n",
+    "datasets lead us to (cross-)validation:\n",
+    "\n",
+    "  1. We minimize the loss (empirical risk minimization) on our *training set*.\n",
+    "  2. We choose models and hyperparameters on the *validation set*.\n",
+    "  3. We use the *test set* in order to report the final performance of our model on unseen data.\n",
+    "\"\"\";"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Metrics_Bonus_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "### Evaluation\n",
+    "\n",
+    "We _finally_ can evaluate and compare the performance of the models on previously unseen examples.\n",
+    "\n",
+    "Which model would you keep? (\\*drum roll*)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "print('Your model...')\n",
+    "train_loss, train_accuracy = eval_model(my_model, train_loader, device=DEVICE)\n",
+    "test_loss, test_accuracy = eval_model(my_model, test_loader, device=DEVICE)\n",
+    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
+    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')\n",
+    "\n",
+    "print('\\nBenchmark model')\n",
+    "train_loss, train_accuracy = eval_model(benchmark_model, train_loader, device=DEVICE)\n",
+    "test_loss, test_accuracy = eval_model(benchmark_model, test_loader, device=DEVICE)\n",
+    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
+    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')"
    ]
   }
  ],

From b7be6bc56b630b7ba4ba4eac92a29ebf4ceaf6c1 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Fri, 1 May 2026 18:29:36 -0400
Subject: [PATCH 14/34] W1D4: remove BonusLecture and Tutorial2 (DL Case Study)
 notebooks

These were separated out in the split-Tutorial1 commit but are no longer
needed now that Tutorial1 is restored as a single unified notebook.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D4_Optimization/W1D4_BonusLecture.ipynb | 2097 -----------------
 .../W1D4_Optimization/W1D4_Tutorial2.ipynb    | 1403 -----------
 2 files changed, 3500 deletions(-)
 delete mode 100644 tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
 delete mode 100644 tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb

diff --git a/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb b/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
deleted file mode 100644
index ddb497759..000000000
--- a/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb
+++ /dev/null
@@ -1,2097 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "execution": {},
-    "id": "view-in-github"
-   },
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D4_Optimization/W1D4_BonusLecture.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "# Bonus Lecture: Optimization techniques (continued)\n",
-    "\n",
-    "**Week 1, Day 4: Optimization**\n",
-    "\n",
-    "**By Neuromatch Academy**\n",
-    "\n",
-    "__Content creators:__ Jose Gallego-Posada, Ioannis Mitliagkas\n",
-    "\n",
-    "__Content reviewers:__ Piyush Chauhan, Vladimir Haltakov, Siwei Bai, Kelson Shilling-Scrivo\n",
-    "\n",
-    "__Content editors:__ Charles J Edelson, Gagana B, Spiros Chavlis\n",
-    "\n",
-    "__Production editors:__ Arush Tagade, R. Krishnakumaran, Gagana B, Spiros Chavlis\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Tutorial Objectives\n",
-    "\n",
-    "Objectives:\n",
-    "*   Optimization in non-convex loss landscapes\n",
-    "*   Mini-batch sampling and stochastic gradients\n",
-    "*   'Adaptive' hyperparameter tuning\n",
-    "*   Ethical concerns\n",
-    "*   Putting it all together: training your own model\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Tutorial slides\n",
-    "from IPython.display import IFrame\n",
-    "link_id = \"ft2sz\"\n",
-    "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n",
-    "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and import feedback gadget\n",
-    "\n",
-    "!pip3 install vibecheck datatops --quiet\n",
-    "\n",
-    "from vibecheck import DatatopsContentReviewContainer\n",
-    "def content_review(notebook_section: str):\n",
-    "    return DatatopsContentReviewContainer(\n",
-    "        \"\",  # No text prompt\n",
-    "        notebook_section,\n",
-    "        {\n",
-    "            \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n",
-    "            \"name\": \"neuromatch_dl\",\n",
-    "            \"user_key\": \"f379rz8y\",\n",
-    "        },\n",
-    "    ).render()\n",
-    "\n",
-    "\n",
-    "feedback_prefix = \"W1D4_T1\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# Imports\n",
-    "import copy\n",
-    "\n",
-    "import ipywidgets as widgets\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "\n",
-    "import time\n",
-    "import torch\n",
-    "import torchvision\n",
-    "import torchvision.datasets as datasets\n",
-    "import torch.nn.functional as F\n",
-    "import torch.nn as nn\n",
-    "import torch.optim as optim\n",
-    "from tqdm.auto import tqdm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Figure settings\n",
-    "import logging\n",
-    "logging.getLogger('matplotlib.font_manager').disabled = True\n",
-    "\n",
-    "%config InlineBackend.figure_format = 'retina'\n",
-    "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")\n",
-    "plt.rc('axes', unicode_minus=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Helper functions\n",
-    "def print_params(model):\n",
-    "  \"\"\"\n",
-    "  Lists the name and current value of the model's\n",
-    "  named parameters\n",
-    "\n",
-    "  Args:\n",
-    "    model: an nn.Module inherited model\n",
-    "      Represents the ML/DL model\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  for name, param in model.named_parameters():\n",
-    "    if param.requires_grad:\n",
-    "      print(name, param.data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Set random seed\n",
-    "\n",
-    "# @markdown Executing `set_seed(seed=seed)` you are setting the seed\n",
-    "\n",
-    "# for DL its critical to set the random seed so that students can have a\n",
-    "# baseline to compare their results to expected results.\n",
-    "# Read more here: https://pytorch.org/docs/stable/notes/randomness.html\n",
-    "\n",
-    "# Call the `set_seed` function in the exercises to ensure reproducibility.\n",
-    "import random\n",
-    "import torch\n",
-    "\n",
-    "def set_seed(seed=None, seed_torch=True):\n",
-    "  \"\"\"\n",
-    "  Handles variability by controlling sources of randomness\n",
-    "  through set seed values\n",
-    "\n",
-    "  Args:\n",
-    "    seed: Integer\n",
-    "      Set the seed value to given integer.\n",
-    "      If no seed, set seed value to random integer in the range 2^32\n",
-    "    seed_torch: Bool\n",
-    "      Seeds the random number generator for all devices to\n",
-    "      offer some guarantees on reproducibility\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  if seed is None:\n",
-    "    seed = np.random.choice(2 ** 32)\n",
-    "  random.seed(seed)\n",
-    "  np.random.seed(seed)\n",
-    "  if seed_torch:\n",
-    "    torch.manual_seed(seed)\n",
-    "    torch.cuda.manual_seed_all(seed)\n",
-    "    torch.cuda.manual_seed(seed)\n",
-    "    torch.backends.cudnn.benchmark = False\n",
-    "    torch.backends.cudnn.deterministic = True\n",
-    "  print(f'Random seed {seed} has been set.')\n",
-    "\n",
-    "\n",
-    "# In case that `DataLoader` is used\n",
-    "def seed_worker(worker_id):\n",
-    "  \"\"\"\n",
-    "  DataLoader will reseed workers following randomness in\n",
-    "  multi-process data loading algorithm.\n",
-    "\n",
-    "  Args:\n",
-    "    worker_id: integer\n",
-    "      ID of subprocess to seed. 0 means that\n",
-    "      the data will be loaded in the main process\n",
-    "      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  worker_seed = torch.initial_seed() % 2**32\n",
-    "  np.random.seed(worker_seed)\n",
-    "  random.seed(worker_seed)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Set device (GPU or CPU). Execute `set_device()`\n",
-    "# especially if torch modules are used.\n",
-    "\n",
-    "# inform the user if the notebook uses GPU or CPU.\n",
-    "\n",
-    "def set_device():\n",
-    "  \"\"\"\n",
-    "  Set the device. CUDA if available, CPU otherwise\n",
-    "\n",
-    "  Args:\n",
-    "    None\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "  if device != \"cuda\":\n",
-    "    print(\"WARNING: For this notebook to perform best, \"\n",
-    "        \"if possible, in the menu under `Runtime` -> \"\n",
-    "        \"`Change runtime type.`  select `GPU` \")\n",
-    "  else:\n",
-    "    print(\"GPU is enabled in this notebook.\")\n",
-    "\n",
-    "  return device"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "SEED = 2021\n",
-    "set_seed(seed=SEED)\n",
-    "DEVICE = set_device()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 5: Non-convexity\n",
-    "\n",
-    "*Time estimate: ~30 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "The introduction of even just 1 hidden layer in the neural network transforms the previous convex optimization problem into a non-convex one. And with great non-convexity, comes great responsibility... (Sorry, we couldn't help it!)\n",
-    "\n",
-    "**Note:** From this section onwards we will be dealing with non-convex optimization problems for the remainder of the tutorial."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 5: Overparameterization\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', '7vUpUEKKl5o'), ('Bilibili', 'BV16h41167Jr')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Overparameterization_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Take a couple of minutes to play with a more complex 3D visualization of the loss landscape of a neural network on a non-convex problem. Visit https://losslandscape.com/explorer.\n",
-    "\n",
-    "1. Explore the features on the bottom left corner. You can see an explanation for each icon by clicking on the ( i ) button located on the top right corner.\n",
-    "2. Use the 'gradient descent' feature to perform a thought experiment:\n",
-    "    -   Choose an initialization\n",
-    "    -   Choose the learning rate\n",
-    "    -   Mentally formulate your hypothesis about what kind of trajectory you expect to observe\n",
-    "3. Run the experiment and contrast your intuition with the observed behavior.\n",
-    "4. Repeat this experiment a handful of times for several initialization/learning rate configurations\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 5: Overparameterization to the rescue!\n",
-    "\n",
-    "As you may have seen, the non-convex nature of the surface can lead the optimization process to get stuck in undesirable local-optima. There is ample empirical evidence supporting the claim that 'overparameterized' models are easier to train.\n",
-    "\n",
-    "We will explore this assertion in the context of our MLP training. For this, we initialize a fixed model and construct several models by small random perturbations to the original initialized weights. Now, we train each of these perturbed models and see how the loss evolves. If we were in the convex setting, we should reach very similar objective values upon convergence since all these models were very close at the beginning of training, and in convex problems, the local optimum is also the global optimum.\n",
-    "\n",
-    "Use the interactive plot below to visualize the loss progression for these perturbed models:\n",
-    "\n",
-    "1. Select different settings from the `hidden_dims` drop-down menu.\n",
-    "2. Explore the effect of the number of steps and learning rate."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "\n",
-    "@widgets.interact_manual\n",
-    "def overparam(max_steps=widgets.IntSlider(150, 50, 500, 5),\n",
-    "              hidden_dims=widgets.Dropdown(options=[\"10\", \"20, 20\", \"100, 100\"],\n",
-    "                                           value=\"10\"),\n",
-    "              lr=widgets.FloatLogSlider(value=5e-2, min=-3, max=0, step=0.1),\n",
-    "              num_inits=widgets.IntSlider(7, 5, 10, 1)):\n",
-    "  \"\"\"\n",
-    "  Displays the overparameterization phenomenon as a widget\n",
-    "\n",
-    "  Args:\n",
-    "    max_steps: widget integer slider\n",
-    "      Maximum number of steps on the slider with default = 150\n",
-    "    hidden_dims: widget dropdown menu instance\n",
-    "      The number of hidden dimensions with default = 10\n",
-    "    lr: widget float slider\n",
-    "      Scalar specifying the learning rate or step-size for the update with default = 5e-2\n",
-    "    num_inits: widget integer slider\n",
-    "      Scalar number of epochs\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "\n",
-    "  X, y = train_set.data[subset_index, :], train_set.targets[subset_index]\n",
-    "\n",
-    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
-    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, 1, figsize=(5, 4))\n",
-    "\n",
-    "  for _ in tqdm(range(num_inits)):\n",
-    "    model = copy.deepcopy(base_model)\n",
-    "    random_update(model, noise_scale=2e-1)\n",
-    "    loss_hist = np.zeros((max_steps, 2))\n",
-    "    for step in range(max_steps):\n",
-    "      loss = loss_fn(model(X), y)\n",
-    "      gradient_update(loss, list(model.parameters()), lr=lr)\n",
-    "      loss_hist[step] = np.array([step, loss.item()])\n",
-    "\n",
-    "    plt.plot(loss_hist[:, 0], loss_hist[:, 1])\n",
-    "\n",
-    "  plt.xlabel('Iteration')\n",
-    "  plt.ylabel('Loss')\n",
-    "  plt.ylim(0, 3)\n",
-    "  plt.show()\n",
-    "\n",
-    "  num_params = sum([np.prod(_.shape) for _ in model.parameters()])\n",
-    "  print('Number of parameters in model:  ' + str(num_params))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Overparameterization_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Think! 5.1: Width and depth of the network\n",
-    "\n",
-    "- We see that as we increase the width/depth of the network, training becomes faster and more consistent across different initializations. What might be the reasons for this behavior?\n",
-    "\n",
-    "- What are some potential downsides of this approach to dealing with non-convexity?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "- The exact mechanism for this phenomenon is still under active research.\n",
-    "Existing evidence points to the following: in the overparameterized setting,\n",
-    "there are many more 'good configurations' (values of the model’s weights) that\n",
-    "lead to a low value of the objective. Furthermore, this large set of possible solutions\n",
-    "seems to be increasingly easy to find in the space of all possible\n",
-    "parameter configurations. As you increase the number of parameters, it becomes\n",
-    "more likely that your initialization will be close to one of these good parameter settings.\n",
-    "\n",
-    "- This approach will require more memory and computation. Furthermore, we need\n",
-    "to always be aware of the risk of overfitting: don’t forget to do cross-validation\n",
-    "in order to be able to detect overfitting.\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Width_and_depth_of_the_network_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 6: Full gradients are expensive\n",
-    "\n",
-    "*Time estimate: ~25 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "So far we have used only a small (fixed) subset of 500 training examples to perform the updates on the model parameters in our quest to minimize the loss. But what if we decided to use the training set? Do our current approach scale to datasets with tens of thousands, or millions of datapoints?\n",
-    "\n",
-    "In this section we explore an efficient alternative to avoid having to perform computations on all the training examples before performing a parameter update."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 6: Mini-batches\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'hbqUxpNBUGk'), ('Bilibili', 'BV1ty4y1T7Uh')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Mini_batches_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 6.1: Cost of computation\n",
-    "\n",
-    "Evaluating a neural network is a relatively fast process. However, when repeated millions of times, the computational cost of performing forward and backward passes through the network starts to become significant.\n",
-    "\n",
-    "In the visualization below, we show the time (averaged over 5 runs) of computing a forward and backward pass with a changing number of input examples. Choose from the different options in the drop-down box and note how the vertical scale changes depending on the size of the network.\n",
-    "\n",
-    "**Remarks:** Note that the computational cost of a forward pass shows a clear linear relationship with the number of input examples, and the cost of the corresponding backward pass exhibits a similar computational complexity."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "\n",
-    "def gradient_update(loss, params, lr=1e-3):\n",
-    "  \"\"\"\n",
-    "  Perform a gradient descent update on a given loss over a collection of parameters\n",
-    "\n",
-    "  Args:\n",
-    "    loss: Tensor\n",
-    "      A scalar tensor containing the loss through which the gradient will be computed\n",
-    "    params: List of iterables\n",
-    "      Collection of parameters with respect to which we compute gradients\n",
-    "    lr: Float\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
-    "  # successive backward calls\n",
-    "  zero_grad(params)\n",
-    "\n",
-    "  # Compute gradients on given objective\n",
-    "  loss.backward()\n",
-    "\n",
-    "  with torch.no_grad():\n",
-    "    for par in params:\n",
-    "       par.data -= lr * par.grad.data\n",
-    "\n",
-    "\n",
-    "def measure_update_time(model, num_points):\n",
-    "  \"\"\"\n",
-    "  Measuring the time for update\n",
-    "\n",
-    "  Args:\n",
-    "    model: an nn.Module inherited model\n",
-    "      Represents the ML/DL model\n",
-    "    num_points: integer\n",
-    "      The number of data points in the train_set\n",
-    "\n",
-    "  Returns:\n",
-    "    tuple of loss time and time for calculation of gradient\n",
-    "  \"\"\"\n",
-    "  X, y = train_set.data[:num_points], train_set.targets[:num_points]\n",
-    "  start_time = time.time()\n",
-    "  loss = loss_fn(model(X), y)\n",
-    "  loss_time = time.time()\n",
-    "  gradient_update(loss, list(model.parameters()), lr=0)\n",
-    "  gradient_time = time.time()\n",
-    "  return loss_time - start_time, gradient_time - loss_time\n",
-    "\n",
-    "\n",
-    "@widgets.interact\n",
-    "def computation_time(hidden_dims=widgets.Dropdown(options=[\"1\", \"100\", \"50, 50\"],\n",
-    "                                                  value=\"100\")):\n",
-    "  \"\"\"\n",
-    "  Demonstrating time taken for computation as a widget\n",
-    "\n",
-    "  Args:\n",
-    "    hidden_dims: widgets dropdown\n",
-    "      The number of hidden dimensions with default = 100\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
-    "  model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
-    "\n",
-    "  NUM_POINTS = [1, 5, 10, 100, 200, 500, 1000, 5000, 10000, 20000, 30000, 50000]\n",
-    "  times_list = []\n",
-    "  for _ in range(5):\n",
-    "    times_list.append(np.array([measure_update_time(model, _) for _ in NUM_POINTS]))\n",
-    "\n",
-    "  times = np.array(times_list).mean(axis=0)\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, 1, figsize=(5,4))\n",
-    "  plt.plot(NUM_POINTS, times[:, 0], label='Forward')\n",
-    "  plt.plot(NUM_POINTS, times[:, 1], label='Backward')\n",
-    "  plt.xlabel('Number of data points')\n",
-    "  plt.ylabel('Seconds')\n",
-    "  plt.legend()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Cost_of_computation_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n\nComplete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "def sample_minibatch(input_data, target_data, num_points=100):\n",
-    "  \"\"\"\n",
-    "  Sample a minibatch of size num_point from the provided input-target data\n",
-    "\n",
-    "  Args:\n",
-    "    input_data: Tensor\n",
-    "      Multi-dimensional tensor containing the input data\n",
-    "    target_data: Tensor\n",
-    "      1D tensor containing the class labels\n",
-    "    num_points: Integer\n",
-    "      Number of elements to be included in minibatch with default=100\n",
-    "\n",
-    "  Returns:\n",
-    "    batch_inputs: Tensor\n",
-    "      Minibatch inputs\n",
-    "    batch_targets: Tensor\n",
-    "      Minibatch targets\n",
-    "  \"\"\"\n",
-    "  #################################################\n",
-    "  ## TODO for students: sample minibatch of data ##\n",
-    "  raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
-    "  #################################################\n",
-    "  # Sample a collection of IID indices from the existing data\n",
-    "  batch_indices = ...\n",
-    "  # Use batch_indices to extract entries from the input and target data tensors\n",
-    "  batch_inputs = input_data[...]\n",
-    "  batch_targets = target_data[...]\n",
-    "\n",
-    "  return batch_inputs, batch_targets\n",
-    "\n",
-    "\n",
-    "\n",
-    "## Uncomment to test your function\n",
-    "# x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
-    "# print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "```\n",
-    "The input shape is torch.Size([100, 28, 28]) and the target shape is: torch.Size([100])\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove solution\n",
-    "def sample_minibatch(input_data, target_data, num_points=100):\n",
-    "  \"\"\"\n",
-    "  Sample a minibatch of size num_point from the provided input-target data\n",
-    "\n",
-    "  Args:\n",
-    "    input_data: Tensor\n",
-    "      Multi-dimensional tensor containing the input data\n",
-    "    target_data: Tensor\n",
-    "      1D tensor containing the class labels\n",
-    "    num_points: Integer\n",
-    "      Number of elements to be included in minibatch with default=100\n",
-    "\n",
-    "  Returns:\n",
-    "    batch_inputs: Tensor\n",
-    "      Minibatch inputs\n",
-    "    batch_targets: Tensor\n",
-    "      Minibatch targets\n",
-    "  \"\"\"\n",
-    "  # Sample a collection of IID indices from the existing data\n",
-    "  batch_indices = np.random.choice(len(input_data), num_points)\n",
-    "  # Use batch_indices to extract entries from the input and target data tensors\n",
-    "  batch_inputs = input_data[batch_indices, :]\n",
-    "  batch_targets = target_data[batch_indices]\n",
-    "\n",
-    "  return batch_inputs, batch_targets\n",
-    "\n",
-    "\n",
-    "\n",
-    "## Uncomment to test your function\n",
-    "x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
-    "print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Implement_mini_batch_sampling_Exercise\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 6.2: *Compare* different minibatch sizes\n",
-    "\n",
-    "What are the trade-offs induced by the choice of minibatch size? The interactive plot below shows the training evolution of a 2-hidden layer MLP with 100 hidden units in each hidden layer. Different plots correspond to a different choice of minibatch size. We have a fixed time budget for all the cases, reflected in the horizontal axes of these plots."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "\n",
-    "@widgets.interact_manual\n",
-    "def minibatch_experiment(batch_sizes='20, 250, 1000',\n",
-    "                         lrs='5e-3, 5e-3, 5e-3',\n",
-    "                         time_budget=widgets.Dropdown(options=[\"2.5\", \"5\", \"10\"],\n",
-    "                                                      value=\"2.5\")):\n",
-    "  \"\"\"\n",
-    "  Demonstration of minibatch experiment\n",
-    "\n",
-    "  Args:\n",
-    "    batch_sizes: String\n",
-    "      Size of minibatches\n",
-    "    lrs: String\n",
-    "      Different learning rates\n",
-    "    time_budget: widget dropdown instance\n",
-    "      Different time budgets with default=2.5s\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  batch_sizes = [int(s) for s in batch_sizes.split(',')]\n",
-    "  lrs = [float(s) for s in lrs.split(',')]\n",
-    "\n",
-    "  LOSS_HIST = {_:[] for _ in batch_sizes}\n",
-    "\n",
-    "  X, y = train_set.data, train_set.targets\n",
-    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
-    "\n",
-    "  for id, batch_size in enumerate(tqdm(batch_sizes)):\n",
-    "    start_time = time.time()\n",
-    "    # Create a new copy of the model for each batch size\n",
-    "    model = copy.deepcopy(base_model)\n",
-    "    params = list(model.parameters())\n",
-    "    lr = lrs[id]\n",
-    "    # Fixed budget per choice of batch size\n",
-    "    while (time.time() - start_time) < float(time_budget):\n",
-    "      data, labels = sample_minibatch(X, y, batch_size)\n",
-    "      loss = loss_fn(model(data), labels)\n",
-    "      gradient_update(loss, params, lr=lr)\n",
-    "      LOSS_HIST[batch_size].append([time.time() - start_time,\n",
-    "                                    loss.item()])\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, len(batch_sizes), figsize=(10, 3))\n",
-    "  for ax, batch_size in zip(axs, batch_sizes):\n",
-    "    plot_data = np.array(LOSS_HIST[batch_size])\n",
-    "    ax.plot(plot_data[:, 0], plot_data[:, 1], label=batch_size,\n",
-    "            alpha=0.8)\n",
-    "    ax.set_title('Batch size: ' + str(batch_size))\n",
-    "    ax.set_xlabel('Seconds')\n",
-    "    ax.set_ylabel('Loss')\n",
-    "  plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "**Remarks:** SGD works! We have an algorithm that can be applied (with due precautions) to learn datasets of arbitrary size.\n",
-    "\n",
-    "However, **note the difference in the vertical scale** across the plots above. When using a larger minibatch, we can perform fewer parameter updates as the forward and backward passes are more expensive.\n",
-    "\n",
-    "This highlights the interplay between the minibatch size and the learning rate: when our minibatch is larger, we have a more confident estimator of the direction to move, and thus can afford a larger learning rate. On the other hand, extremely small minibatches are very fast computationally but are not representative of the data distribution and yield estimations of the gradient with high variance.\n",
-    "\n",
-    "We encourage you to tune the value of the learning rate for each of the minibatch sizes in the previous demo, to achieve a training loss steadily below 0.5 within 5 seconds."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Compare_different_minibatch_sizes_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 7: Adaptive methods\n",
-    "\n",
-    "*Time estimate: ~25 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "As of now, you should be aware that there are many knobs to turn when working on a machine learning problem. Some of these relate to the optimization algorithm, the choice of model, or the objective to minimize. Here are some prototypical examples:\n",
-    "\n",
-    "- Problem: loss function, regularization coefficients (Week 1, Day 5)\n",
-    "- Model: architecture, activations function\n",
-    "- Optimizer: learning rate, batch size, momentum coefficient\n",
-    "\n",
-    "We concentrate on the choices that are directly related to optimization. In particular, we will explore some _automatic_ methods for setting the learning rate in a way that fixes the poor-conditioning problem and is robust across different problems.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 7: Adaptive Methods\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'Zr6r2kfmQUM'), ('Bilibili', 'BV1eq4y1W7JG')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Adaptive_Methods_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": "## Coding Exercise 7 *(optional)*: Implement RMSprop\n\nIn this exercise you will implement the update of the RMSprop optimizer:\n\n\\begin{align}\nv_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\nw_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n\\end{align}\n\nwhere the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n\nHere, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
-    "  \"\"\"\n",
-    "  Perform an RMSprop update on a collection of parameters\n",
-    "\n",
-    "  Args:\n",
-    "    loss: Tensor\n",
-    "      A scalar tensor containing the loss whose gradient will be computed\n",
-    "    params: Iterable\n",
-    "      Collection of parameters with respect to which we compute gradients\n",
-    "    grad_sq: Iterable\n",
-    "      Moving average of squared gradients\n",
-    "    lr: Float\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "    alpha: Float\n",
-    "      Moving average parameter\n",
-    "    epsilon: Float\n",
-    "      quotient for numerical stability\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
-    "  # successive backward calls\n",
-    "  zero_grad(params)\n",
-    "  # Compute gradients on given objective\n",
-    "  loss.backward()\n",
-    "\n",
-    "  with torch.no_grad():\n",
-    "    for (par, gsq) in zip(params, grad_sq):\n",
-    "      #################################################\n",
-    "      ## TODO for students: update the value of the parameter ##\n",
-    "      # Use gsq.data and par.grad\n",
-    "      raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
-    "      #################################################\n",
-    "      # Update estimate of gradient variance\n",
-    "      gsq.data = ...\n",
-    "      # Update parameters\n",
-    "      par.data -=  ...\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "set_seed(seed=SEED)\n",
-    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
-    "print('\\n The model3 parameters before the update are: \\n')\n",
-    "print_params(model3)\n",
-    "loss = loss_fn(model3(X), y)\n",
-    "# Initialize the moving average of squared gradients\n",
-    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
-    "\n",
-    "\n",
-    "\n",
-    "## Uncomment below to test your function\n",
-    "# rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
-    "# print('\\n The model3 parameters after the update are: \\n')\n",
-    "# print_params(model3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "```\n",
-    " The model3 parameters after the update are:\n",
-    "\n",
-    "main.0.weight tensor([[-0.0240,  0.0031,  0.0193,  ...,  0.0316,  0.0297, -0.0198],\n",
-    "        [-0.0063, -0.0318, -0.0109,  ..., -0.0093,  0.0232, -0.0255],\n",
-    "        [ 0.0218, -0.0253,  0.0320,  ...,  0.0102,  0.0248, -0.0203],\n",
-    "        ...,\n",
-    "        [-0.0027,  0.0136,  0.0089,  ...,  0.0123, -0.0324, -0.0166],\n",
-    "        [ 0.0159,  0.0281,  0.0233,  ..., -0.0133, -0.0197,  0.0182],\n",
-    "        [ 0.0186, -0.0376, -0.0205,  ..., -0.0293,  0.0077, -0.0019]])\n",
-    "main.0.bias tensor([-0.0313, -0.0011,  0.0122, -0.0342,  0.0045,  0.0199,  0.0329,  0.0265,\n",
-    "         0.0182, -0.0041])\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove solution\n",
-    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
-    "  \"\"\"\n",
-    "  Perform an RMSprop update on a collection of parameters\n",
-    "\n",
-    "  Args:\n",
-    "    loss: Tensor\n",
-    "      A scalar tensor containing the loss whose gradient will be computed\n",
-    "    params: Iterable\n",
-    "      Collection of parameters with respect to which we compute gradients\n",
-    "    grad_sq: Iterable\n",
-    "      Moving average of squared gradients\n",
-    "    lr: Float\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "    alpha: Float\n",
-    "      Moving average parameter\n",
-    "    epsilon: Float\n",
-    "      quotient for numerical stability\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
-    "  # successive backward calls\n",
-    "  zero_grad(params)\n",
-    "  # Compute gradients on given objective\n",
-    "  loss.backward()\n",
-    "\n",
-    "  with torch.no_grad():\n",
-    "    for (par, gsq) in zip(params, grad_sq):\n",
-    "      # Update estimate of gradient variance\n",
-    "      gsq.data = alpha * gsq.data + (1 - alpha) * par.grad**2\n",
-    "      # Update parameters\n",
-    "      par.data -=  lr * (par.grad / (epsilon + gsq.data)**0.5)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "set_seed(seed=SEED)\n",
-    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
-    "print('\\n The model3 parameters before the update are: \\n')\n",
-    "print_params(model3)\n",
-    "loss = loss_fn(model3(X), y)\n",
-    "# Initialize the moving average of squared gradients\n",
-    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
-    "\n",
-    "## Uncomment below to test your function\n",
-    "rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
-    "print('\\n The model3 parameters after the update are: \\n')\n",
-    "print_params(model3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Implement_RMSProp_Exercise\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Interactive Demo 7: Compare optimizers\n",
-    "\n",
-    "Below, we compare your implementations of **SGD**, **Momentum**, and **RMSprop**. If you have successfully coded all the exercises so far: congrats!\n",
-    "\n",
-    "You are now *in the know* of some of the most commonly used and powerful optimization tools for deep learning."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @markdown Execute this cell to enable the widget!\n",
-    "X, y = train_set.data, train_set.targets\n",
-    "\n",
-    "@widgets.interact_manual\n",
-    "def compare_optimizers(\n",
-    "    batch_size=(25, 250, 5),\n",
-    "    lr=widgets.FloatLogSlider(value=2e-3, min=-5, max=0),\n",
-    "    max_steps=(50, 500, 5)):\n",
-    "  \"\"\"\n",
-    "  Demonstration to compare optimisers - stochastic gradient descent, momentum, RMSprop\n",
-    "\n",
-    "  Args:\n",
-    "    batch_size: Tuple\n",
-    "      Size of minibatches\n",
-    "    lr: Float log slider instance\n",
-    "      Scalar specifying the learning rate or step-size for the update\n",
-    "    max_steps: Tuple\n",
-    "      Max number of step sizes for incrementing\n",
-    "\n",
-    "  Returns:\n",
-    "    Nothing\n",
-    "  \"\"\"\n",
-    "  SGD_DICT = [gradient_update, 'SGD', 'black', '-', {'lr': lr}]\n",
-    "  MOM_DICT = [momentum_update, 'Momentum', 'red', '--', {'lr': lr, 'beta': 0.9}]\n",
-    "  RMS_DICT = [rmsprop_update, 'RMSprop', 'fuchsia', '-', {'lr': lr, 'alpha': 0.8}]\n",
-    "\n",
-    "  ALL_DICTS = [SGD_DICT, MOM_DICT, RMS_DICT]\n",
-    "\n",
-    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
-    "\n",
-    "  LOSS_HIST = {}\n",
-    "\n",
-    "  for opt_dict in tqdm(ALL_DICTS):\n",
-    "    update_fn, opt_name, color, lstyle, kwargs = opt_dict\n",
-    "    LOSS_HIST[opt_name] = []\n",
-    "\n",
-    "    model = copy.deepcopy(base_model)\n",
-    "    params = list(model.parameters())\n",
-    "\n",
-    "    if opt_name != 'SGD':\n",
-    "      aux_tensors = [torch.zeros_like(_) for _ in params]\n",
-    "\n",
-    "    for step in range(max_steps):\n",
-    "      data, labels = sample_minibatch(X, y, batch_size)\n",
-    "      loss = loss_fn(model(data), labels)\n",
-    "      if opt_name == 'SGD':\n",
-    "        update_fn(loss, params, **kwargs)\n",
-    "      else:\n",
-    "        update_fn(loss, params, aux_tensors, **kwargs)\n",
-    "      LOSS_HIST[opt_name].append(loss.item())\n",
-    "\n",
-    "  fig, axs = plt.subplots(1, len(ALL_DICTS), figsize=(9, 3))\n",
-    "  for ax, optim_dict in zip(axs, ALL_DICTS):\n",
-    "    opt_name = optim_dict[1]\n",
-    "    ax.plot(range(max_steps), LOSS_HIST[opt_name], alpha=0.8)\n",
-    "    ax.set_title(opt_name)\n",
-    "    ax.set_xlabel('Iteration')\n",
-    "    ax.set_ylabel('Loss')\n",
-    "    ax.set_ylim(0, 2.5)\n",
-    "  plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Compare_optimizers_Interactive_Demo\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think 7.1!: Compare optimizers\n",
-    "\n",
-    "Tune the three methods above - **SGD**, **Momentum**, and **RMSProp** - to make each excel and discuss your findings. How do the methods compare in terms of robustness to small changes of the hyperparameters? How easy was it to find a good hyperparameter configuration?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "Stochastic Gradient Descent (SGD): Performs updates one example at a time.\n",
-    "Momentum: Helps accelerate SGD in the relevant direction and dampens\n",
-    "oscillations specially ravines.\n",
-    "RMSProp: Allows each parameter to be updated at an 'appropriate' rate decided\n",
-    "based on magnitudes of past recent updates;\n",
-    "i.e., areas where the surface curves much more steeply in one dimension than\n",
-    "in another, which are common around local optima.\n",
-    "\n",
-    "Robustness: RMSProp > Momentum > SGD\n",
-    "Since, each example affects SGD by updating hyperparameters, it's not\n",
-    "considered very robust.\n",
-    "Adagrad greatly improved the robustness of SGD and is used for training\n",
-    "large-scale neural nets.\n",
-    "Momentum is quite robust: he momentum term increases for dimensions whose\n",
-    "gradients point in the same directions\n",
-    "and reduces updates for dimensions whose gradients change directions.\n",
-    "RMSProp is very robust; This combines the idea of only using the sign of\n",
-    "the gradient with the idea of adapting the step size separately\n",
-    "for each weight in a mini-batch.\n",
-    "\n",
-    "Generally, non-adaptive methods consistently produce more robust models\n",
-    "than adaptive methods. Refer https://arxiv.org/pdf/1911.03784.pdf - for more details\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Compare_optimizers_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "**Remarks:** Note that RMSprop allows us to use a 'per-dimension' learning rate _without having to tune one learning rate for each dimension **ourselves**_. The method uses information collected about the variance of the gradients throughout training to **adapt** the step size for each of the parameters automatically. The savings in tuning efforts of RMSprop over SGD or 'plain' momentum are undisputed on this task.\n",
-    "\n",
-    "Moreover, adaptive optimization methods are currently a highly active research domain, with many related algorithms like Adam, AMSgrad, Adagrad being used in practical application and theoretically investigated."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Locality of Gradients\n",
-    "\n",
-    "As we've seen throughout this tutorial, poor conditioning can be a significant burden on convergence to an optimum while using gradient-based optimization. Of the methods we've seen to deal with this issue, notice how both momentum and adaptive learning rates incorporate past gradient values into their update schemes. Why do we use past values of our loss function's gradient while updating our current MLP weights?\n",
-    "\n",
-    "Recall from *W1D2* that the gradient of a function, $\\nabla f(w_t)$, is a **local** property and computes the direction of maximum change of $f(w_t)$ at the point $w_t$. However, when we train our MLP model we are hoping to find the **global** optimum for our training loss. By incorporating past values of our function's gradient into our optimization schemes, we use more information about the overall shape of our function than just a single gradient alone can provide."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think! 7.2: Loss function and optimization\n",
-    "\n",
-    "Can you think of other ways we can incorporate more information about our loss function into our optimization schemes?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "We could consider incorporating the curvature of our function directly into our\n",
-    "optimization schemes. Methods that use this are often called Newton's methods\n",
-    "or Hessian based optimization methods.\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Loss_function_and_optimization_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 8: Ethical concerns\n",
-    "\n",
-    "*Time estimate: ~15mins*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 8: Ethical concerns\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', '0EthSI0cknI'), ('Bilibili', 'BV1TU4y1G7Je')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Ethical_concerns_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Summary\n",
-    "\n",
-    "* Optimization is necessary to create Deep Learning models that are guaranteed to converge\n",
-    "* Stochastic Gradient Descent and Momentum are two commonly used optimization techniques\n",
-    "* RMSProp is a way of adaptive hyperparameter tuning which utilises a per-dimension learning rate\n",
-    "* Poor choice of optimization objectives can lead to unforeseen, undesirable consequences\n",
-    "\n",
-    "If you have time left, you can read the Bonus material, where we put it all together and we compare our model with a benchmark model."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Bonus: Putting it all together\n",
-    "\n",
-    "*Time estimate: ~40 mins*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "We have progressively built a sophisticated optimization algorithm, which is able to deal with a non-convex, poor-conditioned problem concerning tens of thousands of training examples. Now we present _you_ with a small challenge: beat us! :P\n",
-    "\n",
-    "Your mission is to train an MLP model that can compete with a benchmark model which we have pre-trained for you. In this section you will be able to use the full Pytorch power: loading the data, defining the model, sampling minibatches as well as Pytorch's **optimizer implementations**.\n",
-    "\n",
-    "There is a big engineering component behind the design of optimizers and their implementation can sometimes become tricky. So unless you are directly doing research in optimization, it's recommended to use an implementation provided by a widely reviewed open-source library."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 9: Putting it all together\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'DP9c13vLiOM'), ('Bilibili', 'BV1MK4y1u7u2')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Putting_it_all_together_Bonus_Video\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Download parameters of the benchmark model\n",
-    "import requests\n",
-    "\n",
-    "fname = 'benchmark_model.pt'\n",
-    "url = \"https://osf.io/sj4e8/download\"\n",
-    "r = requests.get(url, allow_redirects=True)\n",
-    "with open(fname, 'wb') as fh:\n",
-    "  fh.write(r.content)\n",
-    "\n",
-    "# Load the benchmark model's parameters\n",
-    "DEVICE = set_device()\n",
-    "if DEVICE == \"cuda\":\n",
-    "  benchmark_state_dict = torch.load(fname)\n",
-    "else:\n",
-    "  benchmark_state_dict = torch.load(fname, map_location=torch.device('cpu'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# Create MLP object and update weights with those of saved model\n",
-    "benchmark_model = MLP(in_dim=784, out_dim=10,\n",
-    "                      hidden_dims=[200, 100, 50]).to(DEVICE)\n",
-    "benchmark_model.load_state_dict(benchmark_state_dict)\n",
-    "\n",
-    "\n",
-    "# Define helper function to evaluate models\n",
-    "def eval_model(model, data_loader, num_batches=np.inf, device='cpu'):\n",
-    "  \"\"\"\n",
-    "  To evaluate a given model\n",
-    "\n",
-    "  Args:\n",
-    "    model: nn.Module derived class\n",
-    "      The model which is to be evaluated\n",
-    "    data_loader: Iterable\n",
-    "      A configured dataloading utility\n",
-    "    num_batches: Integer\n",
-    "      Size of minibatches\n",
-    "    device: String\n",
-    "      Sets the device. CUDA if available, CPU otherwise\n",
-    "\n",
-    "  Returns:\n",
-    "    mean of log loss and mean of log accuracy\n",
-    "  \"\"\"\n",
-    "\n",
-    "  loss_log, acc_log = [], []\n",
-    "  model.to(device=device)\n",
-    "\n",
-    "  # We are just evaluating the model, no need to compute gradients\n",
-    "  with torch.no_grad():\n",
-    "    for batch_id, batch in enumerate(data_loader):\n",
-    "      # If we only evaluate a number of batches, stop after we reach that number\n",
-    "      if batch_id > num_batches:\n",
-    "        break\n",
-    "      # Extract minibatch data\n",
-    "      data, labels = batch[0].to(device), batch[1].to(device)\n",
-    "      # Evaluate model and loss on minibatch\n",
-    "      preds = model(data)\n",
-    "      loss_log.append(loss_fn(preds, labels).item())\n",
-    "      acc_log.append(torch.mean(1. * (preds.argmax(dim=1) == labels)).item())\n",
-    "\n",
-    "  return np.mean(loss_log), np.mean(acc_log)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "We define an optimizer in the following steps:\n",
-    "\n",
-    "1. Load  the corresponding class that implements the parameter updates and other internal management activities, including:\n",
-    "    - create auxiliary variables,\n",
-    "    - update moving averages,\n",
-    "    - adjust the learning rate.\n",
-    "2. Pass the parameters of the Pytorch model that the optimizer has control over. Note that different optimizers can potentially control different parameter groups.\n",
-    "3. Specify hyperparameters, including learning rate, momentum, moving average factors, etc.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Exercise Bonus: Train your own model\n",
-    "\n",
-    "Now, train the model with your preferred optimizer and find a good combination of hyperparameter settings."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "#################################################\n",
-    "## TODO for students: adjust training settings ##\n",
-    "\n",
-    "# The three parameters below are in your full control\n",
-    "MAX_EPOCHS = 2  # select number of epochs to train\n",
-    "LR = 1e-5  # choose the step size\n",
-    "BATCH_SIZE = 64  # number of examples per minibatch\n",
-    "\n",
-    "# Define the model and associated optimizer -- you may change its architecture!\n",
-    "my_model = MLP(in_dim=784, out_dim=10, hidden_dims=[200, 100, 50]).to(DEVICE)\n",
-    "\n",
-    "# You can take your pick from many different optimizers\n",
-    "# Check the optimizer documentation and hyperparameter meaning before using!\n",
-    "# More details on Pytorch optimizers: https://pytorch.org/docs/stable/optim.html\n",
-    "# optimizer = torch.optim.SGD(my_model.parameters(), lr=LR, momentum=0.9)\n",
-    "# optimizer = torch.optim.RMSprop(my_model.parameters(), lr=LR, alpha=0.99)\n",
-    "# optimizer = torch.optim.Adagrad(my_model.parameters(), lr=LR)\n",
-    "optimizer = torch.optim.Adam(my_model.parameters(), lr=LR)\n",
-    "#################################################"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "set_seed(seed=SEED)\n",
-    "# Print training stats every LOG_FREQ minibatches\n",
-    "LOG_FREQ = 200\n",
-    "# Frequency for evaluating the validation metrics\n",
-    "VAL_FREQ = 200\n",
-    "# Load data using a Pytorch Dataset\n",
-    "train_set_orig, test_set_orig = load_mnist_data(change_tensors=False)\n",
-    "\n",
-    "# We separate 10,000 training samples to create a validation set\n",
-    "train_set_orig, val_set_orig = torch.utils.data.random_split(train_set_orig, [50000, 10000])\n",
-    "\n",
-    "# Create the corresponding DataLoaders for training and test\n",
-    "g_seed = torch.Generator()\n",
-    "g_seed.manual_seed(SEED)\n",
-    "\n",
-    "train_loader = torch.utils.data.DataLoader(train_set_orig,\n",
-    "                                           shuffle=True,\n",
-    "                                           batch_size=BATCH_SIZE,\n",
-    "                                           num_workers=2,\n",
-    "                                           worker_init_fn=seed_worker,\n",
-    "                                           generator=g_seed)\n",
-    "val_loader = torch.utils.data.DataLoader(val_set_orig,\n",
-    "                                         shuffle=True,\n",
-    "                                         batch_size=256,\n",
-    "                                         num_workers=2,\n",
-    "                                         worker_init_fn=seed_worker,\n",
-    "                                         generator=g_seed)\n",
-    "test_loader = torch.utils.data.DataLoader(test_set_orig,\n",
-    "                                          batch_size=256,\n",
-    "                                          num_workers=2,\n",
-    "                                          worker_init_fn=seed_worker,\n",
-    "                                          generator=g_seed)\n",
-    "\n",
-    "# Run training\n",
-    "metrics = {'train_loss':[],\n",
-    "           'train_acc':[],\n",
-    "           'val_loss':[],\n",
-    "           'val_acc':[],\n",
-    "           'val_idx':[]}\n",
-    "\n",
-    "step_idx = 0\n",
-    "for epoch in tqdm(range(MAX_EPOCHS)):\n",
-    "\n",
-    "  running_loss, running_acc = 0., 0.\n",
-    "\n",
-    "  for batch_id, batch in enumerate(train_loader):\n",
-    "    step_idx += 1\n",
-    "    # Extract minibatch data and labels\n",
-    "    data, labels = batch[0].to(DEVICE), batch[1].to(DEVICE)\n",
-    "    # Just like before, refresh gradient accumulators.\n",
-    "    # Note that this is now a method of the optimizer.\n",
-    "    optimizer.zero_grad()\n",
-    "    # Evaluate model and loss on minibatch\n",
-    "    preds = my_model(data)\n",
-    "    loss = loss_fn(preds, labels)\n",
-    "    acc = torch.mean(1.0 * (preds.argmax(dim=1) == labels))\n",
-    "    # Compute gradients\n",
-    "    loss.backward()\n",
-    "    # Update parameters\n",
-    "    # Note how all the magic in the update of the parameters is encapsulated by\n",
-    "    # the optimizer class.\n",
-    "    optimizer.step()\n",
-    "    # Log metrics for plotting\n",
-    "    metrics['train_loss'].append(loss.cpu().item())\n",
-    "    metrics['train_acc'].append(acc.cpu().item())\n",
-    "\n",
-    "    if batch_id % VAL_FREQ == (VAL_FREQ - 1):\n",
-    "      # Get an estimate of the validation accuracy with 100 batches\n",
-    "      val_loss, val_acc = eval_model(my_model, val_loader,\n",
-    "                                     num_batches=100,\n",
-    "                                     device=DEVICE)\n",
-    "      metrics['val_idx'].append(step_idx)\n",
-    "      metrics['val_loss'].append(val_loss)\n",
-    "      metrics['val_acc'].append(val_acc)\n",
-    "\n",
-    "      print(f\"[VALID] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
-    "            f\"Loss: {val_loss:.3f} - Acc: {100*val_acc:.3f}%\")\n",
-    "\n",
-    "    # print statistics\n",
-    "    running_loss += loss.cpu().item()\n",
-    "    running_acc += acc.cpu().item()\n",
-    "    # Print every LOG_FREQ minibatches\n",
-    "    if batch_id % LOG_FREQ == (LOG_FREQ-1):\n",
-    "      print(f\"[TRAIN] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
-    "            f\"Loss: {running_loss / LOG_FREQ:.3f} - \"\n",
-    "            f\"Acc: {100 * running_acc / LOG_FREQ:.3f}%\")\n",
-    "\n",
-    "      running_loss, running_acc = 0., 0."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
-    "\n",
-    "ax[0].plot(range(len(metrics['train_loss'])), metrics['train_loss'],\n",
-    "           alpha=0.8, label='Train')\n",
-    "ax[0].plot(metrics['val_idx'], metrics['val_loss'], label='Valid')\n",
-    "ax[0].set_xlabel('Iteration')\n",
-    "ax[0].set_ylabel('Loss')\n",
-    "ax[0].legend()\n",
-    "\n",
-    "ax[1].plot(range(len(metrics['train_acc'])), metrics['train_acc'],\n",
-    "           alpha=0.8, label='Train')\n",
-    "ax[1].plot(metrics['val_idx'], metrics['val_acc'], label='Valid')\n",
-    "ax[1].set_xlabel('Iteration')\n",
-    "ax[1].set_ylabel('Accuracy')\n",
-    "ax[1].legend()\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Train_your_own_model_Bonus_Exercise\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think! Bonus: Metrics\n",
-    "\n",
-    "Which metric did you optimize when searching for the right configuration? The training set loss? Accuracy? Validation/test set metrics? Why? Discuss!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# to_remove explanation\n",
-    "\n",
-    "\"\"\"\n",
-    "  Remember the discussion in Section 1 about surrogate objectives.\n",
-    "Our optimization methods minimize the loss, but at the end of the day we care about test accuracy.\n",
-    "\n",
-    "  However, we can't directly optimize for test accuracy and the finite size of our\n",
-    "datasets lead us to (cross-)validation:\n",
-    "\n",
-    "  1. We minimize the loss (empirical risk minimization) on our *training set*.\n",
-    "  2. We choose models and hyperparameters on the *validation set*.\n",
-    "  3. We use the *test set* in order to report the final performance of our model on unseen data.\n",
-    "\"\"\";"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Metrics_Bonus_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "### Evaluation\n",
-    "\n",
-    "We _finally_ can evaluate and compare the performance of the models on previously unseen examples.\n",
-    "\n",
-    "Which model would you keep? (\\*drum roll*)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "print('Your model...')\n",
-    "train_loss, train_accuracy = eval_model(my_model, train_loader, device=DEVICE)\n",
-    "test_loss, test_accuracy = eval_model(my_model, test_loader, device=DEVICE)\n",
-    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
-    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')\n",
-    "\n",
-    "print('\\nBenchmark model')\n",
-    "train_loss, train_accuracy = eval_model(benchmark_model, train_loader, device=DEVICE)\n",
-    "test_loss, test_accuracy = eval_model(benchmark_model, test_loader, device=DEVICE)\n",
-    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
-    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "collapsed_sections": [],
-   "include_colab_link": true,
-   "name": "W1D4_Tutorial1",
-   "provenance": [],
-   "toc_visible": true
-  },
-  "kernel": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.11"
-  },
-  "toc-autonumbering": true,
-  "varInspector": {
-   "cols": {
-    "lenName": 16,
-    "lenType": 16,
-    "lenVar": 40
-   },
-   "kernels_config": {
-    "python": {
-     "delete_cmd_postfix": "",
-     "delete_cmd_prefix": "del ",
-     "library": "var_list.py",
-     "varRefreshCmd": "print(var_dic_list())"
-    },
-    "r": {
-     "delete_cmd_postfix": ") ",
-     "delete_cmd_prefix": "rm(",
-     "library": "var_list.r",
-     "varRefreshCmd": "cat(var_dic_list()) "
-    }
-   },
-   "types_to_exclude": [
-    "module",
-    "function",
-    "builtin_function_or_method",
-    "instance",
-    "_Feature"
-   ],
-   "window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
deleted file mode 100644
index 595e8b789..000000000
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb
+++ /dev/null
@@ -1,1403 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "execution": {},
-    "id": "view-in-github"
-   },
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D4_Optimization/W1D4_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "# Tutorial 2: Deep Learning Case Study 1: Cost Functions\n",
-    "\n",
-    "**Week 1, Day 4: Optimization**\n",
-    "\n",
-    "**By Neuromatch Academy**\n",
-    "\n",
-    "\n",
-    "__Content creators:__ Konrad Kording, Lyle ungar, Ashish Sahoo\n",
-    "\n",
-    "__Content reviewers:__ Kelson Shilling-Scrivo\n",
-    "\n",
-    "__Content editors:__ Kelson Shilling-Scrivo\n",
-    "\n",
-    "__Production editors:__ Gagana B, Spiros Chavlis\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Tutorial Objectives\n",
-    "\n",
-    "In this tutorial, you will practice thinking like a deep learning practitioner and determine how to design cost functions for different scenarios.\n",
-    "\n",
-    "By the end of this tutorial, you will be better able to:\n",
-    "\n",
-    "* Appreciate the importance of cost function engineering\n",
-    "* Translate domain knowledge into cost functions\n",
-    "* Ask questions about DL systems and customer needs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Tutorial slides\n",
-    "from IPython.display import IFrame\n",
-    "link_id = \"szcjn\"\n",
-    "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n",
-    "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and import feedback gadget\n",
-    "\n",
-    "!pip3 install vibecheck datatops --quiet\n",
-    "\n",
-    "from vibecheck import DatatopsContentReviewContainer\n",
-    "def content_review(notebook_section: str):\n",
-    "    return DatatopsContentReviewContainer(\n",
-    "        \"\",  # No text prompt\n",
-    "        notebook_section,\n",
-    "        {\n",
-    "            \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n",
-    "            \"name\": \"neuromatch_dl\",\n",
-    "            \"user_key\": \"f379rz8y\",\n",
-    "        },\n",
-    "    ).render()\n",
-    "\n",
-    "\n",
-    "feedback_prefix = \"W1D4_T2\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form"
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 1: Intro to Deep Learning Case Study\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 1: Intro to DL Case Study\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'iEqd0MY5pxI'), ('Bilibili', 'BV1hL4y1P73s')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Intro_to_DL_Case_Study_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "This tutorial is a bit different from others - there will be no coding! Instead you will watch a series of vignettes about various scenarios where you want to use a neural network. This tutorial will focus on cost functions, a tutorial you will see later in the course will be similar but focused on designing architectures.\n",
-    "\n",
-    "Each section below will start with a vignette where either Lyle or Konrad is trying to figure out how to set up a neural network for a specific problem. Try to think of questions you want to ask them as you watch, then pay attention to what questions Lyle and Konrad are asking. Were they what you would have asked? How do their questions help quickly clarify the situation?\n",
-    "\n",
-    "\n",
-    "You will work together as a group to try to come up with cost functions for each example, with hints available along the way. This may be difficult - deep learning in the real world often is! So try your best but don't get discouraged if you don't reach the solution - you'll learn a lot from the process of trying to.\n",
-    "\n",
-    "You have already seen cost functions (sometimes also called objective functions or loss functions) for deep neural networks - you need one to perform gradient descent and train a neural network.  It turns out what cost function you choose to minimize is incredibly important - it is how you define success of your network after all, so you want to define success in a good way! And cost functions are not one size fits all - you need to carefully choose cost functions according to what you want your neural network to do - as you will seen in the following scenarios.\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 2: Cost function for neurons\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 2: Spiking Neuron Predictions Vignette\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'CC4gMRrE31g'), ('Bilibili', 'BV1Jt4y187UU')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Spiking_Neuron_Predictions_Video\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 3: Spiking Neuron Predictions Set-up\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'vJ7MixhmDh8'), ('Bilibili', 'BV1X94y1y7SH')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Spiking_Neuron_Predictions_SetUp_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Konrad, a neuroscientist, wants to predict what neurons in someone's motor cortex are doing while they are riding a motorcycle.\n",
-    "\n",
-    "Upon discussion with Lyle, it emerges that we have data on 12 parameters of motorcycle riding, including acceleration, angle, braking, degrees of leaning. These inputs are fairly smooth over time, the angle of the motorcycle typically does not change much in 100 ms for example.\n",
-    "\n",
-    "We also have recorded data on the timing of spikes of $N$ neurons in motor cortex. The underlying firing rate is smooth but every millisecond spikes are random and independent. This means we can assume that the number of spikes in a short interval can be modeled using a Poisson distribution with an underlying firing rate for that interval $\\lambda$.\n",
-    "\n",
-    "For neuron $i$, the probability of seeing $k_{i}$ spikes in some interval given an underlying firing rate $\\lambda_{i}$ is:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\mathcal{f(k_{i}:\u03bb_{i})} = \\mathcal{Pr(X=k_{i})} = \\frac {\\lambda_{i}^{k_{i}}e^{-\\lambda_{i}}}{k_{i}!}\n",
-    "\\end{equation}\n",
-    "\n",
-    "So this poisson distribution may be relevant if we want to, in a way, have a good model for the spiking of neurons."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think! 1: Designing a cost function to predict neural activities\n",
-    "\n",
-    "Given everything you know, how would you design a cost function for a neural network that Konrad is training to predict neural activity given the motorcycle riding parameters? Remember that we are predicting the activity of all $N$ neurons, not just one. Try to write out an equation!\n",
-    "\n",
-    "\n",
-    "Please discuss as a group. If you get stuck, you can uncover the hints below one at a time. Please spend some time discussing before uncovering the next hint though! You are being real deep learning scientists now and the answers won't be easy\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 1 </font></summary>\n",
-    "\n",
-    "You get time-stamps for the spikes. You will want to do binning into 50 ms bins. You get $k_{i, t}$ for every neuron $i$ and time bin $t$, the spike count for that neuron in that time bin.  What will the neural network predict?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 2 </font></summary>\n",
-    "\n",
-    "For each bin you can use your neural network model to predict an estimate of $\\lambda_{i,t}$, the number of spikes for neuron $i$ expected at that time bin $t$. The network should get as input the relevant aspects of the motorcycle riding at the relevant times (and potentially of the previous times)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 3 </font></summary>\n",
-    "\n",
-    "You need an equation relating $\\lambda_{i,t}$ (the model prediction) with $k_{i, t}$ (your data) where changing $\\lambda_{i,t}$  to minimize or maximize the number resulting from this equation results in better predictions.  What do we already know about the relationship between $\\lambda_{i,t}$ and $k_{i, t}$ that helps us here?\n",
-    "\n",
-    "Once you have that, how do you extend to incorporate all neurons and time bins?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 4 </font></summary>\n",
-    "\n",
-    "We can treat the bins independently as the spikes are random and independent every millisecond."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for the solution </font></summary>\n",
-    "\n",
-    "\n",
-    "First, we will convert our spike timing data to the number of spikes per time bin for time bins of size 50 ms. This gives us $k_{i,t}$ for every neuron $i$ and time bin $t$.\n",
-    "\n",
-    "We are assuming a Poisson distribution for our spiking. That means that we get the probability of seeing spike count $k_{i, t}$  given underlying firing rate $\\lambda_{i, t}$ using this equation:\n",
-    "\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\mathcal{f(k_{i,t}:\\lambda_{i,t})} = \\mathcal{Pr}(X=k_{i,t}) = \\frac {\\lambda_{i,t}^{k_{i,t}}e^{-\\lambda_{i,t}}}{k_{i,t}!}\n",
-    "\\end{equation}\n",
-    "\n",
-    "That seems a pretty good thing to optimize to make our predictions as good as possible! We want a high probability of seeing the actual spike count we recorded given the neural network prediction of the underlying firing rate.\n",
-    "\n",
-    "We will make this negative later so we have an equation that we want to minimize rather than maximize, so we can use all our normal tricks for minimization (instead of maximization). First though, let's scale up to include all our neurons and time bins.\n",
-    "\n",
-    "We can treat each time bin as independent because, while the underlying probability of firing changes slowly, every milisecond spiking is random and independent. From probability, we know that we can compute the probability of a set of independent events (all the spike counts) by multiplying the probabilities of each event. So the probability of seeing all of our data given the neural network predictions is all of our probabilities of $k_{i,t}$ multiplied together:\n",
-    "\n",
-    "\\begin{align}\n",
-    "\\mathcal{Pr}(\\text{all_data}) &= \\prod_{i=1}^{N}\\prod_{t=1}^\\top \\mathcal{Pr}(X=k_{i,t})\\\\\n",
-    "&= \\prod_{i=1}^{N}\\prod_{t=1}^\\top \\frac {\\lambda_{i,t}^{k_{i,t}}e^{-\\lambda_{i,t}}}{k_{i,t}!}\n",
-    "\\end{align}\n",
-    "\n",
-    "This is also known as our likelihood!\n",
-    "\n",
-    "We usually use the log likelihood instead of the likelihood when minimizing or maximizing for numerical computation reasons. W We can convert the above equation to log likelihood:\n",
-    "\n",
-    "\\begin{align}\n",
-    "\\text{log likelihood} &= \\sum_{i=1}^N\\sum_{t=1}^\\top \\text{log}(\\mathcal{Pr}(X=k_{i,t}) \\\\\n",
-    "&= \\sum_{i=1}^N\\sum_{t=1}^\\top k_{i,t} \\text{log}(\\lambda_{i,t}) - \\lambda_{i,t} - \\text{log}(k_{i,t}!)\n",
-    "\\end{align}\n",
-    "\n",
-    "And last but not least, we want to make it negative so we can minimize instead of maximize:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\text{negative log likelihood}\n",
-    "= \\sum_{i=1}^N\\sum_{t=1}^\\top - k_{i,t} \\text{log}(\\lambda_{i,t}) + \\lambda_{i,t} + \\text{log}(k_{i,t}!)\n",
-    "\\end{equation}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Designing_a_cost_function_to_predict_neural_activities_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 4: Spiking Neurons Wrap-up\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'fb6A03B2U5g'), ('Bilibili', 'BV1K94y117rH')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Spiking_Neurons_WrapUp_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Check out the papers mentioned in the above video:\n",
-    "\n",
-    "- [Fast inference in generalized linear models via expected log likelihood](https://link.springer.com/article/10.1007/s10827-013-0466-4)\n",
-    "\n",
-    "- [Machine Learning for Neural Decoding](https://www.eneuro.org/content/7/4/ENEURO.0506-19.2020)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## (Bonus) Think!: Non-Poisson neurons\n",
-    "\n",
-    "If you have time discuss the following. The spiking distributions don't seem quite Poisson.  Find a good replacement for your cost function."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_NonPoisson_neurons_Bonus_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 3: How can an ANN know its uncertainty"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 5: ANN Uncertainty Vignette\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'b2N2OJ2u4AM'), ('Bilibili', 'BV1UN4y1u7Ws')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_Vignette_Video\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 6: ANN Uncertainty Set-up\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'Reh-gNiOwkQ'), ('Bilibili', 'BV1B34y1W7F8')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_SetUp_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Lyle wants to build an artificial neural network that has a measure of its own uncertainty about it's predictions. He wants the neural network to give a prediction/estimate and an uncertainty, or standard deviation, measurement on it.\n",
-    "\n",
-    "Let's say Lyle wants to estimate the location of an atom in a chemical molecule based on various inputs. He wants to have the estimate of the location and an estimate of the variance. We don't train neural networks on one data point at a time though - he wants a cost function that takes in N data points (input and atom location pairings).\n",
-    "\n",
-    "We think we may be able to use a Gaussian distribution to help Lyle here:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "g(x) = \\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp} \\left( -\\frac{1}{2}\\frac{(x-\\mu)^2}{\\sigma^2} \\right)\n",
-    "\\end{equation}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think! 2: Designing a cost function so we measure uncertainty\n",
-    "\n",
-    "Given everything you know, how would you design a cost function for a neural network that Lyle is training so that he can get the estimate and the uncertainty of the estimate? Try to write out an equation!\n",
-    "\n",
-    "Please discuss as a group. If you get stuck, you can uncover the hints below one at a time. Please spend some time discussing before uncovering the next hint, though! You are being real deep learning scientists now, and the answers won't be easy."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 1 </font></summary>\n",
-    "\n",
-    "Look at the Gaussian equation. What is the true location? Where is there the estimate of location? Where is there the uncertainty?\n",
-    "\n",
-    "What do you want the neural network to predict for one data point (recorded location) given the inputs?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 2 </font></summary>\n",
-    "\n",
-    "What did you learn from working through Section 2 that you can use here?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 3 </font></summary>\n",
-    "\n",
-    "In section 2, you learned that you want to go from probabilities to negative log likelihoods to form cost functions."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for the solution </font></summary>\n",
-    "\n",
-    "For a given set of inputs, we want the neural network to predict the location of the atom and the uncertainty of that estimate. Standard deviation is a great measure of uncertainty so we can predict the mean and standard deviation of the location (instead of just the mean as is more common).\n",
-    "\n",
-    "So how do we a design a cost function that involves the mean and standard deviation?  We can assume a Gaussian distribution over the location. The neural network can predict the mean of that Gaussian (that's the estimate of the location) and the standard deviation of that Gaussian (that's the uncertainty measure) for a given set of inputs.\n",
-    "\n",
-    "Now that we've got that figured out, we can take a very similar approach to what we did in Section 2 with spiking neurons. For a given data point $i$, the neural network predicts the mean ($\\mu_i$) and standard deviation ($\\sigma_i$) of the location given the inputs. We can then compute the probability of seeing the actual recorded location ($x_i$) given these predictions:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "g(x) = \\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp}\\left( -\\frac{1}{2}\\frac{(x_i-\\mu_i)^2}{\\sigma_i^2} \\right)\n",
-    "\\end{equation}\n",
-    "\n",
-    "The location of the atom is independent in each data point so we can get the overall likelihood by multiplying the probabilities for the individual data points.\n",
-    "\\begin{equation}\n",
-    "\\text{likelihood} = \\prod_{i=1}^N\\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp}\\left( -\\frac{1}{2}\\frac{(x_i-\\mu_i)^2}{\\sigma_i^2} \\right)\n",
-    "\\end{equation}\n",
-    "\n",
-    "\n",
-    "And, as before, we want to take the log of this for numerical reasons and convert to negative log likelihood:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\text{negative log likelihood} = \\sum_{i=1}^N \\text{log} \\left( \\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp}\\left( -\\frac{1}{2}\\frac{(x_i-\\mu_i)^2}{\\sigma_i^2} \\right) \\right)\n",
-    "\\end{equation}\n",
-    "\n",
-    "Changing the parameters of the neural network so it predicts $\\mu_i$ and $\\sigma_i$ that minimize this equation will give us (hopefully fairly accurate) predictions of the location and the network uncertainty about the location!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 7: ANN Uncertainty Wrap-up\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'QBKAFRaC8SY'), ('Bilibili', 'BV1zv4y1M7C8')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_WrapUp_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Check out the papers mentioned in the above video:\n",
-    "\n",
-    "- [Rapid prediction of NMR spectral properties with quantified uncertainty](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0374-3)\n",
-    "\n",
-    "- [Deep imitation learning for molecular inverse problems](https://papers.nips.cc/paper/2019/file/b0bef4c9a6e50d43880191492d4fc827-Paper.pdf)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## (Bonus) Think!: Negative standard deviations\n",
-    "\n",
-    "If the standard deviation is negative, the negative log-likelihood will fail as you'd take the log of a negative number. What should we do to ensure we don't run into this while training our neural network?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Negative_standard_deviations_Bonus_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Section 4: Embedding faces"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 8: Embedding Faces Vignette\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'tF0iYBAnyrI'), ('Bilibili', 'BV1NY411K7f6')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Embedding_Faces_Vignette_Video\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 9: Embedding Faces Set-up\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'JrzicfOxqP0'), ('Bilibili', 'BV1fv4y1M7eQ')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Embedding_Faces_SetUp_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Konrad needs help recognizing faces. He wants to build a network that embeds photos of faces so that photos of the same person are nearby in the embedding space and photos of different people are far in the embedding space. We can't just use pixel space because the pixels will be very different between a photo of someone straight on vs. from their side!\n",
-    "\n",
-    "We will use a neural network to go from the pixels of each image to an embedding space. Let's say you have a convolutional neural network with m units in the last layer. If you feed a face photo $i$ through the CNN, the activities of the units in the last layer form an $m$ dimensional vector $\\bar{y}_i$ - this is an embedding of that face photo in $m$ dimensional space.\n",
-    "\n",
-    "We think we might be able to incorporate Euclidean distance to help us here. The Euclidean distance between two vectors is:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "d(\\bar{y}_i, \\bar{y}_j) = \\sqrt{\\sum_{c=1}^m(\\bar{y}_{i_c} - \\bar{y}_{j_c})^2}\n",
-    "\\end{equation}\n",
-    "\n",
-    "<br>\n",
-    "\n",
-    "**Note:** a minor remark here, there is an indexing error in the video where it says $i$ instead of $j$."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "## Think! 3: Designing a cost function for face embedding\n",
-    "\n",
-    "Given everything you know, how would you design a cost function for a neural network that Konrad is training so that he can get a helpful embedding of faces? Try to write out an equation!\n",
-    "\n",
-    "Please discuss as a group. If you get stuck, you can uncover the hints below one at a time. Please spend some time discussing before uncovering the next hint, though! You are being real deep learning scientists now, and the answers won't be easy."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 1 </font></summary>\n",
-    "\n",
-    "How do we want to deal with the same faces? Can we just build a cost function based on similar faces? What would happen?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 2 </font></summary>\n",
-    "\n",
-    "You need to also include different faces. How do you want to deal with different faces?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for hint 3 </font></summary>\n",
-    "\n",
-    "Similar faces should have low Euclidean distance between their embeddings. Different faces should have high Euclidean distance between their embeddings. Can we phrase this with 3 faces?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "<details>\n",
-    "<summary> <font color='green'>Click here for the solution </font></summary>\n",
-    "\n",
-    "We want the same faces to have similar embeddings. Let's say we have one photo of Lyle $a$ and another photo of Lyle $p$. We want the embeddings of those photos to be very similar: we want the Euclidean distance between $\\bar{y}_a$ and $\\bar{y}_p$ (the activitys of the last layer of the CNN when photo $a$ and $p$ are fed through) to be small.\n",
-    "\n",
-    "So one possible cost function is:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\text{Cost function} = d(\\bar{y}_a, \\bar{y}_p)\n",
-    "\\end{equation}\n",
-    "\n",
-    "Imagine if we just feed in pairs of the same face and minimize that though. There would be no motivation to ever have different embeddings, we would be only minimizing the distance between embeddings. If the CNN was smart, it would just have the same embedding for every single photo - then the cost function would equal 0!\n",
-    "\n",
-    "This is clearly not what we want. We want to motivate the CNN to have similar embeddings only when the faces are the same. This means we need to also train it to maximize distance when the faces are different.\n",
-    "\n",
-    "We could choose another two photos of different people and maximize that distance but then there's no relation to the embeddings we've already established of the two photos of Lyle.  Instead, we will add one more photo to the mix: a photo of Konrad $n$. We want the distance of this photo to be far from our original photos of Lyle $a$ and $p$.  So we want the distance between $a$ and $p$ to be small and the distance between $a$ and $n$ for example to be large:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\text{Cost function} = d(\\bar{y}_a, \\bar{y}_p) - d(\\bar{y}_a, \\bar{y}_n)\n",
-    "\\end{equation}\n",
-    "\n",
-    "We could compare $n$ to both $a$ and $p$:\n",
-    "\\begin{equation}\n",
-    "\\text{Cost function} = d(\\bar{y}_a, \\bar{y}_p) - d(\\bar{y}_a, \\bar{y}_n) - d(\\bar{y}_p, \\bar{y}_n)\n",
-    "\\end{equation}\n",
-    "\n",
-    "But then the cost function is a bit unbalanced, there are two dissimiliarty terms and they might dominate (so achieving the similarity is less important). So let's go with just including one dissimilarity term.\n",
-    "\n",
-    "This is an established cost function - triplet loss! We chose the subscripts $a$, $p$, and $n$ for a reason: we have an anchor image, a positive image (the same person's face as the anchor) and a negative image (a different person's face as the anchor). We can then sum over N data points where each data point is a set of three images:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\text{Cost function} = \\sum_{i=1}^N [d(\\bar{y}_{a, i}, \\bar{y}_{p, i}) - d(\\bar{y}_{a, i}, \\bar{y}_{n, i})]\n",
-    "\\end{equation}\n",
-    "\n",
-    "There's one little addition in triplet loss. Instead of just using the above cost function, researchers add a constant $\\alpha$ and then make the cost function 0 if it becomes negative. Why do you think they do this?\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\text{Cost function} = \\text{max} \\left( \\sum_{i=1}^N \\left[ d(\\bar{y}_{a, i}, \\bar{y}_{p, i}) - d(\\bar{y}_{a, i}, \\bar{y}_{n, i}) + \\alpha \\right], 0 \\right)\n",
-    "\\end{equation}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Embedding_Faces_Discussion\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Video 10: Embedding Faces Wrap-up\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'mVk1W7x6Nps'), ('Bilibili', 'BV1nf4y1f7oL')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Embedding_Faces_WrapUp_Video\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "Check out the papers mentioned in the above video:\n",
-    "\n",
-    "- [Large Scale Online Learning of Image Similarity Through Ranking](https://www.jmlr.org/papers/volume11/chechik10a/chechik10a.pdf)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
-   "source": [
-    "---\n",
-    "# Summary\n",
-    "\n",
-    "Today we have seen a range of different cost functions. So we want to dwell a bit on what we want people to take away from these exercises. We have seen several cost functions:\n",
-    "\n",
-    "* Log Poisson likelihood for neurons\n",
-    "* Uncertainty as a modeled entity\n",
-    "* Face embeddings\n",
-    "\n",
-    "What we saw in all these cases is that these cost functions emerge from insights into the problem domain. We saw how one needs to, in a way, pull these insights out of the domain experts. And how, at the same time, the cost functions come from computational insights. Coming up with the proper cost functions requires listening to what domain experts say and probing the things they may mean but not say."
-   ]
-  }
- ],
- "metadata": {
-  "colab": {
-   "collapsed_sections": [],
-   "include_colab_link": true,
-   "name": "W2D2_Tutorial2",
-   "provenance": [],
-   "toc_visible": true
-  },
-  "kernel": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file

From fd45c6120f63b3ce66343452007cd949b4e2751e Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Fri, 1 May 2026 18:43:22 -0400
Subject: [PATCH 15/34] update the content page

---
 tutorials/materials.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tutorials/materials.yml b/tutorials/materials.yml
index 47ff6c7d1..59b21a5dc 100644
--- a/tutorials/materials.yml
+++ b/tutorials/materials.yml
@@ -44,9 +44,7 @@
   slides:
   - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/ft2sz/?direct%26mode=render%26action=download%26mode=render
     title: Tutorial 1
-  - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/szcjn/?direct%26mode=render%26action=download%26mode=render
-    title: Tutorial 2
-  tutorials: 2
+  tutorials: 1
 
 - day: W2D1
   category: Fine Tuning

From f785c3fc3892cb8ea188e4d8f5b28ae8620b7331 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sat, 9 May 2026 22:00:25 -0400
Subject: [PATCH 16/34] W1D1: replace Meet our content creators with Meet our
 day leads

Restore upstream lecturer list (organized by week) and rename section
from "Meet our content creators" to "Meet our day leads".

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_Tutorial1.ipynb                      | 52 +------------------
 1 file changed, 1 insertion(+), 51 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index a1065eb3d..bb4e4f5fa 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -4533,57 +4533,7 @@
    "metadata": {
     "execution": {}
    },
-   "source": [
-    "## Meet our content creators\n",
-    "\n",
-    "* [Alish Dipani](https://alishdipani.github.io/) \n",
-    "* [Alexander Ecker](https://eckerlab.org/) \n",
-    "* [Alona Fyshe](https://webdocs.cs.ualberta.ca/~alona/) \n",
-    "* [Andrew Saxe](https://www.saxelab.org/) \n",
-    "* Arash Ash\n",
-    "* [Arna Ghosh](https://arnaghosh.github.io/) \n",
-    "* Bikram Khastgir\n",
-    "* [Binxu Wang](https://animadversio.github.io/) \n",
-    "* [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
-    "* [Colleen Gillon](https://colleenjg.github.io/)\n",
-    "* Dawn Estes McKnight\n",
-    "* [Egor Zverev](https://egozverev.github.io/)\n",
-    "* [He He](https://hhexiy.github.io/) \n",
-    "* [Ioannis Mitliagkas](http://mitliagkas.github.io/)\n",
-    "* [Jose Gallego-Posada](https://gallego-posada.github.io/) \n",
-    "* [Jordan Matelsky](https://jordan.matelsky.com/) \n",
-    "* Kevin Machado Gamboa\n",
-    "* [Konrad Kording](https://kordinglab.com) *\n",
-    "* Kushaan Gupta\n",
-    "* [Lyle Ungar](https://www.cis.upenn.edu/~ungar/) \n",
-    "* [Mandana Samiei](https://mandanasmi.github.io/) \n",
-    "* Matthew Sargent\n",
-    "* Mohitrajhu Lingan Kumaraian\n",
-    "* [Pablo Samuel Castro](https://psc-g.github.io/)\n",
-    "* Rajaswa Patil\n",
-    "* Ravi Teja Konkimalla\n",
-    "* [Raymond Chua](https://raymondchua.github.io/)\n",
-    "* [Richard Gerum](https://rgerum.github.io/)\n",
-    "* [Rohan Saha](https://www.rohansaha.in/) \n",
-    "* [Saeed Salehi](https://saeedsalehi.com/) \n",
-    "* Saeed Najafi\n",
-    "* [Shaonan Wang](https://wangshaonan.github.io/)\n",
-    "* Shubh Pachchigar\n",
-    "* [Spiros Chavlis](https://spiroschv.github.io/)\n",
-    "* [Surya Ganguli](https://ganguli-gang.stanford.edu/)\n",
-    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php)\n",
-    "* [Timo Lüddecke](https://timojl.github.io/) \n",
-    "* [Vikash Gilja](https://tnel.ucsd.edu/bio) \n",
-    "* [Vladimir Haltakov](https://haltakov.net/) \n",
-    "\n",
-    "### Past contributors\n",
-    "* [Akash Srivastava](https://akashgit.github.io/) \n",
-    "* [Feryal Behbahani](https://feryal.github.io/) \n",
-    "* [James Evans](https://sociology.uchicago.edu/directory/james-evans) \n",
-    "* [Jane Wang](http://www.janexwang.com/) \n",
-    "* [Josh Vogelstein](https://jovo.me/) \n",
-    "* [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/) \n"
-   ]
+   "source": "## Meet our day leads\n\n### Week 1: the building blocks\n* [Konrad Kording](https://kordinglab.com)\n* [Andrew Saxe](https://www.saxelab.org/)\n* [Surya Ganguli](https://ganguli-gang.stanford.edu/)\n* [Ioannis Mitliagkas](http://mitliagkas.github.io/)\n* [Lyle Ungar](https://www.cis.upenn.edu/~ungar/)\n\n### Week 2: making things work\n* [Alona Fyshe](https://webdocs.cs.ualberta.ca/~alona/)\n* [Alexander Ecker](https://eckerlab.org/)\n* [James Evans](https://sociology.uchicago.edu/directory/james-evans)\n* [He He](https://hhexiy.github.io/)\n* [Vikash Gilja](https://tnel.ucsd.edu/bio) and [Akash Srivastava](https://akashgit.github.io/)\n\n### Week 3: more magic\n* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n* [Jane Wang](http://www.janexwang.com/) and [Feryal Behbahani](https://feryal.github.io/)\n* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n* [Josh Vogelstein](https://jovo.me/) and [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/)"
   },
   {
    "cell_type": "markdown",

From 4bc855d99a679546f398a4d2696a7354b111683f Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 00:48:46 -0400
Subject: [PATCH 17/34] W1D1, W1D4: restore source list format

Convert single-string source fields back to list-of-strings format.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_Tutorial1.ipynb                      | 25 +++++++++++++++++--
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    | 16 ++++++++++--
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index bb4e4f5fa..e1ee4f661 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -4533,8 +4533,29 @@
    "metadata": {
     "execution": {}
    },
-   "source": "## Meet our day leads\n\n### Week 1: the building blocks\n* [Konrad Kording](https://kordinglab.com)\n* [Andrew Saxe](https://www.saxelab.org/)\n* [Surya Ganguli](https://ganguli-gang.stanford.edu/)\n* [Ioannis Mitliagkas](http://mitliagkas.github.io/)\n* [Lyle Ungar](https://www.cis.upenn.edu/~ungar/)\n\n### Week 2: making things work\n* [Alona Fyshe](https://webdocs.cs.ualberta.ca/~alona/)\n* [Alexander Ecker](https://eckerlab.org/)\n* [James Evans](https://sociology.uchicago.edu/directory/james-evans)\n* [He He](https://hhexiy.github.io/)\n* [Vikash Gilja](https://tnel.ucsd.edu/bio) and [Akash Srivastava](https://akashgit.github.io/)\n\n### Week 3: more magic\n* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n* [Jane Wang](http://www.janexwang.com/) and [Feryal Behbahani](https://feryal.github.io/)\n* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n* [Josh Vogelstein](https://jovo.me/) and [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/)"
-  },
+   "source": [
+    "## Meet our day leads\n",
+    "\n",
+    "### Week 1: the building blocks\n",
+    "* [Konrad Kording](https://kordinglab.com)\n",
+    "* [Andrew Saxe](https://www.saxelab.org/)\n",
+    "* [Surya Ganguli](https://ganguli-gang.stanford.edu/)\n",
+    "* [Ioannis Mitliagkas](http://mitliagkas.github.io/)\n",
+    "* [Lyle Ungar](https://www.cis.upenn.edu/~ungar/)\n",
+    "\n",
+    "### Week 2: making things work\n",
+    "* [Alona Fyshe](https://webdocs.cs.ualberta.ca/~alona/)\n",
+    "* [Alexander Ecker](https://eckerlab.org/)\n",
+    "* [James Evans](https://sociology.uchicago.edu/directory/james-evans)\n",
+    "* [He He](https://hhexiy.github.io/)\n",
+    "* [Vikash Gilja](https://tnel.ucsd.edu/bio) and [Akash Srivastava](https://akashgit.github.io/)\n",
+    "\n",
+    "### Week 3: more magic\n",
+    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
+    "* [Jane Wang](http://www.janexwang.com/) and [Feryal Behbahani](https://feryal.github.io/)\n",
+    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
+    "* [Josh Vogelstein](https://jovo.me/) and [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/)"
+   ]  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index 68298e04b..83e5a6cb8 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -113,8 +113,20 @@
     "id": "import_check"
    },
    "outputs": [],
-   "source": "# @title Install and check dependencies\n# Most packages are pre-installed on Colab/Kaggle.\n# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\nimport importlib\n\nprint('Package versions:')\nfor _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n    try:\n        _mod = importlib.import_module(_pkg)\n        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n    except ImportError:\n        print(f'  {_pkg}: NOT FOUND')"
-  },
+   "source": [
+    "# @title Install and check dependencies\n",
+    "# Most packages are pre-installed on Colab/Kaggle.\n",
+    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
+    "import importlib\n",
+    "\n",
+    "print('Package versions:')\n",
+    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
+    "    try:\n",
+    "        _mod = importlib.import_module(_pkg)\n",
+    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
+    "    except ImportError:\n",
+    "        print(f'  {_pkg}: NOT FOUND')"
+   ]  },
   {
    "cell_type": "code",
    "execution_count": null,

From 86a04d16c4e51204070a31e55521f3773687340a Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 01:16:34 -0400
Subject: [PATCH 18/34] W1D1: reset kernelspec to generic Python 3

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_Tutorial1.ipynb                      | 784 +++++-------------
 1 file changed, 211 insertions(+), 573 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index e1ee4f661..aa37e269a 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -4,7 +4,6 @@
    "cell_type": "markdown",
    "metadata": {
     "colab_type": "text",
-    "execution": {},
     "id": "view-in-github"
    },
    "source": [
@@ -13,9 +12,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "# Tutorial 1: PyTorch\n",
     "**Week 1, Day 1: Basics and PyTorch**\n",
@@ -34,9 +31,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "---\n",
     "# Tutorial Objectives\n",
@@ -55,8 +50,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -69,9 +63,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "---\n",
     "# Setup"
@@ -79,9 +71,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Throughout your Neuromatch tutorials, most (probably all!) notebooks contain setup cells. These cells will import the required Python packages (e.g., PyTorch, NumPy); set global or environment variables, and load in helper functions for things like plotting. In some tutorials, you will notice that we install some dependencies even if they are preinstalled on Google Colab or Kaggle. This happens because we have added automation to our repository through [GitHub Actions](https://docs.github.com/en/actions/learn-github-actions/introduction-to-github-actions).\n",
     "\n",
@@ -94,8 +84,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -125,8 +114,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -152,9 +140,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Imports\n",
@@ -189,8 +175,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -207,8 +192,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -270,9 +254,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Important note: Colab users**\n",
     "\n",
@@ -285,9 +267,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "# Section 1: Welcome to Neuromatch Deep learning course\n",
     "\n",
@@ -298,8 +278,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -352,9 +331,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "This will be an intensive 3 week adventure. We will all learn Deep Learning (DL) in a group. Groups need standards. Read our\n",
     "[Code of Conduct](https://docs.google.com/document/d/1eHKIkaNbAlbx_92tLQelXnicKXEcvFzlyzzeWjEtifM/edit?usp=sharing).\n"
@@ -364,8 +341,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -377,8 +353,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -431,9 +406,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Discuss with your pod: What do you hope to get out of this course? [in about 100 words]**"
    ]
@@ -442,8 +415,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -453,9 +425,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "---\n",
     "# Section 2: The Basics of PyTorch\n",
@@ -465,9 +435,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "PyTorch is a Python-based scientific computing package targeted at two sets of\n",
     "audiences:\n",
@@ -487,9 +455,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 2.1: Creating Tensors\n"
    ]
@@ -498,8 +464,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -554,8 +519,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -565,18 +529,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "There are various ways of creating tensors, and when doing any real deep learning project, we will usually have to do so."
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Construct tensors directly:**\n",
     "\n",
@@ -587,9 +547,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# We can construct a tensor directly from some common python iterables,\n",
@@ -614,9 +572,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Some common tensor constructors:**\n",
     "\n",
@@ -626,9 +582,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# The numerical arguments we pass to these constructors\n",
@@ -644,18 +598,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Notice that `.empty()` does not return zeros, but seemingly random numbers. Unlike `.zeros()`, which initialises the elements of the tensor with zeros, `.empty()` just allocates the memory. It is hence a bit faster if you are looking to just create a tensor."
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Creating random tensors and tensors like other tensors:**\n",
     "\n",
@@ -665,9 +615,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# There are also constructors for random numbers\n",
@@ -693,9 +641,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "*Reproducibility*:\n",
     "\n",
@@ -722,9 +668,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Here, we define for you a function called `set_seed` that does the job for you!"
    ]
@@ -732,9 +676,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def set_seed(seed=None, seed_torch=True):\n",
@@ -767,9 +709,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Now, let's use the `set_seed` function in the previous example. Execute the cell multiple times to verify that the numbers printed are always the same."
    ]
@@ -777,9 +717,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def simplefun(seed=True, my_seed=None):\n",
@@ -810,9 +748,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "simplefun(seed=True, my_seed=0)  # Turn `seed` to `False` or change `my_seed`"
@@ -820,9 +756,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Numpy-like number ranges:**\n",
     "---\n",
@@ -832,9 +766,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "a = torch.arange(0, 10, step=1)\n",
@@ -851,9 +783,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Coding Exercise 2.1: Creating Tensors\n",
     "\n",
@@ -878,9 +808,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def tensor_creation(Z):\n",
@@ -925,9 +853,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -968,9 +894,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "```\n",
     "All correct!\n",
@@ -981,8 +905,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -992,9 +915,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 2.2: Operations in PyTorch\n",
     "\n",
@@ -1007,8 +928,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -1063,8 +983,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -1074,9 +993,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Tensor-Tensor operations**\n",
     "\n",
@@ -1086,9 +1003,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "a = torch.ones(5, 3)\n",
@@ -1108,9 +1023,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "However, in PyTorch, most common Python operators are overridden.\n",
     "The common standard arithmetic operators ($+$, $-$, $*$, $/$, and $**$) have all been lifted to elementwise operations"
@@ -1119,9 +1032,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.tensor([1, 2, 4, 8])\n",
@@ -1131,18 +1042,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Tensor Methods**"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n",
     "\n",
@@ -1152,9 +1059,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.rand(3, 3)\n",
@@ -1173,9 +1078,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Matrix Operations**\n",
     "\n",
@@ -1187,9 +1090,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Coding Exercise 2.2 : Simple tensor operations\n",
     "\n",
@@ -1224,9 +1125,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def simple_operations(a1: torch.Tensor, a2: torch.Tensor, a3: torch.Tensor):\n",
@@ -1269,9 +1168,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -1309,9 +1206,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "```\n",
     "tensor([[20, 24],\n",
@@ -1322,9 +1217,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def dot_product(b1: torch.Tensor, b2: torch.Tensor):\n",
@@ -1365,9 +1258,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -1404,9 +1295,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "```\n",
     "tensor(82)\n",
@@ -1417,8 +1306,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -1428,9 +1316,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 2.3 Manipulating Tensors in Pytorch"
    ]
@@ -1439,8 +1325,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -1495,8 +1380,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -1506,9 +1390,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Indexing**\n",
     "\n",
@@ -1520,9 +1402,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.arange(0, 10)\n",
@@ -1534,9 +1414,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "When we have multidimensional tensors, indexing rules work the same way as NumPy."
    ]
@@ -1544,9 +1422,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# make a 5D tensor\n",
@@ -1559,9 +1435,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Flatten and reshape**\n",
     "\n",
@@ -1571,9 +1445,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "z = torch.arange(12).reshape(6, 2)\n",
@@ -1590,18 +1462,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Squeezing tensors**\n",
     "\n",
@@ -1613,9 +1481,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.randn(1, 10)\n",
@@ -1627,9 +1493,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Because of that pesky singleton dimension, `x[0]` gave us the first row instead!"
    ]
@@ -1637,9 +1501,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Let's get rid of that singleton dimension and see what happens now\n",
@@ -1651,9 +1513,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Adding singleton dimensions works a similar way, and is often used when tensors\n",
@@ -1669,9 +1529,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Permutation**\n",
     "\n",
@@ -1681,9 +1539,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# `x` has dimensions [color,image_height,image_width]\n",
@@ -1700,27 +1556,21 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "You may also see `.transpose()` used. This works in a similar way as permute, but can only swap two dimensions at once."
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Concatenation**"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "In this example, we concatenate two matrices along rows (axis 0, the first element of the shape) vs. columns (axis 1, the second element of the shape). We can see that the first output tensor’s axis-0 length (`6`) is the sum of the two input tensors’ axis-0 lengths (`3+3`); while the second output tensor’s axis-1 length (`8`) is the sum of the two input tensors’ axis-1 lengths (`4+4`)."
    ]
@@ -1728,9 +1578,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Create two tensors of the same shape\n",
@@ -1750,9 +1598,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Conversion to Other Python Objects**\n",
     "\n",
@@ -1764,9 +1610,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.randn(5)\n",
@@ -1781,9 +1625,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "To convert a size-1 tensor to a Python scalar, we can invoke the item function or Python’s built-in functions."
    ]
@@ -1791,9 +1633,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "a = torch.tensor([3.5])\n",
@@ -1802,9 +1642,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Coding Exercise 2.3: Manipulating Tensors\n",
     "Using a combination of the methods discussed above, complete the functions below."
@@ -1812,9 +1650,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Function A**\n",
     "\n",
@@ -1907,9 +1743,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def functionA(my_tensor1, my_tensor2):\n",
@@ -2011,9 +1845,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -2103,9 +1935,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "```\n",
     "tensor([24, 24])\n",
@@ -2123,8 +1953,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2134,9 +1963,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 2.4: GPUs"
    ]
@@ -2145,8 +1972,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2201,8 +2027,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2212,9 +2037,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "\n",
     "By default, when we create a tensor it will *not* live on the GPU!"
@@ -2223,9 +2046,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.randn(10)\n",
@@ -2234,9 +2055,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "When using Colab notebooks, by default, will not have access to a GPU. In order to start using GPUs we need to request one. We can do this by going to the runtime tab at the top of the page.\n",
     "\n",
@@ -2259,18 +2078,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Now we have a GPU.**\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "The cell below should return `True`."
    ]
@@ -2278,9 +2093,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "print(torch.cuda.is_available())"
@@ -2288,9 +2101,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "[CUDA](https://developer.nvidia.com/cuda-toolkit) is an API developed by Nvidia for interfacing with GPUs. PyTorch provides us with a layer of abstraction, and allows us to launch CUDA kernels using pure Python.\n",
     "\n",
@@ -2308,9 +2119,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def set_device():\n",
@@ -2338,9 +2147,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Let's make some CUDA tensors!"
    ]
@@ -2348,9 +2155,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# common device agnostic way of writing code that can run on cpu OR gpu\n",
@@ -2372,9 +2177,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Operations between cpu tensors and cuda tensors**\n",
     "\n",
@@ -2384,9 +2187,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.tensor([0, 1, 2], device=DEVICE)\n",
@@ -2398,9 +2199,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "We cannot combine CUDA tensors and CPU tensors in this fashion. If we want to compute an operation that combines tensors on different devices, we need to move them first! We can use the `.to()` method as before, or the `.cpu()` and `.cuda()` methods. Note that using the `.cuda()` will throw an error, if CUDA is not enabled in your machine.\n",
     "\n",
@@ -2410,9 +2209,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = torch.tensor([0, 1, 2], device=DEVICE)\n",
@@ -2430,9 +2227,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Coding Exercise 2.4: Just how much faster are GPUs?\n",
     "\n",
@@ -2448,9 +2243,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "dim = 10000\n",
@@ -2460,9 +2253,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def simpleFun(dim, device):\n",
@@ -2509,9 +2300,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -2553,9 +2342,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Sample output (depends on your hardware)\n",
     "\n",
@@ -2569,8 +2356,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2580,9 +2366,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Discuss!**\n",
     "\n",
@@ -2592,9 +2376,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove explanation\n",
@@ -2616,8 +2398,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2627,9 +2408,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 2.5: Datasets and Dataloaders"
    ]
@@ -2638,8 +2417,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2694,8 +2472,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2705,9 +2482,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "When training neural network models you will be working with large amounts of data. Fortunately, PyTorch offers some great tools that help you organize and manipulate your data samples."
    ]
@@ -2715,9 +2490,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Imports moved to Setup section above; kept here for reference\n",
@@ -2728,9 +2501,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Datasets**\n",
     "\n",
@@ -2742,9 +2513,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Download and load the images from the CIFAR10 dataset\n",
@@ -2761,9 +2530,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "We have 50,000 samples loaded. Now, let's take a look at one of them in detail. Each sample consists of an image and its corresponding label."
    ]
@@ -2771,9 +2538,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Choose a random sample\n",
@@ -2785,18 +2550,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Color images are modeled as 3 dimensional tensors. The first dimension corresponds to the channels ($\\text{C}$) of the image (in this case we have RGB images). The second dimensions is the height ($\\text{H}$) of the image and the third is the width ($\\text{W}$). We can denote this image format as $\\text{C} \\times \\text{H} \\times \\text{W}$."
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Coding Exercise 2.5: Display an image from the dataset\n",
     "\n",
@@ -2824,9 +2585,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# TODO: Uncomment the following line to see the error that arises from the current image format\n",
@@ -2840,9 +2599,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -2859,8 +2616,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2872,8 +2628,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2928,8 +2683,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -2939,9 +2693,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Training and Test Datasets**\n",
     "\n",
@@ -2951,9 +2703,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Load the training samples\n",
@@ -2977,8 +2727,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3031,9 +2780,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Dataloader**\n",
     "\n",
@@ -3043,9 +2790,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Create dataloaders with\n",
@@ -3055,9 +2800,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "*Reproducibility:* DataLoader will reseed workers following Randomness in multi-process data loading algorithm. Use `worker_init_fn()` and a `generator` to preserve reproducibility:\n",
     "\n",
@@ -3084,18 +2827,14 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Important:** For the `seed_worker` to have an effect, `num_workers` should be 2 or more."
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "We can now query the next batch from the data loader and inspect it. For this we need to convert the dataloader object to a Python iterator using the function `iter` and then we can query the next batch using the function `next`.\n",
     "\n",
@@ -3105,9 +2844,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Load the next batch\n",
@@ -3121,9 +2858,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Transformations**\n",
     "\n",
@@ -3132,9 +2867,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Coding Exercise 2.6: Load the CIFAR10 dataset as grayscale images\n",
     "\n",
@@ -3144,9 +2877,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def my_data_load():\n",
@@ -3184,9 +2915,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -3221,8 +2950,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3232,9 +2960,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "---\n",
     "# Section 3: Neural Networks\n",
@@ -3244,9 +2970,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Now it's time for you to create your first neural network using PyTorch. This section will walk you through the process of:\n",
     "\n",
@@ -3260,8 +2984,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3316,8 +3039,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3327,9 +3049,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 3.1: Data Loading\n",
     "\n",
@@ -3340,8 +3060,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3358,9 +3077,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Now we can load the data from the CSV file using the Pandas library. Pandas provides many functions for reading files in various formats. When loading data from a CSV file, we can reference the columns directly by their names."
    ]
@@ -3368,9 +3085,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Load the data from the CSV file in a Pandas DataFrame\n",
@@ -3393,9 +3108,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Prepare Data for PyTorch**\n",
     "\n",
@@ -3405,9 +3118,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Initialize the device variable\n",
@@ -3432,9 +3143,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 3.2: Create a Simple Neural Network"
    ]
@@ -3443,8 +3152,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3499,8 +3207,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3510,9 +3217,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "For this example we want to have a simple neural network consisting of 3 layers:\n",
     "\n",
@@ -3550,9 +3255,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Inherit from nn.Module - the base class for neural network modules provided by Pytorch\n",
@@ -3639,9 +3342,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Check that your network works**\n",
     "\n",
@@ -3651,9 +3352,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Create new NaiveNet and transfer it to the device\n",
@@ -3665,9 +3364,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Coding Exercise 3.2: Classify some samples\n",
     "\n",
@@ -3679,9 +3376,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "## Get the samples\n",
@@ -3700,9 +3395,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove solution\n",
@@ -3721,9 +3414,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "```\n",
     "Sample input:\n",
@@ -3749,8 +3440,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3760,9 +3450,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Section 3.3: Train Your Neural Network"
    ]
@@ -3771,8 +3459,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3827,8 +3514,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3838,9 +3524,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Now it is time to train your network on your dataset. Don't worry if you don't fully understand everything yet - we will cover training in much more details in the next days. For now, the goal is just to see your network in action!\n",
     "\n",
@@ -3851,8 +3535,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -3909,9 +3592,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Implement the train function given a training dataset X and correcsponding labels y\n",
@@ -3979,9 +3660,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "**Plot the loss during training**\n",
     "\n",
@@ -3991,9 +3670,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "plt.plot(np.linspace(1, len(losses), len(losses)), losses)\n",
@@ -4005,8 +3682,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4032,8 +3708,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4088,8 +3763,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4099,9 +3773,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Exercise 3.3: Tweak your Network\n",
     "\n",
@@ -4117,9 +3789,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove explanation\n",
@@ -4136,8 +3806,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4149,8 +3818,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4205,8 +3873,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4216,9 +3883,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "Exclusive OR (XOR) logical operation gives a true (`1`) output when the number of true inputs is odd. That is, a true output result if one, and only one, of the inputs to the gate is true. If both inputs are false (`0`) or both are true or false output results. Mathematically speaking, XOR represents the inequality function, i.e., the output is true if the inputs are not alike; otherwise, the output is false.\n",
     "\n",
@@ -4238,9 +3903,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "### Interactive Demo 3.3: Solving XOR\n",
     "\n",
@@ -4262,8 +3925,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4283,8 +3945,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4302,8 +3963,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4313,9 +3973,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "---\n",
     "# Section 4: Ethics And Course Info"
@@ -4325,8 +3983,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4381,8 +4038,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4394,8 +4050,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4450,8 +4105,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4463,8 +4117,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4519,8 +4172,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
-    "execution": {}
+    "cellView": "form"
    },
    "outputs": [],
    "source": [
@@ -4530,9 +4182,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "source": [
     "## Meet our day leads\n",
     "\n",
@@ -4555,7 +4205,8 @@
     "* [Jane Wang](http://www.janexwang.com/) and [Feryal Behbahani](https://feryal.github.io/)\n",
     "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
     "* [Josh Vogelstein](https://jovo.me/) and [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/)"
-   ]  },
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -4573,9 +4224,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# @title Load visualization data\n",
@@ -4606,9 +4255,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# @title Define Visualization using ALtair\n",
@@ -4657,9 +4304,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# to_remove explanation\n",
@@ -4685,9 +4330,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# @title Submit your feedback\n",
@@ -4720,9 +4363,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {}
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# @title Edit the `AUTHOR_FILTER` variable to full text search for authors.\n",
@@ -4752,10 +4393,8 @@
   },
   {
    "cell_type": "markdown",
-   "id": "appendix",
-   "metadata": {
-    "execution": {}
-   },
+   "id": "213",
+   "metadata": {},
    "source": [
     "---\n",
     "# Appendix\n",
@@ -4795,8 +4434,7 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "nma-dl-jax",
-   "language": "python",
+   "display_name": "Python 3",
    "name": "python3"
   },
   "language_info": {

From 097b32d1f716279ebf4be9a940cd1bc6cdc74b70 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 03:42:03 -0400
Subject: [PATCH 19/34] update lecturers to be the current content lecturers

---
 .../W1D1_Tutorial1.ipynb                      | 118 +++++++++---------
 1 file changed, 57 insertions(+), 61 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index aa37e269a..adac31eda 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -4121,51 +4121,52 @@
    },
    "outputs": [],
    "source": [
-    "# @title Video 17: Syllabus\n",
-    "from ipywidgets import widgets\n",
-    "from IPython.display import YouTubeVideo\n",
-    "from IPython.display import IFrame\n",
-    "from IPython.display import display\n",
-    "\n",
-    "\n",
-    "class PlayVideo(IFrame):\n",
-    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "    self.id = id\n",
-    "    if source == 'Bilibili':\n",
-    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "    elif source == 'Osf':\n",
-    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "  tab_contents = []\n",
-    "  for i, video_id in enumerate(video_ids):\n",
-    "    out = widgets.Output()\n",
-    "    with out:\n",
-    "      if video_ids[i][0] == 'Youtube':\n",
-    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "                             height=H, fs=fs, rel=0)\n",
-    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "      else:\n",
-    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "                          height=H, fs=fs, autoplay=False)\n",
-    "        if video_ids[i][0] == 'Bilibili':\n",
-    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "        elif video_ids[i][0] == 'Osf':\n",
-    "          print(f'Video available at https://osf.io/{video.id}')\n",
-    "      display(video)\n",
-    "    tab_contents.append(out)\n",
-    "  return tab_contents\n",
-    "\n",
-    "\n",
-    "video_ids = [('Youtube', 'cDvAqG_hAvQ'), ('Bilibili', 'BV1iB4y1N7uQ')]\n",
-    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "tabs = widgets.Tab()\n",
-    "tabs.children = tab_contents\n",
-    "for i in range(len(tab_contents)):\n",
-    "  tabs.set_title(i, video_ids[i][0])\n",
-    "display(tabs)"
+    "# This syllabus video was outdated\n",
+    "# # @title Video 17: Syllabus\n",
+    "# from ipywidgets import widgets\n",
+    "# from IPython.display import YouTubeVideo\n",
+    "# from IPython.display import IFrame\n",
+    "# from IPython.display import display\n",
+    "\n",
+    "\n",
+    "# class PlayVideo(IFrame):\n",
+    "#   def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "#     self.id = id\n",
+    "#     if source == 'Bilibili':\n",
+    "#       src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "#     elif source == 'Osf':\n",
+    "#       src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "#     super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "# def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "#   tab_contents = []\n",
+    "#   for i, video_id in enumerate(video_ids):\n",
+    "#     out = widgets.Output()\n",
+    "#     with out:\n",
+    "#       if video_ids[i][0] == 'Youtube':\n",
+    "#         video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "#                              height=H, fs=fs, rel=0)\n",
+    "#         print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "#       else:\n",
+    "#         video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "#                           height=H, fs=fs, autoplay=False)\n",
+    "#         if video_ids[i][0] == 'Bilibili':\n",
+    "#           print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "#         elif video_ids[i][0] == 'Osf':\n",
+    "#           print(f'Video available at https://osf.io/{video.id}')\n",
+    "#       display(video)\n",
+    "#     tab_contents.append(out)\n",
+    "#   return tab_contents\n",
+    "\n",
+    "\n",
+    "# video_ids = [('Youtube', 'cDvAqG_hAvQ'), ('Bilibili', 'BV1iB4y1N7uQ')]\n",
+    "# tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "# tabs = widgets.Tab()\n",
+    "# tabs.children = tab_contents\n",
+    "# for i in range(len(tab_contents)):\n",
+    "#   tabs.set_title(i, video_ids[i][0])\n",
+    "# display(tabs)"
    ]
   },
   {
@@ -4176,35 +4177,29 @@
    },
    "outputs": [],
    "source": [
-    "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_Syllabus_Video\")"
+    "# # @title Submit your feedback\n",
+    "# content_review(f\"{feedback_prefix}_Syllabus_Video\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Meet our day leads\n",
+    "## Meet our lecturers\n",
     "\n",
-    "### Week 1: the building blocks\n",
     "* [Konrad Kording](https://kordinglab.com)\n",
     "* [Andrew Saxe](https://www.saxelab.org/)\n",
     "* [Surya Ganguli](https://ganguli-gang.stanford.edu/)\n",
     "* [Ioannis Mitliagkas](http://mitliagkas.github.io/)\n",
     "* [Lyle Ungar](https://www.cis.upenn.edu/~ungar/)\n",
-    "\n",
-    "### Week 2: making things work\n",
     "* [Alona Fyshe](https://webdocs.cs.ualberta.ca/~alona/)\n",
     "* [Alexander Ecker](https://eckerlab.org/)\n",
-    "* [James Evans](https://sociology.uchicago.edu/directory/james-evans)\n",
+    "* [Vikash Gilja](https://tnel.ucsd.edu/bio) \n",
+    "* [Binxu Wang](https://animadversio.github.io/) \n",
     "* [He He](https://hhexiy.github.io/)\n",
-    "* [Vikash Gilja](https://tnel.ucsd.edu/bio) and [Akash Srivastava](https://akashgit.github.io/)\n",
-    "\n",
-    "### Week 3: more magic\n",
-    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
-    "* [Jane Wang](http://www.janexwang.com/) and [Feryal Behbahani](https://feryal.github.io/)\n",
-    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) and [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
-    "* [Josh Vogelstein](https://jovo.me/) and [Vincenzo Lamonaco](https://www.vincenzolomonaco.com/)"
+    "* [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
+    "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) \n",
+    "* [Pablo Samuel Castro](https://psc-g.github.io/)"
    ]
   },
   {
@@ -4434,7 +4429,8 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "nma-dl",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -4447,7 +4443,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.7.16"
   }
  },
  "nbformat": 4,

From 253fbdf0b36403904d8277f3d9049543a2ad8e04 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 05:16:47 -0400
Subject: [PATCH 20/34] update W1D1 slides

---
 .../W1D1_Tutorial1.ipynb                      | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index adac31eda..71fb2f6bc 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -48,15 +48,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "cellView": "form"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "If you want to download the slides: https://osf.io/download/dg4h7/\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"854\"\n",
+       "            height=\"480\"\n",
+       "            src=\"https://mfr.ca-1.osf.io/render?url=https://osf.io/dg4h7/?direct%26mode=render%26action=download%26mode=render\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "            \n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7ff6d925af10>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# @title Tutorial slides\n",
     "from IPython.display import IFrame\n",
-    "link_id = \"wcjrv\"\n",
+    "link_id = \"dg4h7\" # link_id = \"wcjrv\"\n",
     "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n",
     "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)"
    ]

From 0d50ee01c848295d99839e49aaff3cf1c91eb344 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 05:41:15 -0400
Subject: [PATCH 21/34] W1D1-W1D4: add Jiaxin Cindy Tu to content reviewers

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../W1D1_Tutorial1.ipynb                      | 428 ++++++++++++------
 .../W1D2_Tutorial1.ipynb                      |   2 +-
 .../W1D2_Tutorial2.ipynb                      |   2 +-
 .../W1D2_Tutorial3.ipynb                      |   2 +-
 .../W1D3_Tutorial1.ipynb                      |   2 +-
 .../W1D3_Tutorial2.ipynb                      |   2 +-
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    |   2 +-
 7 files changed, 303 insertions(+), 137 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 71fb2f6bc..06cd220b6 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -8,7 +8,9 @@
    },
    "source": [
     "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -22,12 +24,14 @@
     "\n",
     "__Content creators:__ Shubh Pachchigar, Vladimir Haltakov, Matthew Sargent, Konrad Kording\n",
     "\n",
-    "__Content reviewers:__ Deepak Raya, Siwei Bai, Kelson Shilling-Scrivo\n",
+    "__Content reviewers:__ Deepak Raya, Siwei Bai, Kelson Shilling-Scrivo, Jiaxin Cindy Tu\n",
     "\n",
     "__Content editors:__ Anoop Kulkarni, Spiros Chavlis\n",
     "\n",
     "__Production editors:__ Arush Tagade, Spiros Chavlis"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -44,45 +48,17 @@
     "* Train NaiveNet\n",
     "* Get to know your pod\n",
     "* Start thinking about the course as a whole"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "cellView": "form"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "If you want to download the slides: https://osf.io/download/dg4h7/\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"854\"\n",
-       "            height=\"480\"\n",
-       "            src=\"https://mfr.ca-1.osf.io/render?url=https://osf.io/dg4h7/?direct%26mode=render%26action=download%26mode=render\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "            \n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7ff6d925af10>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# @title Tutorial slides\n",
     "from IPython.display import IFrame\n",
@@ -97,7 +73,9 @@
    "source": [
     "---\n",
     "# Setup"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -108,7 +86,9 @@
     "Be sure to run all of the cells in the setup section. Feel free to expand them and have a look at what you are loading in, but you should be able to fulfill the learning objectives of every tutorial without having to look at these cells.\n",
     "\n",
     "If you start building your own projects built on this code base we highly recommend looking at them in more detail."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -293,7 +273,9 @@
     "If you want to quickly try out something or take a look at the data, you can use scratch code cells. They allow you to run Python code, but will not mess up the structure of your notebook.\n",
     "\n",
     "To open a new scratch cell go to *Insert* → *Scratch code cell*."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -302,7 +284,9 @@
     "# Section 1: Welcome to Neuromatch Deep learning course\n",
     "\n",
     "*Time estimate: ~25mins*"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -365,7 +349,9 @@
    "source": [
     "This will be an intensive 3 week adventure. We will all learn Deep Learning (DL) in a group. Groups need standards. Read our\n",
     "[Code of Conduct](https://docs.google.com/document/d/1eHKIkaNbAlbx_92tLQelXnicKXEcvFzlyzzeWjEtifM/edit?usp=sharing).\n"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -439,7 +425,9 @@
    "metadata": {},
    "source": [
     "**Discuss with your pod: What do you hope to get out of this course? [in about 100 words]**"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -461,7 +449,9 @@
     "# Section 2: The Basics of PyTorch\n",
     "\n",
     "*Time estimate: ~2 hours 05 mins*"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -481,14 +471,18 @@
     "- A clean, modular API for building and deploying **deep learning models**.\n",
     "\n",
     "You can find more information about PyTorch in the Appendix."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Section 2.1: Creating Tensors\n"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -562,7 +556,9 @@
    "metadata": {},
    "source": [
     "There are various ways of creating tensors, and when doing any real deep learning project, we will usually have to do so."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -572,7 +568,9 @@
     "\n",
     "---\n",
     "\n"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -607,7 +605,9 @@
     "**Some common tensor constructors:**\n",
     "\n",
     "---"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -631,7 +631,9 @@
    "metadata": {},
    "source": [
     "Notice that `.empty()` does not return zeros, but seemingly random numbers. Unlike `.zeros()`, which initialises the elements of the tensor with zeros, `.empty()` just allocates the memory. It is hence a bit faster if you are looking to just create a tensor."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -640,7 +642,9 @@
     "**Creating random tensors and tensors like other tensors:**\n",
     "\n",
     "---"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -694,14 +698,18 @@
     "import numpy as np\n",
     "np.random.seed(0)\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Here, we define for you a function called `set_seed` that does the job for you!"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -742,7 +750,9 @@
    "metadata": {},
    "source": [
     "Now, let's use the `set_seed` function in the previous example. Execute the cell multiple times to verify that the numbers printed are always the same."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -791,7 +801,9 @@
     "**Numpy-like number ranges:**\n",
     "---\n",
     "The ```.arange()``` and ```.linspace()``` behave how you would expect them to if you are familar with numpy."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -833,7 +845,9 @@
     "<br>\n",
     "\n",
     "$^\\dagger$: $\\mathcal{U(\\alpha, \\beta)}$ denotes the [uniform distribution](https://en.wikipedia.org/wiki/Continuous_uniform_distribution) from $\\alpha$ to $\\beta$, with $\\alpha, \\beta \\in \\mathbb{R}$.\n"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -929,7 +943,9 @@
     "```\n",
     "All correct!\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -952,7 +968,9 @@
     "**Tensor-Tensor operations**\n",
     "\n",
     "We can perform operations on tensors using methods under `torch.`"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1028,7 +1046,9 @@
     "**Tensor-Tensor operations**\n",
     "\n",
     "We can perform operations on tensors using methods under `torch.`."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1057,7 +1077,9 @@
    "source": [
     "However, in PyTorch, most common Python operators are overridden.\n",
     "The common standard arithmetic operators ($+$, $-$, $*$, $/$, and $**$) have all been lifted to elementwise operations"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1075,7 +1097,9 @@
    "metadata": {},
    "source": [
     "**Tensor Methods**"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -1084,7 +1108,9 @@
     "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n",
     "\n",
     "All of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1116,7 +1142,9 @@
     "\n",
     "Transposes of 2D tensors are obtained using `torch.t()` or `Tensor.T`. Note the lack of brackets for `Tensor.T` - it is an attribute, not a method.\n",
     "\n"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -1150,7 +1178,9 @@
     "\\end{equation}\n",
     "\n",
     "The code block below that computes these expressions using PyTorch is incomplete - fill in the missing lines."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1242,7 +1272,9 @@
     "tensor([[20, 24],\n",
     "        [31, 27]])\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1330,7 +1362,9 @@
     "```\n",
     "tensor(82)\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1349,7 +1383,9 @@
    "metadata": {},
    "source": [
     "## Section 2.3 Manipulating Tensors in Pytorch"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1427,7 +1463,9 @@
     "Just as in numpy, elements in a tensor can be accessed by index. As in any numpy array, the first element has index 0 and ranges are specified to include the first to last_element-1. We can access elements according to their relative position to the end of the list by using negative indices. Indexing is also referred to as slicing.\n",
     "\n",
     "For example, `[-1]` selects the last element; `[1:3]` selects the second and the third elements, and `[:-2]` will select all elements excluding the last and second-to-last elements."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1447,7 +1485,9 @@
    "metadata": {},
    "source": [
     "When we have multidimensional tensors, indexing rules work the same way as NumPy."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1470,7 +1510,9 @@
     "**Flatten and reshape**\n",
     "\n",
     "There are various methods for reshaping tensors. It is common to have to express 2D data in 1D format. Similarly, it is also common to have to reshape a 1D tensor into a 2D tensor. We can achieve this with the `.flatten()` and `.reshape()` methods."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1495,7 +1537,9 @@
    "metadata": {},
    "source": [
     "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -1506,7 +1550,9 @@
     "When processing batches of data, you will quite often be left with singleton dimensions. E.g., `[1,10]` or `[256, 1, 3]`. This dimension can quite easily mess up your matrix operations if you don't plan on it being there...\n",
     "\n",
     "In order to compress tensors along their singleton dimensions we can use the `.squeeze()` method. We can use the `.unsqueeze()` method to do the opposite."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1526,7 +1572,9 @@
    "metadata": {},
    "source": [
     "Because of that pesky singleton dimension, `x[0]` gave us the first row instead!"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1564,7 +1612,9 @@
     "**Permutation**\n",
     "\n",
     "Sometimes our dimensions will be in the wrong order! For example, we may be dealing with RGB images with dim $[3\\times48\\times64]$, but our pipeline expects the colour dimension to be the last dimension, i.e., $[48\\times64\\times3]$. To get around this we can use the `.permute()` method."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1589,21 +1639,27 @@
    "metadata": {},
    "source": [
     "You may also see `.transpose()` used. This works in a similar way as permute, but can only swap two dimensions at once."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "**Concatenation**"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "In this example, we concatenate two matrices along rows (axis 0, the first element of the shape) vs. columns (axis 1, the second element of the shape). We can see that the first output tensor’s axis-0 length (`6`) is the sum of the two input tensors’ axis-0 lengths (`3+3`); while the second output tensor’s axis-1 length (`8`) is the sum of the two input tensors’ axis-1 lengths (`4+4`)."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1635,7 +1691,9 @@
     "Converting a tensor to a numpy.ndarray, or vice versa, is easy, and the converted result does not share memory. This minor inconvenience is quite important: when you perform operations on the CPU or GPUs, you do not want to halt computation, waiting to see whether the NumPy package of Python might want to be doing something else with the same chunk of memory.\n",
     "\n",
     "When converting to a NumPy array, the information being tracked by the tensor will be lost, i.e., the computational graph. This will be covered in detail when you are introduced to autograd tomorrow!"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1658,7 +1716,9 @@
    "metadata": {},
    "source": [
     "To convert a size-1 tensor to a Python scalar, we can invoke the item function or Python’s built-in functions."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1676,7 +1736,9 @@
    "source": [
     "### Coding Exercise 2.3: Manipulating Tensors\n",
     "Using a combination of the methods discussed above, complete the functions below."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -1768,7 +1830,9 @@
     "<br>\n",
     "\n",
     "**Hint:** `torch.numel()` is an easy way of finding the number of elements in a tensor."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1977,7 +2041,9 @@
     "        [-1,  5]])\n",
     "tensor([ 1, -1, -1,  3,  2,  3,  0])\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -1996,7 +2062,9 @@
    "metadata": {},
    "source": [
     "## Section 2.4: GPUs"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2071,7 +2139,9 @@
    "source": [
     "\n",
     "By default, when we create a tensor it will *not* live on the GPU!"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2094,7 +2164,9 @@
     "Once you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n",
     "\n",
     "For more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -2104,21 +2176,27 @@
     "> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n",
     "> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n",
     "> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "**Now we have a GPU.**\n"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "The cell below should return `True`."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2144,7 +2222,9 @@
     "```\n",
     "\n",
     "Let's define the function using the PyTorch package `torch.cuda`, which is lazily initialized, so we can always import it, and use `is_available()` to determine if our system supports CUDA."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2180,7 +2260,9 @@
    "metadata": {},
    "source": [
     "Let's make some CUDA tensors!"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2212,7 +2294,9 @@
     "**Operations between cpu tensors and cuda tensors**\n",
     "\n",
     "Note that the type of the tensor changed after calling `.to()`. What happens if we try and perform operations on tensors on devices?"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2234,7 +2318,9 @@
     "We cannot combine CUDA tensors and CPU tensors in this fashion. If we want to compute an operation that combines tensors on different devices, we need to move them first! We can use the `.to()` method as before, or the `.cpu()` and `.cuda()` methods. Note that using the `.cuda()` will throw an error, if CUDA is not enabled in your machine.\n",
     "\n",
     "Generally, in this course, all Deep Learning is done on the GPU, and any computation is done on the CPU, so sometimes we have to pass things back and forth, so you'll see us call."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2268,7 +2354,9 @@
     "- Matrix multiplication\n",
     "\n",
     "The operations should be able to perfomed on either the CPU or GPU specified by the parameter `device`. We will use the helper function `timeFun(f, dim, iterations, device)`."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2380,7 +2468,9 @@
     "time taken for 1 iterations of simpleFun(10000, cpu): 23.74070\n",
     "time taken for 1 iterations of simpleFun(10000, cuda): 0.87535\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2401,7 +2491,9 @@
     "**Discuss!**\n",
     "\n",
     "Try and reduce the dimensions of the tensors and increase the iterations. You can get to a point where the cpu only function is faster than the GPU function. Why might this be?"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2441,7 +2533,9 @@
    "metadata": {},
    "source": [
     "## Section 2.5: Datasets and Dataloaders"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2515,7 +2609,9 @@
    "metadata": {},
    "source": [
     "When training neural network models you will be working with large amounts of data. Fortunately, PyTorch offers some great tools that help you organize and manipulate your data samples."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2538,7 +2634,9 @@
     "The `torchvision` package gives you easy access to many of the publicly available datasets. Let's load the [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset, which contains color images of 10 different classes, like vehicles and animals.\n",
     "\n",
     "Creating an object of type `datasets.CIFAR10` will automatically download and load all images from the dataset. The resulting data structure can be treated as a list containing data samples and their corresponding labels."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2563,7 +2661,9 @@
    "metadata": {},
    "source": [
     "We have 50,000 samples loaded. Now, let's take a look at one of them in detail. Each sample consists of an image and its corresponding label."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2583,7 +2683,9 @@
    "metadata": {},
    "source": [
     "Color images are modeled as 3 dimensional tensors. The first dimension corresponds to the channels ($\\text{C}$) of the image (in this case we have RGB images). The second dimensions is the height ($\\text{H}$) of the image and the third is the width ($\\text{W}$). We can denote this image format as $\\text{C} \\times \\text{H} \\times \\text{W}$."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -2610,7 +2712,9 @@
     "print(input_var.size())\n",
     "print(input_var)\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2728,7 +2832,9 @@
     "**Training and Test Datasets**\n",
     "\n",
     "When loading a dataset, you can specify if you want to load the training or the test samples using the `train` argument. We can load the training and test datasets separately. For simplicity, today we will not use both datasets separately, but this topic will be adressed in the next days."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2815,7 +2921,9 @@
     "**Dataloader**\n",
     "\n",
     "Another important concept is the `Dataloader`. It is a wrapper around the `Dataset` that splits it into minibatches (important for training the neural network) and makes the data iterable. The `shuffle` argument is used to shuffle the order of the samples across the minibatches."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2853,14 +2961,18 @@
     "    generator=g_seed\n",
     "    )\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "**Important:** For the `seed_worker` to have an effect, `num_workers` should be 2 or more."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -2869,7 +2981,9 @@
     "We can now query the next batch from the data loader and inspect it. For this we need to convert the dataloader object to a Python iterator using the function `iter` and then we can query the next batch using the function `next`.\n",
     "\n",
     "We can now see that we have a 4D tensor. This is because we have a 64 images in the batch ($B$) and each image has 3 dimensions: channels ($C$), height ($H$) and width ($W$). So, the size of the 4D tensor is $B \\times C \\times H \\times W$."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2893,7 +3007,9 @@
     "**Transformations**\n",
     "\n",
     "Another useful feature when loading a dataset is applying transformations on the data - color conversions, normalization, cropping, rotation etc. There are many predefined transformations in the `torchvision.transforms` package and you can also combine them using the `Compose` transform. Checkout the [pytorch documentation](https://pytorch.org/vision/stable/transforms.html) for details."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -2902,7 +3018,9 @@
     "### Coding Exercise 2.6: Load the CIFAR10 dataset as grayscale images\n",
     "\n",
     "The goal of this excercise is to load the images from the CIFAR10 dataset as grayscale images. Note that we rerun the `set_seed` function to ensure reproducibility."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -2996,7 +3114,9 @@
     "# Section 3: Neural Networks\n",
     "\n",
     "*Time estimate: ~1 hour 30 mins (excluding video)*"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -3008,7 +3128,9 @@
     "- Training the network\n",
     "- Visualizing the results of the network\n",
     "- Tweaking the network"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3084,7 +3206,9 @@
     "## Section 3.1: Data Loading\n",
     "\n",
     "First we need some sample data to train our network on. You can use the function below to generate an example dataset consisting of 2D points along two interleaving half circles. The data will be stored in a file called `sample_data.csv`. You can inspect the file directly in Colab by going to Files on the left side and opening the CSV file."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3110,7 +3234,9 @@
    "metadata": {},
    "source": [
     "Now we can load the data from the CSV file using the Pandas library. Pandas provides many functions for reading files in various formats. When loading data from a CSV file, we can reference the columns directly by their names."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3143,7 +3269,9 @@
     "**Prepare Data for PyTorch**\n",
     "\n",
     "Now let's prepare the data in a format suitable for PyTorch - convert everything into tensors."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3176,7 +3304,9 @@
    "metadata": {},
    "source": [
     "## Section 3.2: Create a Simple Neural Network"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3280,7 +3410,9 @@
     "<br>\n",
     "\n",
     "**Note:** You can use the `__call__` method of a module directly and it will invoke the `forward` method: `net()` does the same as `net.forward()`."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3377,7 +3509,9 @@
     "**Check that your network works**\n",
     "\n",
     "Create an instance of your model and visualize it."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3401,7 +3535,9 @@
     "Now, let's pass some of the points of our dataset through the network and see if it works. You should not expect the network to actually classify the points correctly, because it has not been trained yet.\n",
     "\n",
     "The goal here is just to get some experience with the data structures that are passed to the forward and predict methods and their results."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3464,7 +3600,9 @@
     "Predicted labels:\n",
     " tensor([0, 0, 1, 0, 0], device='cuda:0')\n",
     "```"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3483,7 +3621,9 @@
    "metadata": {},
    "source": [
     "## Section 3.3: Train Your Neural Network"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3559,7 +3699,9 @@
     "Now it is time to train your network on your dataset. Don't worry if you don't fully understand everything yet - we will cover training in much more details in the next days. For now, the goal is just to see your network in action!\n",
     "\n",
     "You will usually implement the `train` method directly when implementing your class `NaiveNet`. Here, we will implement it as a function outside of the class in order to have it in a separate cell."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3695,7 +3837,9 @@
     "**Plot the loss during training**\n",
     "\n",
     "Plot the loss during the training to see how it reduces and converges."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3814,7 +3958,9 @@
     "- Add one additional hidden layer\n",
     "\n",
     "Can you get the network to better fit the data?"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -3929,7 +4075,9 @@
     "\\end{matrix}\n",
     "\n",
     "Here, with `0`, we denote `False`, and with `1` we denote `True` in boolean terms."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -3949,7 +4097,9 @@
     "\\end{equation}\n",
     "\n",
     "Try to set the weights and biases to implement this function after you played enough :)"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -4007,7 +4157,9 @@
    "source": [
     "---\n",
     "# Section 4: Ethics And Course Info"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -4230,7 +4382,9 @@
     "* [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
     "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) \n",
     "* [Pablo Samuel Castro](https://psc-g.github.io/)"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -4244,7 +4398,9 @@
     "In this notebook we visualize a subset* of 3,300 articles retreived from the AllenAI [S2ORC dataset](https://github.com/allenai/s2orc). We represent each paper by a position that is output of a dimensionality reduction method applied to a vector representation of each paper. The vector representation is the output of a neural network.\n",
     "\n",
     "**Note:** The selection is very biased on the keywords and methodology we used to filter. Please see the details section to learn about what we did."
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -4310,7 +4466,9 @@
     "1. Hover over a dot to see a tooltip (title, author)\n",
     "2. Select a year in the legend (right) to filter dots\n",
     "3. Zoom in/out with scroll -- double click resets view"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -4324,7 +4482,9 @@
     "2. Can you see a temporal trend in the data and clusters?\n",
     "3. Can you determine when deep learning methods started booming ?\n",
     "4. Can you find the key papers that where written before the DL \"winter\" that define milestones for a cluster? (tip: look for large dots of different color)"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -4376,14 +4536,18 @@
     "3. Embed each paper by using abstract + title in SPECTER model\n",
     "4. Project based on embedding using UMAP\n",
     "5. Visualize using Altair"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Find Authors"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
@@ -4441,7 +4605,9 @@
     "\n",
     "## Books for reference:\n",
     "- [https://www.deeplearningbook.org/](https://www.deeplearningbook.org/) (Deep Learning by Ian Goodfellow, Yoshua Bengio and Aaron Courville)"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   }
  ],
  "metadata": {
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
index e3d9870e3..d557bf67c 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
@@ -26,7 +26,7 @@
     "\n",
     "__Content creators:__ Saeed Salehi, Vladimir Haltakov, Andrew Saxe\n",
     "\n",
-    "__Content reviewers:__ Polina Turishcheva, Antoine De Comite, Kelson Shilling-Scrivo\n",
+    "__Content reviewers:__ Polina Turishcheva, Antoine De Comite, Kelson Shilling-Scrivo, Jiaxin Cindy Tu\n",
     "\n",
     "__Content editors:__ Anoop Kulkarni, Spiros Chavlis\n",
     "\n",
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
index 80fe8a9e9..965b6302e 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
@@ -25,7 +25,7 @@
     "\n",
     "__Content creators:__ Saeed Salehi, Andrew Saxe\n",
     "\n",
-    "__Content reviewers:__ Polina Turishcheva, Antoine De Comite, Kelson Shilling-Scrivo\n",
+    "__Content reviewers:__ Polina Turishcheva, Antoine De Comite, Kelson Shilling-Scrivo, Jiaxin Cindy Tu\n",
     "\n",
     "__Content editors:__ Anoop Kulkarni\n",
     "\n",
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
index bc0cf5ac2..61076b987 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
@@ -24,7 +24,7 @@
     "\n",
     "__Content creators:__ Saeed Salehi, Spiros Chavlis, Andrew Saxe\n",
     "\n",
-    "__Content reviewers:__ Polina Turishcheva, Antoine De Comite\n",
+    "__Content reviewers:__ Polina Turishcheva, Antoine De Comite, Jiaxin Cindy Tu\n",
     "\n",
     "__Content editors:__ Anoop Kulkarni\n",
     "\n",
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
index f0b5f542c..7eb734daf 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
@@ -25,7 +25,7 @@
     "\n",
     "__Content creators:__ Arash Ash, Surya Ganguli\n",
     "\n",
-    "__Content reviewers:__ Saeed Salehi, Felix Bartsch, Yu-Fang Yang, Antoine De Comite, Melvin Selim Atay, Kelson Shilling-Scrivo\n",
+    "__Content reviewers:__ Saeed Salehi, Felix Bartsch, Yu-Fang Yang, Antoine De Comite, Melvin Selim Atay, Kelson Shilling-Scrivo, Jiaxin Cindy Tu\n",
     "\n",
     "__Content editors:__ Gagana B, Kelson Shilling-Scrivo, Spiros Chavlis\n",
     "\n",
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
index 1a07ab468..0bb19772b 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
@@ -25,7 +25,7 @@
     "\n",
     "__Content creators:__ Arash Ash, Surya Ganguli\n",
     "\n",
-    "__Content reviewers:__ Saeed Salehi, Felix Bartsch, Yu-Fang Yang, Melvin Selim Atay, Kelson Shilling-Scrivo\n",
+    "__Content reviewers:__ Saeed Salehi, Felix Bartsch, Yu-Fang Yang, Melvin Selim Atay, Kelson Shilling-Scrivo, Jiaxin Cindy Tu\n",
     "\n",
     "__Content editors:__ Gagana B, Kelson Shilling-Scrivo, Spiros Chavlis\n",
     "\n",
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index 83e5a6cb8..3df862687 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -25,7 +25,7 @@
     "\n",
     "__Content creators:__ Jose Gallego-Posada, Ioannis Mitliagkas\n",
     "\n",
-    "__Content reviewers:__ Piyush Chauhan, Vladimir Haltakov, Siwei Bai, Kelson Shilling-Scrivo\n",
+    "__Content reviewers:__ Piyush Chauhan, Vladimir Haltakov, Siwei Bai, Kelson Shilling-Scrivo, Jiaxin Cindy Tu\n",
     "\n",
     "__Content editors:__ Charles J Edelson, Gagana B, Spiros Chavlis\n",
     "\n",

From 266969e352eee6abe17dba0d0793bcc8409a7586 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 22:19:44 -0400
Subject: [PATCH 22/34] add missing package dependencies to requirements.txt
 and environment.yml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 environment.yml  | 22 ++++++++++++++++++++++
 requirements.txt | 15 +++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/environment.yml b/environment.yml
index e69e0c342..3ce8d2d35 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,4 +11,26 @@ dependencies:
   - ipywidgets
   - pathlib
   - tqdm
+  - pandas
+  - pillow
+  - imageio
+  - seaborn
+  - nltk
+  - tensorboard
+  - flask
+  - coloredlogs
   - pip
+  - pip:
+    - altair
+    - datasets
+    - diffusers
+    - evaluate
+    - fasttext
+    - flair
+    - flasgger
+    - flask-restful
+    - pytorch-pretrained-biggan
+    - textattack
+    - tokenizers
+    - transformers
+    - vibecheck
diff --git a/requirements.txt b/requirements.txt
index b5eac4f7c..c97896efa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,3 +21,18 @@ tensorboard>=2.19
 ipywidgets>=8.0
 tqdm>=4.0
 requests>=2.31
+altair
+coloredlogs
+datasets
+diffusers
+evaluate
+fasttext
+flair
+flasgger
+flask
+flask-restful
+pytorch-pretrained-biggan
+textattack
+tokenizers
+transformers
+vibecheck

From fc6119408a3bae737f774357e405c3921037a61c Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 22:24:22 -0400
Subject: [PATCH 23/34] restore W2D2 folder from upstream/main

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../README.md                                 |    0
 .../W2D2_BonusLecture.ipynb                   |    0
 .../W2D2_Tutorial1.ipynb                      |    0
 .../W2D2_Tutorial2.ipynb                      | 1381 +++++++++++++++++
 .../instructor/W2D2_BonusLecture.ipynb        |    0
 .../instructor/W2D2_Tutorial1.ipynb           |    0
 .../instructor/W2D2_Tutorial2.ipynb           |    0
 .../W2D2_Tutorial1_Solution_0adbc972.py       |    0
 .../W2D2_Tutorial1_Solution_1279086f.py       |    0
 .../W2D2_Tutorial1_Solution_168b8fcf.py       |    0
 .../W2D2_Tutorial1_Solution_18b18cac.py       |    0
 .../W2D2_Tutorial1_Solution_240aa557.py       |    0
 .../W2D2_Tutorial1_Solution_309474b2.py       |    0
 .../W2D2_Tutorial1_Solution_3ef24bd7.py       |    0
 .../W2D2_Tutorial1_Solution_4f643447.py       |    0
 .../W2D2_Tutorial1_Solution_6e9ea2ef.py       |    0
 .../W2D2_Tutorial1_Solution_78a81e50.py       |    0
 .../W2D2_Tutorial1_Solution_7c652c63.py       |    0
 .../W2D2_Tutorial1_Solution_7cc3340b.py       |    0
 .../W2D2_Tutorial1_Solution_800ed014.py       |    0
 .../W2D2_Tutorial1_Solution_82e644f4.py       |    0
 .../W2D2_Tutorial1_Solution_ae125a93.py       |    0
 .../W2D2_Tutorial1_Solution_c295e530.py       |    0
 .../static/Backpropagation.gif                |  Bin
 .../static/PoolingConvolution.svg             |    0
 .../W2D2_Tutorial1_Solution_0adbc972_3.png    |  Bin
 .../W2D2_Tutorial1_Solution_1279086f_3.png    |  Bin
 .../W2D2_Tutorial1_Solution_240aa557_3.png    |  Bin
 .../W2D2_Tutorial1_Solution_78a81e50_2.png    |  Bin
 .../W2D2_Tutorial1_Solution_78a81e50_3.png    |  Bin
 .../W2D2_Tutorial1_Solution_78a81e50_4.png    |  Bin
 .../static/chicago_skyline_shrunk_v2.bmp      |  Bin
 .../static/correlation.svg                    |    0
 .../static/img_params.png                     |  Bin
 .../static/interactive_demo2.2.html           |    0
 .../static/interactive_demo2.html             |    0
 .../static/interactive_demo3.3.html           |    0
 .../static/interactive_demo3.html             |    0
 .../static/relu.png                           |  Bin
 .../static/think0.png                         |  Bin
 .../static/twain.txt                          |    0
 .../student/W2D2_BonusLecture.ipynb           |    0
 .../student/W2D2_Tutorial1.ipynb              |    0
 .../student/W2D2_Tutorial2.ipynb              |    0
 44 files changed, 1381 insertions(+)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/README.md (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/W2D2_BonusLecture.ipynb (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/W2D2_Tutorial1.ipynb (100%)
 create mode 100644 tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/instructor/W2D2_BonusLecture.ipynb (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/instructor/W2D2_Tutorial1.ipynb (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/instructor/W2D2_Tutorial2.ipynb (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_0adbc972.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_1279086f.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_168b8fcf.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_18b18cac.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_240aa557.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_309474b2.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_4f643447.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_78a81e50.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_7c652c63.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_7cc3340b.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_800ed014.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_82e644f4.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_ae125a93.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/solutions/W2D2_Tutorial1_Solution_c295e530.py (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/Backpropagation.gif (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/PoolingConvolution.svg (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/W2D2_Tutorial1_Solution_0adbc972_3.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/W2D2_Tutorial1_Solution_1279086f_3.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/W2D2_Tutorial1_Solution_240aa557_3.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/W2D2_Tutorial1_Solution_78a81e50_2.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/W2D2_Tutorial1_Solution_78a81e50_3.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/W2D2_Tutorial1_Solution_78a81e50_4.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/chicago_skyline_shrunk_v2.bmp (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/correlation.svg (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/img_params.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/interactive_demo2.2.html (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/interactive_demo2.html (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/interactive_demo3.3.html (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/interactive_demo3.html (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/relu.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/think0.png (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/static/twain.txt (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/student/W2D2_BonusLecture.ipynb (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/student/W2D2_Tutorial1.ipynb (100%)
 rename tutorials/{W2D2_Convnets => W2D2_ConvnetsAndDlThinking}/student/W2D2_Tutorial2.ipynb (100%)

diff --git a/tutorials/W2D2_Convnets/README.md b/tutorials/W2D2_ConvnetsAndDlThinking/README.md
similarity index 100%
rename from tutorials/W2D2_Convnets/README.md
rename to tutorials/W2D2_ConvnetsAndDlThinking/README.md
diff --git a/tutorials/W2D2_Convnets/W2D2_BonusLecture.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_BonusLecture.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/W2D2_BonusLecture.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/W2D2_BonusLecture.ipynb
diff --git a/tutorials/W2D2_Convnets/W2D2_Tutorial1.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial1.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/W2D2_Tutorial1.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial1.ipynb
diff --git a/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb
new file mode 100644
index 000000000..7df248018
--- /dev/null
+++ b/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb
@@ -0,0 +1,1381 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "execution": {},
+    "id": "view-in-github"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W2D2_ConvnetsAndDlThinking/W2D2_Tutorial2.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "# Tutorial 2: Deep Learning Thinking 1: Cost Functions\n",
+    "\n",
+    "**Week 2, Day 2: Convnets and DL Thinking**\n",
+    "\n",
+    "**By Neuromatch Academy**\n",
+    "\n",
+    "\n",
+    "__Content creators:__ Konrad Kording, Lyle ungar, Ashish Sahoo\n",
+    "\n",
+    "__Content reviewers:__ Kelson Shilling-Scrivo\n",
+    "\n",
+    "__Content editors:__ Kelson Shilling-Scrivo\n",
+    "\n",
+    "__Production editors:__ Gagana B, Spiros Chavlis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Tutorial Objectives\n",
+    "\n",
+    "In this tutorial, you will practice thinking like a deep learning practitioner and determine how to design cost functions for different scenarios.\n",
+    "\n",
+    "By the end of this tutorial, you will be better able to:\n",
+    "\n",
+    "* Appreciate the importance of cost function engineering\n",
+    "* Translate domain knowledge into cost functions\n",
+    "* Ask questions about DL systems and customer needs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Tutorial slides\n",
+    "from IPython.display import IFrame\n",
+    "link_id = \"szcjn\"\n",
+    "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n",
+    "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install and import feedback gadget\n",
+    "\n",
+    "!pip3 install vibecheck datatops --quiet\n",
+    "\n",
+    "from vibecheck import DatatopsContentReviewContainer\n",
+    "def content_review(notebook_section: str):\n",
+    "    return DatatopsContentReviewContainer(\n",
+    "        \"\",  # No text prompt\n",
+    "        notebook_section,\n",
+    "        {\n",
+    "            \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n",
+    "            \"name\": \"neuromatch_dl\",\n",
+    "            \"user_key\": \"f379rz8y\",\n",
+    "        },\n",
+    "    ).render()\n",
+    "\n",
+    "\n",
+    "feedback_prefix = \"W2D2_T2\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 1: Intro to Deep Learning Thinking\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 1: Intro to DL Thinking\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'iEqd0MY5pxI'), ('Bilibili', 'BV1hL4y1P73s')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Intro_to_DL_Thinking_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "This tutorial is a bit different from others - there will be no coding! Instead you will watch a series of vignettes about various scenarios where you want to use a neural network. This tutorial will focus on cost functions, a tutorial you will see later in the course will be similar but focused on designing architectures.\n",
+    "\n",
+    "Each section below will start with a vignette where either Lyle or Konrad is trying to figure out how to set up a neural network for a specific problem. Try to think of questions you want to ask them as you watch, then pay attention to what questions Lyle and Konrad are asking. Were they what you would have asked? How do their questions help quickly clarify the situation?\n",
+    "\n",
+    "\n",
+    "You will work together as a group to try to come up with cost functions for each example, with hints available along the way. This may be difficult - deep learning in the real world often is! So try your best but don't get discouraged if you don't reach the solution - you'll learn a lot from the process of trying to.\n",
+    "\n",
+    "You have already seen cost functions (sometimes also called objective functions or loss functions) for deep neural networks - you need one to perform gradient descent and train a neural network.  It turns out what cost function you choose to minimize is incredibly important - it is how you define success of your network after all, so you want to define success in a good way! And cost functions are not one size fits all - you need to carefully choose cost functions according to what you want your neural network to do - as you will seen in the following scenarios.\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 2: Cost function for neurons\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 2: Spiking Neuron Predictions Vignette\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'CC4gMRrE31g'), ('Bilibili', 'BV1Jt4y187UU')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Spiking_Neuron_Predictions_Video\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 3: Spiking Neuron Predictions Set-up\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'vJ7MixhmDh8'), ('Bilibili', 'BV1X94y1y7SH')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Spiking_Neuron_Predictions_SetUp_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Konrad, a neuroscientist, wants to predict what neurons in someone's motor cortex are doing while they are riding a motorcycle.\n",
+    "\n",
+    "Upon discussion with Lyle, it emerges that we have data on 12 parameters of motorcycle riding, including acceleration, angle, braking, degrees of leaning. These inputs are fairly smooth over time, the angle of the motorcycle typically does not change much in 100 ms for example.\n",
+    "\n",
+    "We also have recorded data on the timing of spikes of $N$ neurons in motor cortex. The underlying firing rate is smooth but every millisecond spikes are random and independent. This means we can assume that the number of spikes in a short interval can be modeled using a Poisson distribution with an underlying firing rate for that interval $\\lambda$.\n",
+    "\n",
+    "For neuron $i$, the probability of seeing $k_{i}$ spikes in some interval given an underlying firing rate $\\lambda_{i}$ is:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\mathcal{f(k_{i}:λ_{i})} = \\mathcal{Pr(X=k_{i})} = \\frac {\\lambda_{i}^{k_{i}}e^{-\\lambda_{i}}}{k_{i}!}\n",
+    "\\end{equation}\n",
+    "\n",
+    "So this poisson distribution may be relevant if we want to, in a way, have a good model for the spiking of neurons."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think! 1: Designing a cost function to predict neural activities\n",
+    "\n",
+    "Given everything you know, how would you design a cost function for a neural network that Konrad is training to predict neural activity given the motorcycle riding parameters? Remember that we are predicting the activity of all $N$ neurons, not just one. Try to write out an equation!\n",
+    "\n",
+    "\n",
+    "Please discuss as a group. If you get stuck, you can uncover the hints below one at a time. Please spend some time discussing before uncovering the next hint though! You are being real deep learning scientists now and the answers won't be easy\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 1 </font></summary>\n",
+    "\n",
+    "You get time-stamps for the spikes. You will want to do binning into 50 ms bins. You get $k_{i, t}$ for every neuron $i$ and time bin $t$, the spike count for that neuron in that time bin.  What will the neural network predict?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 2 </font></summary>\n",
+    "\n",
+    "For each bin you can use your neural network model to predict an estimate of $\\lambda_{i,t}$, the number of spikes for neuron $i$ expected at that time bin $t$. The network should get as input the relevant aspects of the motorcycle riding at the relevant times (and potentially of the previous times)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 3 </font></summary>\n",
+    "\n",
+    "You need an equation relating $\\lambda_{i,t}$ (the model prediction) with $k_{i, t}$ (your data) where changing $\\lambda_{i,t}$  to minimize or maximize the number resulting from this equation results in better predictions.  What do we already know about the relationship between $\\lambda_{i,t}$ and $k_{i, t}$ that helps us here?\n",
+    "\n",
+    "Once you have that, how do you extend to incorporate all neurons and time bins?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 4 </font></summary>\n",
+    "\n",
+    "We can treat the bins independently as the spikes are random and independent every millisecond."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for the solution </font></summary>\n",
+    "\n",
+    "\n",
+    "First, we will convert our spike timing data to the number of spikes per time bin for time bins of size 50 ms. This gives us $k_{i,t}$ for every neuron $i$ and time bin $t$.\n",
+    "\n",
+    "We are assuming a Poisson distribution for our spiking. That means that we get the probability of seeing spike count $k_{i, t}$  given underlying firing rate $\\lambda_{i, t}$ using this equation:\n",
+    "\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\mathcal{f(k_{i,t}:\\lambda_{i,t})} = \\mathcal{Pr}(X=k_{i,t}) = \\frac {\\lambda_{i,t}^{k_{i,t}}e^{-\\lambda_{i,t}}}{k_{i,t}!}\n",
+    "\\end{equation}\n",
+    "\n",
+    "That seems a pretty good thing to optimize to make our predictions as good as possible! We want a high probability of seeing the actual spike count we recorded given the neural network prediction of the underlying firing rate.\n",
+    "\n",
+    "We will make this negative later so we have an equation that we want to minimize rather than maximize, so we can use all our normal tricks for minimization (instead of maximization). First though, let's scale up to include all our neurons and time bins.\n",
+    "\n",
+    "We can treat each time bin as independent because, while the underlying probability of firing changes slowly, every milisecond spiking is random and independent. From probability, we know that we can compute the probability of a set of independent events (all the spike counts) by multiplying the probabilities of each event. So the probability of seeing all of our data given the neural network predictions is all of our probabilities of $k_{i,t}$ multiplied together:\n",
+    "\n",
+    "\\begin{align}\n",
+    "\\mathcal{Pr}(\\text{all_data}) &= \\prod_{i=1}^{N}\\prod_{t=1}^\\top \\mathcal{Pr}(X=k_{i,t})\\\\\n",
+    "&= \\prod_{i=1}^{N}\\prod_{t=1}^\\top \\frac {\\lambda_{i,t}^{k_{i,t}}e^{-\\lambda_{i,t}}}{k_{i,t}!}\n",
+    "\\end{align}\n",
+    "\n",
+    "This is also known as our likelihood!\n",
+    "\n",
+    "We usually use the log likelihood instead of the likelihood when minimizing or maximizing for numerical computation reasons. W We can convert the above equation to log likelihood:\n",
+    "\n",
+    "\\begin{align}\n",
+    "\\text{log likelihood} &= \\sum_{i=1}^N\\sum_{t=1}^\\top \\text{log}(\\mathcal{Pr}(X=k_{i,t}) \\\\\n",
+    "&= \\sum_{i=1}^N\\sum_{t=1}^\\top k_{i,t} \\text{log}(\\lambda_{i,t}) - \\lambda_{i,t} - \\text{log}(k_{i,t}!)\n",
+    "\\end{align}\n",
+    "\n",
+    "And last but not least, we want to make it negative so we can minimize instead of maximize:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\text{negative log likelihood}\n",
+    "= \\sum_{i=1}^N\\sum_{t=1}^\\top - k_{i,t} \\text{log}(\\lambda_{i,t}) + \\lambda_{i,t} + \\text{log}(k_{i,t}!)\n",
+    "\\end{equation}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Designing_a_cost_function_to_predict_neural_activities_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 4: Spiking Neurons Wrap-up\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'fb6A03B2U5g'), ('Bilibili', 'BV1K94y117rH')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Spiking_Neurons_WrapUp_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Check out the papers mentioned in the above video:\n",
+    "\n",
+    "- [Fast inference in generalized linear models via expected log likelihood](https://link.springer.com/article/10.1007/s10827-013-0466-4)\n",
+    "\n",
+    "- [Machine Learning for Neural Decoding](https://www.eneuro.org/content/7/4/ENEURO.0506-19.2020)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## (Bonus) Think!: Non-Poisson neurons\n",
+    "\n",
+    "If you have time discuss the following. The spiking distributions don't seem quite Poisson.  Find a good replacement for your cost function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_NonPoisson_neurons_Bonus_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 3: How can an ANN know its uncertainty"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 5: ANN Uncertainty Vignette\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'b2N2OJ2u4AM'), ('Bilibili', 'BV1UN4y1u7Ws')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_Vignette_Video\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 6: ANN Uncertainty Set-up\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'Reh-gNiOwkQ'), ('Bilibili', 'BV1B34y1W7F8')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_SetUp_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Lyle wants to build an artificial neural network that has a measure of its own uncertainty about it's predictions. He wants the neural network to give a prediction/estimate and an uncertainty, or standard deviation, measurement on it.\n",
+    "\n",
+    "Let's say Lyle wants to estimate the location of an atom in a chemical molecule based on various inputs. He wants to have the estimate of the location and an estimate of the variance. We don't train neural networks on one data point at a time though - he wants a cost function that takes in N data points (input and atom location pairings).\n",
+    "\n",
+    "We think we may be able to use a Gaussian distribution to help Lyle here:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "g(x) = \\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp} \\left( -\\frac{1}{2}\\frac{(x-\\mu)^2}{\\sigma^2} \\right)\n",
+    "\\end{equation}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think! 2: Designing a cost function so we measure uncertainty\n",
+    "\n",
+    "Given everything you know, how would you design a cost function for a neural network that Lyle is training so that he can get the estimate and the uncertainty of the estimate? Try to write out an equation!\n",
+    "\n",
+    "Please discuss as a group. If you get stuck, you can uncover the hints below one at a time. Please spend some time discussing before uncovering the next hint, though! You are being real deep learning scientists now, and the answers won't be easy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 1 </font></summary>\n",
+    "\n",
+    "Look at the Gaussian equation. What is the true location? Where is there the estimate of location? Where is there the uncertainty?\n",
+    "\n",
+    "What do you want the neural network to predict for one data point (recorded location) given the inputs?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 2 </font></summary>\n",
+    "\n",
+    "What did you learn from working through Section 2 that you can use here?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 3 </font></summary>\n",
+    "\n",
+    "In section 2, you learned that you want to go from probabilities to negative log likelihoods to form cost functions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for the solution </font></summary>\n",
+    "\n",
+    "For a given set of inputs, we want the neural network to predict the location of the atom and the uncertainty of that estimate. Standard deviation is a great measure of uncertainty so we can predict the mean and standard deviation of the location (instead of just the mean as is more common).\n",
+    "\n",
+    "So how do we a design a cost function that involves the mean and standard deviation?  We can assume a Gaussian distribution over the location. The neural network can predict the mean of that Gaussian (that's the estimate of the location) and the standard deviation of that Gaussian (that's the uncertainty measure) for a given set of inputs.\n",
+    "\n",
+    "Now that we've got that figured out, we can take a very similar approach to what we did in Section 2 with spiking neurons. For a given data point $i$, the neural network predicts the mean ($\\mu_i$) and standard deviation ($\\sigma_i$) of the location given the inputs. We can then compute the probability of seeing the actual recorded location ($x_i$) given these predictions:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "g(x) = \\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp}\\left( -\\frac{1}{2}\\frac{(x_i-\\mu_i)^2}{\\sigma_i^2} \\right)\n",
+    "\\end{equation}\n",
+    "\n",
+    "The location of the atom is independent in each data point so we can get the overall likelihood by multiplying the probabilities for the individual data points.\n",
+    "\\begin{equation}\n",
+    "\\text{likelihood} = \\prod_{i=1}^N\\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp}\\left( -\\frac{1}{2}\\frac{(x_i-\\mu_i)^2}{\\sigma_i^2} \\right)\n",
+    "\\end{equation}\n",
+    "\n",
+    "\n",
+    "And, as before, we want to take the log of this for numerical reasons and convert to negative log likelihood:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\text{negative log likelihood} = \\sum_{i=1}^N \\text{log} \\left( \\frac{1}{\\sigma\\sqrt{2\\pi}} \\text{exp}\\left( -\\frac{1}{2}\\frac{(x_i-\\mu_i)^2}{\\sigma_i^2} \\right) \\right)\n",
+    "\\end{equation}\n",
+    "\n",
+    "Changing the parameters of the neural network so it predicts $\\mu_i$ and $\\sigma_i$ that minimize this equation will give us (hopefully fairly accurate) predictions of the location and the network uncertainty about the location!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 7: ANN Uncertainty Wrap-up\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'QBKAFRaC8SY'), ('Bilibili', 'BV1zv4y1M7C8')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_ANN_Uncertainty_WrapUp_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Check out the papers mentioned in the above video:\n",
+    "\n",
+    "- [Rapid prediction of NMR spectral properties with quantified uncertainty](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0374-3)\n",
+    "\n",
+    "- [Deep imitation learning for molecular inverse problems](https://papers.nips.cc/paper/2019/file/b0bef4c9a6e50d43880191492d4fc827-Paper.pdf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## (Bonus) Think!: Negative standard deviations\n",
+    "\n",
+    "If the standard deviation is negative, the negative log-likelihood will fail as you'd take the log of a negative number. What should we do to ensure we don't run into this while training our neural network?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Negative_standard_deviations_Bonus_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Section 4: Embedding faces"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 8: Embedding Faces Vignette\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'tF0iYBAnyrI'), ('Bilibili', 'BV1NY411K7f6')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Embedding_Faces_Vignette_Video\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 9: Embedding Faces Set-up\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'JrzicfOxqP0'), ('Bilibili', 'BV1fv4y1M7eQ')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Embedding_Faces_SetUp_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Konrad needs help recognizing faces. He wants to build a network that embeds photos of faces so that photos of the same person are nearby in the embedding space and photos of different people are far in the embedding space. We can't just use pixel space because the pixels will be very different between a photo of someone straight on vs. from their side!\n",
+    "\n",
+    "We will use a neural network to go from the pixels of each image to an embedding space. Let's say you have a convolutional neural network with m units in the last layer. If you feed a face photo $i$ through the CNN, the activities of the units in the last layer form an $m$ dimensional vector $\\bar{y}_i$ - this is an embedding of that face photo in $m$ dimensional space.\n",
+    "\n",
+    "We think we might be able to incorporate Euclidean distance to help us here. The Euclidean distance between two vectors is:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "d(\\bar{y}_i, \\bar{y}_j) = \\sqrt{\\sum_{c=1}^m(\\bar{y}_{i_c} - \\bar{y}_{j_c})^2}\n",
+    "\\end{equation}\n",
+    "\n",
+    "<br>\n",
+    "\n",
+    "**Note:** a minor remark here, there is an indexing error in the video where it says $i$ instead of $j$."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "## Think! 3: Designing a cost function for face embedding\n",
+    "\n",
+    "Given everything you know, how would you design a cost function for a neural network that Konrad is training so that he can get a helpful embedding of faces? Try to write out an equation!\n",
+    "\n",
+    "Please discuss as a group. If you get stuck, you can uncover the hints below one at a time. Please spend some time discussing before uncovering the next hint, though! You are being real deep learning scientists now, and the answers won't be easy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 1 </font></summary>\n",
+    "\n",
+    "How do we want to deal with the same faces? Can we just build a cost function based on similar faces? What would happen?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 2 </font></summary>\n",
+    "\n",
+    "You need to also include different faces. How do you want to deal with different faces?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for hint 3 </font></summary>\n",
+    "\n",
+    "Similar faces should have low Euclidean distance between their embeddings. Different faces should have high Euclidean distance between their embeddings. Can we phrase this with 3 faces?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "<details>\n",
+    "<summary> <font color='green'>Click here for the solution </font></summary>\n",
+    "\n",
+    "We want the same faces to have similar embeddings. Let's say we have one photo of Lyle $a$ and another photo of Lyle $p$. We want the embeddings of those photos to be very similar: we want the Euclidean distance between $\\bar{y}_a$ and $\\bar{y}_p$ (the activitys of the last layer of the CNN when photo $a$ and $p$ are fed through) to be small.\n",
+    "\n",
+    "So one possible cost function is:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\text{Cost function} = d(\\bar{y}_a, \\bar{y}_p)\n",
+    "\\end{equation}\n",
+    "\n",
+    "Imagine if we just feed in pairs of the same face and minimize that though. There would be no motivation to ever have different embeddings, we would be only minimizing the distance between embeddings. If the CNN was smart, it would just have the same embedding for every single photo - then the cost function would equal 0!\n",
+    "\n",
+    "This is clearly not what we want. We want to motivate the CNN to have similar embeddings only when the faces are the same. This means we need to also train it to maximize distance when the faces are different.\n",
+    "\n",
+    "We could choose another two photos of different people and maximize that distance but then there's no relation to the embeddings we've already established of the two photos of Lyle.  Instead, we will add one more photo to the mix: a photo of Konrad $n$. We want the distance of this photo to be far from our original photos of Lyle $a$ and $p$.  So we want the distance between $a$ and $p$ to be small and the distance between $a$ and $n$ for example to be large:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\text{Cost function} = d(\\bar{y}_a, \\bar{y}_p) - d(\\bar{y}_a, \\bar{y}_n)\n",
+    "\\end{equation}\n",
+    "\n",
+    "We could compare $n$ to both $a$ and $p$:\n",
+    "\\begin{equation}\n",
+    "\\text{Cost function} = d(\\bar{y}_a, \\bar{y}_p) - d(\\bar{y}_a, \\bar{y}_n) - d(\\bar{y}_p, \\bar{y}_n)\n",
+    "\\end{equation}\n",
+    "\n",
+    "But then the cost function is a bit unbalanced, there are two dissimiliarty terms and they might dominate (so achieving the similarity is less important). So let's go with just including one dissimilarity term.\n",
+    "\n",
+    "This is an established cost function - triplet loss! We chose the subscripts $a$, $p$, and $n$ for a reason: we have an anchor image, a positive image (the same person's face as the anchor) and a negative image (a different person's face as the anchor). We can then sum over N data points where each data point is a set of three images:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\text{Cost function} = \\sum_{i=1}^N [d(\\bar{y}_{a, i}, \\bar{y}_{p, i}) - d(\\bar{y}_{a, i}, \\bar{y}_{n, i})]\n",
+    "\\end{equation}\n",
+    "\n",
+    "There's one little addition in triplet loss. Instead of just using the above cost function, researchers add a constant $\\alpha$ and then make the cost function 0 if it becomes negative. Why do you think they do this?\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\text{Cost function} = \\text{max} \\left( \\sum_{i=1}^N \\left[ d(\\bar{y}_{a, i}, \\bar{y}_{p, i}) - d(\\bar{y}_{a, i}, \\bar{y}_{n, i}) + \\alpha \\right], 0 \\right)\n",
+    "\\end{equation}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Embedding_Faces_Discussion\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Video 10: Embedding Faces Wrap-up\n",
+    "from ipywidgets import widgets\n",
+    "from IPython.display import YouTubeVideo\n",
+    "from IPython.display import IFrame\n",
+    "from IPython.display import display\n",
+    "\n",
+    "\n",
+    "class PlayVideo(IFrame):\n",
+    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
+    "    self.id = id\n",
+    "    if source == 'Bilibili':\n",
+    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
+    "    elif source == 'Osf':\n",
+    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
+    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
+    "\n",
+    "\n",
+    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
+    "  tab_contents = []\n",
+    "  for i, video_id in enumerate(video_ids):\n",
+    "    out = widgets.Output()\n",
+    "    with out:\n",
+    "      if video_ids[i][0] == 'Youtube':\n",
+    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
+    "                             height=H, fs=fs, rel=0)\n",
+    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
+    "      else:\n",
+    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
+    "                          height=H, fs=fs, autoplay=False)\n",
+    "        if video_ids[i][0] == 'Bilibili':\n",
+    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
+    "        elif video_ids[i][0] == 'Osf':\n",
+    "          print(f'Video available at https://osf.io/{video.id}')\n",
+    "      display(video)\n",
+    "    tab_contents.append(out)\n",
+    "  return tab_contents\n",
+    "\n",
+    "\n",
+    "video_ids = [('Youtube', 'mVk1W7x6Nps'), ('Bilibili', 'BV1nf4y1f7oL')]\n",
+    "tab_contents = display_videos(video_ids, W=854, H=480)\n",
+    "tabs = widgets.Tab()\n",
+    "tabs.children = tab_contents\n",
+    "for i in range(len(tab_contents)):\n",
+    "  tabs.set_title(i, video_ids[i][0])\n",
+    "display(tabs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Submit your feedback\n",
+    "content_review(f\"{feedback_prefix}_Embedding_Faces_WrapUp_Video\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "Check out the papers mentioned in the above video:\n",
+    "\n",
+    "- [Large Scale Online Learning of Image Similarity Through Ranking](https://www.jmlr.org/papers/volume11/chechik10a/chechik10a.pdf)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {}
+   },
+   "source": [
+    "---\n",
+    "# Summary\n",
+    "\n",
+    "Today we have seen a range of different cost functions. So we want to dwell a bit on what we want people to take away from these exercises. We have seen several cost functions:\n",
+    "\n",
+    "* Log Poisson likelihood for neurons\n",
+    "* Uncertainty as a modeled entity\n",
+    "* Face embeddings\n",
+    "\n",
+    "What we saw in all these cases is that these cost functions emerge from insights into the problem domain. We saw how one needs to, in a way, pull these insights out of the domain experts. And how, at the same time, the cost functions come from computational insights. Coming up with the proper cost functions requires listening to what domain experts say and probing the things they may mean but not say."
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [],
+   "include_colab_link": true,
+   "name": "W2D2_Tutorial2",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernel": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/tutorials/W2D2_Convnets/instructor/W2D2_BonusLecture.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_BonusLecture.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/instructor/W2D2_BonusLecture.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_BonusLecture.ipynb
diff --git a/tutorials/W2D2_Convnets/instructor/W2D2_Tutorial1.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial1.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/instructor/W2D2_Tutorial1.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial1.ipynb
diff --git a/tutorials/W2D2_Convnets/instructor/W2D2_Tutorial2.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial2.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/instructor/W2D2_Tutorial2.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/instructor/W2D2_Tutorial2.ipynb
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_0adbc972.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_0adbc972.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_0adbc972.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_0adbc972.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_1279086f.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_1279086f.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_1279086f.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_1279086f.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_168b8fcf.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_168b8fcf.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_168b8fcf.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_168b8fcf.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_18b18cac.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_18b18cac.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_18b18cac.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_18b18cac.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_240aa557.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_240aa557.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_240aa557.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_240aa557.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_309474b2.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_309474b2.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_309474b2.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_309474b2.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_3ef24bd7.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_4f643447.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_4f643447.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_4f643447.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_4f643447.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_6e9ea2ef.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_78a81e50.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_78a81e50.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_78a81e50.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_78a81e50.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7c652c63.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7c652c63.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7c652c63.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7c652c63.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7cc3340b.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7cc3340b.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_7cc3340b.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_7cc3340b.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_800ed014.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_800ed014.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_800ed014.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_800ed014.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_82e644f4.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_82e644f4.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_82e644f4.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_82e644f4.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_ae125a93.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_ae125a93.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_ae125a93.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_ae125a93.py
diff --git a/tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_c295e530.py b/tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_c295e530.py
similarity index 100%
rename from tutorials/W2D2_Convnets/solutions/W2D2_Tutorial1_Solution_c295e530.py
rename to tutorials/W2D2_ConvnetsAndDlThinking/solutions/W2D2_Tutorial1_Solution_c295e530.py
diff --git a/tutorials/W2D2_Convnets/static/Backpropagation.gif b/tutorials/W2D2_ConvnetsAndDlThinking/static/Backpropagation.gif
similarity index 100%
rename from tutorials/W2D2_Convnets/static/Backpropagation.gif
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/Backpropagation.gif
diff --git a/tutorials/W2D2_Convnets/static/PoolingConvolution.svg b/tutorials/W2D2_ConvnetsAndDlThinking/static/PoolingConvolution.svg
similarity index 100%
rename from tutorials/W2D2_Convnets/static/PoolingConvolution.svg
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/PoolingConvolution.svg
diff --git a/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_0adbc972_3.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_0adbc972_3.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_0adbc972_3.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_0adbc972_3.png
diff --git a/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_1279086f_3.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_1279086f_3.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_1279086f_3.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_1279086f_3.png
diff --git a/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_240aa557_3.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_240aa557_3.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_240aa557_3.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_240aa557_3.png
diff --git a/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_2.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_2.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_2.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_2.png
diff --git a/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_3.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_3.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_3.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_3.png
diff --git a/tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_4.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_4.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/W2D2_Tutorial1_Solution_78a81e50_4.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/W2D2_Tutorial1_Solution_78a81e50_4.png
diff --git a/tutorials/W2D2_Convnets/static/chicago_skyline_shrunk_v2.bmp b/tutorials/W2D2_ConvnetsAndDlThinking/static/chicago_skyline_shrunk_v2.bmp
similarity index 100%
rename from tutorials/W2D2_Convnets/static/chicago_skyline_shrunk_v2.bmp
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/chicago_skyline_shrunk_v2.bmp
diff --git a/tutorials/W2D2_Convnets/static/correlation.svg b/tutorials/W2D2_ConvnetsAndDlThinking/static/correlation.svg
similarity index 100%
rename from tutorials/W2D2_Convnets/static/correlation.svg
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/correlation.svg
diff --git a/tutorials/W2D2_Convnets/static/img_params.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/img_params.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/img_params.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/img_params.png
diff --git a/tutorials/W2D2_Convnets/static/interactive_demo2.2.html b/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.2.html
similarity index 100%
rename from tutorials/W2D2_Convnets/static/interactive_demo2.2.html
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.2.html
diff --git a/tutorials/W2D2_Convnets/static/interactive_demo2.html b/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.html
similarity index 100%
rename from tutorials/W2D2_Convnets/static/interactive_demo2.html
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo2.html
diff --git a/tutorials/W2D2_Convnets/static/interactive_demo3.3.html b/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.3.html
similarity index 100%
rename from tutorials/W2D2_Convnets/static/interactive_demo3.3.html
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.3.html
diff --git a/tutorials/W2D2_Convnets/static/interactive_demo3.html b/tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.html
similarity index 100%
rename from tutorials/W2D2_Convnets/static/interactive_demo3.html
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/interactive_demo3.html
diff --git a/tutorials/W2D2_Convnets/static/relu.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/relu.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/relu.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/relu.png
diff --git a/tutorials/W2D2_Convnets/static/think0.png b/tutorials/W2D2_ConvnetsAndDlThinking/static/think0.png
similarity index 100%
rename from tutorials/W2D2_Convnets/static/think0.png
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/think0.png
diff --git a/tutorials/W2D2_Convnets/static/twain.txt b/tutorials/W2D2_ConvnetsAndDlThinking/static/twain.txt
similarity index 100%
rename from tutorials/W2D2_Convnets/static/twain.txt
rename to tutorials/W2D2_ConvnetsAndDlThinking/static/twain.txt
diff --git a/tutorials/W2D2_Convnets/student/W2D2_BonusLecture.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_BonusLecture.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/student/W2D2_BonusLecture.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_BonusLecture.ipynb
diff --git a/tutorials/W2D2_Convnets/student/W2D2_Tutorial1.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial1.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/student/W2D2_Tutorial1.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial1.ipynb
diff --git a/tutorials/W2D2_Convnets/student/W2D2_Tutorial2.ipynb b/tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial2.ipynb
similarity index 100%
rename from tutorials/W2D2_Convnets/student/W2D2_Tutorial2.ipynb
rename to tutorials/W2D2_ConvnetsAndDlThinking/student/W2D2_Tutorial2.ipynb

From 4909c29d57fa97c67247721d375f81c16ab17f7d Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 22:26:49 -0400
Subject: [PATCH 24/34] restore tutorials/materials.yml from upstream/main

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tutorials/materials.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tutorials/materials.yml b/tutorials/materials.yml
index 59b21a5dc..cd7ddb7a3 100644
--- a/tutorials/materials.yml
+++ b/tutorials/materials.yml
@@ -60,15 +60,17 @@
 
 - day: W2D2
   category: ConvNets and Generative Models
-  name: Convnets
+  name: Convnets And Dl Thinking
   playlist: https://youtube.com/playlist?list=PLkBQOLLbi18MYWrGf6xW2d8CRyF83m592
 
   slides:
   - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/s8xz5/?direct%26mode=render%26action=download%26mode=render
     title: Tutorial 1
+  - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/szcjn/?direct%26mode=render%26action=download%26mode=render
+    title: Tutorial 2
   - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/r9pjc/?direct%26mode=render%26action=download%26mode=render
     title: Bonus Lecture
-  tutorials: 1
+  tutorials: 2
 
 - day: W2D3
   category: ConvNets and Generative Models

From 1dc75c9ded76c847fc05fc1f84c9cc45c0dd58dd Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Tue, 9 Jun 2026 22:29:50 -0400
Subject: [PATCH 25/34] add week1 changes to materials.yml

---
 tutorials/materials.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/materials.yml b/tutorials/materials.yml
index cd7ddb7a3..5fd80e017 100644
--- a/tutorials/materials.yml
+++ b/tutorials/materials.yml
@@ -4,7 +4,7 @@
   playlist: https://youtube.com/playlist?list=PLkBQOLLbi18MGF3aRYZ-Ya_lexek248qJ
 
   slides:
-  - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/wcjrv/?direct%26mode=render%26action=download%26mode=render
+  - link: https://mfr.ca-1.osf.io/render?url=https://osf.io/dg4h7/?direct%26mode=render%26action=download%26mode=render
     title: Tutorial 1
   tutorials: 1
 

From 42194832b292864b3da88bb295428d01bb2066c6 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Wed, 10 Jun 2026 03:18:06 -0400
Subject: [PATCH 26/34] update requirements

---
 environment.yml  | 11 +++++++++--
 requirements.txt |  7 +++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index 3ce8d2d35..405874d64 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,6 @@
 name: nma-dl
 dependencies:
-  - python=3.7
+  - python=3.9
   - requests
   - numpy
   - scipy
@@ -8,6 +8,7 @@ dependencies:
   - scikit-learn
   - pytorch
   - torchvision
+  - torchaudio
   - ipywidgets
   - pathlib
   - tqdm
@@ -19,17 +20,23 @@ dependencies:
   - tensorboard
   - flask
   - coloredlogs
+  - conda-forge::fasttext
+  - conda-forge::imageio-ffmpeg
   - pip
   - pip:
+    - accelerate
     - altair
     - datasets
     - diffusers
     - evaluate
-    - fasttext
+    - facenet-pytorch
     - flair
     - flasgger
     - flask-restful
+    - libsixel-python
+    - pyngrok
     - pytorch-pretrained-biggan
+    - python-Levenshtein
     - textattack
     - tokenizers
     - transformers
diff --git a/requirements.txt b/requirements.txt
index c97896efa..fc4b1f8ee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,27 +11,34 @@ pandas>=2.2
 matplotlib>=3.10
 torch>=2.0
 torchvision>=0.15
+torchaudio>=2.0
 scikit-learn>=1.3
 scipy>=1.13
 Pillow>=10.0
 imageio>=2.30
+imageio-ffmpeg
 seaborn>=0.13
 nltk>=3.9
 tensorboard>=2.19
 ipywidgets>=8.0
 tqdm>=4.0
 requests>=2.31
+accelerate
 altair
 coloredlogs
 datasets
 diffusers
 evaluate
+facenet-pytorch
 fasttext
 flair
 flasgger
 flask
 flask-restful
+libsixel-python
+pyngrok
 pytorch-pretrained-biggan
+python-Levenshtein
 textattack
 tokenizers
 transformers

From 51e76beb8abd45dbb48ad83572cf477f676e159b Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Wed, 10 Jun 2026 04:10:10 -0400
Subject: [PATCH 27/34] update week1 install dependencies

---
 .../W1D1_Tutorial1.ipynb                      | 446 +++++-------------
 1 file changed, 125 insertions(+), 321 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 06cd220b6..672cdc14e 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -8,9 +8,7 @@
    },
    "source": [
     "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a> &nbsp; <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb\" target=\"_parent\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" alt=\"Open in Kaggle\"/></a>"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -29,9 +27,7 @@
     "__Content editors:__ Anoop Kulkarni, Spiros Chavlis\n",
     "\n",
     "__Production editors:__ Arush Tagade, Spiros Chavlis"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -48,9 +44,7 @@
     "* Train NaiveNet\n",
     "* Get to know your pod\n",
     "* Start thinking about the course as a whole"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -73,9 +67,7 @@
    "source": [
     "---\n",
     "# Setup"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -86,9 +78,34 @@
     "Be sure to run all of the cells in the setup section. Feel free to expand them and have a look at what you are loading in, but you should be able to fulfill the learning objectives of every tutorial without having to look at these cells.\n",
     "\n",
     "If you start building your own projects built on this code base we highly recommend looking at them in more detail."
-   ],
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form"
+   },
    "outputs": [],
-   "execution_count": null
+   "source": [
+    "# @title Install and import feedback gadget\n",
+    "!pip3 install vibecheck datatops --quiet\n",
+    "\n",
+    "from vibecheck import DatatopsContentReviewContainer\n",
+    "def content_review(notebook_section: str):\n",
+    "    return DatatopsContentReviewContainer(\n",
+    "        \"\",  # No text prompt\n",
+    "        notebook_section,\n",
+    "        {\n",
+    "            \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n",
+    "            \"name\": \"neuromatch_dl\",\n",
+    "            \"user_key\": \"f379rz8y\",\n",
+    "        },\n",
+    "    ).render()\n",
+    "\n",
+    "\n",
+    "feedback_prefix = \"W1D1_T1\""
+   ]
   },
   {
    "cell_type": "code",
@@ -120,33 +137,6 @@
     "        print(f'  {_pkg}: NOT FOUND')\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form"
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and import feedback gadget\n",
-    "!pip3 install vibecheck datatops --quiet\n",
-    "\n",
-    "from vibecheck import DatatopsContentReviewContainer\n",
-    "def content_review(notebook_section: str):\n",
-    "    return DatatopsContentReviewContainer(\n",
-    "        \"\",  # No text prompt\n",
-    "        notebook_section,\n",
-    "        {\n",
-    "            \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n",
-    "            \"name\": \"neuromatch_dl\",\n",
-    "            \"user_key\": \"f379rz8y\",\n",
-    "        },\n",
-    "    ).render()\n",
-    "\n",
-    "\n",
-    "feedback_prefix = \"W1D1_T1\""
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -273,9 +263,7 @@
     "If you want to quickly try out something or take a look at the data, you can use scratch code cells. They allow you to run Python code, but will not mess up the structure of your notebook.\n",
     "\n",
     "To open a new scratch cell go to *Insert* → *Scratch code cell*."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -284,9 +272,7 @@
     "# Section 1: Welcome to Neuromatch Deep learning course\n",
     "\n",
     "*Time estimate: ~25mins*"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -349,9 +335,7 @@
    "source": [
     "This will be an intensive 3 week adventure. We will all learn Deep Learning (DL) in a group. Groups need standards. Read our\n",
     "[Code of Conduct](https://docs.google.com/document/d/1eHKIkaNbAlbx_92tLQelXnicKXEcvFzlyzzeWjEtifM/edit?usp=sharing).\n"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -425,9 +409,7 @@
    "metadata": {},
    "source": [
     "**Discuss with your pod: What do you hope to get out of this course? [in about 100 words]**"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -449,9 +431,7 @@
     "# Section 2: The Basics of PyTorch\n",
     "\n",
     "*Time estimate: ~2 hours 05 mins*"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -471,18 +451,14 @@
     "- A clean, modular API for building and deploying **deep learning models**.\n",
     "\n",
     "You can find more information about PyTorch in the Appendix."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Section 2.1: Creating Tensors\n"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -556,9 +532,7 @@
    "metadata": {},
    "source": [
     "There are various ways of creating tensors, and when doing any real deep learning project, we will usually have to do so."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -568,9 +542,7 @@
     "\n",
     "---\n",
     "\n"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -605,9 +577,7 @@
     "**Some common tensor constructors:**\n",
     "\n",
     "---"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -631,9 +601,7 @@
    "metadata": {},
    "source": [
     "Notice that `.empty()` does not return zeros, but seemingly random numbers. Unlike `.zeros()`, which initialises the elements of the tensor with zeros, `.empty()` just allocates the memory. It is hence a bit faster if you are looking to just create a tensor."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -642,9 +610,7 @@
     "**Creating random tensors and tensors like other tensors:**\n",
     "\n",
     "---"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -698,18 +664,14 @@
     "import numpy as np\n",
     "np.random.seed(0)\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Here, we define for you a function called `set_seed` that does the job for you!"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -750,9 +712,7 @@
    "metadata": {},
    "source": [
     "Now, let's use the `set_seed` function in the previous example. Execute the cell multiple times to verify that the numbers printed are always the same."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -801,9 +761,7 @@
     "**Numpy-like number ranges:**\n",
     "---\n",
     "The ```.arange()``` and ```.linspace()``` behave how you would expect them to if you are familar with numpy."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -845,9 +803,7 @@
     "<br>\n",
     "\n",
     "$^\\dagger$: $\\mathcal{U(\\alpha, \\beta)}$ denotes the [uniform distribution](https://en.wikipedia.org/wiki/Continuous_uniform_distribution) from $\\alpha$ to $\\beta$, with $\\alpha, \\beta \\in \\mathbb{R}$.\n"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -943,9 +899,7 @@
     "```\n",
     "All correct!\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -968,9 +922,7 @@
     "**Tensor-Tensor operations**\n",
     "\n",
     "We can perform operations on tensors using methods under `torch.`"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1046,9 +998,7 @@
     "**Tensor-Tensor operations**\n",
     "\n",
     "We can perform operations on tensors using methods under `torch.`."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1077,9 +1027,7 @@
    "source": [
     "However, in PyTorch, most common Python operators are overridden.\n",
     "The common standard arithmetic operators ($+$, $-$, $*$, $/$, and $**$) have all been lifted to elementwise operations"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1097,9 +1045,7 @@
    "metadata": {},
    "source": [
     "**Tensor Methods**"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -1108,9 +1054,7 @@
     "Tensors also have a number of common arithmetic operations built in. A full list of **all** methods can be found  in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix)) (there are a lot!)\n",
     "\n",
     "All of these operations should have similar syntax to their numpy equivalents (feel free to skip if you already know this!)."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1142,9 +1086,7 @@
     "\n",
     "Transposes of 2D tensors are obtained using `torch.t()` or `Tensor.T`. Note the lack of brackets for `Tensor.T` - it is an attribute, not a method.\n",
     "\n"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -1178,9 +1120,7 @@
     "\\end{equation}\n",
     "\n",
     "The code block below that computes these expressions using PyTorch is incomplete - fill in the missing lines."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1272,9 +1212,7 @@
     "tensor([[20, 24],\n",
     "        [31, 27]])\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1362,9 +1300,7 @@
     "```\n",
     "tensor(82)\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1383,9 +1319,7 @@
    "metadata": {},
    "source": [
     "## Section 2.3 Manipulating Tensors in Pytorch"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1463,9 +1397,7 @@
     "Just as in numpy, elements in a tensor can be accessed by index. As in any numpy array, the first element has index 0 and ranges are specified to include the first to last_element-1. We can access elements according to their relative position to the end of the list by using negative indices. Indexing is also referred to as slicing.\n",
     "\n",
     "For example, `[-1]` selects the last element; `[1:3]` selects the second and the third elements, and `[:-2]` will select all elements excluding the last and second-to-last elements."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1485,9 +1417,7 @@
    "metadata": {},
    "source": [
     "When we have multidimensional tensors, indexing rules work the same way as NumPy."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1510,9 +1440,7 @@
     "**Flatten and reshape**\n",
     "\n",
     "There are various methods for reshaping tensors. It is common to have to express 2D data in 1D format. Similarly, it is also common to have to reshape a 1D tensor into a 2D tensor. We can achieve this with the `.flatten()` and `.reshape()` methods."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1537,9 +1465,7 @@
    "metadata": {},
    "source": [
     "You will also see the `.view()` methods used a lot to reshape tensors. There is a subtle difference between `.view()` and `.reshape()`, though for now we will just use `.reshape()`. The documentation can be found in the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -1550,9 +1476,7 @@
     "When processing batches of data, you will quite often be left with singleton dimensions. E.g., `[1,10]` or `[256, 1, 3]`. This dimension can quite easily mess up your matrix operations if you don't plan on it being there...\n",
     "\n",
     "In order to compress tensors along their singleton dimensions we can use the `.squeeze()` method. We can use the `.unsqueeze()` method to do the opposite."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1572,9 +1496,7 @@
    "metadata": {},
    "source": [
     "Because of that pesky singleton dimension, `x[0]` gave us the first row instead!"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1612,9 +1534,7 @@
     "**Permutation**\n",
     "\n",
     "Sometimes our dimensions will be in the wrong order! For example, we may be dealing with RGB images with dim $[3\\times48\\times64]$, but our pipeline expects the colour dimension to be the last dimension, i.e., $[48\\times64\\times3]$. To get around this we can use the `.permute()` method."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1639,27 +1559,21 @@
    "metadata": {},
    "source": [
     "You may also see `.transpose()` used. This works in a similar way as permute, but can only swap two dimensions at once."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "**Concatenation**"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "In this example, we concatenate two matrices along rows (axis 0, the first element of the shape) vs. columns (axis 1, the second element of the shape). We can see that the first output tensor’s axis-0 length (`6`) is the sum of the two input tensors’ axis-0 lengths (`3+3`); while the second output tensor’s axis-1 length (`8`) is the sum of the two input tensors’ axis-1 lengths (`4+4`)."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1691,9 +1605,7 @@
     "Converting a tensor to a numpy.ndarray, or vice versa, is easy, and the converted result does not share memory. This minor inconvenience is quite important: when you perform operations on the CPU or GPUs, you do not want to halt computation, waiting to see whether the NumPy package of Python might want to be doing something else with the same chunk of memory.\n",
     "\n",
     "When converting to a NumPy array, the information being tracked by the tensor will be lost, i.e., the computational graph. This will be covered in detail when you are introduced to autograd tomorrow!"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1716,9 +1628,7 @@
    "metadata": {},
    "source": [
     "To convert a size-1 tensor to a Python scalar, we can invoke the item function or Python’s built-in functions."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -1736,9 +1646,7 @@
    "source": [
     "### Coding Exercise 2.3: Manipulating Tensors\n",
     "Using a combination of the methods discussed above, complete the functions below."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -1830,9 +1738,7 @@
     "<br>\n",
     "\n",
     "**Hint:** `torch.numel()` is an easy way of finding the number of elements in a tensor."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2041,9 +1947,7 @@
     "        [-1,  5]])\n",
     "tensor([ 1, -1, -1,  3,  2,  3,  0])\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2062,9 +1966,7 @@
    "metadata": {},
    "source": [
     "## Section 2.4: GPUs"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2139,9 +2041,7 @@
    "source": [
     "\n",
     "By default, when we create a tensor it will *not* live on the GPU!"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2164,9 +2064,7 @@
     "Once you have done this your runtime will restart. ⚠️ **Make sure to rerun the Setup cells at the top of the notebook before continuing.**\n",
     "\n",
     "For more information on the GPU usage policy see the **Appendix** at the bottom of this notebook ([Colab link](#scrollTo=appendix))."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -2176,27 +2074,21 @@
     "> - **Switch back to CPU when done:** `Runtime → Change runtime type → Hardware Accelerator: None`. Free Colab GPU time is limited and shared. In future tutorials we will note at the top if GPU is not required.\n",
     "> - **End your session properly when finished:** `Runtime → Disconnect and delete runtime`. Closing the browser tab does **not** free the GPU — the session keeps running for up to 90 minutes idle or 12 hours total.\n",
     "> - **Avoid opening multiple GPU notebooks** at the same time across tabs."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "**Now we have a GPU.**\n"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "The cell below should return `True`."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2222,9 +2114,7 @@
     "```\n",
     "\n",
     "Let's define the function using the PyTorch package `torch.cuda`, which is lazily initialized, so we can always import it, and use `is_available()` to determine if our system supports CUDA."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2260,9 +2150,7 @@
    "metadata": {},
    "source": [
     "Let's make some CUDA tensors!"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2294,9 +2182,7 @@
     "**Operations between cpu tensors and cuda tensors**\n",
     "\n",
     "Note that the type of the tensor changed after calling `.to()`. What happens if we try and perform operations on tensors on devices?"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2318,9 +2204,7 @@
     "We cannot combine CUDA tensors and CPU tensors in this fashion. If we want to compute an operation that combines tensors on different devices, we need to move them first! We can use the `.to()` method as before, or the `.cpu()` and `.cuda()` methods. Note that using the `.cuda()` will throw an error, if CUDA is not enabled in your machine.\n",
     "\n",
     "Generally, in this course, all Deep Learning is done on the GPU, and any computation is done on the CPU, so sometimes we have to pass things back and forth, so you'll see us call."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2354,9 +2238,7 @@
     "- Matrix multiplication\n",
     "\n",
     "The operations should be able to perfomed on either the CPU or GPU specified by the parameter `device`. We will use the helper function `timeFun(f, dim, iterations, device)`."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2468,9 +2350,7 @@
     "time taken for 1 iterations of simpleFun(10000, cpu): 23.74070\n",
     "time taken for 1 iterations of simpleFun(10000, cuda): 0.87535\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2491,9 +2371,7 @@
     "**Discuss!**\n",
     "\n",
     "Try and reduce the dimensions of the tensors and increase the iterations. You can get to a point where the cpu only function is faster than the GPU function. Why might this be?"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2533,9 +2411,7 @@
    "metadata": {},
    "source": [
     "## Section 2.5: Datasets and Dataloaders"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2609,9 +2485,7 @@
    "metadata": {},
    "source": [
     "When training neural network models you will be working with large amounts of data. Fortunately, PyTorch offers some great tools that help you organize and manipulate your data samples."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2634,9 +2508,7 @@
     "The `torchvision` package gives you easy access to many of the publicly available datasets. Let's load the [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset, which contains color images of 10 different classes, like vehicles and animals.\n",
     "\n",
     "Creating an object of type `datasets.CIFAR10` will automatically download and load all images from the dataset. The resulting data structure can be treated as a list containing data samples and their corresponding labels."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2661,9 +2533,7 @@
    "metadata": {},
    "source": [
     "We have 50,000 samples loaded. Now, let's take a look at one of them in detail. Each sample consists of an image and its corresponding label."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2683,9 +2553,7 @@
    "metadata": {},
    "source": [
     "Color images are modeled as 3 dimensional tensors. The first dimension corresponds to the channels ($\\text{C}$) of the image (in this case we have RGB images). The second dimensions is the height ($\\text{H}$) of the image and the third is the width ($\\text{W}$). We can denote this image format as $\\text{C} \\times \\text{H} \\times \\text{W}$."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -2712,9 +2580,7 @@
     "print(input_var.size())\n",
     "print(input_var)\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2832,9 +2698,7 @@
     "**Training and Test Datasets**\n",
     "\n",
     "When loading a dataset, you can specify if you want to load the training or the test samples using the `train` argument. We can load the training and test datasets separately. For simplicity, today we will not use both datasets separately, but this topic will be adressed in the next days."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2921,9 +2785,7 @@
     "**Dataloader**\n",
     "\n",
     "Another important concept is the `Dataloader`. It is a wrapper around the `Dataset` that splits it into minibatches (important for training the neural network) and makes the data iterable. The `shuffle` argument is used to shuffle the order of the samples across the minibatches."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -2961,18 +2823,14 @@
     "    generator=g_seed\n",
     "    )\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "**Important:** For the `seed_worker` to have an effect, `num_workers` should be 2 or more."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -2981,9 +2839,7 @@
     "We can now query the next batch from the data loader and inspect it. For this we need to convert the dataloader object to a Python iterator using the function `iter` and then we can query the next batch using the function `next`.\n",
     "\n",
     "We can now see that we have a 4D tensor. This is because we have a 64 images in the batch ($B$) and each image has 3 dimensions: channels ($C$), height ($H$) and width ($W$). So, the size of the 4D tensor is $B \\times C \\times H \\times W$."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3007,9 +2863,7 @@
     "**Transformations**\n",
     "\n",
     "Another useful feature when loading a dataset is applying transformations on the data - color conversions, normalization, cropping, rotation etc. There are many predefined transformations in the `torchvision.transforms` package and you can also combine them using the `Compose` transform. Checkout the [pytorch documentation](https://pytorch.org/vision/stable/transforms.html) for details."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -3018,9 +2872,7 @@
     "### Coding Exercise 2.6: Load the CIFAR10 dataset as grayscale images\n",
     "\n",
     "The goal of this excercise is to load the images from the CIFAR10 dataset as grayscale images. Note that we rerun the `set_seed` function to ensure reproducibility."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3114,9 +2966,7 @@
     "# Section 3: Neural Networks\n",
     "\n",
     "*Time estimate: ~1 hour 30 mins (excluding video)*"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -3128,9 +2978,7 @@
     "- Training the network\n",
     "- Visualizing the results of the network\n",
     "- Tweaking the network"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3206,9 +3054,7 @@
     "## Section 3.1: Data Loading\n",
     "\n",
     "First we need some sample data to train our network on. You can use the function below to generate an example dataset consisting of 2D points along two interleaving half circles. The data will be stored in a file called `sample_data.csv`. You can inspect the file directly in Colab by going to Files on the left side and opening the CSV file."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3234,9 +3080,7 @@
    "metadata": {},
    "source": [
     "Now we can load the data from the CSV file using the Pandas library. Pandas provides many functions for reading files in various formats. When loading data from a CSV file, we can reference the columns directly by their names."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3269,9 +3113,7 @@
     "**Prepare Data for PyTorch**\n",
     "\n",
     "Now let's prepare the data in a format suitable for PyTorch - convert everything into tensors."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3304,9 +3146,7 @@
    "metadata": {},
    "source": [
     "## Section 3.2: Create a Simple Neural Network"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3410,9 +3250,7 @@
     "<br>\n",
     "\n",
     "**Note:** You can use the `__call__` method of a module directly and it will invoke the `forward` method: `net()` does the same as `net.forward()`."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3509,9 +3347,7 @@
     "**Check that your network works**\n",
     "\n",
     "Create an instance of your model and visualize it."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3535,9 +3371,7 @@
     "Now, let's pass some of the points of our dataset through the network and see if it works. You should not expect the network to actually classify the points correctly, because it has not been trained yet.\n",
     "\n",
     "The goal here is just to get some experience with the data structures that are passed to the forward and predict methods and their results."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3600,9 +3434,7 @@
     "Predicted labels:\n",
     " tensor([0, 0, 1, 0, 0], device='cuda:0')\n",
     "```"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3621,9 +3453,7 @@
    "metadata": {},
    "source": [
     "## Section 3.3: Train Your Neural Network"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3699,9 +3529,7 @@
     "Now it is time to train your network on your dataset. Don't worry if you don't fully understand everything yet - we will cover training in much more details in the next days. For now, the goal is just to see your network in action!\n",
     "\n",
     "You will usually implement the `train` method directly when implementing your class `NaiveNet`. Here, we will implement it as a function outside of the class in order to have it in a separate cell."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3837,9 +3665,7 @@
     "**Plot the loss during training**\n",
     "\n",
     "Plot the loss during the training to see how it reduces and converges."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -3958,9 +3784,7 @@
     "- Add one additional hidden layer\n",
     "\n",
     "Can you get the network to better fit the data?"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -4075,9 +3899,7 @@
     "\\end{matrix}\n",
     "\n",
     "Here, with `0`, we denote `False`, and with `1` we denote `True` in boolean terms."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -4097,9 +3919,7 @@
     "\\end{equation}\n",
     "\n",
     "Try to set the weights and biases to implement this function after you played enough :)"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -4157,9 +3977,7 @@
    "source": [
     "---\n",
     "# Section 4: Ethics And Course Info"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -4382,9 +4200,7 @@
     "* [Blake Richards](https://www.mcgill.ca/neuro/blake-richards-phd)\n",
     "* [Tim Lillicrap](https://contrastiveconvergence.net/~timothylillicrap/index.php) \n",
     "* [Pablo Samuel Castro](https://psc-g.github.io/)"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -4398,9 +4214,7 @@
     "In this notebook we visualize a subset* of 3,300 articles retreived from the AllenAI [S2ORC dataset](https://github.com/allenai/s2orc). We represent each paper by a position that is output of a dimensionality reduction method applied to a vector representation of each paper. The vector representation is the output of a neural network.\n",
     "\n",
     "**Note:** The selection is very biased on the keywords and methodology we used to filter. Please see the details section to learn about what we did."
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -4466,9 +4280,7 @@
     "1. Hover over a dot to see a tooltip (title, author)\n",
     "2. Select a year in the legend (right) to filter dots\n",
     "3. Zoom in/out with scroll -- double click resets view"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -4482,9 +4294,7 @@
     "2. Can you see a temporal trend in the data and clusters?\n",
     "3. Can you determine when deep learning methods started booming ?\n",
     "4. Can you find the key papers that where written before the DL \"winter\" that define milestones for a cluster? (tip: look for large dots of different color)"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -4536,18 +4346,14 @@
     "3. Embed each paper by using abstract + title in SPECTER model\n",
     "4. Project based on embedding using UMAP\n",
     "5. Visualize using Altair"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Find Authors"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "code",
@@ -4605,9 +4411,7 @@
     "\n",
     "## Books for reference:\n",
     "- [https://www.deeplearningbook.org/](https://www.deeplearningbook.org/) (Deep Learning by Ian Goodfellow, Yoshua Bengio and Aaron Courville)"
-   ],
-   "outputs": [],
-   "execution_count": null
+   ]
   }
  ],
  "metadata": {
@@ -4639,7 +4443,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.16"
+   "version": "3.9.23"
   }
  },
  "nbformat": 4,

From a58d4e6e57b832c27a070dd65bfa926e16f20ec4 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Wed, 10 Jun 2026 22:25:28 -0400
Subject: [PATCH 28/34] make W3D2 folder name matches to materials.yml

---
 tutorials/materials.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/materials.yml b/tutorials/materials.yml
index 5fd80e017..7ed129610 100644
--- a/tutorials/materials.yml
+++ b/tutorials/materials.yml
@@ -128,7 +128,7 @@
 
 - day: W3D2
   category: Natural Language Processing
-  name: DL Case Study
+  name: DL Case Study 2
   playlist: https://youtube.com/playlist?list=PLkBQOLLbi18M6GYMqOSsswELhdxOUWQgR
 
   slides:

From 0a6b14e988cd5b9fc858da4350dfcb7dbc82bf7b Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Thu, 11 Jun 2026 23:35:22 -0400
Subject: [PATCH 29/34] minor edits to jupyter notebook cell tags back to list
 format and remove unused cells

---
 .../W1D1_Tutorial1.ipynb                      | 12 ------
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    | 42 +++++++++++++++++--
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 672cdc14e..bd9a8be6b 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -2487,18 +2487,6 @@
     "When training neural network models you will be working with large amounts of data. Fortunately, PyTorch offers some great tools that help you organize and manipulate your data samples."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Imports moved to Setup section above; kept here for reference\n",
-    "# from torchvision import datasets\n",
-    "# from torchvision.transforms import ToTensor, Compose, Grayscale\n",
-    "# from torch.utils.data import DataLoader\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index 3df862687..d2727984a 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -1485,7 +1485,30 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 4 *(optional)*: Implement momentum\n\nIn this exercise you will implement the momentum update given by:\n\n\\begin{equation}\nw_{t+1} = w_t - \\eta \\nabla J(w_t) + \\beta (w_t - w_{t-1})\n\\end{equation}\n\nIt is convenient to re-express this update rule in terms of a recursion. For that, we define 'velocity' as the quantity:\n\\begin{equation}\nv_{t-1} := w_{t} - w_{t-1}\n\\end{equation}\n\nwhich leads to the two-step update rule:\n\n\\begin{equation}\nv_t = - \\eta \\nabla J(w_t) + \\beta (\\underbrace{w_t - w_{t-1}}_{v_{t-1}})\n\\end{equation}\n\n\\begin{equation}\nw_{t+1} \\leftarrow w_t + v_{t}\n\\end{equation}\n\nPay attention to the positive sign of the update in the last equation, given the definition of $v_t$, above."
+    "## Coding Exercise 4 *(optional)*: Implement momentum\n",
+    "\n",
+    "In this exercise you will implement the momentum update given by:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "w_{t+1} = w_t - \\eta \\nabla J(w_t) + \\beta (w_t - w_{t-1})\n",
+    "\\end{equation}\n",
+    "\n",
+    "It is convenient to re-express this update rule in terms of a recursion. For that, we define 'velocity' as the quantity:\n",
+    "\\begin{equation}\n",
+    "v_{t-1} := w_{t} - w_{t-1}\n",
+    "\\end{equation}\n",
+    "\n",
+    "which leads to the two-step update rule:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "v_t = - \\eta \\nabla J(w_t) + \\beta (\\underbrace{w_t - w_{t-1}}_{v_{t-1}})\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
+    "w_{t+1} \\leftarrow w_t + v_{t}\n",
+    "\\end{equation}\n",
+    "\n",
+    "Pay attention to the positive sign of the update in the last equation, given the definition of $v_t$, above."
    ]
   },
   {
@@ -2476,7 +2499,9 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n\nComplete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
+    "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n",
+    "\n",
+    "Complete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
    ]
   },
   {
@@ -2794,7 +2819,18 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 7 *(optional)*: Implement RMSprop\n\nIn this exercise you will implement the update of the RMSprop optimizer:\n\n\\begin{align}\nv_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\nw_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n\\end{align}\n\nwhere the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n\nHere, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
+    "## Coding Exercise 7 *(optional)*: Implement RMSprop\n",
+    "\n",
+    "In this exercise you will implement the update of the RMSprop optimizer:\n",
+    "\n",
+    "\\begin{align}\n",
+    "v_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\n",
+    "w_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n",
+    "\\end{align}\n",
+    "\n",
+    "where the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n",
+    "\n",
+    "Here, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
    ]
   },
   {

From 92f0a242a963d8861b61e64e56692aafe36ff91b Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Thu, 11 Jun 2026 23:41:32 -0400
Subject: [PATCH 30/34] remove the commented out sections

---
 .../W1D1_Tutorial1.ipynb                      | 70 +------------------
 1 file changed, 1 insertion(+), 69 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index bd9a8be6b..13aa359ea 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -56,7 +56,7 @@
    "source": [
     "# @title Tutorial slides\n",
     "from IPython.display import IFrame\n",
-    "link_id = \"dg4h7\" # link_id = \"wcjrv\"\n",
+    "link_id = \"dg4h7\" \n",
     "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n",
     "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)"
    ]
@@ -4101,74 +4101,6 @@
     "content_review(f\"{feedback_prefix}_Be_a_group_Video\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form"
-   },
-   "outputs": [],
-   "source": [
-    "# This syllabus video was outdated\n",
-    "# # @title Video 17: Syllabus\n",
-    "# from ipywidgets import widgets\n",
-    "# from IPython.display import YouTubeVideo\n",
-    "# from IPython.display import IFrame\n",
-    "# from IPython.display import display\n",
-    "\n",
-    "\n",
-    "# class PlayVideo(IFrame):\n",
-    "#   def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
-    "#     self.id = id\n",
-    "#     if source == 'Bilibili':\n",
-    "#       src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
-    "#     elif source == 'Osf':\n",
-    "#       src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
-    "#     super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
-    "\n",
-    "\n",
-    "# def display_videos(video_ids, W=400, H=300, fs=1):\n",
-    "#   tab_contents = []\n",
-    "#   for i, video_id in enumerate(video_ids):\n",
-    "#     out = widgets.Output()\n",
-    "#     with out:\n",
-    "#       if video_ids[i][0] == 'Youtube':\n",
-    "#         video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
-    "#                              height=H, fs=fs, rel=0)\n",
-    "#         print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
-    "#       else:\n",
-    "#         video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
-    "#                           height=H, fs=fs, autoplay=False)\n",
-    "#         if video_ids[i][0] == 'Bilibili':\n",
-    "#           print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
-    "#         elif video_ids[i][0] == 'Osf':\n",
-    "#           print(f'Video available at https://osf.io/{video.id}')\n",
-    "#       display(video)\n",
-    "#     tab_contents.append(out)\n",
-    "#   return tab_contents\n",
-    "\n",
-    "\n",
-    "# video_ids = [('Youtube', 'cDvAqG_hAvQ'), ('Bilibili', 'BV1iB4y1N7uQ')]\n",
-    "# tab_contents = display_videos(video_ids, W=854, H=480)\n",
-    "# tabs = widgets.Tab()\n",
-    "# tabs.children = tab_contents\n",
-    "# for i in range(len(tab_contents)):\n",
-    "#   tabs.set_title(i, video_ids[i][0])\n",
-    "# display(tabs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form"
-   },
-   "outputs": [],
-   "source": [
-    "# # @title Submit your feedback\n",
-    "# content_review(f\"{feedback_prefix}_Syllabus_Video\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},

From 6965dda0388bb27181d5712470cb5d635ed3596b Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 14 Jun 2026 13:07:35 -0400
Subject: [PATCH 31/34] remove import check cells and restore W1D4 to remove
 the special character replacement by json.dump

---
 .../W1D1_Tutorial1.ipynb                      | 15 ++------
 .../W1D2_Tutorial1.ipynb                      | 28 ++-------------
 .../W1D2_Tutorial2.ipynb                      | 23 ------------
 .../W1D2_Tutorial3.ipynb                      | 23 ------------
 .../W1D3_Tutorial1.ipynb                      | 23 ------------
 .../W1D3_Tutorial2.ipynb                      | 23 ------------
 .../W1D4_Optimization/W1D4_Tutorial1.ipynb    | 36 ++++---------------
 7 files changed, 13 insertions(+), 158 deletions(-)

diff --git a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
index 13aa359ea..dcc96cc02 100644
--- a/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_BasicsAndPytorch/W1D1_Tutorial1.ipynb
@@ -125,16 +125,7 @@
     "    try:\n",
     "        importlib.import_module(_pkg)\n",
     "    except ImportError:\n",
-    "        subprocess.check_call([sys.executable, '-m', 'pip', 'install', _pip, '-q'])\n",
-    "\n",
-    "# Print versions for reproducibility / bug reports\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'pandas', 'matplotlib', 'torch', 'torchvision', 'sklearn', 'imageio', 'altair']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')\n"
+    "        subprocess.check_call([sys.executable, '-m', 'pip', 'install', _pip, '-q'])"
    ]
   },
   {
@@ -4237,8 +4228,8 @@
     "\n",
     "4. After filtering around the mid 1900's, hovering on the larger dots show\n",
     "the key papers before the DL winters.\n",
-    "For instance, \\\"Neural networks and physical systems with emergent\n",
-    "collective computational abilities\\\" by John J Hopfield (1980's)\n",
+    "For instance, \"Neural networks and physical systems with emergent\n",
+    "collective computational abilities\" by John J Hopfield (1980's)\n",
     "\"\"\";"
    ]
   },
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
index d557bf67c..41293068d 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
@@ -106,29 +106,6 @@
     "feedback_prefix = \"W1D2_T1\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1992,7 +1969,8 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "nma-dl-jax",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -2005,7 +1983,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.11"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
index 965b6302e..c54202613 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
@@ -104,29 +104,6 @@
     "feedback_prefix = \"W1D2_T2\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
index 61076b987..8f1e3b923 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
@@ -103,29 +103,6 @@
     "feedback_prefix = \"W1D2_T3\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
index 7eb734daf..aab54b954 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial1.ipynb
@@ -105,29 +105,6 @@
     "feedback_prefix = \"W1D3_T1\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
index 0bb19772b..74f4e54b2 100644
--- a/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
+++ b/tutorials/W1D3_MultiLayerPerceptrons/W1D3_Tutorial2.ipynb
@@ -103,29 +103,6 @@
     "feedback_prefix = \"W1D3_T2\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": ""
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
index d2727984a..b91c8d435 100644
--- a/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
+++ b/tutorials/W1D4_Optimization/W1D4_Tutorial1.ipynb
@@ -105,28 +105,6 @@
     "feedback_prefix = \"W1D4_T1\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "id": "import_check"
-   },
-   "outputs": [],
-   "source": [
-    "# @title Install and check dependencies\n",
-    "# Most packages are pre-installed on Colab/Kaggle.\n",
-    "# Running locally? See https://github.com/NeuromatchAcademy/course-content-dl/blob/main/requirements.txt\n",
-    "import importlib\n",
-    "\n",
-    "print('Package versions:')\n",
-    "for _pkg in ['numpy', 'matplotlib', 'torch', 'torchvision', 'tqdm']:\n",
-    "    try:\n",
-    "        _mod = importlib.import_module(_pkg)\n",
-    "        print(f'  {_pkg}: {getattr(_mod, \"__version__\", \"unknown\")}')\n",
-    "    except ImportError:\n",
-    "        print(f'  {_pkg}: NOT FOUND')"
-   ]  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -815,7 +793,7 @@
     "\n",
     "$$f(x) = \\text{softmax}(W x + b)$$\n",
     "\n",
-    "Here $x \\in \\mathbb{R}^{784}$, $W \\in \\mathbb{R}^{10 \\times 784}$ and $b \\in \\mathbb{R}^{10}$. Notice that the dimensions of the weight matrix are $10 \\times 784$ as the input tensors are flattened images, i.e., $28 \\times 28 = 784$-dimensional tensors and the output layer consists of $10$ nodes. Also, note that the implementation of softmax encapsulates b in W i.e., It maps the rows of the input instead of the columns. That is, the i\u2019th row of the output is the mapping of the i\u2019th row of the input under W, plus the bias term. Refer Affine maps here: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#affine-maps"
+    "Here $x \\in \\mathbb{R}^{784}$, $W \\in \\mathbb{R}^{10 \\times 784}$ and $b \\in \\mathbb{R}^{10}$. Notice that the dimensions of the weight matrix are $10 \\times 784$ as the input tensors are flattened images, i.e., $28 \\times 28 = 784$-dimensional tensors and the output layer consists of $10$ nodes. Also, note that the implementation of softmax encapsulates b in W i.e., It maps the rows of the input instead of the columns. That is, the i’th row of the output is the mapping of the i’th row of the input under W, plus the bias term. Refer Affine maps here: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#affine-maps"
    ]
   },
   {
@@ -1485,7 +1463,7 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 4 *(optional)*: Implement momentum\n",
+    "## Coding Exercise 4: Implement momentum\n",
     "\n",
     "In this exercise you will implement the momentum update given by:\n",
     "\n",
@@ -2257,14 +2235,14 @@
     "\"\"\"\n",
     "- The exact mechanism for this phenomenon is still under active research.\n",
     "Existing evidence points to the following: in the overparameterized setting,\n",
-    "there are many more 'good configurations' (values of the model\u2019s weights) that\n",
+    "there are many more 'good configurations' (values of the model’s weights) that\n",
     "lead to a low value of the objective. Furthermore, this large set of possible solutions\n",
     "seems to be increasingly easy to find in the space of all possible\n",
     "parameter configurations. As you increase the number of parameters, it becomes\n",
     "more likely that your initialization will be close to one of these good parameter settings.\n",
     "\n",
     "- This approach will require more memory and computation. Furthermore, we need\n",
-    "to always be aware of the risk of overfitting: don\u2019t forget to do cross-validation\n",
+    "to always be aware of the risk of overfitting: don’t forget to do cross-validation\n",
     "in order to be able to detect overfitting.\n",
     "\"\"\";"
    ]
@@ -2499,7 +2477,7 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 6 *(optional)*: Implement minibatch sampling\n",
+    "## Coding Exercise 6: Implement minibatch sampling\n",
     "\n",
     "Complete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
    ]
@@ -2819,7 +2797,7 @@
     "execution": {}
    },
    "source": [
-    "## Coding Exercise 7 *(optional)*: Implement RMSprop\n",
+    "## Coding Exercise 7 (optional): Implement RMSprop\n",
     "\n",
     "In this exercise you will implement the update of the RMSprop optimizer:\n",
     "\n",
@@ -3826,4 +3804,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}

From a2289f2e0a1514c8803fd4a72f0f1b0b8ebca3eb Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 14 Jun 2026 13:16:28 -0400
Subject: [PATCH 32/34] revert W1D2

---
 .../W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb  | 13 ++++++-------
 .../W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb  | 10 +++++-----
 .../W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb  | 15 ++++++++-------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
index 41293068d..69d28b804 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
@@ -42,7 +42,7 @@
     "---\n",
     "# Tutorial Objectives\n",
     "\n",
-    "Day 2 Tutorial 1 will continue on building PyTorch skillset and motivate its core functionality: Autograd. In this notebook, we will cover the key concepts and ideas of:\n",
+    "Day 2 Tutorial 1 will continue on buiding PyTorch skillset and motivate its core functionality: Autograd. In this notebook, we will cover the key concepts and ideas of:\n",
     "\n",
     "* Gradient descent\n",
     "* PyTorch Autograd\n",
@@ -74,7 +74,7 @@
     "---\n",
     "# Setup\n",
     "\n",
-    "This a GPU-Free tutorial!"
+    "This a GPU-Free tutorial!\n"
    ]
   },
   {
@@ -119,10 +119,7 @@
     "import numpy as np\n",
     "from torch import nn\n",
     "from math import pi\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "import ipywidgets as widgets\n",
-    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
+    "import matplotlib.pyplot as plt"
    ]
   },
   {
@@ -138,6 +135,7 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
+    "import ipywidgets as widgets  # Interactive display\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")"
    ]
@@ -153,6 +151,7 @@
    "source": [
     "# @title Plotting functions\n",
     "\n",
+    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
     "\n",
     "def ex3_plot(model, x, y, ep, lss):\n",
     "  \"\"\"\n",
@@ -899,7 +898,7 @@
     "execution": {}
    },
    "source": [
-    "## Section 1.3: Computational Graphs and Backprop"
+    "## Section 1.3: Computational Graphs and Backprop\n"
    ]
   },
   {
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
index c54202613..5d3a0feb2 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial2.ipynb
@@ -114,13 +114,9 @@
    "source": [
     "# Imports\n",
     "import time\n",
-    "import torch\n",
     "import numpy as np\n",
     "import matplotlib\n",
-    "import matplotlib.pyplot as plt\n",
-    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
-    "from ipywidgets import HBox, interactive_output, ToggleButton, Layout\n",
-    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
+    "import matplotlib.pyplot as plt"
    ]
   },
   {
@@ -136,6 +132,9 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
+    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
+    "from ipywidgets import HBox, interactive_output, ToggleButton, Layout\n",
+    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
     "\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")"
@@ -841,6 +840,7 @@
     "\n",
     "# Call `set_seed` function in the exercises to ensure reproducibility.\n",
     "import random\n",
+    "import torch\n",
     "\n",
     "def set_seed(seed=None, seed_torch=True):\n",
     "  \"\"\"\n",
diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
index 8f1e3b923..f71d1c36d 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial3.ipynb
@@ -120,13 +120,7 @@
     "import matplotlib.pyplot as plt\n",
     "\n",
     "import torch.nn as nn\n",
-    "import torch.optim as optim\n",
-    "\n",
-    "import warnings\n",
-    "from matplotlib import gridspec\n",
-    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
-    "from ipywidgets import FloatLogSlider, Layout, VBox, interactive_output\n",
-    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n"
+    "import torch.optim as optim"
    ]
   },
   {
@@ -142,7 +136,14 @@
     "import logging\n",
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
+    "from matplotlib import gridspec\n",
+    "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n",
+    "from ipywidgets import FloatLogSlider, Layout, VBox\n",
+    "from ipywidgets import interactive_output\n",
+    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
     "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
     "\n",
     "%config InlineBackend.figure_format = 'retina'\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")"

From 19acfdd5fd53a71e286f93ef74c97cd6f4f48cdb Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 14 Jun 2026 15:58:33 -0400
Subject: [PATCH 33/34] revert W1D2

---
 .../W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb      | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
index 69d28b804..02fa7571b 100644
--- a/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_LinearDeepLearning/W1D2_Tutorial1.ipynb
@@ -42,7 +42,7 @@
     "---\n",
     "# Tutorial Objectives\n",
     "\n",
-    "Day 2 Tutorial 1 will continue on buiding PyTorch skillset and motivate its core functionality: Autograd. In this notebook, we will cover the key concepts and ideas of:\n",
+    "Day 2 Tutorial 1 will continue on building PyTorch skillset and motivate its core functionality: Autograd. In this notebook, we will cover the key concepts and ideas of:\n",
     "\n",
     "* Gradient descent\n",
     "* PyTorch Autograd\n",
@@ -74,7 +74,7 @@
     "---\n",
     "# Setup\n",
     "\n",
-    "This a GPU-Free tutorial!\n"
+    "This a GPU-Free tutorial!"
    ]
   },
   {
@@ -898,7 +898,7 @@
     "execution": {}
    },
    "source": [
-    "## Section 1.3: Computational Graphs and Backprop\n"
+    "## Section 1.3: Computational Graphs and Backprop"
    ]
   },
   {
@@ -1968,8 +1968,7 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "nma-dl-jax",
-   "language": "python",
+   "display_name": "Python 3",
    "name": "python3"
   },
   "language_info": {
@@ -1982,7 +1981,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.20"
   }
  },
  "nbformat": 4,

From ef6889ebc633038a3167fdbab6b05f25543baa22 Mon Sep 17 00:00:00 2001
From: Cindy Tu <cindyhfls@gmail.com>
Date: Sun, 14 Jun 2026 16:01:46 -0400
Subject: [PATCH 34/34] update requirements

---
 requirements.txt | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fc4b1f8ee..1f58cc85f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,5 @@
 # Requirements for Neuromatch Academy Deep Learning tutorials
-# These packages are pre-installed on Google Colab/Kaggle.
-# For local setup: pip install -r requirements_tutorials.txt
-#
-# Python >= 3.10 required
-# Tutorial-specific packages (e.g. transformers, diffusers, altair) are
-# installed at the top of the relevant tutorial notebooks.
+# Some of these packages are pre-installed on Google Colab/Kaggle.
 
 numpy>=2.0
 pandas>=2.2