diff --git a/.gitignore b/.gitignore
index 70702445..70255b20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,7 @@ _build/
mydask.png
dataframes/data
.idea/
+.devcontainer/
+.data/
+.vector_cache/
+__pycache__
\ No newline at end of file
diff --git a/machine-learning/Untitled1.ipynb b/machine-learning/Untitled1.ipynb
deleted file mode 100644
index 2fd64429..00000000
--- a/machine-learning/Untitled1.ipynb
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "cells": [],
- "metadata": {},
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/machine-learning/model.py b/machine-learning/model.py
new file mode 100644
index 00000000..6f48270d
--- /dev/null
+++ b/machine-learning/model.py
@@ -0,0 +1,53 @@
+# more details can be found here: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+import torchtext
+import numpy as np
+
+
+class CNN(nn.Module):
+ def __init__(self, n_filters=100, filter_sizes=(2,3,4), output_dim=2, dropout=0.2, pretrained_embeddings=None, TEXT=None):
+
+ super().__init__()
+ self.TEXT = TEXT
+ # will be used to initialize model embeddings layer
+ self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings)
+ self.embedding.weight.requires_grad = False # save some computation
+ embedding_dim = self.embedding.embedding_dim
+ self.conv_0 = nn.Conv1d(in_channels = 1,
+ out_channels = n_filters,
+ kernel_size = (filter_sizes[0], embedding_dim))
+ self.conv_1 = nn.Conv1d(in_channels = 1,
+ out_channels = n_filters,
+ kernel_size = (filter_sizes[1], embedding_dim))
+ self.conv_2 = nn.Conv1d(in_channels = 1,
+ out_channels = n_filters,
+ kernel_size = (filter_sizes[2], embedding_dim))
+ self.fc = nn.Linear(len(filter_sizes) * n_filters, 2)
+ self.dropout = nn.Dropout(dropout)
+
+ def forward(self, text):
+# # bit of a hack to preprocess data inside the network
+# if isinstance(text, np.ndarray):
+# text = self.TEXT.process(text)
+
+ #text = [batch size, sent len]
+ embedded = self.embedding(text)
+ #embedded = [batch size, sent len, emb dim]
+ embedded = embedded.unsqueeze(1)
+ #embedded = [batch size, 1, sent len, emb dim]
+ conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
+ conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
+ conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
+ #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
+ pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
+ pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
+ pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
+ #pooled_n = [batch size, n_filters]
+ cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
+ #cat = [batch size, n_filters * len(filter_sizes)]
+ logits = self.fc(cat)
+ #logits = [batch_size, output_dim]
+ return F.softmax(logits, dim=-1)
\ No newline at end of file
diff --git a/machine-learning/skorch-hyperparam-opt.ipynb b/machine-learning/skorch-hyperparam-opt.ipynb
new file mode 100644
index 00000000..d551c689
--- /dev/null
+++ b/machine-learning/skorch-hyperparam-opt.ipynb
@@ -0,0 +1,1784 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !pip install -q dask_cuda torch torchtext skorch\n",
+ "# !pip -q install dask[dataframe] --upgrade"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Hyperparameter optimization with Skorch\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setup Dask Cluster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import random\n",
+ "import time\n",
+ "\n",
+ "import dask.array as da\n",
+ "from dask_cuda import LocalCUDACluster\n",
+ "from dask_ml.model_selection import HyperbandSearchCV\n",
+ "from distributed import Client\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from scipy.stats import loguniform\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.model_selection import RandomizedSearchCV\n",
+ "import skorch\n",
+ "from skorch import NeuralNetClassifier\n",
+ "from skorch.helper import SliceDataset\n",
+ "import torch\n",
+ "import torch.nn as nn\n",
+ "import torch.nn.functional as F\n",
+ "import torch.optim as optim\n",
+ "from torch.utils.data import Dataset, DataLoader\n",
+ "import torchtext\n",
+ "from torchtext import data\n",
+ "from torchtext import datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "\n",
+ "Client\n",
+ "\n",
+ " | \n",
+ "\n",
+ "Cluster\n",
+ "\n",
+ " - Workers: 1
\n",
+ " - Cores: 1
\n",
+ " - Memory: 31.63 GB
\n",
+ " \n",
+ " | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# if you have GPU(s), use dask_cuda to automatically make use of them in your dask cluster\n",
+ "if torch.cuda.is_available():\n",
+ " cluster = LocalCUDACluster()\n",
+ " client = Client(cluster)\n",
+ "else:\n",
+ " client = Client(processes=False, threads_per_worker=4,\n",
+ " n_workers=1, memory_limit='2GB')\n",
+ "client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# for reproducibility\n",
+ "# NB: enabling reproducibility can significantly slow down runtimes\n",
+ "reproducible = False\n",
+ "if reproducible:\n",
+ " SEED = 42\n",
+ " random.seed(SEED)\n",
+ " np.random.seed(SEED)\n",
+ " torch.manual_seed(SEED)\n",
+ " torch.backends.cudnn.deterministic = True"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this solves many of our later problems but isn't an ideal solution\n",
+ "# accuracy will take a hit\n",
+ "FIX_LENGTH = 512"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# a few seconds to download IMDB dataset (84Mb, will be cached)\n",
+ "# approx. 10 minutes to download glove embeddings (862Mb, will be cached)\n",
+ "\n",
+ "# set up fields\n",
+ "TEXT = data.Field(lower=True, batch_first=True, fix_length=FIX_LENGTH)\n",
+ "LABEL = data.Field(sequential=False, unk_token=None)\n",
+ "\n",
+ "# make splits for data\n",
+ "train, test = datasets.IMDB.splits(TEXT, LABEL)\n",
+ "\n",
+ "# work with 5k datapoints for faster iteration times\n",
+ "split_ratio = 5_000 / len(train)\n",
+ "train, discard = train.split(split_ratio=split_ratio)\n",
+ "\n",
+ "split_ratio = 5_000 / len(test)\n",
+ "test, discard = test.split(split_ratio=split_ratio)\n",
+ "\n",
+ "# will be used to initialize model embeddings layer\n",
+ "vocab = torchtext.vocab.GloVe(name='6B', dim=100)\n",
+ "\n",
+ "# build the vocabulary\n",
+ "max_size = 25_000 # shorten for demonstrative purposes\n",
+ "TEXT.build_vocab(train, vectors=vocab, max_size=max_size)\n",
+ "LABEL.build_vocab(train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['', '', 'the', 'and', 'a']"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# itos := index-to-string\n",
+ "# note the 2 extra tokens added for us: '', ''\n",
+ "TEXT.vocab.itos[:5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "defaultdict(None, {'pos': 0, 'neg': 1})"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# stoi := string-to-index\n",
+ "# check on the meaning of these zeroes and ones\n",
+ "LABEL.vocab.stoi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assert (len(TEXT.vocab.itos) == max_size + 2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['i', \"couldn't\", 'hold', 'back', 'the', 'tears', 'when', 'i', 'watched', 'this'] ...\n",
+ "\n",
+ "pos\n"
+ ]
+ }
+ ],
+ "source": [
+ "# peek at the data\n",
+ "print(train.examples[0].text[:10], '...')\n",
+ "print()\n",
+ "print(train.examples[0].label)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# custom dataset class required to work with Skorch\n",
+ "class TorchDataset(Dataset):\n",
+ " def __init__(self, dataset):\n",
+ " self.dataset = dataset\n",
+ "\n",
+ " def __getitem__(self, idx):\n",
+ " example = self.dataset.examples[idx]\n",
+ " return example.text, example.label\n",
+ " \n",
+ " def __len__(self):\n",
+ " return len(self.dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_dataset = TorchDataset(train)\n",
+ "test_dataset = TorchDataset(test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokens, label = train_dataset[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['i', \"couldn't\", 'hold', 'back', 'the', 'tears', 'when', 'i', 'watched', 'this'] ...\n",
+ "\n",
+ "pos\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(tokens[:10], '...')\n",
+ "print()\n",
+ "print(label)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# custom collate function for DataLoader\n",
+ "def pad_batch(batch, TEXT, LABEL):\n",
+ " text, label = list(zip(*batch))\n",
+ " # numericalized and padded text representation\n",
+ " text_processed = TEXT.process(text)\n",
+ " label_processed = LABEL.process(label)\n",
+ " return text_processed, label_processed\n",
+ "\n",
+ "from functools import partial\n",
+ "\n",
+ "pad_batch_partial = partial(pad_batch, TEXT=TEXT, LABEL=LABEL)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_batch_partial)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "batch = next(iter(train_dataloader))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "processed_examples, labels = batch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "tensor([ 9, 20, 7, 3765, 23, 8, 54, 692, 2, 2384]) ...\n",
+ "\n",
+ "tensor(1)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(processed_examples[0][:10], '...')\n",
+ "print()\n",
+ "print(labels[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define your network"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# would have defined the class in this notebook but was getting the following error from Hyperband\n",
+ "# PicklingError: Can't pickle : attribute lookup CNN on __main__ failed\n",
+ "from model import CNN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "tensor([[0.5000, 0.5000],\n",
+ " [0.5174, 0.4826],\n",
+ " [0.4599, 0.5401],\n",
+ " [0.3472, 0.6528],\n",
+ " [0.3888, 0.6112],\n",
+ " [0.4259, 0.5741],\n",
+ " [0.3734, 0.6266],\n",
+ " [0.3527, 0.6473],\n",
+ " [0.4275, 0.5725],\n",
+ " [0.4277, 0.5723],\n",
+ " [0.5281, 0.4719],\n",
+ " [0.4183, 0.5817],\n",
+ " [0.4409, 0.5591],\n",
+ " [0.4205, 0.5795],\n",
+ " [0.4820, 0.5180],\n",
+ " [0.3552, 0.6448],\n",
+ " [0.3843, 0.6157],\n",
+ " [0.3047, 0.6953],\n",
+ " [0.5312, 0.4688],\n",
+ " [0.4069, 0.5931],\n",
+ " [0.3691, 0.6309],\n",
+ " [0.3541, 0.6459],\n",
+ " [0.2763, 0.7237],\n",
+ " [0.4770, 0.5230],\n",
+ " [0.3749, 0.6251],\n",
+ " [0.4165, 0.5835],\n",
+ " [0.4208, 0.5792],\n",
+ " [0.5268, 0.4732],\n",
+ " [0.4046, 0.5954],\n",
+ " [0.5047, 0.4953],\n",
+ " [0.3795, 0.6205],\n",
+ " [0.4030, 0.5970]], device='cuda:0', grad_fn=)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# smoketest\n",
+ "model = CNN(pretrained_embeddings=TEXT.vocab.vectors).to(device)\n",
+ "gpu_batch = batch[0].to(device)\n",
+ "model_out = model(gpu_batch)\n",
+ "print(model_out)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "del model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "del gpu_batch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "torch.cuda.empty_cache()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Quick attempt at model training to debug any issues"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# if you want to use a custom DataLoader, you must use NeuralNet\n",
+ "# also, not immediately obvious that for NeuralNet you are responsible for applying the log function\n",
+ "# whereas for NeuralNetClassifier, you are not\n",
+ "\n",
+ "# NB: not ideal to be using softmax + log + NLLLoss\n",
+ "# see discussion: https://github.com/skorch-dev/skorch/issues/637\n",
+ "skorch_model = NeuralNetClassifier(\n",
+ " CNN,\n",
+ " device=device,\n",
+ " max_epochs=2,\n",
+ " lr=0.001,\n",
+ " optimizer=optim.Adam,\n",
+ " criterion=nn.NLLLoss,\n",
+ " iterator_train=DataLoader,\n",
+ " iterator_train__shuffle=True,\n",
+ " iterator_train__batch_size=32,\n",
+ " iterator_train__collate_fn=pad_batch_partial,\n",
+ " iterator_train__num_workers=8,\n",
+ " iterator_valid=DataLoader,\n",
+ " iterator_valid__shuffle=False,\n",
+ " iterator_valid__batch_size=64,\n",
+ " iterator_valid__collate_fn=pad_batch_partial,\n",
+ " iterator_valid__num_workers=8,\n",
+ " train_split=skorch.dataset.CVSplit(.2), # NB: this witholds 20% of the training data for validation\n",
+ " module__n_filters=100,\n",
+ " module__filter_sizes=(2,3,4),\n",
+ " module__dropout=0.2,\n",
+ " module__pretrained_embeddings=TEXT.vocab.vectors,\n",
+ " verbose=2)\n",
+ "# getting the following error when trying to compute accuracy\n",
+ "# ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets\n",
+ "# callbacks=callbacks)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " epoch train_loss valid_acc valid_loss dur\n",
+ "------- ------------ ----------- ------------ ------\n",
+ " 1 \u001b[36m0.6258\u001b[0m \u001b[32m0.7930\u001b[0m \u001b[35m0.5002\u001b[0m 2.0929\n",
+ " 2 \u001b[36m0.4405\u001b[0m \u001b[32m0.8250\u001b[0m \u001b[35m0.3986\u001b[0m 1.9010\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[initialized](\n",
+ " module_=CNN(\n",
+ " (embedding): Embedding(25002, 100)\n",
+ " (conv_0): Conv1d(1, 100, kernel_size=(2, 100), stride=(1,))\n",
+ " (conv_1): Conv1d(1, 100, kernel_size=(3, 100), stride=(1,))\n",
+ " (conv_2): Conv1d(1, 100, kernel_size=(4, 100), stride=(1,))\n",
+ " (fc): Linear(in_features=300, out_features=2, bias=True)\n",
+ " (dropout): Dropout(p=0.2, inplace=False)\n",
+ " ),\n",
+ ")"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "skorch_model.fit(train_dataset, y=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# https://github.com/skorch-dev/skorch/issues/641\n",
+ "\n",
+ "# skorch_model.score(test_dataset)\n",
+ "# TypeError: score() missing 1 required positional argument: 'y'\n",
+ "# skorch_model.score(test_dataset, y=None)\n",
+ "# ValueError: Expected array-like (array or non-string sequence), got None\n",
+ "\n",
+ "# can monkey patch skorch_model to achieve native scoring\n",
+ "# def score(self, X, y=None): \n",
+ "# ds = self.get_dataset(X) \n",
+ "# target_iterator = self.get_iterator(ds, training=False) \n",
+ " \n",
+ "# y_true = np.concatenate([skorch.utils.to_numpy(y) for _, y in target_iterator]) \n",
+ "# y_pred = self.predict(X)\n",
+ " \n",
+ "# return accuracy_score(y_true, y_pred) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# score manually\n",
+ "test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False, collate_fn=pad_batch_partial, num_workers=8)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.8006\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test set accuracy\n",
+ "test_preds = skorch_model.predict(test_dataset)\n",
+ "processed_test_data = next(iter(test_dataloader))\n",
+ "test_labels = processed_test_data[1].numpy()\n",
+ "print(accuracy_score(test_labels, test_preds))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([0, 1]), array([2554, 2446]))"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# random guessing would 50% accuracy so the model is indeed training well\n",
+ "np.unique(test_labels, return_counts=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# NB: this has no effect on GPU memory usage. If I keyboard interrupt, the workers get\n",
+ "# restarted and memory usage goes down. Deleting these \"handler\" objects doesn't delete\n",
+ "# GPU memory references on the workers. \n",
+ "# del skorch_model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Grid search with Skorch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define parameter grid\n",
+ "params = {'module__filter_sizes': [(1, 2, 3), (2, 3, 4), (3, 4, 5)], \n",
+ " 'module__n_filters': [25, 50, 100],\n",
+ " 'module__dropout': loguniform(1e-1, 3e-1),\n",
+ " 'batch_size': [32, 64],\n",
+ " }\n",
+ "\n",
+ "skorch_search = RandomizedSearchCV(skorch_model, params, n_iter=2, cv=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This errors out with: TypeError: fit() missing 1 required positional argument: 'y'\n",
+ "# skorch_search.fit(train_dataset, y=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# https://github.com/skorch-dev/skorch/issues/605#issuecomment-650580286"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ValueError: Dataset does not have consistent lengths.\n",
+ "# dummy_y = np.zeros((len(train_dataset)))\n",
+ "# skorch_search.fit(train_dataset, y=dummy_y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ValueError: Dataset does not have consistent lengths.\n",
+ "# y = torch.cat([LABEL.process([pair[1]]) for pair in train_dataset]).numpy()\n",
+ "# skorch_search.fit(train_dataset, y=y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ValueError: Dataset does not have consistent lengths.\n",
+ "# skorch_search.fit(train_dataset, y=SliceDataset(train_dataset, idx=1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Grid search with Hyperband"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is a really unfortunate hack to make deep learning batching semantics work with `Skorch` and `Dask`. The downside here is that we're no longer padding to the longest sequence in the batch, rather we're padding to the longest sequence in the *dataset*, which results in signifcantly more computation and thus significantly more time to train a model.\n",
+ "\n",
+ "Our solution was to set a max sequence length but that's not an ideal solution since you're still performing extra computation and accuracy does suffer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# train=True shuffles the data\n",
+ "train_iter_skorch = torchtext.data.Iterator(train, batch_size=len(train), train=True, sort=False, device='cpu')\n",
+ "test_iter_skorch = torchtext.data.Iterator(test, batch_size=len(test), train=False, sort=False, device='cpu')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# takes some time to numericalize the whole dataset\n",
+ "\n",
+ "# also notice that skorch and dask expect numpy arrays, which isn't ideal since it ties you to the cpu.\n",
+ "# meanwhile, projects like https://rapids.ai/ are moving toward all GPU computation, avoiding the cpu altogether.\n",
+ "for batch in train_iter_skorch:\n",
+ " X_train = batch.text.numpy()\n",
+ " y_train = batch.label.numpy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for batch in test_iter_skorch:\n",
+ " X_test = batch.text.numpy()\n",
+ " y_test = batch.label.numpy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5000, 512)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# notice how awfully large the second dimension is\n",
+ "X_train.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# https://ml.dask.org/hyper-parameter-search.html#hyperband-parameters-rule-of-thumb\n",
+ "EPOCHS = 10\n",
+ "NUM_TRAINING_EXAMPLES = len(train)*.8\n",
+ "n_examples = EPOCHS * NUM_TRAINING_EXAMPLES\n",
+ "n_params = 12\n",
+ "\n",
+ "# it's not immediately obvious to beginners how all these parameters interact with each other\n",
+ "max_iter = n_params\n",
+ "chunk_size = n_examples // n_params"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# suppose we want to set max_iter to be the commensurate with the number of examples required\n",
+ "# for the model converge (as cited in the documentation)\n",
+ "\n",
+ "# it's a bit unclear how n_params relates to BOTH the number of data points required\n",
+ "# for the model to converge AND how many hyperparameters to try out (i.e. n_iter in RandomizedSearchCV)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Chunk size: 3333.0\n",
+ "Total chunks: 2\n",
+ "Last chunk size: 1667.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# choose chunk size so that the remainder is not a tiny number\n",
+ "print(f'Chunk size: {chunk_size}')\n",
+ "print(f'Total chunks: {math.ceil(len(train) / chunk_size)}')\n",
+ "last_chunk_size = len(train) % chunk_size\n",
+ "if last_chunk_size == 0: # i.e. chunk_size evenly divides X_train\n",
+ " last_chunk_size = chunk_size\n",
+ "print(f'Last chunk size: {last_chunk_size}')\n",
+ "\n",
+ "assert (len(train) % chunk_size > 10 or len(train) % chunk_size == 0), 'Choose another chunk size'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X = da.from_array(X_train, chunks=(chunk_size, X_train.shape[-1]))\n",
+ "y = da.from_array(y_train, chunks=(chunk_size))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " | Array | Chunk | \n",
+ " \n",
+ " \n",
+ " Bytes | 20.48 MB | 13.65 MB | \n",
+ " Shape | (5000, 512) | (3333, 512) | \n",
+ " Count | 3 Tasks | 2 Chunks | \n",
+ " Type | int64 | numpy.ndarray | \n",
+ " \n",
+ " \n",
+ " | \n",
+ "\n",
+ "\n",
+ " | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "dask.array"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "TLDR; you can't use dask arrays with `torch.utils.data.Dataloader`, which means you have to do all your data preparation ahead of time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# raw_train_dataset = [x for x in train_dataset]\n",
+ "# raw_train_dataset_array = np.array(raw_train_dataset, dtype=object)\n",
+ "# dask_dataset = da.from_array(raw_train_dataset_array, chunks=(chunk_size))\n",
+ "# dask_dataset[0].compute()\n",
+ "\n",
+ "# TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found \n",
+ "# data_iter = DataLoader(dask_dataset)\n",
+ "# next(iter(data_iter))\n",
+ "\n",
+ "# # TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object\n",
+ "# np_data_iter = DataLoader(raw_train_dataset_array)\n",
+ "# next(iter(np_data_iter))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# reinitialize and set train_split=None to let hyperband handle validation set splitting\n",
+ "skorch_model = NeuralNetClassifier(\n",
+ " CNN,\n",
+ " device=device,\n",
+ " lr=0.001,\n",
+ " optimizer=optim.Adam,\n",
+ " criterion=nn.NLLLoss,\n",
+ " iterator_train__batch_size=32,\n",
+ " iterator_valid__batch_size=64,\n",
+ " train_split=None, # let hyperband handle it\n",
+ " module__n_filters=100,\n",
+ " module__filter_sizes=(2, 3, 4),\n",
+ " module__dropout=0.2,\n",
+ " module__pretrained_embeddings=TEXT.vocab.vectors,\n",
+ " # module__TEXT=TEXT,\n",
+ " batch_size=32,\n",
+ " verbose=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define parameter grid\n",
+ "params = {'module__filter_sizes': [(1, 2, 3), (2, 3, 4), (3, 4, 5)], \n",
+ " 'module__n_filters': [25, 50, 100],\n",
+ " 'module__dropout': loguniform(1e-1, 3e-1),\n",
+ " 'batch_size': [32, 64],\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "search = HyperbandSearchCV(\n",
+ " skorch_model,\n",
+ " params,\n",
+ " max_iter=max_iter,\n",
+ " verbose=True,\n",
+ " test_size=0.2 # validation size\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "85"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "search.metadata[\"partial_fit_calls\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "17"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "search.metadata['n_models']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# to clear up any confusion, every time partial_fit is called, we're passing in chunk_size number of\n",
+ "# data points. Then skorch handles the batch size either by being set explicitly or as part of the param grid.\n",
+ "\n",
+ "# to compare this grid search to number of epochs, we have 26 partial_fit calls * 10k data points = 260k examples\n",
+ "# with a training set size of 25k * .8 = 20k data points, this is 13 epochs!\n",
+ "# considering that it takes approximately 5 epochs to train a model, you would get through less than 3 sets of \n",
+ "# hyperparameters if manually searching. Instead we'll search through ~5."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running training on a Nvidia Tesla T4\\ "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[CV, bracket=2] creating 9 models\n",
+ "[CV, bracket=1] creating 5 models\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.7/site-packages/distributed/worker.py:3351: UserWarning: Large object of size 10.00 MB detected in task graph: \n",
+ " [[u ... .0000]]),\n",
+ "), 0]\n",
+ "Consider scattering large objects ahead of time\n",
+ "with client.scatter to reduce scheduler burden and \n",
+ "keep data on workers\n",
+ "\n",
+ " future = client.submit(func, big_data) # bad\n",
+ "\n",
+ " big_future = client.scatter(big_data) # good\n",
+ " future = client.submit(func, big_future) # good\n",
+ " % (format_bytes(len(b)), s)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[CV, bracket=0] creating 3 models\n",
+ "[CV, bracket=0] For training there are between 1333 and 2666 examples in each chunk\n",
+ "[CV, bracket=1] For training there are between 1333 and 2666 examples in each chunk\n",
+ "[CV, bracket=2] For training there are between 1333 and 2666 examples in each chunk\n",
+ "[CV, bracket=0] validation score of 0.7982 received after 1 partial_fit calls\n",
+ "[CV, bracket=1] validation score of 0.8032 received after 1 partial_fit calls\n",
+ "[CV, bracket=2] validation score of 0.7842 received after 1 partial_fit calls\n",
+ "[CV, bracket=0] validation score of 0.8551 received after 12 partial_fit calls\n",
+ "[CV, bracket=1] validation score of 0.8322 received after 4 partial_fit calls\n",
+ "[CV, bracket=2] validation score of 0.8212 received after 3 partial_fit calls\n",
+ "[CV, bracket=1] validation score of 0.8162 received after 12 partial_fit calls\n",
+ "[CV, bracket=2] validation score of 0.8062 received after 9 partial_fit calls\n",
+ "Time to complete grid search: 372.02 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "# notice how the number of training datapoints relates to the chunk size and our test_size\n",
+ "# Train set chunk size: 800 = 1000*(1-.2)\n",
+ "# Validation set chunk size: 200 = 1000*.2\n",
+ "start = time.time()\n",
+ "search.fit(X, y)\n",
+ "end = time.time()\n",
+ "duration = round(end - start, 2)\n",
+ "print(f'Time to complete grid search: {duration} seconds')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Integration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "`HyperbandSearchCV` follows the Scikit-learn API and mirrors Scikit-learn's `RandomizedSearchCV`. This means that it \"just works\". All the Scikit-learn attributes and methods are available:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8551448551448552"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "search.best_score_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[initialized](\n",
+ " module_=CNN(\n",
+ " (embedding): Embedding(25002, 100)\n",
+ " (conv_0): Conv1d(1, 50, kernel_size=(1, 100), stride=(1,))\n",
+ " (conv_1): Conv1d(1, 50, kernel_size=(2, 100), stride=(1,))\n",
+ " (conv_2): Conv1d(1, 50, kernel_size=(3, 100), stride=(1,))\n",
+ " (fc): Linear(in_features=150, out_features=2, bias=True)\n",
+ " (dropout): Dropout(p=0.12476236679704862, inplace=False)\n",
+ " ),\n",
+ ")"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "search.best_estimator_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'param_module__n_filters': array([100, 100, 25, 50, 25, 25, 25, 50, 50, 25, 100, 25, 100,\n",
+ " 100, 50, 100, 100]),\n",
+ " 'param_batch_size': array([32, 64, 64, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 32, 32, 64, 32]),\n",
+ " 'param_module__dropout': array([0.14233276, 0.15454961, 0.273504 , 0.26444226, 0.13400896,\n",
+ " 0.19302409, 0.15142034, 0.23147478, 0.28516434, 0.15385363,\n",
+ " 0.18147304, 0.29957988, 0.10990513, 0.29003914, 0.12476237,\n",
+ " 0.14516028, 0.10459089]),\n",
+ " 'std_partial_fit_time': array([0. , 1.42792781, 0. , 0. , 0.8889792 ,\n",
+ " 0.94623792, 0. , 0. , 0. , 0.84567356,\n",
+ " 1.6097213 , 0.79452193, 1.30084145, 1.54377656, 0.04432201,\n",
+ " 0.26346576, 0.31667006]),\n",
+ " 'param_module__filter_sizes': array([[3, 4, 5],\n",
+ " [2, 3, 4],\n",
+ " [1, 2, 3],\n",
+ " [2, 3, 4],\n",
+ " [2, 3, 4],\n",
+ " [3, 4, 5],\n",
+ " [1, 2, 3],\n",
+ " [3, 4, 5],\n",
+ " [1, 2, 3],\n",
+ " [2, 3, 4],\n",
+ " [2, 3, 4],\n",
+ " [1, 2, 3],\n",
+ " [1, 2, 3],\n",
+ " [3, 4, 5],\n",
+ " [1, 2, 3],\n",
+ " [3, 4, 5],\n",
+ " [3, 4, 5]]),\n",
+ " 'test_score': array([0.73626374, 0.80619381, 0.77822178, 0.77922078, 0.80619381,\n",
+ " 0.78821179, 0.75224775, 0.77322677, 0.77322677, 0.80819181,\n",
+ " 0.81718282, 0.7982018 , 0.82817183, 0.81618382, 0.85514486,\n",
+ " 0.84615385, 0.84515485]),\n",
+ " 'model_id': array(['bracket=2-0', 'bracket=2-1', 'bracket=2-2', 'bracket=2-3',\n",
+ " 'bracket=2-4', 'bracket=2-5', 'bracket=2-6', 'bracket=2-7',\n",
+ " 'bracket=2-8', 'bracket=1-0', 'bracket=1-1', 'bracket=1-2',\n",
+ " 'bracket=1-3', 'bracket=1-4', 'bracket=0-0', 'bracket=0-1',\n",
+ " 'bracket=0-2'], dtype='\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " param_module__n_filters | \n",
+ " param_batch_size | \n",
+ " param_module__dropout | \n",
+ " std_partial_fit_time | \n",
+ " param_module__filter_sizes | \n",
+ " test_score | \n",
+ " model_id | \n",
+ " bracket | \n",
+ " rank_test_score | \n",
+ " std_score_time | \n",
+ " params | \n",
+ " mean_partial_fit_time | \n",
+ " mean_score_time | \n",
+ " partial_fit_calls | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100 | \n",
+ " 32 | \n",
+ " 0.142333 | \n",
+ " 0.000000 | \n",
+ " [3, 4, 5] | \n",
+ " 0.736264 | \n",
+ " bracket=2-0 | \n",
+ " 2 | \n",
+ " 9 | \n",
+ " 0.000000 | \n",
+ " {'batch_size': 32, 'module__dropout': 0.142332... | \n",
+ " 3.082313 | \n",
+ " 0.165004 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100 | \n",
+ " 64 | \n",
+ " 0.154550 | \n",
+ " 1.427928 | \n",
+ " [2, 3, 4] | \n",
+ " 0.806194 | \n",
+ " bracket=2-1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0.003680 | \n",
+ " {'batch_size': 64, 'module__dropout': 0.154549... | \n",
+ " 4.203769 | \n",
+ " 0.176866 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 25 | \n",
+ " 64 | \n",
+ " 0.273504 | \n",
+ " 0.000000 | \n",
+ " [1, 2, 3] | \n",
+ " 0.778222 | \n",
+ " bracket=2-2 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 0.000000 | \n",
+ " {'batch_size': 64, 'module__dropout': 0.273504... | \n",
+ " 1.613707 | \n",
+ " 0.053710 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 50 | \n",
+ " 32 | \n",
+ " 0.264442 | \n",
+ " 0.000000 | \n",
+ " [2, 3, 4] | \n",
+ " 0.779221 | \n",
+ " bracket=2-3 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 0.000000 | \n",
+ " {'batch_size': 32, 'module__dropout': 0.264442... | \n",
+ " 1.775565 | \n",
+ " 0.075516 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 25 | \n",
+ " 32 | \n",
+ " 0.134009 | \n",
+ " 0.888979 | \n",
+ " [2, 3, 4] | \n",
+ " 0.806194 | \n",
+ " bracket=2-4 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0.000247 | \n",
+ " {'batch_size': 32, 'module__dropout': 0.134008... | \n",
+ " 2.550697 | \n",
+ " 0.062964 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " param_module__n_filters param_batch_size param_module__dropout \\\n",
+ "0 100 32 0.142333 \n",
+ "1 100 64 0.154550 \n",
+ "2 25 64 0.273504 \n",
+ "3 50 32 0.264442 \n",
+ "4 25 32 0.134009 \n",
+ "\n",
+ " std_partial_fit_time param_module__filter_sizes test_score model_id \\\n",
+ "0 0.000000 [3, 4, 5] 0.736264 bracket=2-0 \n",
+ "1 1.427928 [2, 3, 4] 0.806194 bracket=2-1 \n",
+ "2 0.000000 [1, 2, 3] 0.778222 bracket=2-2 \n",
+ "3 0.000000 [2, 3, 4] 0.779221 bracket=2-3 \n",
+ "4 0.888979 [2, 3, 4] 0.806194 bracket=2-4 \n",
+ "\n",
+ " bracket rank_test_score std_score_time \\\n",
+ "0 2 9 0.000000 \n",
+ "1 2 1 0.003680 \n",
+ "2 2 5 0.000000 \n",
+ "3 2 4 0.000000 \n",
+ "4 2 1 0.000247 \n",
+ "\n",
+ " params mean_partial_fit_time \\\n",
+ "0 {'batch_size': 32, 'module__dropout': 0.142332... 3.082313 \n",
+ "1 {'batch_size': 64, 'module__dropout': 0.154549... 4.203769 \n",
+ "2 {'batch_size': 64, 'module__dropout': 0.273504... 1.613707 \n",
+ "3 {'batch_size': 32, 'module__dropout': 0.264442... 1.775565 \n",
+ "4 {'batch_size': 32, 'module__dropout': 0.134008... 2.550697 \n",
+ "\n",
+ " mean_score_time partial_fit_calls \n",
+ "0 0.165004 1 \n",
+ "1 0.176866 9 \n",
+ "2 0.053710 1 \n",
+ "3 0.075516 1 \n",
+ "4 0.062964 3 "
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cv_results = pd.DataFrame(search.cv_results_)\n",
+ "cv_results.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8106"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "search.score(X_test, y_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " | Array | Chunk | \n",
+ " \n",
+ " \n",
+ " Bytes | 40.00 kB | 40.00 kB | \n",
+ " Shape | (5000,) | (5000,) | \n",
+ " Count | 2 Tasks | 1 Chunks | \n",
+ " Type | int64 | numpy.ndarray | \n",
+ " \n",
+ " \n",
+ " | \n",
+ "\n",
+ "\n",
+ " | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "dask.array<_predict, shape=(5000,), dtype=int64, chunksize=(5000,), chunktype=numpy.ndarray>"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "search.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0, 1, 0, ..., 1, 0, 1])"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "search.predict(X_test).compute()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It also has some other attributes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " model_id | \n",
+ " params | \n",
+ " partial_fit_calls | \n",
+ " partial_fit_time | \n",
+ " score | \n",
+ " score_time | \n",
+ " elapsed_wall_time | \n",
+ " bracket | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " bracket=0-0 | \n",
+ " {'batch_size': 32, 'module__dropout': 0.124762... | \n",
+ " 1 | \n",
+ " 3.679612 | \n",
+ " 0.798202 | \n",
+ " 0.069165 | \n",
+ " 30.430613 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " bracket=0-1 | \n",
+ " {'batch_size': 64, 'module__dropout': 0.145160... | \n",
+ " 1 | \n",
+ " 6.269561 | \n",
+ " 0.791209 | \n",
+ " 0.167421 | \n",
+ " 30.430616 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " bracket=0-2 | \n",
+ " {'batch_size': 32, 'module__dropout': 0.104590... | \n",
+ " 1 | \n",
+ " 6.193064 | \n",
+ " 0.789211 | \n",
+ " 0.166522 | \n",
+ " 30.430617 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " bracket=1-0 | \n",
+ " {'batch_size': 32, 'module__dropout': 0.153853... | \n",
+ " 1 | \n",
+ " 1.683563 | \n",
+ " 0.787213 | \n",
+ " 0.062644 | \n",
+ " 36.104408 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " bracket=1-1 | \n",
+ " {'batch_size': 64, 'module__dropout': 0.181473... | \n",
+ " 1 | \n",
+ " 3.002587 | \n",
+ " 0.803197 | \n",
+ " 0.171636 | \n",
+ " 36.104410 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " model_id params \\\n",
+ "0 bracket=0-0 {'batch_size': 32, 'module__dropout': 0.124762... \n",
+ "1 bracket=0-1 {'batch_size': 64, 'module__dropout': 0.145160... \n",
+ "2 bracket=0-2 {'batch_size': 32, 'module__dropout': 0.104590... \n",
+ "3 bracket=1-0 {'batch_size': 32, 'module__dropout': 0.153853... \n",
+ "4 bracket=1-1 {'batch_size': 64, 'module__dropout': 0.181473... \n",
+ "\n",
+ " partial_fit_calls partial_fit_time score score_time \\\n",
+ "0 1 3.679612 0.798202 0.069165 \n",
+ "1 1 6.269561 0.791209 0.167421 \n",
+ "2 1 6.193064 0.789211 0.166522 \n",
+ "3 1 1.683563 0.787213 0.062644 \n",
+ "4 1 3.002587 0.803197 0.171636 \n",
+ "\n",
+ " elapsed_wall_time bracket \n",
+ "0 30.430613 0 \n",
+ "1 30.430616 0 \n",
+ "2 30.430617 0 \n",
+ "3 36.104408 1 \n",
+ "4 36.104410 1 "
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hist = pd.DataFrame(search.history_)\n",
+ "hist.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This illustrates the history after every `partial_fit` call. There's also an attributed `model_history_` that records the history for each model (it's a reorganization of `history_`)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Learn more"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook covered basic usage `HyperbandSearchCV`. The following documentation and resources might be useful to learn more about `HyperbandSearchCV`, including some of the finer use cases:\n",
+ "\n",
+ "* [A talk](https://www.youtube.com/watch?v=x67K9FiPFBQ) introducing `HyperbandSearchCV` to the SciPy 2019 audience and the [corresponding paper](https://conference.scipy.org/proceedings/scipy2019/pdfs/scott_sievert.pdf)\n",
+ "* [HyperbandSearchCV's documentation](https://ml.dask.org/modules/generated/dask_ml.model_selection.HyperbandSearchCV.html)\n",
+ "\n",
+ "Performance comparisons can be found in the SciPy 2019 talk/paper."
+ ]
+ }
+ ],
+ "metadata": {
+ "environment": {
+ "name": "pytorch-gpu.1-4.m46",
+ "type": "gcloud",
+ "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46"
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}