🦾 new loss API

TezRomacH · TezRomacH · commit 0688cc652798 · 2020-09-13T08:25:52.000+03:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# layer-to-layer-pytorch
+# L2L execution algorithm PyTorch [WIP]
 
 <div align="center">
 
@@ -12,12 +12,97 @@
 [![Semantic Versions](https://img.shields.io/badge/%F0%9F%9A%80-semantic%20versions-informational.svg)](https://github.com/TezRomacH/layer-to-layer-pytorch/releases)
 [![License](https://img.shields.io/github/license/TezRomacH/layer-to-layer-pytorch)](https://github.com/TezRomacH/layer-to-layer-pytorch/blob/master/LICENSE)
 
-PyTorch implementation of L2L execution algorithm
+PyTorch implementation of L2L execution algorithm from paper [Training Large Neural Networks with Constant Memory using a New Execution Algorithm](https://arxiv.org/abs/2002.05645)
 </div>
 
-## 🚀 Features [WIP]
+## [Not ready yet]
+
+## 🚀 Exapmle
+
+You need to define a torch model where all layers are specified in ModuleList.
+
+for example
+
+```python
+import torch
+from torch import nn, optim
+
+class M(nn.Module):
+    def __init__(self, depth: int, dim: int, hidden_dim: Optional[int] = None):
+        super().__init__()
+        hidden_dim = hidden_dim or dim
+        self.layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Linear(dim, hidden_dim),
+                    nn.BatchNorm1d(hidden_dim),
+                    nn.LeakyReLU(),
+                )
+            ]
+            + [
+                nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim),
+                    nn.BatchNorm1d(hidden_dim),
+                    nn.LeakyReLU(),
+                )
+                for i in range(depth)
+            ]
+            + [nn.Linear(hidden_dim, dim), nn.Sigmoid()]
+        )
+
+    def forward(self, batch: torch.Tensor) -> torch.Tensor:
+        x = batch
+        for l in self.layers:
+            x = l(x)
+
+        return x
 
-## Installation [Not yet ready]
+```
+
+Then, you can use the L2L wrapper over this model.
+
+```python
+from layer_to_layer_pytorch.l2l import Layer2Layer
+
+model = M(depth=5, dim=40).train() # on CPU
+
+l2l_model = Layer2Layer(
+    model,
+    layers_attr="layers", # attribute with ModuleList
+    microbatch_size=100,  # size of microbatch in minibatch :) from original paper
+    verbose=False  # enable tqdm
+)
+```
+
+And train it, like torch model (almost):
+
+```python
+from tqdm.auto import tqdm, trange
+
+x = torch.rand(1_000, 40) # on CPU
+y = torch.rand(1_000, 40) # on CPU
+
+losses = []
+loss_fn = nn.MSELoss(reduction="sum") # since L2L calcs average loses itself, we just need to save them
+
+optimizer = optim.AdamW(l2l_model.main_model.parameters(), lr=0.001) # optimizer looks to main model on CPu
+
+for i in trange(5000):
+    l2l_model.zero_grad()
+    l2l_model.forward(x)
+
+    with l2l_model.l2l_loss(loss_fn=loss_fn) as loss: # APEX-like loss style
+        loss_value = loss(x, y)
+        loss.backward()
+
+    if i % 50 == 0:
+        tqdm.write(f"[{i}] loss = {loss_value.item()}")
+    losses.append(loss_value.item())
+
+    optimizer.step()
+```
+
+## Installation
 
 ```bash
 pip install layer-to-layer-pytorch
diff --git a/layer_to_layer_pytorch/__init__.py b/layer_to_layer_pytorch/__init__.py
@@ -13,4 +13,5 @@
     __version__ = "unknown"
 
 from layer_to_layer_pytorch.l2l import Layer2Layer
+from layer_to_layer_pytorch.loss import L2LLoss
 from layer_to_layer_pytorch.types import Device, LossFn, TensorOrTensorArray
diff --git a/layer_to_layer_pytorch/l2l.py b/layer_to_layer_pytorch/l2l.py
@@ -7,6 +7,7 @@
 from torch import nn
 
 from layer_to_layer_pytorch.helpers import enumerator, iterator, zipper
+from layer_to_layer_pytorch.loss import L2LLoss
 from layer_to_layer_pytorch.types import Device, LossFn, TensorOrTensorArray
 
 
@@ -54,7 +55,7 @@ def zero_grad(self) -> None:
 
     @torch.no_grad()
     def forward(self, batch: torch.Tensor, **kwargs) -> torch.Tensor:
-        layers: nn.ModuleList = getattr(self.main_model, self.layers_attr)
+        layers: nn.ModuleList = self._get_layers()
 
         # layer by layer forward pass. only activations are stored
         for idx, l in enumerator(
@@ -72,13 +73,7 @@ def forward(self, batch: torch.Tensor, **kwargs) -> torch.Tensor:
             else:
                 input = self._activations[idx - 1]
 
-            # forward with microbatching
-            batch_size = input.shape[0]
-            microbatch_size = (
-                batch_size
-                if self.microbatch_size is None
-                else self.microbatch_size
-            )
+            microbatch_size = self._get_microbatch_size(input)
             num_steps: int = input.shape[0] // microbatch_size
 
             for microbatch in iterator(
@@ -103,12 +98,13 @@ def calculate_gradients(
         target: torch.Tensor,
         loss_fn: LossFn,
         loss_kwargs: dict = None,
+        skip_last_layer: bool = False,
         **forward_kwargs,
-    ) -> torch.Tensor:
+    ) -> Optional[torch.Tensor]:
         if loss_kwargs is None:
             loss_kwargs = {}
         # layer by layer backward pass (in reverse order)
-        layers: nn.ModuleList = getattr(self.main_model, self.layers_attr)
+        layers: nn.ModuleList = self._get_layers()
         losses: List[torch.Tensor] = []
         num_steps_in_loss: int = 1
 
@@ -122,26 +118,38 @@ def calculate_gradients(
             layer = copy.deepcopy(l).to(self.gpu_device)
             f_idx: int = self.num_layers - idx - 1
 
+            if idx == 0 and skip_last_layer:
+                microbatch_size = self._get_microbatch_size(
+                    self._activations[f_idx]
+                )
+                num_steps: int = (
+                    self._activations[f_idx].shape[0] // microbatch_size
+                )
+                self._copy_grad_to_main_model(
+                    idx,
+                    num_steps,
+                    local_params=layer.parameters(),
+                    main_params=layers[f_idx].parameters(),
+                )
+                continue
+
             for param in layer.parameters():
                 param.requires_grad = True
 
             input: torch.Tensor
             output: torch.Tensor
 
-            if f_idx == 0:
+            if idx == 0:  # last layer
+                input = self._activations[f_idx]
+                output = target
+            elif f_idx == 0:  # first layer
                 input = batch
-                output = self._grads[idx - 1]
-            else:
+                output = self._activations[f_idx]
+            else:  # any other layer
                 input = self._activations[f_idx - 1]
-                output = target
-
-            batch_size = input.shape[0]
-            microbatch_size = (
-                batch_size
-                if self.microbatch_size is None
-                else self.microbatch_size
-            )
+                output = self._activations[f_idx]
 
+            microbatch_size = self._get_microbatch_size(input)
             num_steps: int = input.shape[0] // microbatch_size
             if idx == 0:
                 num_steps_in_loss = num_steps
@@ -160,7 +168,12 @@ def calculate_gradients(
 
                 microtarget = microtarget.to(self.gpu_device)
 
-                activation: torch.Tensor = layer(microbatch, **forward_kwargs)
+                if idx == 0:
+                    activation = microbatch
+                else:
+                    activation: torch.Tensor = layer(
+                        microbatch, **forward_kwargs
+                    )
 
                 if idx == 0:
                     loss = loss_fn(activation, microtarget, **loss_kwargs)
@@ -172,27 +185,58 @@ def calculate_gradients(
                     activation.backward(microtarget)
                     self._grads[idx].append(microbatch.grad.cpu())
 
-            for local_param, main_param in zip(
-                layer.parameters(), layers[f_idx].parameters()
-            ):
-                if main_param.grad is None:
-                    main_param.grad = local_param.grad.cpu() / num_steps
-                else:
-                    main_param.grad += local_param.grad.cpu() / num_steps
+            self._copy_grad_to_main_model(
+                idx,
+                num_steps,
+                local_params=layer.parameters(),
+                main_params=layers[f_idx].parameters(),
+            )
 
+        self._grads = list(reversed(self._grads))
+
+        if not skip_last_layer:
             with torch.no_grad():
-                self._grads[idx] = (
-                    torch.cat(self._grads[idx], dim=0).cpu() / num_steps
-                )
+                loss_value = torch.tensor(np.sum(losses) / num_steps_in_loss)
 
-        self._grads = list(reversed(self._grads))
-        with torch.no_grad():
-            loss_value = torch.tensor(np.sum(losses) / num_steps_in_loss)
+                return loss_value
 
-        return loss_value
+        return None
 
     def __call__(self, batch: torch.Tensor) -> torch.Tensor:
         return self.forward(batch)
 
+    def _get_microbatch_size(self, batch: torch.Tensor) -> int:
+        batch_size = batch.shape[0]
+        return (
+            batch_size if self.microbatch_size is None else self.microbatch_size
+        )
+
+    def _get_layers(self) -> nn.ModuleList:
+        return getattr(self.main_model, self.layers_attr)
+
+    def _copy_grad_to_main_model(
+        self, idx: int, num_steps: int, local_params, main_params
+    ):
+        for local_param, main_param in zip(local_params, main_params):
+            if main_param.grad is None:
+                main_param.grad = local_param.grad.cpu() / num_steps
+            else:
+                main_param.grad += local_param.grad.cpu() / num_steps
+
+        with torch.no_grad():
+            self._grads[idx] = (
+                torch.cat(self._grads[idx], dim=0).cpu() / num_steps
+            )
+
+    def l2l_loss(
+        self, loss_fn: LossFn, store_grad_on_calc: bool = True, **forward_kwargs
+    ) -> L2LLoss:
+        return L2LLoss(
+            model=self,
+            loss_fn=loss_fn,
+            store_grad_on_calc=store_grad_on_calc,
+            **forward_kwargs,
+        )
+
 
 __all__ = ["Layer2Layer"]
diff --git a/layer_to_layer_pytorch/loss.py b/layer_to_layer_pytorch/loss.py
@@ -1,14 +1,85 @@
-# from typing import Callable
+from typing import Callable, List
 
-# import torch
-# from torch import nn
+import numpy as np
+import torch
+from torch import nn
 
-# from layer_to_layer_pytorch.types import LossFn
-# from layer_to_layer_pytorch.l2l import Layer2Layer
+from layer_to_layer_pytorch.helpers import zipper
+from layer_to_layer_pytorch.types import LossFn
 
-# class L2LLoss:
-#     def __init__(self, model: Layer2Layer, loss_fn: LossFn):
-#         self.model = model
-#         self.loss_fn = loss_fn
 
-#     def __call__(self, batch: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+class L2LLoss:
+    def __init__(
+        self,
+        model,
+        loss_fn: LossFn,
+        store_grad_on_calc: bool = True,
+        **forward_kwargs,
+    ):
+        self.model = model
+        self.loss_fn = loss_fn
+        self.store_grad_on_calc = store_grad_on_calc
+        self.forward_kwargs = forward_kwargs or {}
+
+        self._batch = None
+        self._target = None
+
+    def __call__(
+        self, batch: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        self._batch = batch
+        self._target = target
+
+        microbatch_size = self.model._get_microbatch_size(batch)
+        num_steps_in_loss = batch.shape[0] // microbatch_size
+        losses: List[torch.Tensor] = []
+
+        layer: nn.Module = self.model._get_layers()[-1].to(
+            self.model.gpu_device
+        )
+
+        for microbatch, microtarget in zipper(
+            batch.split(microbatch_size),
+            target.split(microbatch_size),
+            verbose=False,
+            desc="Microbatching",
+            total=num_steps_in_loss,
+            leave=False,
+        ):
+            microbatch = microbatch.to(self.model.gpu_device)
+            microbatch.requires_grad = True
+
+            microtarget = microtarget.to(self.model.gpu_device)
+
+            activation: torch.Tensor = layer(microbatch, **self.forward_kwargs)
+
+            loss = self.loss_fn(activation, microtarget)
+            losses.append(loss.item())
+
+            if self.store_grad_on_calc:
+                loss.backward()
+                self.model._grads[-1].append(microbatch.grad.cpu())
+
+        with torch.no_grad():
+            loss_value = torch.tensor(np.sum(losses) / num_steps_in_loss)
+
+        return loss_value
+
+    @torch.no_grad()
+    def backward(self) -> None:
+        self.model.calculate_gradients(
+            self._batch,
+            self._target,
+            loss_fn=self.loss_fn,
+            skip_last_layer=self.store_grad_on_calc,
+        )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self._batch = None
+        self._target = None
+
+
+__all__ = ["L2LLoss"]