Add gradient checkpointing (#33)

forklady42 · claude · pre-commit-ci[bot] · web-flow · commit 605deb88ac23 · 2025-12-09T10:00:52.000-05:00
Adds gradient checkpointing, which trades compute for memory, to
residual blocks and upsampling layers.

---------

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/src/electrai/lightning.py b/src/electrai/lightning.py
@@ -18,6 +18,7 @@ def __init__(self, cfg):
             K1=int(cfg.kernel_size1),
             K2=int(cfg.kernel_size2),
             normalize=cfg.normalize,
+            use_checkpoint=getattr(cfg, "use_checkpoint", True),
         )
         self.loss_fn = NormMAE()
 
diff --git a/src/electrai/model/srgan_layernorm_pbc.py b/src/electrai/model/srgan_layernorm_pbc.py
@@ -6,11 +6,13 @@
 
 import torch
 import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
 
 
 class ResidualBlock(nn.Module):
-    def __init__(self, in_features, K=3):
+    def __init__(self, in_features, K=3, use_checkpoint=True):
         super().__init__()
+        self.use_checkpoint = use_checkpoint
         self.conv_block = nn.Sequential(
             nn.Conv3d(
                 in_features,
@@ -34,7 +36,11 @@ def __init__(self, in_features, K=3):
         )
 
     def forward(self, x):
-        return x + self.conv_block(x)
+        if self.use_checkpoint and self.training:
+            # Use gradient checkpointing to save memory during training
+            return x + checkpoint(self.conv_block, x, use_reentrant=False)
+        else:
+            return x + self.conv_block(x)
 
 
 class PixelShuffle3d(nn.Module):
@@ -67,16 +73,19 @@ def __init__(
         K1=5,
         K2=3,
         normalize=True,
+        use_checkpoint=True,
     ):
         """
         This net upscales each axis by 2**n_upscale_layers
         C = channel size in most of layers
         K1 = kernel size in the first and last layers
         K2 = kernel size in Res blocks
+        use_checkpoint = enable gradient checkpointing to save memory
         """
         super().__init__()
         self.n_upscale_layers = n_upscale_layers
         self.normalize = normalize
+        self.use_checkpoint = use_checkpoint
 
         # First layer
         self.conv1 = nn.Sequential(
@@ -92,7 +101,10 @@ def __init__(
         )
 
         # Residual blocks
-        res_blocks = [ResidualBlock(C, K=K2) for _ in range(n_residual_blocks)]
+        res_blocks = [
+            ResidualBlock(C, K=K2, use_checkpoint=use_checkpoint)
+            for _ in range(n_residual_blocks)
+        ]
         self.res_blocks = nn.Sequential(*res_blocks)
 
         # Second conv layer post residual blocks

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ def __init__(self, cfg):`
`18`	`18`	`K1=int(cfg.kernel_size1),`
`19`	`19`	`K2=int(cfg.kernel_size2),`
`20`	`20`	`normalize=cfg.normalize,`
	`21`	`+ use_checkpoint=getattr(cfg, "use_checkpoint", True),`
`21`	`22`	`)`
`22`	`23`	`self.loss_fn = NormMAE()`
`23`	`24`