adding seeded dropout

azrael417 · azrael417 · commit 9f8e318ce086 · 2025-11-03T08:53:13.000-08:00
diff --git a/makani/models/common/context.py b/makani/models/common/context.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 20245 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+from typing import Optional
+from contextlib import contextmanager
+
+@contextmanager
+def rng_context(cpu_rng: torch.Generator, device_rng: Optional[torch.Generator] = None):
+    """
+    Context manager for temporarily setting CPU and device RNG states.
+    
+    This context manager allows you to temporarily set specific RNG states
+    for reproducibility, then automatically restore the original global states.
+    
+    Parameters
+    ----------
+    cpu_rng_state : torch.Tensor
+        CPU RNG state to set temporarily
+    device_rng_state : torch.Tensor, optional
+        Device (CUDA) RNG state to set temporarily. Uses current device.
+        
+    Examples
+    --------
+    >>> # Save current states
+    >>> cpu_state = torch.get_rng_state()
+    >>> device_state = torch.cuda.get_rng_state()
+    >>> 
+    >>> # Later, temporarily use those states
+    >>> with rng_context(cpu_state, device_state):
+    >>>     # Code here uses the provided RNG states
+    >>>     x = torch.randn(10)
+    >>> # Original RNG states are restored here
+    """
+    
+    # Backup and set CPU RNG state
+    cpu_backup = torch.get_rng_state()
+    torch.set_rng_state(cpu_rng.get_state())
+    
+    # Backup and set device RNG state if provided
+    device_backup = None
+    if device_rng is not None and torch.cuda.is_available():
+        device_backup = torch.cuda.get_rng_state()
+        torch.cuda.set_rng_state(device_rng.get_state())
+    try:
+        yield
+
+    finally:
+        # Restore states
+        cpu_rng.set_state(torch.get_rng_state())
+        torch.set_rng_state(cpu_backup)
+        if device_backup is not None:
+            device_rng.set_state(torch.cuda.get_rng_state())
+            torch.cuda.set_rng_state(device_backup)
diff --git a/makani/models/common/layers.py b/makani/models/common/layers.py
@@ -19,6 +19,8 @@
 from torch.utils.checkpoint import checkpoint, checkpoint_sequential
 import math
 
+from makani.models.common.context import rng_context
+
 
 @torch.compile
 def drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
@@ -49,6 +51,26 @@ def forward(self, x):
         return drop_path(x, self.drop_prob, self.training)
 
 
+class SeededDropout(nn.Module):
+    def __init__(self, drop_prob=0.0, seed=333):
+        super(SeededDropout, self).__init__()
+        self.drop_prob = drop_prob
+        self.seed = seed
+        self.drop = nn.Dropout(p=self.drop_prob)
+
+        # set RNG states
+        self.rng_cpu = torch.Generator(device=torch.device("cpu"))
+        self.rng_cpu.manual_seed(seed)
+        if torch.cuda.is_available():
+            self.rng_gpu = torch.Generator(device=torch.cuda.current_device())
+            self.rng_gpu.manual_seed(seed)
+
+    def forward(self, x):
+        with rng_context(self.rng_cpu, self.rng_gpu):
+            xdrop = self.drop(x)
+        return xdrop
+
+
 class LayerScale(nn.Module):
     def __init__(self, num_chans=3, init_value=0.1):
         super().__init__()