some changes to the model and losses

bonevbs · bonevbs · commit db49a71075fa · 2025-10-30T08:26:13.000-07:00
diff --git a/makani/models/networks/fourcastnet3.py b/makani/models/networks/fourcastnet3.py
diff --git a/makani/models/networks/pangu.py b/makani/models/networks/pangu.py
@@ -452,7 +452,7 @@ def forward(self, x: torch.Tensor, mask=None):
             x: input features with shape of (B * num_lon, num_pl*num_lat, N, C)
             mask: (0/-inf) mask with shape of (num_lon, num_pl*num_lat, Wpl*Wlat*Wlon, Wpl*Wlat*Wlon)
         """
-        
+
         B_, nW_, N, C = x.shape
         qkv = (
             self.qkv(x)
@@ -478,18 +478,18 @@ def forward(self, x: torch.Tensor, mask=None):
             attn = self.attn_drop_fn(attn)
 
             x = self.apply_attention(attn, v, B_, nW_, N, C)
-        
+
         else:
             if mask is not None:
                 bias = mask.unsqueeze(1).unsqueeze(0) + earth_position_bias.unsqueeze(0).unsqueeze(0)
                 # squeeze the bias if needed in dim 2
                 #bias = bias.squeeze(2)
             else:
                 bias = earth_position_bias.unsqueeze(0)
-            
+
             # extract batch size for q,k,v
             nLon = self.num_lon
-            q = q.view(B_ // nLon, nLon, q.shape[1], q.shape[2], q.shape[3], q.shape[4]) 
+            q = q.view(B_ // nLon, nLon, q.shape[1], q.shape[2], q.shape[3], q.shape[4])
             k = k.view(B_ // nLon, nLon, k.shape[1], k.shape[2], k.shape[3], k.shape[4])
             v = v.view(B_ // nLon, nLon, v.shape[1], v.shape[2], v.shape[3], v.shape[4])
             ####
@@ -736,7 +736,7 @@ class Pangu(nn.Module):
     - https://arxiv.org/abs/2211.02556
     """
 
-    def __init__(self, 
+    def __init__(self,
         inp_shape=(721,1440),
         out_shape=(721,1440),
         grid_in="equiangular",
@@ -773,14 +773,14 @@ def __init__(self,
         self.checkpointing_level = checkpointing_level
 
         drop_path = np.linspace(0, drop_path_rate, 8).tolist()
-        
+
         # Add static channels to surface
         self.num_aux = len(self.aux_channel_names)
         N_total_surface = self.num_aux + self.num_surface
 
         # compute static permutations to extract
         self._precompute_channel_groups(self.channel_names, self.aux_channel_names)
-        
+
         # Patch embeddings are 2D or 3D convolutions, mapping the data to the required patches
         self.patchembed2d = PatchEmbed2D(
             img_size=self.inp_shape,
@@ -791,7 +791,7 @@ def __init__(self,
             flatten=False,
             norm_layer=None,
         )
-        
+
         self.patchembed3d = PatchEmbed3D(
             img_size=(num_levels, self.inp_shape[0], self.inp_shape[1]),
             patch_size=patch_size,
@@ -870,7 +870,7 @@ def __init__(self,
         self.patchrecovery3d = PatchRecovery3D(
             (num_levels, self.inp_shape[0], self.inp_shape[1]), patch_size, 2 * embed_dim, num_atmospheric
         )
-        
+
     def _precompute_channel_groups(
         self,
         channel_names=[],
@@ -901,7 +901,7 @@ def _precompute_channel_groups(
 
     def prepare_input(self, input):
         """
-        Prepares the input tensor for the Pangu model by splitting it into surface * static variables and atmospheric, 
+        Prepares the input tensor for the Pangu model by splitting it into surface * static variables and atmospheric,
         and reshaping the atmospheric variables into the required format.
         """
 
@@ -932,23 +932,23 @@ def prepare_output(self, output_surface, output_atmospheric):
         level_dict = {level: [idx for idx, value in enumerate(self.channel_names) if value[1:] == level] for level in levels}
         reordered_ids = [idx for level in levels for idx in level_dict[level]]
         check_reorder = [f'{level}_{idx}' for level in levels for idx in level_dict[level]]
-    
+
         # Flatten & reorder the output atmospheric to original order (doublechecked that this is working correctly!)
         flattened_atmospheric = output_atmospheric.reshape(output_atmospheric.shape[0], -1, output_atmospheric.shape[3], output_atmospheric.shape[4])
         reordered_atmospheric = torch.cat([torch.zeros_like(output_surface), torch.zeros_like(flattened_atmospheric)], dim=1)
         for i in range(len(reordered_ids)):
             reordered_atmospheric[:, reordered_ids[i], :, :] = flattened_atmospheric[:, i, :, :]
-        
+
         # Append the surface output, this has not been reordered.
         if output_surface is not None:
-            _, surf_chans, _, _ = features.get_channel_groups(self.channel_names, self.aux_channel_names)
+            _, surf_chans, _, _, _ = features.get_channel_groups(self.channel_names, self.aux_channel_names)
             reordered_atmospheric[:, surf_chans, :, :] = output_surface
             output = reordered_atmospheric
         else:
             output = reordered_atmospheric
 
         return output
-      
+
     def forward(self, input):
 
         # Prep the input by splitting into surface and atmospheric variables
@@ -959,7 +959,7 @@ def forward(self, input):
             surface = checkpoint(self.patchembed2d, surface_aux, use_reentrant=False)
             atmospheric = checkpoint(self.patchembed3d, atmospheric, use_reentrant=False)
         else:
-            surface = self.patchembed2d(surface_aux) 
+            surface = self.patchembed2d(surface_aux)
             atmospheric = self.patchembed3d(atmospheric)
 
         if surface.shape[1] == 0:
@@ -1011,11 +1011,5 @@ def forward(self, input):
                 output_atmospheric = self.patchrecovery3d(output_atmospheric)
 
         output = self.prepare_output(output_surface, output_atmospheric)
-        
-        return output
-
-        
-    
 
-    
-    
+        return output
diff --git a/makani/models/networks/pangu_onnx.py b/makani/models/networks/pangu_onnx.py
@@ -38,7 +38,7 @@ class PanguOnnx(OnnxWrapper):
         channel_order_PL: List containing the names of the pressure levels with the ordering that the ONNX model expects
         onnx_file: Path to the ONNX file containing the model
     '''
-    def __init__(self, 
+    def __init__(self,
         channel_names=[],
         aux_channel_names=[],
         onnx_file=None,
@@ -58,7 +58,7 @@ def _precompute_channel_groups(
         group the channels appropriately into atmospheric pressure levels and surface variables
         """
 
-        atmo_chans, surf_chans, _, pressure_lvls = get_channel_groups(channel_names, aux_channel_names)
+        atmo_chans, surf_chans, _, _, pressure_lvls = get_channel_groups(channel_names, aux_channel_names)
 
         # compute how many channel groups will be kept internally
         self.n_atmo_groups = len(pressure_lvls)
@@ -78,12 +78,12 @@ def prepare_input(self, input):
         B,V,Lat,Long=input.shape
 
         if B>1:
-            raise NotImplementedError("Not implemented yet for batch size greater than 1")   
+            raise NotImplementedError("Not implemented yet for batch size greater than 1")
 
         input=input.squeeze(0)
         surface_aux_inp=input[self.surf_channels]
         atmospheric_inp=input[self.atmo_channels].reshape(self.n_atmo_groups,self.n_atmo_chans,Lat,Long).transpose(1,0)
-        
+
         return surface_aux_inp, atmospheric_inp
 
     def prepare_output(self, output_surface, output_atmospheric):
@@ -99,15 +99,15 @@ def prepare_output(self, output_surface, output_atmospheric):
 
         return output.unsqueeze(0)
 
-      
+
     def forward(self, input):
-        
+
         surface, atmospheric = self.prepare_input(input)
 
 
         output,output_surface=self.onnx_session_run({'input':atmospheric,'input_surface':surface})
 
         output = self.prepare_output(output_surface, output)
 
-        
+
         return output
diff --git a/makani/models/stepper.py b/makani/models/stepper.py
@@ -153,11 +153,11 @@ def _forward_eval(self, inp, update_state=True, replace_state=True):
 
         return y
 
-    def forward(self, inp, replace_state=True):
+    def forward(self, inp, update_state=True, replace_state=True):
         # decide which routine to call
         if self.training:
-            y = self._forward_train(inp, update_state=True, replace_state=replace_state)
+            y = self._forward_train(inp, update_state=update_state, replace_state=replace_state)
         else:
-            y = self._forward_eval(inp, update_state=True, replace_state=replace_state)
+            y = self._forward_eval(inp, update_state=update_state, replace_state=replace_state)
 
         return y
diff --git a/makani/utils/driver.py b/makani/utils/driver.py
@@ -632,11 +632,11 @@ def get_optimizer(self, model, params):
         if params.optimizer_type == "Adam":
             if self.log_to_screen:
                 self.logger.info("using Adam optimizer")
-            optimizer = optim.Adam(all_parameters, betas=betas, lr=params.get("lr", 1e-3), weight_decay=params.get("weight_decay", 0), foreach=True)
+            optimizer = optim.Adam(all_parameters, lr=params.get("lr", 1e-3), betas=betas, eps=params.get("optimizer_eps", 1e-8), weight_decay=params.get("weight_decay", 0), foreach=True)
         elif params.optimizer_type == "AdamW":
             if self.log_to_screen:
                 self.logger.info("using AdamW optimizer")
-            optimizer = optim.AdamW(all_parameters, betas=betas, lr=params.get("lr", 1e-3), weight_decay=params.get("weight_decay", 0), foreach=True)
+            optimizer = optim.AdamW(all_parameters, lr=params.get("lr", 1e-3), betas=betas, eps=params.get("optimizer_eps", 1e-8), weight_decay=params.get("weight_decay", 0), foreach=True)
         elif params.optimizer_type == "SGD":
             if self.log_to_screen:
                 self.logger.info("using SGD optimizer")
diff --git a/makani/utils/features.py b/makani/utils/features.py
@@ -97,13 +97,15 @@ def get_wind_channels(channel_names):
 
 def get_channel_groups(channel_names, aux_channel_names=[]):
     """
-    Helper routine to extract indices of atmospheric, surface and auxiliary variables and group them into their respective groups
+    Helper routine to extract indices of atmospheric, surface and auxiliary variables and group them into their respective groups.
+    The resulting numbering does NOT respect history.
     """
 
     atmo_groups = OrderedDict()
     atmo_chans = []
     surf_chans = []
-    aux_chans = []
+    dyn_aux_chans = []
+    stat_aux_chans = []
 
     # parse channel names and group variables by pressure level/surface variables
     for idx, chn in enumerate(channel_names):
@@ -127,6 +129,10 @@ def get_channel_groups(channel_names, aux_channel_names=[]):
         atmo_chans += idx
 
     # append the auxiliary variable to the surface channels
-    aux_chans = [idx + len(channel_names) for idx in range(len(aux_channel_names))]
+    for idx, chn in enumerate(aux_channel_names):
+        if chn in ["xoro", "xlsml", "xlsms"]:
+            stat_aux_chans.append(idx + len(channel_names))
+        else:
+            dyn_aux_chans.append(idx + len(channel_names))
 
-    return atmo_chans, surf_chans, aux_chans, atmo_groups.keys()
+    return atmo_chans, surf_chans, dyn_aux_chans, stat_aux_chans, atmo_groups.keys()
diff --git a/makani/utils/loss.py b/makani/utils/loss.py
@@ -32,7 +32,7 @@
 from torch_harmonics.quadrature import clenshaw_curtiss_weights, legendre_gauss_weights
 
 from .losses import LossType, GeometricLpLoss, SpectralH1Loss, SpectralAMSELoss
-from .losses import EnsembleCRPSLoss, EnsembleSpectralCRPSLoss, EnsembleVortDivCRPSLoss
+from .losses import EnsembleCRPSLoss, EnsembleSpectralCRPSLoss, EnsembleVortDivCRPSLoss, EnergyScoreLoss
 from .losses import EnsembleNLLLoss, EnsembleMMDLoss
 from .losses import DriftRegularization, HydrostaticBalanceLoss
 
@@ -119,8 +119,6 @@ def __init__(self, params, track_running_stats: bool = False, seed: int = 0, eps
             )
 
             # append to dict and compile before:
-            # TODO: fix the compile issue
-            # self.loss_fn[loss_type] = torch.compile(loss_fn)
             self.loss_fn.append(loss_fn)
 
             # determine channel weighting
@@ -140,7 +138,8 @@ def __init__(self, params, track_running_stats: bool = False, seed: int = 0, eps
             # get channel weights either directly or through the compute routine
             if isinstance(channel_weight_type, List):
                 chw = torch.tensor(channel_weight_type, dtype=torch.float32)
-                chw = chw * time_diff_scale
+                if time_diff_scale is not None:
+                    chw = chw * time_diff_scale
                 assert chw.shape[1] == loss_fn.n_channels
             else:
                 chw = loss_fn.compute_channel_weighting(channel_weight_type, time_diff_scale=time_diff_scale)
@@ -228,6 +227,8 @@ def _parse_loss_type(self, loss_type: str):
             loss_handle = EnsembleNLLLoss
         elif "ensemble_mmd" in loss_type:
             loss_handle = EnsembleMMDLoss
+        elif "energy_score" in loss_type:
+            loss_handle = partial(EnergyScoreLoss)
         elif "drift_regularization" in loss_type:
             loss_handle = DriftRegularization
         else:
@@ -333,19 +334,23 @@ def forward(self, prd: torch.Tensor, tar: torch.Tensor, wgt: Optional[torch.Tens
                 loss_vals.append(lfn(prd, tar, wgt))
         all_losses = torch.cat(loss_vals, dim=-1)
 
+        # print(all_losses)
+
         if self.training and self.track_running_stats:
             self._update_running_stats(all_losses.clone())
 
         # process channel weights
         chw = self.channel_weights
         if self.uncertainty_weighting and self.training:
             var, _ = self.get_running_stats()
+            if self.num_batches_tracked.item() <= 100:
+                var = torch.ones_like(var)
             chw = chw / (torch.sqrt(2 * var) + self.eps)
         elif self.balanced_weighting and self.training:
             _, mean = self.get_running_stats()
             if self.num_batches_tracked.item() <= 100:
                 mean = torch.ones_like(mean)
-            chw = chw / mean
+            chw = chw / (mean + self.eps)
 
         if self.randomized_loss_weights:
             rmask = torch.zeros_like(chw)
diff --git a/makani/utils/losses/__init__.py b/makani/utils/losses/__init__.py
@@ -19,6 +19,7 @@
 from .amse_loss import SpectralAMSELoss
 from .hydrostatic_loss import HydrostaticBalanceLoss
 from .crps_loss import EnsembleCRPSLoss, EnsembleSpectralCRPSLoss, EnsembleVortDivCRPSLoss
+from .crps_loss import EnergyScoreLoss
 from .mmd_loss import EnsembleMMDLoss
 from .likelihood_loss import EnsembleNLLLoss
 from .drift_regularization import DriftRegularization
diff --git a/makani/utils/losses/base_loss.py b/makani/utils/losses/base_loss.py
@@ -45,7 +45,7 @@ def _compute_channel_weighting_helper(channel_names: List[str], channel_weight_t
     elif channel_weight_type == "auto":
 
         for c, chn in enumerate(channel_names):
-            if chn in ["u10m", "v10m", "u100m", "v100m", "tp", "sp", "msl", "tcwv"]:
+            if chn in ["u10m", "v10m", "u100m", "v100m", "tp", "sp", "msl", "tcwv", "sst"]:
                 channel_weights[c] = 0.1
             elif chn in ["t2m", "2d"]:
                 channel_weights[c] = 1.0
@@ -58,7 +58,7 @@ def _compute_channel_weighting_helper(channel_names: List[str], channel_weight_t
     elif channel_weight_type == "new auto":
 
         for c, chn in enumerate(channel_names):
-            if chn in ["u10m", "v10m", "u100m", "v100m", "tp", "sp", "msl", "tcwv"]:
+            if chn in ["u10m", "v10m", "u100m", "v100m", "tp", "sp", "msl", "tcwv", "sst"]:
                 channel_weights[c] = 0.1
             elif chn in ["t2m", "2d"]:
                 channel_weights[c] = 2.0
@@ -71,7 +71,7 @@ def _compute_channel_weighting_helper(channel_names: List[str], channel_weight_t
     elif channel_weight_type == "new auto 2":
 
         for c, chn in enumerate(channel_names):
-            if chn in ["u10m", "v10m", "u100m", "v100m", "tp", "sp", "msl", "tcwv"]:
+            if chn in ["u10m", "v10m", "u100m", "v100m", "tp", "sp", "msl", "tcwv", "sst"]:
                 channel_weights[c] = 0.1
             elif chn in ["t2m", "2d"]:
                 channel_weights[c] = 2.0
diff --git a/makani/utils/losses/crps_loss.py b/makani/utils/losses/crps_loss.py
diff --git a/makani/utils/metric.py b/makani/utils/metric.py
diff --git a/makani/utils/training/ensemble_trainer.py b/makani/utils/training/ensemble_trainer.py