[Enhance] Support internal metrics for gdn A_log and norm

nil0x9 · nil0x9 · commit b3018e90aab7 · 2026-03-23T17:06:14.000+08:00
diff --git a/xtuner/v1/utils/internal_metrics.py b/xtuner/v1/utils/internal_metrics.py
@@ -13,6 +13,7 @@
 from xtuner.v1.model.base import BaseModel as XTunerBaseModel
 from xtuner.v1.model.base import ModelItem
 from xtuner.v1.module import LMHead, MHAConfig, MLAConfig, MultiHeadAttention, MultiLatentAttention
+from xtuner.v1.module.attention.gated_deltanet import FusedRMSNormGated
 from xtuner.v1.module.decoder_layer.dense_decoder_layer import DenseDecoderLayer
 from xtuner.v1.module.decoder_layer.moe_decoder_layer import MoEDecoderLayer
 from xtuner.v1.utils.device import get_device
@@ -37,6 +38,8 @@
 
 class InternalMetrics(TypedDict, total=False):
     weight_rms: dict[str, float]
+    weight_min: dict[str, float]
+    weight_max: dict[str, float]
     maxvio: dict[str, float]
     drop_ratio: dict[str, float]
     router_logits_max: dict[str, float]
@@ -90,6 +93,8 @@ def _init_metrics_dict(self) -> InternalMetrics:
 
         if self.internal_metrics_cfg.monitor_weights_rms_norm:
             metrics["weight_rms"] = {}
+            metrics["weight_min"] = {}
+            metrics["weight_max"] = {}
 
         if self.internal_metrics_cfg.monitor_attn_logits_stats:
             attn_cfg: MHAConfig | MLAConfig = self.model.config.attention  # type: ignore[attr-defined]
@@ -153,6 +158,44 @@ def calculate_module_weight_rms(self, module: nn.Module, layer_name: str, dtype:
 
         self.metrics["weight_rms"][layer_name] = param_rms.item()
 
+    @torch.no_grad()
+    def calculate_module_weight_min_max(self, module: nn.Module | nn.Parameter | torch.Tensor, layer_name: str):
+        """Calculate the min and max of the module's parameters."""
+        self._check_closed()
+
+        if "weight_min" not in self.metrics or "weight_max" not in self.metrics:
+            return
+
+        if isinstance(module, nn.Module):
+            all_params = [param.data for param in module.parameters() if param.requires_grad]
+        else:
+            all_params = [module.data]
+
+        if not all_params:
+            return
+
+        # Handle DTensor - convert to local tensors
+        from torch.distributed.tensor import DTensor
+
+        local_params = []
+        for param in all_params:
+            if isinstance(param, DTensor):
+                local_params.append(param.to_local())
+            else:
+                local_params.append(param)
+
+        # Calculate local min/max
+        local_min = torch.min(torch.stack([p.min() for p in local_params]))
+        local_max = torch.max(torch.stack([p.max() for p in local_params]))
+
+        # All-reduce across ranks
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            dist.all_reduce(local_min, op=dist.ReduceOp.MIN)
+            dist.all_reduce(local_max, op=dist.ReduceOp.MAX)
+
+        self.metrics["weight_min"][layer_name] = local_min.item()
+        self.metrics["weight_max"][layer_name] = local_max.item()
+
     def register_attn_output_hook(self, module: nn.Module):
         """Register attention output hook as a forward hook."""
         self._check_closed()
@@ -179,6 +222,14 @@ def pop_metrics(self, data_batches: list[ModelItem]):
             if self.internal_metrics_cfg.monitor_weights_rms_norm and isinstance(module, RMS_NORM_MONITOR_MODULES):
                 self.calculate_module_weight_rms(module, self._clean_module_name(name), dtype=torch.float32)
 
+            if self.internal_metrics_cfg.monitor_weights_rms_norm and isinstance(module, FusedRMSNormGated):
+                self.calculate_module_weight_min_max(module, self._clean_module_name(name))
+
+            if self.internal_metrics_cfg.monitor_weights_rms_norm and hasattr(module, "A_log"):
+                self.calculate_module_weight_min_max(module.A_log, f"{self._clean_module_name(name)}.A_log")
+
+
+
         additional_kwargs = {}
         if self.internal_metrics_cfg.monitor_moe_router_logits_stats and isinstance(self.model, MoE):
             # for MoE model, add additional kwargs to return necessary stats