[Fix][Temp] Avoid recompiling on globals and closure variables

nil0x9 · nil0x9 · commit 39f7de0048a6 · 2025-11-19T19:18:44.000+08:00
diff --git a/xtuner/v1/module/decoder_layer/dense_decoder_layer.py b/xtuner/v1/module/decoder_layer/dense_decoder_layer.py
@@ -62,6 +62,7 @@ def __init__(
             generate_config=generate_config,
             float8_cfg=float8_cfg,
         )
+        self.self_attn.name = f"layers.{layer_idx}.self_attn"
         self.mlp = DenseMLP(
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
diff --git a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
@@ -214,6 +214,7 @@ def __init__(
             layer_type=layer_type,
             float8_cfg=float8_cfg,
         )
+        self.self_attn.name = f"layers.{layer_idx}.self_attn"
         self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)
         self.shared_experts: MoEMLP | None
         self.layer_idx = layer_idx
diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py
@@ -620,7 +620,10 @@ def _maybe_check_model_internal_metrics(self, data_batches: list[ModelItem]) ->
             return None
 
         with InternalMetricsRecorder(self._engine) as metrics_recorder:
-            return metrics_recorder.get_metrics(data_batches)
+            logger.info("Start calculating model internal metrics...")
+            metrics: InternalMetrics = metrics_recorder.get_metrics(data_batches)
+            logger.info("Calculating model internal metrics done.")
+            return metrics
 
     @property
     def world_size(self) -> int:
diff --git a/xtuner/v1/utils/compile.py b/xtuner/v1/utils/compile.py
@@ -70,7 +70,11 @@ def decorator(func):
                         func_compile_kwargs = {**compile_kwargs, **target_kwargs}
 
                 # Compile the function
-                self._compiled_funcs[func_id] = torch.compile(original_func, **func_compile_kwargs)
+                self._compiled_funcs[func_id] = torch.compile(
+                    original_func,
+                    options={"guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe},
+                    **func_compile_kwargs,
+                )
 
             @functools.wraps(func)
             def wrapper(*args, **kwargs):
diff --git a/xtuner/v1/utils/internal_metrics.py b/xtuner/v1/utils/internal_metrics.py
@@ -70,24 +70,24 @@ def calculate_module_weight_rms(self, module: nn.Module, layer_name: str, dtype:
         param_rms = param_l2_norm / total_numel**0.5
         self.metrics["weight_rms"][layer_name] = param_rms.item()
 
-    def register_attn_extra_info_hook(self, module: nn.Module, layer_name: str):
+    def register_attn_extra_info_hook(self, module: nn.Module):
         """Register attention extra info hook as a forward hook"""
         def hook(module, input, output):
             extra_info = output[1]
             if extra_info.get("softmax_lse", None) is not None:
-                if layer_name not in ATTN_MAX_LSE:
+                if module.name not in ATTN_MAX_LSE:
                     # original shape: [n_head, seq]
-                    ATTN_MAX_LSE[layer_name] = extra_info["softmax_lse"].max()
+                    ATTN_MAX_LSE[module.name] = extra_info["softmax_lse"].max()
                 else:
-                    prev_lse_max = ATTN_MAX_LSE[layer_name]
-                    ATTN_MAX_LSE[layer_name] = max(prev_lse_max, extra_info["softmax_lse"].max())
+                    prev_lse_max = ATTN_MAX_LSE[module.name]
+                    ATTN_MAX_LSE[module.name] = max(prev_lse_max, extra_info["softmax_lse"].max())
             if extra_info.get("attn_logits", None) is not None:
-                if layer_name not in ATTN_MAX_LOGITS:
+                if module.name not in ATTN_MAX_LOGITS:
                     # original shape: [b, n_head, seq, seq]
-                    ATTN_MAX_LOGITS[layer_name] = extra_info["attn_logits"].max()
+                    ATTN_MAX_LOGITS[module.name] = extra_info["attn_logits"].max()
                 else:
-                    prev_logits_max = ATTN_MAX_LOGITS[layer_name]
-                    ATTN_MAX_LOGITS[layer_name] = max(prev_logits_max, extra_info["attn_logits"].max())
+                    prev_logits_max = ATTN_MAX_LOGITS[module.name]
+                    ATTN_MAX_LOGITS[module.name] = max(prev_logits_max, extra_info["attn_logits"].max())
 
         hook_handle: RemovableHandle = module.register_forward_hook(hook)
         self.hooks.append(hook_handle)
@@ -187,7 +187,7 @@ def get_metrics(self, data_batches: list[ModelItem]):
     def __enter__(self):
         for name, module in self.model.named_modules():
             if isinstance(module, ATTENTION_CLS):
-                self.register_attn_extra_info_hook(module, self._clean_module_name(name))
+                self.register_attn_extra_info_hook(module)
             if isinstance(module, RMS_NORM_MONITOR_MODULES):
                 self.calculate_module_weight_rms(module, self._clean_module_name(name), dtype=torch.float32)
         return self

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ def __init__(`
`62`	`62`	`generate_config=generate_config,`
`63`	`63`	`float8_cfg=float8_cfg,`
`64`	`64`	`)`
	`65`	`+ self.self_attn.name = f"layers.{layer_idx}.self_attn"`
`65`	`66`	`self.mlp = DenseMLP(`
`66`	`67`	`hidden_size=hidden_size,`
`67`	`68`	`intermediate_size=intermediate_size,`
Original file line number	Diff line number	Diff line change
`@@ -214,6 +214,7 @@ def __init__(`
`214`	`214`	`layer_type=layer_type,`
`215`	`215`	`float8_cfg=float8_cfg,`
`216`	`216`	`)`
	`217`	`+ self.self_attn.name = f"layers.{layer_idx}.self_attn"`
`217`	`218`	`self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)`
`218`	`219`	`self.shared_experts: MoEMLP \| None`
`219`	`220`	`self.layer_idx = layer_idx`