update blockedfp8 scale name (#3532)

CUHKSZzxy · web-flow · commit 2831c3a234c3 · 2025-05-12T17:58:30.000+08:00
* update blockedfp8 scale name

* fix lint

* fix deepseek _load_weight_attention
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -1273,8 +1273,8 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
             if name.endswith('.weight'):
                 weight_name = name
                 scale_name = name.replace('.weight', '.scale')
-            elif name.endswith('.scale'):
-                weight_name = name.replace('.scale', '.weight')
+            elif name.endswith('.weight_scale_inv'):
+                weight_name = name.replace('.weight_scale_inv', '.weight')
                 scale_name = name
             self._load_buffers[name] = loaded_weight
             if (weight_name in self._load_buffers and scale_name in self._load_buffers):
@@ -1288,7 +1288,7 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
         for (mod_name, head_dim, pe_dim_offset) in update_pe_mapping:
             if mod_name not in name:
                 continue
-            if name.endswith('.scale'):
+            if name.endswith('.weight_scale_inv'):
                 weight = loaded_weight
             else:
                 loaded_weight = loaded_weight.to(device)
@@ -1328,8 +1328,6 @@ def __skip_nextn(name, nextn_keys):
             ('.gate_up_proj', '.up_proj', 1),
         ]
 
-        scale_suffix = '.weight_scale_inv'
-
         config = self.config
 
         update_pe_mapping = []
@@ -1375,8 +1373,7 @@ def __skip_nextn(name, nextn_keys):
                     continue
             if self.config.tie_word_embeddings and 'lm_head.weight' in name:
                 continue
-            if name.endswith(scale_suffix):
-                name = name[:-len(scale_suffix)] + '.scale'
+
             if '.experts' in name:
                 self._load_weight_experts(name, loaded_weight, params_dict, expert_params_mapping=expert_params_mapping)
             elif '.self_attn' in name and getattr(config, 'use_mla', True):
diff --git a/lmdeploy/pytorch/models/internlm3.py b/lmdeploy/pytorch/models/internlm3.py
@@ -403,7 +403,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ('.gate_up_proj', '.up_proj', 1),
         ]
 
-        scale_suffix = '.weight_scale_inv'
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if 'rotary_emb.inv_freq' in name:
@@ -412,8 +411,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 continue
             if self.config.tie_word_embeddings and 'lm_head.weight' in name:
                 continue
-            if name.endswith(scale_suffix):
-                name = name[:-len(scale_suffix)] + '.scale'
 
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
diff --git a/lmdeploy/pytorch/models/qwen3.py b/lmdeploy/pytorch/models/qwen3.py
@@ -403,7 +403,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ('.gate_up_proj', '.up_proj', 1),
         ]
 
-        scale_suffix = '.weight_scale_inv'
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if 'rotary_emb.inv_freq' in name:
@@ -412,8 +411,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 continue
             if self.config.tie_word_embeddings and 'lm_head.weight' in name:
                 continue
-            if name.endswith(scale_suffix):
-                name = name[:-len(scale_suffix)] + '.scale'
 
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py
@@ -495,7 +495,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             down_param = ('.experts.down', f'.experts.{exp_id}.down_proj', exp_id, 'down')
             expert_params_mapping += [gate_param, up_param, down_param]
 
-        scale_suffix = '.weight_scale_inv'
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if 'rotary_emb.inv_freq' in name:
@@ -504,8 +503,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 continue
             if self.config.tie_word_embeddings and 'lm_head.weight' in name:
                 continue
-            if name.endswith(scale_suffix):
-                name = name[:-len(scale_suffix)] + '.scale'
 
             if '.experts' in name:
                 self._load_weight_experts(name, loaded_weight, params_dict, expert_params_mapping=expert_params_mapping)
diff --git a/lmdeploy/pytorch/nn/linear.py b/lmdeploy/pytorch/nn/linear.py
@@ -241,16 +241,16 @@ def __init__(
         self.impl = impl_builder.build(in_features, out_features, block_size=128, bias=bias is not None, dtype=dtype)
         self.block_size = 128
         self.fp8_dtype = fp8_dtype
-        weight, scale, bias = self.create_weights(in_features, out_features, bias, dtype, device)
+        weight, weight_scale_inv, bias = self.create_weights(in_features, out_features, bias, dtype, device)
         weight = torch.nn.Parameter(weight, requires_grad=False)
         weight.weight_loader = self.weight_loader
-        scale = torch.nn.Parameter(scale, requires_grad=False)
-        scale.weight_loader = self.weight_loader
+        weight_scale_inv = torch.nn.Parameter(weight_scale_inv, requires_grad=False)
+        weight_scale_inv.weight_loader = self.weight_loader
         if bias is not None:
             bias = torch.nn.Parameter(bias, requires_grad=False)
             bias.weight_loader = self.weight_loader
         self.register_parameter('weight', weight)
-        self.register_parameter('scale', scale)
+        self.register_parameter('weight_scale_inv', weight_scale_inv)
         self.register_parameter('bias', bias)
 
         self.in_features = in_features
@@ -302,27 +302,27 @@ def weight_loader(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor):
     def create_weights(self, in_features: int, out_features: int, bias: bool, dtype: torch.dtype, device: torch.device):
         """create weights."""
         weight = torch.empty((out_features, in_features), dtype=self.fp8_dtype, device=device)
-        scale = torch.empty((div_up(out_features, self.block_size), div_up(in_features, self.block_size)),
-                            dtype=torch.float32,
-                            device=device)
+        weight_scale_inv = torch.empty((div_up(out_features, self.block_size), div_up(in_features, self.block_size)),
+                                       dtype=torch.float32,
+                                       device=device)
         if bias:
             bias = torch.empty((out_features, ), dtype=dtype, device=device)
         else:
             bias = None
-        return weight, scale, bias
+        return weight, weight_scale_inv, bias
 
     def update_weights(self):
         """update weights."""
-        weight, scale, bias = self.impl.update_weights(self.weight, self.scale, self.bias)
+        weight, weight_scale_inv, bias = self.impl.update_weights(self.weight, self.weight_scale_inv, self.bias)
         weight = torch.nn.Parameter(weight, requires_grad=False)
         self.weight.weight_loader = self.weight_loader
-        scale = torch.nn.Parameter(scale, requires_grad=False)
-        self.scale.weight_loader = self.weight_loader
+        weight_scale_inv = torch.nn.Parameter(weight_scale_inv, requires_grad=False)
+        self.weight_scale_inv.weight_loader = self.weight_loader
         if bias is not None:
             bias = torch.nn.Parameter(bias, requires_grad=False)
             self.bias.weight_loader = self.weight_loader
         self.register_parameter('weight', weight)
-        self.register_parameter('scale', scale)
+        self.register_parameter('weight_scale_inv', weight_scale_inv)
         self.register_parameter('bias', bias)
 
     def forward(self, x):
@@ -340,11 +340,11 @@ def forward(self, x):
         if len(self.lora_adapters) == 0:
             if self.dp_scatter:
                 _, rank = get_tp_world_rank()
-                return self.impl.forward(x, self.weight, self.scale, self.bias, all_reduce, rank, tp_sizes)
+                return self.impl.forward(x, self.weight, self.weight_scale_inv, self.bias, all_reduce, rank, tp_sizes)
             else:
-                return self.impl.forward(x, self.weight, self.scale, self.bias, all_reduce)
+                return self.impl.forward(x, self.weight, self.weight_scale_inv, self.bias, all_reduce)
 
-        out = self.impl.forward(x, self.weight, self.scale, self.bias, False)
+        out = self.impl.forward(x, self.weight, self.weight_scale_inv, self.bias, False)
         for lora_adapter in self.lora_adapters.values():
             out = lora_adapter(x, out)
         if all_reduce:
@@ -394,10 +394,10 @@ def __init__(self,
                          dp_gather=dp_gather)
         self.weight.weight_loader = self.weight_loader
         self.weight._weight_type = 'qweight'
-        self.scale.weight_loader = self.weight_loader
-        self.scale._weight_type = 'scales'
+        self.weight_scale_inv.weight_loader = self.weight_loader
+        self.weight_scale_inv._weight_type = 'scales'
         self.weight.weight_spliter = self.weight_spliter
-        self.scale.weight_spliter = self.weight_spliter
+        self.weight_scale_inv.weight_spliter = self.weight_spliter
         if self.bias is not None:
             self.bias.weight_loader = self.weight_loader
             self.bias.weight_spliter = self.weight_spliter
diff --git a/lmdeploy/pytorch/nn/moe.py b/lmdeploy/pytorch/nn/moe.py
@@ -421,25 +421,25 @@ def __init__(self,
             ep=ep,
         )
         self.block_size = block_size
-        scale = torch.empty((num_experts, div_up(out_features, block_size), div_up(in_features, block_size)),
-                            dtype=torch.float32,
-                            device=device)
-        scale = torch.nn.Parameter(scale, requires_grad=False)
-        self.register_parameter('scale', scale)
+        weight_scale_inv = torch.empty((num_experts, div_up(out_features, block_size), div_up(in_features, block_size)),
+                                       dtype=torch.float32,
+                                       device=device)
+        weight_scale_inv = torch.nn.Parameter(weight_scale_inv, requires_grad=False)
+        self.register_parameter('weight_scale_inv', weight_scale_inv)
 
         if self.ep:
             self.expert_map = dict((eid, idx) for idx, eid in enumerate(expert_list))
-            self.scale.weight_loader = self.weight_loader_scale_ep
+            self.weight_scale_inv.weight_loader = self.weight_loader_scale_ep
         else:
-            self.scale.weight_loader = self.weight_loader_scale_tp
+            self.weight_scale_inv.weight_loader = self.weight_loader_scale_tp
 
-    def update_weight(self, weight: torch.Tensor, scale: torch.Tensor):
+    def update_weight(self, weight: torch.Tensor, weight_scale_inv: torch.Tensor):
         """update weight."""
         super().update_weight(weight=weight)
-        weight_loader = self.scale.weight_loader
-        scale = torch.nn.Parameter(scale, requires_grad=False)
-        scale.weight_loader = weight_loader
-        self.register_parameter('scale', scale)
+        weight_loader = self.weight_scale_inv.weight_loader
+        weight_scale_inv = torch.nn.Parameter(weight_scale_inv, requires_grad=False)
+        weight_scale_inv.weight_loader = weight_loader
+        self.register_parameter('weight_scale_inv', weight_scale_inv)
 
     def weight_loader_scale_ep(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int,
                                shard_id: str):
@@ -545,8 +545,8 @@ def __init__(self,
     def update_weights(self):
         """update weights."""
         (gate_up_weights, down_weights, gate_up_scale,
-         down_scale) = self.impl.update_weights(self.gate_up.weight, self.down.weight, self.gate_up.scale,
-                                                self.down.scale)
+         down_scale) = self.impl.update_weights(self.gate_up.weight, self.down.weight, self.gate_up.weight_scale_inv,
+                                                self.down.weight_scale_inv)
         self.gate_up.update_weight(gate_up_weights, gate_up_scale)
         self.down.update_weight(down_weights, down_scale)
 
@@ -628,8 +628,9 @@ def gemm(self, state: Dict):
         if moe_type == MoeType.DSAsyncPrefill:
             if state['recv_hidden_states'].shape[0] > 0:
                 state['recv_hidden_states'] = state['fusedmoe'].fusedmoe_forward(state, self.gate_up.weight,
-                                                                                 self.gate_up.scale, self.down.weight,
-                                                                                 self.down.scale)
+                                                                                 self.gate_up.weight_scale_inv,
+                                                                                 self.down.weight,
+                                                                                 self.down.weight_scale_inv)
             gemm_state = {
                 'fusedmoe': state['fusedmoe'],
                 'hidden_states': state['recv_hidden_states'],
@@ -638,8 +639,9 @@ def gemm(self, state: Dict):
             }
         elif moe_type == MoeType.DSAsyncDecode:
             state['recv_hidden_states'] = state['fusedmoe'].fusedmoe_forward(state, self.gate_up.weight,
-                                                                             self.gate_up.scale, self.down.weight,
-                                                                             self.down.scale)
+                                                                             self.gate_up.weight_scale_inv,
+                                                                             self.down.weight,
+                                                                             self.down.weight_scale_inv)
             gemm_state = {
                 'fusedmoe': state['fusedmoe'],
                 'hidden_states': state['recv_hidden_states'],
@@ -650,8 +652,8 @@ def gemm(self, state: Dict):
             }
         else:  # MoeType.Default
             hidden_states = self.impl.forward(state['hidden_states'], state['topk_weights'], state['topk_idx'],
-                                              self.gate_up.weight, self.gate_up.scale, self.down.weight,
-                                              self.down.scale, self.expert_list)
+                                              self.gate_up.weight, self.gate_up.weight_scale_inv, self.down.weight,
+                                              self.down.weight_scale_inv, self.expert_list)
             gemm_state = {'hidden_states': hidden_states, 'moe_type': state['moe_type']}
         return gemm_state