fix bug

openvino-dev-samples · openvino-dev-samples · commit 217399da8898 · 2026-02-26T06:14:41.000-08:00
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -433,20 +433,39 @@ def ts_patched_forward(*args, **kwargs):
                 if patch_16bit_model:
                     from openvino.frontend.pytorch.patch_model import __make_16bit_traceable
 
+                    import psutil
+                    proc = psutil.Process()
+                    print(f"[DEBUG] Before __make_16bit_traceable: RSS={proc.memory_info().rss / 1e9:.1f} GB", flush=True)
                     __make_16bit_traceable(model)
-
+                    print(f"[DEBUG] After __make_16bit_traceable: RSS={proc.memory_info().rss / 1e9:.1f} GB", flush=True)
+
+                # Allow patcher to free duplicated memory after 16-bit tracing setup.
+                # __make_16bit_traceable calls module.float() on non-Linear/Embedding modules,
+                # creating fp32 copies of parameters already captured as bf16 views by the patcher.
+                # This hook frees those fp32 duplicates to avoid OOM on large MoE models.
+                if hasattr(patcher, "post_make_16bit_traceable"):
+                    print(f"[DEBUG] Calling post_make_16bit_traceable hook", flush=True)
+                    patcher.post_make_16bit_traceable()
+                    print(f"[DEBUG] After post_make_16bit_traceable: RSS={proc.memory_info().rss / 1e9:.1f} GB", flush=True)
+                else:
+                    print(f"[DEBUG] No post_make_16bit_traceable hook found on patcher={type(patcher).__name__}", flush=True)
                 conversion_extensions = getattr(patcher, "conversion_extensions", [])
                 module_extensions = getattr(patcher, "module_extensions", None)
                 if module_extensions is not None:
                     ts_decoder_kwargs["module_extensions"] = module_extensions
 
-                ts_decoder = TorchScriptPythonDecoder(model, example_input=dummy_inputs, **ts_decoder_kwargs)
+                example_input = dummy_inputs
+
+                print(f"[DEBUG] Before TorchScriptPythonDecoder: RSS={proc.memory_info().rss / 1e9:.1f} GB" if 'proc' in dir() else "[DEBUG] Before TorchScriptPythonDecoder", flush=True)
+                ts_decoder = TorchScriptPythonDecoder(model, example_input=example_input, **ts_decoder_kwargs)
+                print(f"[DEBUG] After TorchScriptPythonDecoder: RSS={proc.memory_info().rss / 1e9:.1f} GB" if 'proc' in dir() else "[DEBUG] After TorchScriptPythonDecoder", flush=True)
                 ov_model = convert_model(
                     ts_decoder,
-                    example_input=dummy_inputs,
+                    example_input=example_input,
                     input=[(item.shape, item.type) for item in input_info],
                     extension=conversion_extensions,
                 )
+                print(f"[DEBUG] After convert_model: RSS={proc.memory_info().rss / 1e9:.1f} GB" if 'proc' in dir() else "[DEBUG] After convert_model", flush=True)
 
         ov_model.validate_nodes_and_infer_types()  # TODO: remove as unnecessary validation?
 
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -1724,6 +1724,19 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
             dummy_inputs["token_type_ids"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[
                 0
             ].random_int_tensor(token_type_ids_shape, min_value=0, max_value=2)
+
+        # Generate dummy inputs for any extra entries from input_info_upd (e.g., position_ids)
+        if self.input_info_upd:
+            dummy_inputs_generators = self.orig_export_config._create_dummy_input_generator_classes(**kwargs)
+            for input_name in self.input_info_upd:
+                if input_name not in dummy_inputs:
+                    for dummy_input_gen in dummy_inputs_generators:
+                        if dummy_input_gen.supports_input(input_name):
+                            dummy_inputs[input_name] = self.orig_export_config.overwrite_shape_and_generate_input(
+                                dummy_input_gen, input_name, framework, input_shapes=kwargs,
+                            )
+                            break
+
         return dummy_inputs
 
 
@@ -5510,6 +5523,8 @@ def __init__(
         self.batch_size = batch_size
         self.normalized_config = normalized_config
         self.hidden_size = self.normalized_config.hidden_size
+        self.num_key_value_heads = self.normalized_config.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.normalized_config.num_attention_heads)
         self.linear_key_head_dim = config.linear_key_head_dim
         self.linear_value_head_dim = config.linear_value_head_dim
         self.linear_num_key_heads = config.linear_num_key_heads
@@ -5542,7 +5557,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 self.batch_size,
                 self.num_key_value_heads,
                 self.sequence_length,
-                self.hidden_size // self.num_attention_heads,
+                self.head_dim,
             )
             k = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype)
             v = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -8590,12 +8590,12 @@ def patched_qwen3_5_moe_sparse_moe_block(self, hidden_states: torch.Tensor) -> t
     hidden_expanded = hidden_states_reshaped.unsqueeze(0).expand(num_experts, -1, -1)
 
     # Vectorized expert computation using pre-transposed weights
-    gate_up = torch.bmm(hidden_expanded, self._gate_up_projs_t)
+    gate_up = torch.bmm(hidden_expanded, self._gate_up_projs_t.to(hidden_expanded.dtype))
     intermediate_size = self.experts.intermediate_dim
     gate = gate_up[:, :, :intermediate_size]
     up = gate_up[:, :, intermediate_size:]
     activated = self.experts.act_fn(gate) * up
-    next_states = torch.bmm(activated, self._down_projs_t)
+    next_states = torch.bmm(activated, self._down_projs_t.to(activated.dtype))
 
     # Weight by routing and sum over experts
     next_states = next_states * new_routing_weights.T.unsqueeze(-1)
@@ -8914,6 +8914,26 @@ def __enter__(self):
                     patched_qwen3_5_moe_sparse_moe_block, sparse_moe_block
                 )
 
+    def post_make_16bit_traceable(self):
+        """Free duplicated expert weights after __make_16bit_traceable.
+
+        __make_16bit_traceable calls module.float() on Qwen3_5MoeExperts modules,
+        creating fp32 copies of gate_up_proj and down_proj parameters. Since patcher
+        already captured bf16 views (_gate_up_projs_t, _down_projs_t) for the patched
+        forward, the fp32 copies are unused waste. Free them to avoid OOM.
+        """
+        import gc
+
+        import torch
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock
+
+        for decoder_layer in self._model.model.layers:
+            if isinstance(decoder_layer.mlp, Qwen3_5MoeSparseMoeBlock):
+                experts = decoder_layer.mlp.experts
+                experts.gate_up_proj.data = torch.empty(0)
+                experts.down_proj.data = torch.empty(0)
+        gc.collect()
+
     def __exit__(self, exc_type, exc_value, traceback):
         from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock
 
@@ -9025,6 +9045,8 @@ def has_previous_state(self):
                 layer_idx = self.linear_attn_mapping[self.last_linear_layer]
                 return self.conv_states[layer_idx] is not None
 
+        _lm_head_weight = model.lm_head.weight
+
         def patched_forward(
             inputs_embeds,
             attention_mask=None,
@@ -9063,7 +9085,7 @@ def patched_forward(
                 use_cache=use_cache,
             )
             hidden_states = outputs[0]
-            logits = model.lm_head(hidden_states)
+            logits = torch.nn.functional.linear(hidden_states, _lm_head_weight.to(hidden_states.dtype))
 
             result = {"logits": logits}
 
@@ -9178,6 +9200,8 @@ def has_previous_state(self):
                 layer_idx = self.linear_attn_mapping[self.last_linear_layer]
                 return self.conv_states[layer_idx] is not None
 
+        _lm_head_weight = model.lm_head.weight
+
         def patched_forward(
             inputs_embeds,
             attention_mask=None,
@@ -9216,7 +9240,7 @@ def patched_forward(
                 use_cache=use_cache,
             )
             hidden_states = outputs[0]
-            logits = model.lm_head(hidden_states)
+            logits = torch.nn.functional.linear(hidden_states, _lm_head_weight.to(hidden_states.dtype))
 
             result = {"logits": logits}
 
@@ -9271,6 +9295,26 @@ def __enter__(self):
                     patched_qwen3_5_moe_sparse_moe_block, sparse_moe_block
                 )
 
+    def post_make_16bit_traceable(self):
+        """Free duplicated expert weights after __make_16bit_traceable.
+
+        __make_16bit_traceable calls module.float() on Qwen3_5MoeExperts modules,
+        creating fp32 copies of gate_up_proj and down_proj parameters. Since patcher
+        already captured bf16 views (_gate_up_projs_t, _down_projs_t) for the patched
+        forward, the fp32 copies are unused waste. Free them to avoid OOM.
+        """
+        import gc
+
+        import torch
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock
+
+        for decoder_layer in self._model.model.language_model.layers:
+            if isinstance(decoder_layer.mlp, Qwen3_5MoeSparseMoeBlock):
+                experts = decoder_layer.mlp.experts
+                experts.gate_up_proj.data = torch.empty(0)
+                experts.down_proj.data = torch.empty(0)
+        gc.collect()
+
     def __exit__(self, exc_type, exc_value, traceback):
         from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock
 
diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py
@@ -285,14 +285,14 @@ def get_kv_ssm_tensor_names(ssm_prefix_names: list, kv_prefix_names: list, ov_te
                 other_names.append(ov_tensor_name)
         return kv_names, ssm_names, other_names
 
-    ssm_prefix_input_names = ["cache_params.past.ssm", "cache_params.past.conv"]
+    ssm_prefix_input_names = ["cache_params.past.ssm", "cache_params.past.conv", "cache_params.past.recurrent"]
     kv_prefix_input_names = ["cache_params.past.key", "cache_params.past.value"]
     kv_input_names, ssm_input_names, other_input_names = get_kv_ssm_tensor_names(
         ssm_prefix_input_names, kv_prefix_input_names, ov_model.inputs
     )
     not_kv_inputs = ssm_input_names + other_input_names
 
-    ssm_prefix_output_names = ["cache_params.present.ssm", "cache_params.present.conv"]
+    ssm_prefix_output_names = ["cache_params.present.ssm", "cache_params.present.conv", "cache_params.present.recurrent"]
     kv_prefix_output_names = ["cache_params.present.key", "cache_params.present.value"]
     kv_output_names, ssm_output_names, _ = get_kv_ssm_tensor_names(
         ssm_prefix_output_names, kv_prefix_output_names, ov_model.outputs
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
@@ -91,9 +91,11 @@ def _get_input_info(
 ) -> List[InputInfo]:
     sig = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.call)
     inputs = config.ordered_inputs(model)
+
     ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
     if not ordered_dummy_inputs:
         ordered_dummy_inputs = dummy_inputs
+
     ordered_input_names = list(inputs)
     flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
     input_info = []