PaddlePaddle · Ace-To-HYB · Nov 18, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 20, 2025
diff --git a/docs/zh/image_processors.md → docs/zh/image_processors_zh.md b/docs/zh/image_processors.md → docs/zh/image_processors_zh.md
diff --git a/docs/zh/processors.md → docs/zh/processors_zh.md b/docs/zh/processors.md → docs/zh/processors_zh.md
@@ -14,7 +14,7 @@
 
 ### 💻 使用示例
 
-下面是一个示例，展示如何加载 `Processor` 并处理图像/视频数据（[Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct).
+下面是一个示例，展示如何加载 `Processor` 并处理图像/视频数据（以[Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)模型为例）。
 
 - 示例 1：直接处理图像与文本:
 

diff --git a/docs/zh/video_processors.md → docs/zh/video_processors_zh.md b/docs/zh/video_processors.md → docs/zh/video_processors_zh.md
diff --git a/paddleformers/generation/utils.py b/paddleformers/generation/utils.py
@@ -16,7 +16,7 @@
 
 import copy
 import inspect
-from typing import Optional, Union
+from typing import Optional, Tuple, Union
 
 import paddle
 import paddle.distributed as dist
@@ -641,20 +641,16 @@ def get_decoder_start_token_id(self, decoder_start_token_id=None, bos_token_id=N
 
     def prepare_inputs_for_generation(
         self,
-        input_ids,
-        use_cache=True,
-        past_key_values=None,
-        inputs_embeds=None,
+        input_ids: paddle.Tensor,
+        past_key_values: Optional[Tuple[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
         **kwargs,
     ):
         """Prepares model inputs for generation in PaddlePaddle models.
 
         Args:
             input_ids (paddle.Tensor):
                 The input token IDs with shape [batch_size, sequence_length].
-            use_cache (bool, optional):
-                Whether to use cached key-value states for faster generation.
-                Defaults to False.
             past_key_values (Optional[Tuple[paddle.Tensor]]):
                 Cached past key-value states from previous generation steps.
                 If provided, the input_ids will be truncated to only keep the last token.
@@ -675,26 +671,52 @@ def prepare_inputs_for_generation(
                 - "return_dict": Always set to True for consistent output format
 
         """
+        model_inputs = {}
+        model_inputs["past_key_values"] = past_key_values
+        model_inputs["cache_position"] = kwargs.get("cache_position", None)
+
         if past_key_values:
             input_ids = input_ids[:, -1:]
 
-        attention_mask = kwargs.get("attention_mask", None)
+        use_cache = kwargs.get("use_cache", None)
+        if use_cache is None:
+            use_cache = getattr(self.config, "use_cache", False)
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids}
 
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "return_dict": True,
-            }
-        )
+        attention_mask = kwargs.get("attention_mask", None)
+        if (
+            attention_mask is not None
+            and kwargs.get("position_ids") is None
+            and "position_ids" in set(inspect.signature(self.forward).parameters.keys())
+        ):
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            kwargs["position_ids"] = position_ids  # placed in kwargs for further processing (see below)
+
+        model_input = kwargs.get("position_ids")
+        if model_input is not None:
+            if past_key_values is not None or use_cache:
+                current_input_length = (
+                    model_inputs["inputs_embeds"].shape[1]
+                    if model_inputs.get("inputs_embeds") is not None
+                    else model_inputs["input_ids"].shape[1]
+                )
+                model_input = model_input[:, -current_input_length:]
+            model_inputs["position_ids"] = model_input
+
+        model_inputs["return_dict"] = kwargs.get("return_dict", True)
+
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
 
+        # Remove unexpected `generate` inputs
+        model_inputs.pop("labels", None)
         return model_inputs
 
     def adjust_logits_during_generation(self, logits):

diff --git a/paddleformers/nn/attention/flashmask_attention.py b/paddleformers/nn/attention/flashmask_attention.py
@@ -44,7 +44,7 @@ def flashmask_attention_forward(
             key,
             value,
             startend_row_indices=attn_mask_startend_row_indices,
-            causal=True,
+            causal=is_causal if is_causal is not None else True,
         )
     else:
         out = sink_attention_forward(

diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
@@ -301,6 +301,13 @@
     ],
     "qwen2.tokenizer": ["Qwen2Tokenizer"],
     "qwen2.tokenizer_fast": ["Qwen2TokenizerFast"],
+    "qwen2_5_vl.configuration": ["Qwen2_5_VLConfig", "Qwen2_5_VLTextConfig"],
+    "qwen2_5_vl.modeling": [
+        "Qwen2_5_VLForConditionalGeneration",
+        "Qwen2_5_VLModel",
+        "Qwen2_5_VLPretrainedModel",
+        "Qwen2_5_VLTextModel",
+    ],
     "qwen2_5_vl.processor": ["Qwen2_5_VLProcessor"],
     "qwen2_moe.configuration": ["Qwen2MoeConfig"],
     "qwen2_moe.modeling": [
@@ -343,6 +350,7 @@
     "ernie4_5": ["Ernie4_5DecoderLayer", "Ernie4_5Model", "Ernie4_5_ForCausalLM"],
     "ernie4_5_moe": ["Ernie4_5_MoeDecoderLayer", "Ernie4_5_MoeModel", "Ernie4_5_MoeForCausalLM"],
     "ernie4_5_moe_vl": [],
+    "qwen2_5_v;": [],
     "qwen2_moe": [],
     "qwen2_vl": [],
     "qwen3_moe": [],

diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
@@ -42,6 +42,8 @@
         ("llama", "LlamaConfig"),
         ("qwen", "QWenConfig"),
         ("qwen2", "Qwen2Config"),
+        ("qwen2_5_vl", "Qwen2_5_VLConfig"),
+        ("qwen2_5_vl_text", "Qwen2_5_VLTextConfig"),
         ("qwen2_moe", "Qwen2MoeConfig"),
         ("qwen3", "Qwen3Config"),
         ("qwen3_moe", "Qwen3MoeConfig"),
@@ -63,6 +65,8 @@
         ("llama", "Llama"),
         ("qwen", "QWen"),
         ("qwen2", "Qwen2"),
+        ("qwen2_5_vl", "Qwen2_5_VL"),
+        ("qwen2_5_vl_text", "Qwen2_5_VL"),
         ("qwen2_moe", "Qwen2Moe"),
         ("qwen3", "Qwen3"),
         ("qwen3_moe", "Qwen3Moe"),
@@ -74,6 +78,12 @@
     []
 )
 
+SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
+    [
+        ("qwen2_5_vl_text", "qwen2_5_vl"),
+    ]
+)
+
 
 def config_class_to_model_type(config):
     """Converts a config class name to the corresponding model type"""
@@ -182,6 +192,11 @@ def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]:
 
 def model_type_to_module_name(key):
     """Converts a config key to the corresponding module."""
+    # Special treatment
+    if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
+        key = SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+        return key
+
     key = key.replace("-", "_")
     return key
 

diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
@@ -61,8 +61,9 @@
         ("Llama", "llama"),
         ("QWen", "qwen"),
         ("Qwen2", "qwen2"),
-        ("Qwen3", "qwen3"),
+        ("Qwen2_5_VL", "qwen2_5_vl"),
         ("Qwen2Moe", "qwen2_moe"),
+        ("Qwen3", "qwen3"),
         ("Qwen3Moe", "qwen3_moe"),
         ("Glm4Moe", "glm4_moe"),
         ("GptOss", "gpt_oss"),