hiyouga
diff --git a/‎.github/requirements-test.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/requirements-test.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎verl/models/monkey_patch.py‎
Lines changed: 31 additions & 37 deletions b/‎verl/models/monkey_patch.py‎
Lines changed: 31 additions & 37 deletions
diff --git a/‎verl/models/transformers/flash_attention_utils.py‎
Lines changed: 13 additions & 16 deletions b/‎verl/models/transformers/flash_attention_utils.py‎
Lines changed: 13 additions & 16 deletions
@@ -8,4 +8,4 @@ ruff
 tensordict
 torch
 torchvision
-transformers>=4.51.0,<=4.56.1
+transformers>=4.54.0,<=4.56.2
@@ -36,7 +36,7 @@ EasyR1 is efficient and scalable due to the design of **[HybirdEngine](https://a
 ### Software Requirements
 
 - Python 3.9+
-- transformers>=4.51.0
+- transformers>=4.54.0
 - flash-attn>=2.4.3
 - vllm>=0.8.3
 
 
@@ -15,6 +15,6 @@ qwen-vl-utils
 ray[default]
 tensordict
 torchdata
-transformers>=4.51.0,<=4.56.1
+transformers>=4.54.0,<=4.56.2
 vllm>=0.8.0
 wandb
@@ -15,48 +15,42 @@
 
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
-from ..utils.py_functional import is_transformers_version_greater_than
 from .transformers.flash_attention_utils import flash_attention_forward
-from .transformers.qwen2_vl import (
-    qwen2_vl_attn_forward,
-    qwen2_vl_base_forward_new,
-    qwen2_vl_forward_new,
-    qwen2_vl_forward_old,
+from .transformers.qwen2_vl import qwen2_vl_base_forward, qwen2_vl_model_forward
+
+
+SUPPORTED_MODEL_TYPE = (
+    "llama",
+    "gemma",
+    "gemma2",
+    "mistral",
+    "qwen2",
+    "qwen2_moe",
+    "qwen3",
+    "qwen3_moe",
+    "qwen2_vl",
+    "qwen2_5_vl",
 )
 
+SUPPORTED_VLM_TYPE = ("qwen2_vl", "qwen2_5_vl")
+
 
 def apply_ulysses_patch(model_type: str) -> None:
-    if model_type in ("llama", "gemma", "gemma2", "mistral", "qwen2", "qwen3", "qwen3_moe"):
+    if model_type in SUPPORTED_MODEL_TYPE:
         ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
-    elif model_type in ("qwen2_vl", "qwen2_5_vl"):
-        if is_transformers_version_greater_than("4.54.0"):
-            # transformers 4.54.0 does not need special patch: https://github.com/huggingface/transformers/pull/39447
-            ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
-        elif is_transformers_version_greater_than("4.53.0"):
-            raise NotImplementedError("Transformers 4.53.* is not compatible with Qwen2-VL. Use 4.54.0 or later.")
-        else:
-            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLFlashAttention2
-            from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLFlashAttention2
-
-            Qwen2VLFlashAttention2.forward = qwen2_vl_attn_forward
-            Qwen2_5_VLFlashAttention2.forward = qwen2_vl_attn_forward
-
-        if is_transformers_version_greater_than("4.52.0"):
-            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
-                Qwen2_5_VLForConditionalGeneration,
-                Qwen2_5_VLModel,
-            )
-            from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration, Qwen2VLModel
-
-            Qwen2VLModel.forward = qwen2_vl_base_forward_new
-            Qwen2_5_VLModel.forward = qwen2_vl_base_forward_new
-            Qwen2VLForConditionalGeneration.forward = qwen2_vl_forward_new
-            Qwen2_5_VLForConditionalGeneration.forward = qwen2_vl_forward_new
-        else:
-            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
-            from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
-
-            Qwen2VLForConditionalGeneration.forward = qwen2_vl_forward_old
-            Qwen2_5_VLForConditionalGeneration.forward = qwen2_vl_forward_old
     else:
         raise NotImplementedError(f"Model architecture {model_type} is not supported yet.")
+
+    if model_type in SUPPORTED_VLM_TYPE:
+        from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+            Qwen2_5_VLForConditionalGeneration,
+            Qwen2_5_VLModel,
+        )
+        from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration, Qwen2VLModel
+
+        # fix text-image mixed data
+        Qwen2VLModel.forward = qwen2_vl_base_forward
+        Qwen2_5_VLModel.forward = qwen2_vl_base_forward
+        # TODO: add linear cross entropy kernels
+        Qwen2VLForConditionalGeneration.forward = qwen2_vl_model_forward
+        Qwen2_5_VLForConditionalGeneration.forward = qwen2_vl_model_forward
@@ -47,11 +47,12 @@ def prepare_fa2_from_position_ids(
     query = query.contiguous().view(-1, query.size(-2), query.size(-1))
     key = key.contiguous().view(-1, key.size(-2), key.size(-1))
     value = value.contiguous().view(-1, value.size(-2), value.size(-1))
+    tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}
     position_ids = position_ids.view(-1)
     cu_seqlens = torch.cat(
         (
-            (position_ids == 0).nonzero().view(-1).to(torch.int32),
-            torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
+            (position_ids == 0).nonzero().view(-1).to(**tensor_kwargs),
+            torch.tensor(position_ids.size(), **tensor_kwargs),
         )
     )
     max_length = cu_seqlens.diff().max()  # use cu_seqlens to infer max_length for qwen2vl mrope
@@ -90,12 +91,9 @@ def _custom_flash_attention_forward(
         query_states, key_states, value_states, target_dtype=torch.bfloat16
     )
 
-    if position_ids is not None:
-        assert position_ids.ndim == 2  # (batch_size, seq_length)
-
     sp_size = get_ulysses_sequence_parallel_world_size()
     if sp_size > 1:
-        # qkv: (batch_size, seq_length, num_head, head_size)
+        # qkv: (batch_size, seq_length / sp_size, num_head, head_size)
         query_states = gather_seq_scatter_heads(query_states, seq_dim=1, head_dim=2)
         key_states = gather_seq_scatter_heads(key_states, seq_dim=1, head_dim=2)
         value_states = gather_seq_scatter_heads(value_states, seq_dim=1, head_dim=2)
@@ -105,19 +103,17 @@ def _custom_flash_attention_forward(
 
     if position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():
         batch_size = query_states.size(0)
-        query_states, key_states, value_states, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
+        q, k, v, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = prepare_fa2_from_position_ids(
             query_states, key_states, value_states, position_ids
         )
-        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
         attn_output = flash_attn_varlen_func(
-            query_states,
-            key_states,
-            value_states,
+            q,
+            k,
+            v,
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=max_seqlen_in_batch_q,
-            max_seqlen_k=max_seqlen_in_batch_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
             dropout_p=kwargs.pop("dropout", 0.0),
             softmax_scale=kwargs.pop("softmax_scale", None),
             causal=is_causal,
@@ -132,14 +128,15 @@ def _custom_flash_attention_forward(
             attention_mask,
             query_length,
             is_causal=is_causal,
+            position_ids=position_ids,
             sliding_window=sliding_window,
             use_top_left_mask=use_top_left_mask,
             deterministic=deterministic,
             **kwargs,
-        )  # do not pass position_ids to old flash_attention_forward
+        )
 
     if sp_size > 1:
-        # (batch_size, seq_length, num_head, head_size)
+        # output: (batch_size, seq_length / sp_size, num_head, head_size)
         attn_output = gather_heads_scatter_seq(attn_output, head_dim=2, seq_dim=1)
 
     return attn_output