[misc] fix flops counter (#401)

hiyouga · web-flow · commit 61b93849b66f · 2025-06-27T17:34:21.000+08:00
diff --git a/verl/models/transformers/flash_attention_utils.py b/verl/models/transformers/flash_attention_utils.py
@@ -171,7 +171,7 @@ def flash_attention_forward(
         value,
         attention_mask,
         query_length=q_len,
-        is_causal=True,
+        is_causal=module.is_causal,
         dropout=dropout,
         softmax_scale=scaling,
         sliding_window=sliding_window,
diff --git a/verl/utils/flops_counter.py b/verl/utils/flops_counter.py
@@ -21,9 +21,6 @@
     from transformers.models.llama.configuration_llama import LlamaConfig
 
 
-VALID_MODLE_TYPE = {"llama", "qwen2", "qwen2_vl", "qwen2_5_vl", "qwen3"}
-
-
 def get_device_flops(unit: str = "T") -> float:
     def unit_convert(number: float, level: str):
         units = ["B", "K", "M", "G", "T", "P"]
@@ -51,6 +48,7 @@ def unit_convert(number: float, level: str):
         flops = 148e12
     elif "910B" in device_name:
         flops = 354e12
+
     flops_unit = unit_convert(flops, unit)
     return flops_unit
 
@@ -65,16 +63,19 @@ class FlopsCounter:
     """
 
     def __init__(self, config: "LlamaConfig"):
-        if config.model_type not in VALID_MODLE_TYPE:
-            print(f"Only support {VALID_MODLE_TYPE}, but got {config.model_type}. MFU will always be zero.")
-
-        self.estimate_func = {
+        _ESTIMATE_FUNC = {
             "llama": self._estimate_llama_flops,
             "qwen2": self._estimate_llama_flops,
             "qwen2_vl": self._estimate_llama_flops,
             "qwen2_5_vl": self._estimate_llama_flops,
+            "qwen3": self._estimate_llama_flops,
         }
+
+        if config.model_type not in _ESTIMATE_FUNC:
+            print(f"Only support {_ESTIMATE_FUNC.keys()}, but got {config.model_type}. MFU will always be zero.")
+
         self.config = config
+        self._estimate_flops = _ESTIMATE_FUNC.get(config.model_type, self._estimate_unknown_flops)
 
     def _estimate_unknown_flops(self, tokens_sum: int, batch_seqlens: List[int], delta_time: float) -> float:
         return 0
@@ -127,7 +128,6 @@ def estimate_flops(self, batch_seqlens: List[int], delta_time: float) -> Tuple[f
             promised_flops (float): The expected FLOPS of the current device.
         """
         tokens_sum = sum(batch_seqlens)
-        func = self.estimate_func.get(self.config.model_type, self._estimate_unknown_flops)
-        estimated_flops = func(tokens_sum, batch_seqlens, delta_time)
+        estimated_flops = self._estimate_flops(tokens_sum, batch_seqlens, delta_time)
         promised_flops = get_device_flops()
         return estimated_flops, promised_flops
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
@@ -24,7 +24,6 @@
 from ray.experimental.tqdm_ray import tqdm
 from torch import nn
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from transformers.modeling_flash_attention_utils import index_first_axis, pad_input, unpad_input
 
 from ...protocol import DataProto
 from ...trainer.core_algos import average_loss, compute_kl, compute_policy_loss
@@ -35,6 +34,12 @@
 from .config import ActorConfig
 
 
+try:
+    from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input
+except ImportError:
+    pass
+
+
 __all__ = ["DataParallelPPOActor"]