[trainer] fix checkpoint tracker (#467)

hiyouga · web-flow · commit 7c2b84f64f0d · 2025-08-19T03:03:12.000+08:00
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
@@ -35,9 +35,9 @@ def test_find_latest_ckpt(save_checkpoint_path):
     with open(os.path.join(save_checkpoint_path, CHECKPOINT_TRACKER), "w") as f:
         json.dump({"last_global_step": 10}, f, ensure_ascii=False, indent=2)
 
-    assert find_latest_ckpt(save_checkpoint_path) is None
+    assert find_latest_ckpt(save_checkpoint_path)[0] is None
     os.makedirs(os.path.join(save_checkpoint_path, "global_step_10"), exist_ok=True)
-    assert find_latest_ckpt(save_checkpoint_path) == os.path.join(save_checkpoint_path, "global_step_10")
+    assert find_latest_ckpt(save_checkpoint_path)[0] == os.path.join(save_checkpoint_path, "global_step_10")
 
 
 def test_remove_obsolete_ckpt(save_checkpoint_path):
diff --git a/verl/models/transformers/flash_attention_utils.py b/verl/models/transformers/flash_attention_utils.py
@@ -20,6 +20,7 @@
 
 import torch
 import torch.distributed as dist
+import torch.nn.functional as F
 from transformers.modeling_flash_attention_utils import _flash_attention_forward, fa_peft_integration_check
 from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10
 
@@ -43,19 +44,14 @@
 def prepare_fa2_from_position_ids(
     query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, position_ids: torch.Tensor
 ):
-    query = query.view(-1, query.size(-2), query.size(-1))
+    assert position_ids.ndim == 2  # (batch_size, seq_length)
+    query = query.contiguous().view(-1, query.size(-2), query.size(-1))
     key = key.contiguous().view(-1, key.size(-2), key.size(-1))
     value = value.contiguous().view(-1, value.size(-2), value.size(-1))
-    position_ids = position_ids.flatten()
-    indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
-    cu_seqlens = torch.cat(
-        (
-            indices_q[position_ids == 0],
-            torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
-        )
-    )
+    position_ids = position_ids.view(-1)
+    cu_seqlens = F.pad((position_ids == 0).nonzero().view(-1), (0, 1), value=position_ids.size())
     max_length = cu_seqlens.diff().max()  # use cu_seqlens to infer max_length for qwen2vl mrope
-    return (query, key, value, indices_q, (cu_seqlens, cu_seqlens), (max_length, max_length))
+    return (query, key, value, (cu_seqlens, cu_seqlens), (max_length, max_length))
 
 
 def _custom_flash_attention_forward(
@@ -102,7 +98,7 @@ def _custom_flash_attention_forward(
 
     if position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():
         batch_size = query_states.size(0)
-        query_states, key_states, value_states, _, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
+        query_states, key_states, value_states, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
             query_states, key_states, value_states, position_ids
         )
         cu_seqlens_q, cu_seqlens_k = cu_seq_lens
@@ -162,16 +158,18 @@ def flash_attention_forward(
     key = key.transpose(1, 2)
     value = value.transpose(1, 2)
 
-    # FA2 always relies on the value set in the module, so remove it if present in kwargs to avoid passing it twice
-    kwargs.pop("is_causal", None)
+    # FA2 uses the kwargs value if explicitly passed, otherwise it uses the module attribute
+    is_causal = kwargs.pop("is_causal", None)
+    if is_causal is None:
+        is_causal = getattr(module, "is_causal", True)
 
     attn_output = _custom_flash_attention_forward(
         query,
         key,
         value,
         attention_mask,
         query_length=q_len,
-        is_causal=module.is_causal,
+        is_causal=is_causal,
         dropout=dropout,
         softmax_scale=scaling,
         sliding_window=sliding_window,
diff --git a/verl/trainer/ray_trainer.py b/verl/trainer/ray_trainer.py
@@ -343,7 +343,10 @@ def _load_checkpoint(self) -> None:
         if self.config.trainer.load_checkpoint_path is not None:
             load_checkpoint_path = self.config.trainer.load_checkpoint_path
         elif self.config.trainer.find_last_checkpoint:
-            load_checkpoint_path = find_latest_ckpt(self.config.trainer.save_checkpoint_path)
+            load_checkpoint_path, tracker_info = find_latest_ckpt(self.config.trainer.save_checkpoint_path)
+            if tracker_info is not None:
+                self.best_val_reward_score = tracker_info.get("best_val_reward_score", 0.0)
+                self.best_global_step = tracker_info.get("best_global_step", 0)
         else:
             load_checkpoint_path = None
 
diff --git a/verl/utils/checkpoint/checkpoint_manager.py b/verl/utils/checkpoint/checkpoint_manager.py
@@ -115,24 +115,26 @@ def get_checkpoint_tracker_filename(root_path: str) -> str:
     return os.path.join(root_path, CHECKPOINT_TRACKER)
 
 
-def find_latest_ckpt(path: str, directory_format: str = "global_step_{}") -> Optional[str]:
+def find_latest_ckpt(
+    path: str, directory_format: str = "global_step_{}"
+) -> tuple[Optional[str], Optional[dict[str, Any]]]:
     """
     Find the latest checkpoint in the save path.
     """
     tracker_file = get_checkpoint_tracker_filename(path)
     if not os.path.exists(tracker_file):
-        return None
+        return None, None
 
     with open(tracker_file, "rb") as f:
         checkpointer_tracker_info = json.load(f)
 
     ckpt_path = os.path.join(path, directory_format.format(checkpointer_tracker_info["last_global_step"]))
     if not os.path.exists(ckpt_path):
         print(f"Checkpoint does not exist: {ckpt_path}")
-        return None
+        return None, None
 
     print(f"Found latest checkpoint: {ckpt_path}, will resume from it. Turn off `find_last_checkpoint` to disable it.")
-    return ckpt_path
+    return ckpt_path, checkpointer_tracker_info
 
 
 def remove_obsolete_ckpt(