save

hiyouga · hiyouga · commit 449d9fce75ed · 2025-06-12T04:06:41.000Z
diff --git a/.gitignore b/.gitignore
@@ -175,3 +175,7 @@ outputs/
 checkpoints/
 wandb/
 tensorboard_log/
+
+# data
+images/
+images*
diff --git a/README.md b/README.md
@@ -156,7 +156,7 @@ See [baselines.md](assets/baselines.md).
 - **ViGoRL**: Grounded Reinforcement Learning for Visual Reasoning. [![[code]](https://img.shields.io/github/stars/Gabesarch/grounded-rl)](https://github.com/Gabesarch/grounded-rl) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/abs/2505.23678)
 - **Revisual-R1**: Advancing Multimodal Reasoning: From Optimized Cold Start to Staged Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CSfufu/Revisual-R1)](https://github.com/CSfufu/Revisual-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.04207-blue)](https://arxiv.org/abs/2506.04207)
 - **SophiaVL-R1**: Reinforcing MLLMs Reasoning with Thinking Reward. [![[code]](https://img.shields.io/github/stars/kxfan2002/SophiaVL-R1)](https://github.com/kxfan2002/SophiaVL-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.17018-blue)](https://arxiv.org/abs/2505.17018)
-  
+
 ## TODO
 
 - Support LoRA (high priority).
diff --git a/examples/config.yaml b/examples/config.yaml
@@ -4,6 +4,7 @@ data:
   prompt_key: problem
   answer_key: answer
   image_key: images
+  image_dir: null
   max_prompt_length: 2048
   max_response_length: 2048
   rollout_batch_size: 512
diff --git a/verl/protocol.py b/verl/protocol.py
@@ -384,11 +384,11 @@ def pop(
         meta_info_keys = meta_info_keys or []
 
         tensors = {}
-        for key in batch_keys:
+        for key in batch_keys and key in self.batch:
             tensors[key] = self.batch.pop(key)
 
         non_tensors = {}
-        for key in non_tensor_batch_keys:
+        for key in non_tensor_batch_keys and key in self.non_tensor_batch:
             non_tensors[key] = self.non_tensor_batch.pop(key)
 
         meta_info = {}
diff --git a/verl/trainer/config.py b/verl/trainer/config.py
@@ -38,6 +38,7 @@ class DataConfig:
     prompt_key: str = "prompt"
     answer_key: str = "answer"
     image_key: str = "images"
+    image_dir: Optional[str] = None
     max_prompt_length: int = 512
     max_response_length: int = 512
     rollout_batch_size: int = 512
@@ -51,10 +52,18 @@ class DataConfig:
     filter_overlong_prompts: bool = True
 
     def post_init(self):
+        if self.image_dir is not None:
+            if os.path.exists(self.image_dir):  # ray job uses absolute path
+                self.image_dir = os.path.abspath(self.image_dir)
+            else:
+                print(f"Image directory {self.image_dir} is not found.")
+                self.image_dir = None
+
         if self.format_prompt is not None:
             if os.path.exists(self.format_prompt):  # ray job uses absolute path
                 self.format_prompt = os.path.abspath(self.format_prompt)
             else:
+                print(f"Format prompt file {self.format_prompt} is not found.")
                 self.format_prompt = None
 
 
@@ -97,7 +106,11 @@ def post_init(self):
 
         self.save_checkpoint_path = os.path.abspath(self.save_checkpoint_path)  # ray job uses absolute path
         if self.load_checkpoint_path is not None:
-            self.load_checkpoint_path = os.path.abspath(self.load_checkpoint_path)
+            if os.path.exists(self.load_checkpoint_path):  # ray job uses absolute path
+                self.load_checkpoint_path = os.path.abspath(self.load_checkpoint_path)
+            else:
+                print(f"Model checkpoint {self.load_checkpoint_path} is not found.")
+                self.load_checkpoint_path = None
 
 
 @dataclass
diff --git a/verl/trainer/data_loader.py b/verl/trainer/data_loader.py
@@ -31,6 +31,7 @@ def create_dataloader(config: DataConfig, tokenizer: PreTrainedTokenizer, proces
         prompt_key=config.prompt_key,
         answer_key=config.answer_key,
         image_key=config.image_key,
+        image_dir=config.image_dir,
         max_prompt_length=config.max_prompt_length,
         truncation="right",
         format_prompt=config.format_prompt,
@@ -63,6 +64,7 @@ def create_dataloader(config: DataConfig, tokenizer: PreTrainedTokenizer, proces
         prompt_key=config.prompt_key,
         answer_key=config.answer_key,
         image_key=config.image_key,
+        image_dir=config.image_dir,
         max_prompt_length=config.max_prompt_length,
         truncation="right",
         format_prompt=config.format_prompt,
diff --git a/verl/trainer/ray_trainer.py b/verl/trainer/ray_trainer.py
@@ -283,22 +283,13 @@ def _validate(self) -> Dict[str, Any]:
             input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
             sample_inputs.extend(input_texts)
 
-            if "multi_modal_data" in test_batch.non_tensor_batch.keys():
-                test_gen_batch = test_batch.pop(
-                    batch_keys=["input_ids", "attention_mask", "position_ids"],
-                    non_tensor_batch_keys=["raw_prompt_ids", "multi_modal_data"],
-                )
-            else:
-                test_gen_batch = test_batch.pop(
-                    batch_keys=["input_ids", "attention_mask", "position_ids"],
-                    non_tensor_batch_keys=["raw_prompt_ids"],
-                )
-
+            test_gen_batch = test_batch.pop(
+                batch_keys=["input_ids", "attention_mask", "position_ids"],
+                non_tensor_batch_keys=["raw_prompt_ids", "multi_modal_data"],
+            )
             test_gen_batch.meta_info = self.config.worker.rollout.val_override_config
-            test_gen_batch.meta_info.update({
-                "min_pixels": self.config.data.min_pixels,
-                "max_pixels": self.config.data.max_pixels,
-            })
+            test_gen_batch.meta_info["min_pixels"] = self.config.data.min_pixels
+            test_gen_batch.meta_info["max_pixels"] = self.config.data.max_pixels
             test_gen_batch, pad_size = pad_dataproto_to_divisor(test_gen_batch, self.actor_rollout_wg.world_size)
             test_output_gen_batch = self.actor_rollout_wg.generate_sequences(test_gen_batch)
             test_output_gen_batch = unpad_dataproto(test_output_gen_batch, pad_size=pad_size)
@@ -485,23 +476,16 @@ def fit(self):
 
                 metrics, timing_raw = {}, {}
                 batch: DataProto = DataProto.from_single_dict(batch_dict)
+                batch.meta_info = {
+                    "min_pixels": self.config.data.min_pixels,
+                    "max_pixels": self.config.data.max_pixels,
+                }
 
                 # pop those keys for generation
-                if "multi_modal_data" in batch.non_tensor_batch.keys():
-                    gen_batch = batch.pop(
-                        batch_keys=["input_ids", "attention_mask", "position_ids"],
-                        non_tensor_batch_keys=["raw_prompt_ids", "multi_modal_data"],
-                    )
-                    gen_batch.meta_info.update({
-                        "min_pixels": self.config.data.min_pixels,
-                        "max_pixels": self.config.data.max_pixels,
-                    })
-                else:
-                    gen_batch = batch.pop(
-                        batch_keys=["input_ids", "attention_mask", "position_ids"],
-                        non_tensor_batch_keys=["raw_prompt_ids"],
-                    )
-
+                gen_batch = batch.pop(
+                    batch_keys=["input_ids", "attention_mask", "position_ids"],
+                    non_tensor_batch_keys=["raw_prompt_ids", "multi_modal_data"],
+                )
                 with timer("step", timing_raw):
                     # generate a batch
                     with timer("gen", timing_raw):  # wg: worker group
diff --git a/verl/utils/dataset.py b/verl/utils/dataset.py
@@ -50,7 +50,6 @@ def collate_fn(features: List[Dict[str, Any]]) -> Dict[str, Any]:
     return {**tensors, **non_tensors}
 
 
-
 def process_image(image: Union[Dict[str, Any], ImageObject, str], min_pixels: int, max_pixels: int) -> ImageObject:
     if isinstance(image, str):
         image = Image.open(image)
@@ -59,6 +58,7 @@ def process_image(image: Union[Dict[str, Any], ImageObject, str], min_pixels: in
     elif isinstance(image, bytes):
         image = Image.open(BytesIO(image))
 
+    image.load()  # avoid "Too many open files" errors
     if (image.width * image.height) > max_pixels:
         resize_factor = math.sqrt(max_pixels / (image.width * image.height))
         width, height = int(image.width * resize_factor), int(image.height * resize_factor)
@@ -88,6 +88,7 @@ def __init__(
         prompt_key: str = "prompt",
         answer_key: str = "answer",
         image_key: str = "images",
+        image_dir: Optional[str] = None,
         max_prompt_length: int = 1024,
         truncation: str = "error",
         format_prompt: Optional[str] = None,
@@ -100,6 +101,7 @@ def __init__(
         self.prompt_key = prompt_key
         self.answer_key = answer_key
         self.image_key = image_key
+        self.image_dir = image_dir
         self.max_prompt_length = max_prompt_length
         self.truncation = truncation
         self.max_pixels = max_pixels
@@ -113,9 +115,11 @@ def __init__(
 
         if os.path.isdir(data_path):
             # when we use dataset builder, we should always refer to the train split
-            self.dataset = load_dataset("parquet", data_dir=data_path, split="train")
+            file_type = os.path.splitext(os.listdir("images/train")[0])[-1][1:].replace("jsonl", "json")
+            self.dataset = load_dataset(file_type, data_dir=data_path, split=data_split)
         elif os.path.isfile(data_path):
-            self.dataset = load_dataset("parquet", data_files=data_path, split="train")
+            file_type = os.path.splitext(data_path)[-1][1:].replace("jsonl", "json")
+            self.dataset = load_dataset(file_type, data_files=data_path, split=data_split)
         else:
             # load remote dataset from huggingface hub
             self.dataset = load_dataset(data_path, split=data_split)
@@ -164,22 +168,25 @@ def __getitem__(self, index):
 
         if self.image_key in example:
             prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-            raw_image_data = example.pop(self.image_key)
-            images = [
+            images = example.pop(self.image_key)
+            if self.image_dir is not None and len(images) != 0 and isinstance(images[0], str): # image paths
+                images = [os.path.join(self.image_dir, image) for image in images]
+
+            resized_images = [
                 process_image(image, min_pixels=self.min_pixels, max_pixels=self.max_pixels)
-                for image in raw_image_data
+                for image in images
             ]
-            model_inputs = self.processor(images, [prompt], add_special_tokens=False, return_tensors="pt")
+            model_inputs = self.processor(resized_images, [prompt], add_special_tokens=False, return_tensors="pt")
             input_ids = model_inputs.pop("input_ids")[0]
             attention_mask = model_inputs.pop("attention_mask")[0]
-            example["multi_modal_data"] = {"image": raw_image_data}
+            example["multi_modal_inputs"] = {"images": images}
         else:
             prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
             model_inputs = self.tokenizer([prompt], add_special_tokens=False, return_tensors="pt")
             input_ids = model_inputs.pop("input_ids")[0]
             attention_mask = model_inputs.pop("attention_mask")[0]
 
-        if self.processor is not None and self.processor.image_processor.__class__.__name__ == "Qwen2VLImageProcessor":
+        if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
             # qwen2vl mrope
             position_ids = get_rope_index(
                 self.processor,
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
@@ -15,13 +15,13 @@
 The main entry point to run the PPO algorithm
 """
 
+from copy import deepcopy
 from typing import Literal, Optional, Union
 
 import numpy as np
 import psutil
 import torch
 import torch.distributed as dist
-from copy import deepcopy
 from accelerate import init_empty_weights
 from codetiming import Timer
 from torch.distributed.device_mesh import init_device_mesh
@@ -42,6 +42,7 @@
 from ..single_controller.base import Worker
 from ..single_controller.base.decorator import Dispatch, register
 from ..utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
+from ..utils.dataset import process_image
 from ..utils.flops_counter import FlopsCounter
 from ..utils.fsdp_utils import (
     get_fsdp_wrap_policy,
@@ -51,7 +52,6 @@
     offload_fsdp_model,
     offload_fsdp_optimizer,
 )
-from ..utils.dataset import process_image
 from ..utils.model_utils import print_gpu_memory_usage, print_model_size
 from ..utils.tokenizer import get_processor, get_tokenizer
 from ..utils.torch_dtypes import PrecisionType
@@ -436,10 +436,9 @@ def preprocess_multi_modal_data(self, data: DataProto):
         processed_images = []
         for multi_modal_data in multi_modal_data_copy:
             processed_per_query_images = []
-            for image in multi_modal_data['image']:
-                processed_per_query_images.append(
-                    process_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
-                )
+            for image in multi_modal_data["image"]:
+                processed_per_query_images.append(process_image(image, min_pixels=min_pixels, max_pixels=max_pixels))
+
             processed_images.append(processed_per_query_images)
 
         # Note: Using the alternative (commented) code below to process images can lead to subtle resize issues:
@@ -454,17 +453,20 @@ def preprocess_multi_modal_data(self, data: DataProto):
         #     for j, image in enumerate(per_query_images):
         #         images[i][j] = process_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
 
-        multi_modal_inputs = np.array([
-            dict(self.processor.image_processor(images=per_query_images, videos=None))
-            for per_query_images in processed_images
-        ], dtype=object)
+        multi_modal_inputs = np.array(
+            [
+                dict(self.processor.image_processor(images=per_query_images, videos=None))
+                for per_query_images in processed_images
+            ],
+            dtype=object,
+        )
         data.non_tensor_batch["multi_modal_inputs"] = multi_modal_inputs
 
     @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
     def update_actor(self, data: DataProto):
         assert self._is_actor
         if "multi_modal_inputs" in self._cache:
-            data.non_tensor_batch['multi_modal_inputs'] = deepcopy(self._cache['multi_modal_inputs'])
+            data.non_tensor_batch["multi_modal_inputs"] = deepcopy(self._cache["multi_modal_inputs"])
         elif "multi_modal_data" in data.non_tensor_batch:
             self.preprocess_multi_modal_data(data)
 
@@ -545,12 +547,14 @@ def generate_sequences(self, prompts: DataProto):
             cached_multi_modal_data = None
             if "multi_modal_data" in prompts.non_tensor_batch:
                 cached_multi_modal_data = deepcopy(prompts.non_tensor_batch["multi_modal_data"])
-                min_pixels = prompts.meta_info['min_pixels']
-                max_pixels = prompts.meta_info['max_pixels']
+                min_pixels = prompts.meta_info["min_pixels"]
+                max_pixels = prompts.meta_info["max_pixels"]
                 processed_images = []
                 for i, multi_modal_data in enumerate(prompts.non_tensor_batch["multi_modal_data"]):
                     for j, image in enumerate(multi_modal_data["image"]):
-                        multi_modal_data['image'][j] = process_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
+                        multi_modal_data["image"][j] = process_image(
+                            image, min_pixels=min_pixels, max_pixels=max_pixels
+                        )
                     processed_images.append(multi_modal_data)
                 prompts.non_tensor_batch["multi_modal_data"] = processed_images
 
@@ -562,7 +566,9 @@ def generate_sequences(self, prompts: DataProto):
                 output.non_tensor_batch["multi_modal_data"] = cached_multi_modal_data
                 if sampling_n > 1:
                     output.non_tensor_batch["multi_modal_data"] = np.repeat(
-                        output.non_tensor_batch["multi_modal_data"], repeats=sampling_n, axis=0,
+                        output.non_tensor_batch["multi_modal_data"],
+                        repeats=sampling_n,
+                        axis=0,
                     )
 
             output = self.rollout_sharding_manager.postprocess_data(output)
@@ -577,7 +583,7 @@ def compute_log_probs(self, data: DataProto):
         if "multi_modal_data" in data.non_tensor_batch:
             self.preprocess_multi_modal_data(data)
             # create cache for multi_modal_inputs
-            self._cache['multi_modal_inputs'] = deepcopy(data.non_tensor_batch['multi_modal_inputs'])
+            self._cache["multi_modal_inputs"] = deepcopy(data.non_tensor_batch["multi_modal_inputs"])
 
         data = data.to(torch.cuda.current_device())
         if self._use_param_offload:
@@ -611,7 +617,7 @@ def compute_ref_log_probs(self, data: DataProto):
         # not in the ref_policy's or critic's caches.
         assert self._is_ref
         if "multi_modal_inputs" in self._cache:
-            data.non_tensor_batch['multi_modal_inputs'] = deepcopy(self._cache['multi_modal_inputs'])
+            data.non_tensor_batch["multi_modal_inputs"] = deepcopy(self._cache["multi_modal_inputs"])
         elif "multi_modal_data" in data.non_tensor_batch:
             self.preprocess_multi_modal_data(data)
 
@@ -643,7 +649,7 @@ def compute_values(self, data: DataProto):
         # The `self._cache` is empty here since cached `multi_modal_inputs` is only saved in the actor's _cache,
         # not in the ref_policy's or critic's caches.
         if "multi_modal_inputs" in self._cache:
-            data.non_tensor_batch['multi_modal_inputs'] = deepcopy(self._cache['multi_modal_inputs'])
+            data.non_tensor_batch["multi_modal_inputs"] = deepcopy(self._cache["multi_modal_inputs"])
         elif "multi_modal_data" in data.non_tensor_batch:
             self.preprocess_multi_modal_data(data)
 
@@ -668,7 +674,7 @@ def update_critic(self, data: DataProto):
         # The `self._cache` is empty here since cached `multi_modal_inputs` is only saved in the actor's _cache,
         # not in the ref_policy's or critic's caches.
         if "multi_modal_inputs" in self._cache:
-            data.non_tensor_batch['multi_modal_inputs'] = deepcopy(self._cache['multi_modal_inputs'])
+            data.non_tensor_batch["multi_modal_inputs"] = deepcopy(self._cache["multi_modal_inputs"])
         elif "multi_modal_data" not in data.non_tensor_batch:
             self.preprocess_multi_modal_data(data)
 
diff --git a/verl/workers/rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout_spmd.py
@@ -201,5 +201,5 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         return DataProto(
             batch=batch,
             non_tensor_batch=non_tensor_batch,
-            meta_info=prompts.meta_info.copy(),
+            meta_info=prompts.meta_info.copy()
         )

Original file line number	Diff line number	Diff line change
`@@ -201,5 +201,5 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:`
`201`	`201`	`return DataProto(`
`202`	`202`	`batch=batch,`
`203`	`203`	`non_tensor_batch=non_tensor_batch,`
`204`		`- meta_info=prompts.meta_info.copy(),`
	`204`	`+ meta_info=prompts.meta_info.copy()`
`205`	`205`	`)`