-
Notifications
You must be signed in to change notification settings - Fork 2.9k
fix: add support for video data in Agent Loop and Qwen3 VL
#4727
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -459,6 +459,7 @@ async def generate( | |||||
| sampling_params: dict[str, Any], | ||||||
| request_id: str, | ||||||
| image_data: Optional[list[Any]] = None, | ||||||
| video_data: Optional[list[Any]] = None, | ||||||
| ) -> TokenOutput: | ||||||
| """Generate sequence with token-in-token-out.""" | ||||||
| # TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready. | ||||||
|
|
@@ -476,10 +477,16 @@ async def generate( | |||||
| sampling_params["logprobs"] = 0 if sampling_params.pop("logprobs", False) else None | ||||||
| sampling_params.setdefault("repetition_penalty", self.config.get("repetition_penalty", 1.0)) | ||||||
| sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params) | ||||||
| prompt_ids = _qwen2_5_vl_dedup_image_tokens(prompt_ids, self.model_config.processor) | ||||||
| prompt = TokensPrompt( | ||||||
| prompt_token_ids=prompt_ids, multi_modal_data={"image": image_data} if image_data else None | ||||||
| ) | ||||||
| if "Qwen3VLProcessor" in self.model_config.processor.__class__.__name__: | ||||||
| prompt_ids = _qwen3_vl_dedup_vision_tokens(prompt_ids, self.model_config.processor, video_data) | ||||||
| else: | ||||||
| prompt_ids = _qwen2_5_vl_dedup_vision_tokens(prompt_ids, self.model_config.processor) | ||||||
| multi_modal_data = {} | ||||||
| if image_data is not None: | ||||||
| multi_modal_data["image"] = image_data | ||||||
| if video_data is not None: | ||||||
| multi_modal_data["video"] = video_data | ||||||
| prompt = TokensPrompt(prompt_token_ids=prompt_ids, multi_modal_data=multi_modal_data) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The previous implementation passed
Suggested change
|
||||||
|
|
||||||
| # Add lora request | ||||||
| lora_request = None | ||||||
|
|
@@ -792,15 +799,19 @@ async def abort_request(self, request_id: str) -> dict[str, Any]: | |||||
| return {"aborted": False, "request_id": request_id, "error": "Request not found on any server"} | ||||||
|
|
||||||
|
|
||||||
| def _qwen2_5_vl_dedup_image_tokens(prompt_ids: list[int], processor): | ||||||
| """Deduplicate consecutive image tokens in prompt_ids for Qwen2.5-VL, since vLLM will replicate the | ||||||
| <|image_pad|> token by image_data. | ||||||
| def _qwen2_5_vl_dedup_vision_tokens(prompt_ids: list[int], processor): | ||||||
| """Deduplicate consecutive vision tokens (image or video) in prompt_ids for Qwen2.5-VL, | ||||||
| since vLLM will replicate the padding tokens by vision data. | ||||||
|
|
||||||
| For example, | ||||||
| ``` | ||||||
| <|vision_start|><|image_pad|><|image_pad|>...<|image_pad|><|vision_end|> | ||||||
| => | ||||||
| <|vision_start|><|image_pad|><|vision_end|> | ||||||
|
|
||||||
| <|vision_start|><|video_pad|>...<|vision_end|> | ||||||
| => | ||||||
| <|vision_start|><|video_pad|><|vision_end|> | ||||||
| ``` | ||||||
| """ | ||||||
| if processor is not None and "Qwen2VLImageProcessor" in processor.image_processor.__class__.__name__: | ||||||
|
|
@@ -810,11 +821,55 @@ def _qwen2_5_vl_dedup_image_tokens(prompt_ids: list[int], processor): | |||||
| mask = np.ones(len(prompt_ids), dtype=bool) | ||||||
|
|
||||||
| # Find where the array equals the value | ||||||
| is_value = prompt_ids == processor.image_token_id | ||||||
| is_value = (prompt_ids == processor.image_token_id) | (prompt_ids == processor.video_token_id) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Accessing A safer implementation would be: is_value = prompt_ids == processor.image_token_id
if hasattr(processor, "video_token_id"):
is_value |= (prompt_ids == processor.video_token_id) |
||||||
|
|
||||||
| # Find consecutive duplicates by checking if previous element is also the value | ||||||
| mask[1:] &= ~(is_value[1:] & is_value[:-1]) | ||||||
|
|
||||||
| return prompt_ids[mask].tolist() | ||||||
| else: | ||||||
| return prompt_ids | ||||||
|
|
||||||
|
|
||||||
| def _qwen3_vl_dedup_vision_tokens(prompt_ids: list[int], processor, video_data: Optional[list[Any]] = None): | ||||||
| """Deduplicate consecutive vision tokens (image or video) in prompt_ids for Qwen3-VL, | ||||||
| since vLLM will replicate the padding tokens by vision data. | ||||||
|
|
||||||
| For example, | ||||||
| ``` | ||||||
| <|vision_start|><|image_pad|><|image_pad|>...<|image_pad|><|vision_end|> | ||||||
| => | ||||||
| <|vision_start|><|image_pad|><|vision_end|> | ||||||
|
|
||||||
| <0.1 seconds><|vision_start|><|video_pad|>...<|vision_end|> | ||||||
| ...<11.3 seconds><|vision_start|><|video_pad|>...<|vision_end|> | ||||||
| => | ||||||
| <|vision_start|><|video_pad|><|vision_end|> | ||||||
| ``` | ||||||
| """ | ||||||
|
|
||||||
| # dedup video placeholder | ||||||
| video_frames = [] | ||||||
| if video_data is not None: | ||||||
| for video in video_data: | ||||||
| frame = video[0].shape[0] // 2 | ||||||
| video_frames.append(frame) | ||||||
|
|
||||||
| import re | ||||||
|
|
||||||
| single_frame_pattern = r"<[\d.]+ seconds><\|vision_start\|>(?:<\|video_pad\|>)+<\|vision_end\|>" | ||||||
| prompt = processor.tokenizer.decode(prompt_ids) | ||||||
| current_prompt = prompt | ||||||
| for num_frames in video_frames: | ||||||
| # Match exactly num_frames repetitions of the single frame pattern | ||||||
| video_sequence_pattern = f"(?:{single_frame_pattern}){{{num_frames}}}" | ||||||
|
|
||||||
| current_prompt, count = re.subn( | ||||||
| video_sequence_pattern, "<|vision_start|><|video_pad|><|vision_end|>", current_prompt, count=1 | ||||||
| ) | ||||||
| if count != 1: | ||||||
| logger.warning(f"Expected to deduplicate {num_frames} frames, but found {count} matches.") | ||||||
|
|
||||||
| prompt_ids = processor.tokenizer.encode(current_prompt) | ||||||
|
|
||||||
| return _qwen2_5_vl_dedup_vision_tokens(prompt_ids, processor) | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This block of code for processing video data is nearly identical to the one in
verl/experimental/agent_loop/tool_agent_loop.pyat lines 223-228. Duplicating this logic increases maintenance overhead and the risk of introducing inconsistencies if one is updated and the other is not. Consider refactoring this into a shared helper function to promote code reuse and simplify future modifications. For example, a function like_prepare_video_kwargs(videos)could encapsulate this logic.