Too many temporal ROIs in <glue> tags

Thanks for your work!

When I try your code on long video understanding, I found that the model output too many (start, end) temporal ROIs in the output, sometimes these ROIs even exceeds the actual duration of the video. 

This may make the "zoom-in" nature in the next iteration's perception meaningless, and increase inference time.

My code follows your example in your [hf readme](https://huggingface.co/OpenGVLab/VideoChat-R1_5-7B), is listed below with the results of videomme question 601-2.

Could you please help point out any possible mistakes in my implementation or provide some explanation?

Thanks again!

```
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import sys
sys.path.append("Videochat-R1.5")
from src_eval.my_vision_process import process_vision_info
import torch
import re
import ast
import json

model_path = "OpenGVLab/VideoChat-R1_5-7B"
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype="auto", device_map="cuda:0",
    attn_implementation="flash_attention_2",
    local_files_only=True
)
# default processer
processor = AutoProcessor.from_pretrained(model_path, local_files_only=True)

def inference(video_path, prompt, model, processor, max_new_tokens=2048, device="cuda:0", client = None, pred_glue=None):
    messages = [
        {"role": "user", "content": [
                {"type": "video", 
                "video": video_path,
                'key_time':pred_glue,
                "total_pixels": 128*12 * 28 * 28, 
                "min_pixels": 128 * 28 * 28,
                },
                {"type": "text", "text": prompt},
            ]
        },
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client = client)
    fps_inputs = video_kwargs['fps']

    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
    inputs = inputs.to(device)

    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)

    generated_ids = [output_ids[i][len(inputs.input_ids[i]):] for i in range(len(output_ids))]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

num_percptions = 3

QA_THINK_GLUE = """Answer the question: "{QUESTION}" according to the content of the video. 

Output your think process within the  <think> </think> tags.

Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. At the same time, in the <glue> </glue> tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
"""

QA_THINK = """Answer the question: "{QUESTION}" according to the content of the video.

Output your think process within the  <think> </think> tags.

Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.

video_path = 'video_path.mp4'
question = '''
What is the video mainly about? Options:
A. Planes invented by the Wright Brothers.
B. The structural difference between the planes created by Whitehead and planes created by the Wright Brothers.
C. Who invented the first plane.
D. How Whitehead and the Wright Brothers cooperated to invent the first motorized flight.
'''

answers = []
pred_glue = None
for percption in range(num_percptions):    
            
    if percption == num_percptions - 1:
        example_prompt = QA_THINK.format(QUESTION=question)
    else:
        example_prompt = QA_THINK_GLUE.format(QUESTION=question)

    
    ans = inference(video_path, example_prompt, model, processor, pred_glue=pred_glue)

    pattern_glue = r'<glue>(.*?)</glue>'
    match_glue = re.search(pattern_glue, ans, re.DOTALL)
    # print(f'ann:{ans}')
    answers.append(ans)

    try:
        if match_glue:
            glue = match_glue.group(1)
            pred_glue = ast.literal_eval(glue)
            print(pred_glue)
        
        
    except Exception as e:
        pred_glue = None
print(answers)

```

Here are the printed answer:

<img width="1913" height="459" alt="Image" src="https://github.com/user-attachments/assets/c9d38868-a862-4e85-9795-b7f86b9ba6dd" />

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Too many temporal ROIs in <glue> tags #44

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Too many temporal ROIs in <glue> tags #44

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions