Description
I used VideoRefer-7B for inference, but it doesn't output natural language properly; instead, it produces garbled text like:
! I!!!!!!!!!!!!!!!!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...
I referenced the benchmark files for testing, and the platform is V100.
import sys
sys.path.append('./')
from videorefer import model_init, mm_infer
from videorefer.utils import disable_torch_init
import os
def inference():
disable_torch_init()
# Video Inference
modal = 'video'
# modal = 'text'
# modal_path = '/cache/test_video/1.mp4'
modal_path = sorted([os.path.join("/cache/test_video/frames", item) for item in os.listdir("/cache/test_video/frames")], key=lambda x: int(x.split('_')[-1].split('.')[0])) + ["/cache/test_video/vp2.jpg"]
instruct = 'How many English letters did the cloud with red frame columns eventually form? \nA. 4\nB. 100\nC. 3\nD. 8\n'
model_path = 'cache/model/VideoRefer-7B'
model, processor, tokenizer = model_init(model_path)
for m in model.modules():
m.tokenizer = tokenizer
output = mm_infer(processor[modal](modal_path), instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
# output = mm_infer(None, instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
print(output)
if name == "main":
inference()
Activity