Skip to content

Minicpm_o-4-5进行视频推理和多模混合推理时报错 #1081

@KivenJoo

Description

@KivenJoo

File "/workdir/user_repository/llm-eval-public/inference/local_deploy_minicpm_o_4_5.py", line 115, in generate
answer = self.model.chat(msgs=messages, max_new_tokens=32768,max_inp_length=32000,max_slice_nums=1, omni_mode=True, use_tts_template=False,
File "/usr/local/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/home/hadoop-aipnlp/.cache/huggingface/modules/transformers_modules/main/modeling_minicpmo.py", line 1202, in chat
res, outputs = self.generate(
File "/usr/local/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/home/hadoop-aipnlp/.cache/huggingface/modules/transformers_modules/main/modeling_minicpmo.py", line 892, in generate
model_inputs["inputs_embeds"] = self.get_omni_embedding(
File "/home/hadoop-aipnlp/.cache/huggingface/modules/transformers_modules/main/modeling_minicpmo.py", line 771, in get_omni_embedding
input_embeddings[i, bound[0] : bound[1]] = audio_embs[
RuntimeError: The expanded size of the tensor (21) must match the existing size (20) at non-singleton dimension 0. Target sizes: [21, 4096]. Tensor sizes: [20, 4096]

视频推理的报错:
File "/workdir/user_repository/llm-eval-public/inference/local_deploy_minicpm_o_4_5.py", line 115, in generate
answer = self.model.chat(msgs=messages, max_new_tokens=32768,max_inp_length=32000,max_slice_nums=1, omni_mode=True, use_tts_template=False,
File "/usr/local/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/home/hadoop-aipnlp/.cache/huggingface/modules/transformers_modules/main/modeling_minicpmo.py", line 1146, in chat
content = normalize_content(content)
File "/home/hadoop-aipnlp/.cache/huggingface/modules/transformers_modules/main/utils.py", line 2400, in normalize_content
normalized = normalize_content_item(item)
File "/home/hadoop-aipnlp/.cache/huggingface/modules/transformers_modules/main/utils.py", line 2338, in normalize_content_item
video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(
File "/home/hadoop-aipnlp/.local/lib/python3.10/site-packages/minicpmo/utils.py", line 392, in get_video_frame_audio_segments
video_segments, timestamps = _extract_frames_by_timestamps(timestamps, is_long_video)
File "/home/hadoop-aipnlp/.local/lib/python3.10/site-packages/minicpmo/utils.py", line 357, in _extract_frames_by_timestamps
video = _vr.get_batch(frame_idx).asnumpy()
File "/usr/local/conda/lib/python3.10/site-packages/decord/video_reader.py", line 175, in get_batch
arr = _CAPI_VideoReaderGetBatch(self._handle, indices)
File "/usr/local/conda/lib/python3.10/site-packages/decord/_ffi/_ctypes/function.py", line 173, in call
check_call(_LIB.DECORDFuncCall(
File "/usr/local/conda/lib/python3.10/site-packages/decord/_ffi/base.py", line 78, in check_call
raise DECORDError(err_str)
decord.ffi.base.DECORDError: [11:20:58] /github/workspace/src/video/ffmpeg/threaded_decoder.cc:104: Check failed: run.load()
请问下这两个有什么好的修复建议吗

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions