-
Notifications
You must be signed in to change notification settings - Fork 31.4k
Open
Description
Hi,
When I call the PaddlePaddle/PaddleOCR-VL model using the code below, I encounter inaccurate outputs—specifically, numerical values are not decoded correctly.
For example, the model outputs:
“(e.g., Gemini-.-Pro gains only +.%, while DeepSeek-R declines by -.%)”
whereas the original text in the image is:
“(e.g., Gemini-2.5-Pro gains only +1.8%, while DeepSeek-R1 declines by -0.1%)”.
Am I missing some configuration or preprocessing step?
from PIL import Image
import torch
from transformers import PaddleOCRVLProcessor, PaddleOCRVLForConditionalGeneration
from transformers import AutoProcessor, AutoModelForImageTextToText
# ---- Settings ----
model_path = "PaddlePaddle/PaddleOCR-VL"
image_path = "../test.png"
task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
# ------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PROMPTS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
image = Image.open(image_path).convert("RGB")
model = AutoModelForImageTextToText.from_pretrained(
model_path,
# trust_remote_code=True,
torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
).to(dtype=torch.bfloat16, device=DEVICE).eval()
processor = AutoProcessor.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": Image.open(image_path).convert("RGB")},
{"type": "text", "text": PROMPTS[task]}
]
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to(DEVICE)
with torch.inference_mode():
generated_ids = model.generate(
**inputs,
max_new_tokens=1024,
# do_sample=False,
# use_cache=True
)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(output_text)Metadata
Metadata
Assignees
Labels
No labels