From 009514b60f301ad1a9de4015be951fb98f0016b0 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 19 Jun 2026 11:07:23 +0200 Subject: [PATCH 1/3] test: add percentage-based tolerance for image size verification Signed-off-by: Cesar Berrospi Ramis --- tests/test_verify_utils.py | 67 +++++++++++++++++++++++++++++++++++--- tests/verify_utils.py | 48 +++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 11 deletions(-) diff --git a/tests/test_verify_utils.py b/tests/test_verify_utils.py index ed837ae969..979b200303 100644 --- a/tests/test_verify_utils.py +++ b/tests/test_verify_utils.py @@ -102,14 +102,73 @@ def test_verify_docitems_rejects_picture_count_mismatch(): ) -def test_verify_docitems_uses_predicted_picture_image(): - doc_true = _make_doc_with_picture(image_size=(2, 2)) - doc_pred = _make_doc_with_picture(image_size=(3, 2)) +def test_verify_docitems_uses_predicted_picture_image() -> None: + """Test that image size mismatches are detected.""" + doc_true = _make_doc_with_picture(image_size=(10, 10)) + doc_pred = _make_doc_with_picture(image_size=(15, 10)) # 5 pixel difference - with pytest.raises(AssertionError): + with pytest.raises(AssertionError, match="Image width mismatch"): verify_docitems( doc_pred=doc_pred, doc_true=doc_true, fuzzy=False, pdf_filename="fixture.json", ) + + +@pytest.mark.parametrize( + "true_size,pred_size,fuzzy,should_pass,expected_error", + [ + # Strict mode (fuzzy=False): tolerance is 1.5% of image dimension + # For 254x267 image: width_tol=3.8px, height_tol=4px + ( + (254, 267), + (251, 267), + False, + True, + None, + ), # 3px width diff: passes (within 1.5%) + ((254, 267), (250, 267), False, False, "Image width mismatch"), # 4px: fails + ( + (254, 267), + (254, 263), + False, + True, + None, + ), # 4px height diff: passes (within 1.5%) + ((254, 267), (254, 262), False, False, "Image height mismatch"), # 5px: fails + # Fuzzy mode (fuzzy=True): tolerance is 5% of image dimension + # For 254x267 image: width_tol=12px, height_tol=13px + ((254, 267), (242, 254), True, True, None), # 12-13px diff: passes (within 5%) + ((254, 267), (241, 267), True, False, "Image width mismatch"), # 13px: fails + ((254, 267), (254, 253), True, False, "Image height mismatch"), # 14px: fails + # Small images should have at least 1 pixel tolerance + ((10, 10), (9, 9), False, True, None), # 1px diff on small image: passes + ], +) +def test_verify_docitems_image_size_fuzziness( + true_size: tuple(int, int), + pred_size: tuple(int, int), + fuzzy: bool, + should_pass: bool, + expected_error: str | None, +) -> None: + """Test image size verification with percentage-based tolerance in strict and fuzzy modes.""" + doc_true = _make_doc_with_picture(image_size=true_size) + doc_pred = _make_doc_with_picture(image_size=pred_size) + + if should_pass: + verify_docitems( + doc_pred=doc_pred, + doc_true=doc_true, + fuzzy=fuzzy, + pdf_filename="fixture.json", + ) + else: + with pytest.raises(AssertionError, match=expected_error): + verify_docitems( + doc_pred=doc_pred, + doc_true=doc_true, + fuzzy=fuzzy, + pdf_filename="fixture.json", + ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index ad2e1f6c14..0dfdc492ad 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -26,6 +26,8 @@ FUZZY_BBOX_TOL_RATIO = ( 0.005 # OCR/image output varies more, but gross shifts should fail ) +STRICT_IMAGE_SIZE_TOL_RATIO = 0.015 # allow ~1.5% cross-platform image size variance +FUZZY_IMAGE_SIZE_TOL_RATIO = 0.05 # OCR/image output varies more, allow ~5% class _TestPagesMeta(BaseModel): @@ -171,13 +173,45 @@ def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool): def verify_picture_image_v2( - true_image: PILImage.Image, pred_item: Optional[PILImage.Image] -): + true_image: PILImage.Image, pred_item: Optional[PILImage.Image], fuzzy: bool = False +) -> None: + """Compare image properties with optional fuzziness for cross-platform variance. + + Args: + true_image: Ground truth image + pred_item: Predicted image + fuzzy: If True, allow larger size differences (e.g., OCR/image processing variance) + + Note: + We don't compare image bytes as they can vary significantly across platforms even for visually identical images + """ assert pred_item is not None, "predicted image is None" - assert true_image.size == pred_item.size - assert true_image.mode == pred_item.mode - # assert true_image.tobytes() == pred_item.tobytes() - return True + + # Check image mode (should be exact) + assert true_image.mode == pred_item.mode, ( + f"Image mode mismatch: {true_image.mode} vs {pred_item.mode}" + ) + + # Check image size with percentage-based tolerance + tol_ratio = FUZZY_IMAGE_SIZE_TOL_RATIO if fuzzy else STRICT_IMAGE_SIZE_TOL_RATIO + true_width, true_height = true_image.size + pred_width, pred_height = pred_item.size + + # Calculate tolerance based on the true image dimensions + width_tol = max(1, int(true_width * tol_ratio)) + height_tol = max(1, int(true_height * tol_ratio)) + + width_diff = abs(true_width - pred_width) + height_diff = abs(true_height - pred_height) + + assert width_diff <= width_tol, ( + f"Image width mismatch: {true_width} vs {pred_width} " + f"(diff: {width_diff}, tol: {width_tol} [{tol_ratio:.1%}])" + ) + assert height_diff <= height_tol, ( + f"Image height mismatch: {true_height} vs {pred_height} " + f"(diff: {height_diff}, tol: {height_tol} [{tol_ratio:.1%}])" + ) def verify_docitems( @@ -285,7 +319,7 @@ def verify_docitems( true_image = true_item.get_image(doc=doc_true) pred_image = pred_item.get_image(doc=doc_pred) if true_image is not None: - assert verify_picture_image_v2(true_image, pred_image), ( + assert verify_picture_image_v2(true_image, pred_image, fuzzy=fuzzy), ( f"[{pdf_filename}] Picture image mismatch" ) # TODO: check picture annotations From fa51337b44bd53e241d1f928e2ac9052c9d8a4f8 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 19 Jun 2026 11:09:02 +0200 Subject: [PATCH 2/3] test(docx): set image verification fuzziness to False The images in docx should pass the verification with small tolerance due to cross-platform Signed-off-by: Cesar Berrospi Ramis --- tests/test_backend_msword.py | 2 +- tests/verify_utils.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 9f57455877..b1cba97132 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -107,7 +107,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument ), f"export to indented-text failed on {docx_path}" assert verify_document( - doc, str(docx_path) + ".json", generate=GENERATE, fuzzy=True + doc, str(docx_path) + ".json", generate=GENERATE, fuzzy=False ), f"DoclingDocument verification failed on {docx_path}" if docx_path.name in {"word_tables.docx", "docx_rich_cells.docx"}: diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 0dfdc492ad..67d0257e00 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -174,7 +174,7 @@ def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool): def verify_picture_image_v2( true_image: PILImage.Image, pred_item: Optional[PILImage.Image], fuzzy: bool = False -) -> None: +) -> bool: """Compare image properties with optional fuzziness for cross-platform variance. Args: @@ -213,6 +213,8 @@ def verify_picture_image_v2( f"(diff: {height_diff}, tol: {height_tol} [{tol_ratio:.1%}])" ) + return True + def verify_docitems( *, From 28c9f3908e128cbee804ca730d032ccb5a0410b9 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis Date: Fri, 19 Jun 2026 11:59:58 +0200 Subject: [PATCH 3/3] test: change the precision of the image tolerance Signed-off-by: Cesar Berrospi Ramis --- tests/test_verify_utils.py | 62 ++++++++++++++++++++++++++------------ tests/verify_utils.py | 16 +++++----- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/tests/test_verify_utils.py b/tests/test_verify_utils.py index 979b200303..e2c4a44eb8 100644 --- a/tests/test_verify_utils.py +++ b/tests/test_verify_utils.py @@ -103,9 +103,8 @@ def test_verify_docitems_rejects_picture_count_mismatch(): def test_verify_docitems_uses_predicted_picture_image() -> None: - """Test that image size mismatches are detected.""" - doc_true = _make_doc_with_picture(image_size=(10, 10)) - doc_pred = _make_doc_with_picture(image_size=(15, 10)) # 5 pixel difference + doc_true = _make_doc_with_picture(image_size=(2, 2)) + doc_pred = _make_doc_with_picture(image_size=(3, 2)) with pytest.raises(AssertionError, match="Image width mismatch"): verify_docitems( @@ -120,35 +119,60 @@ def test_verify_docitems_uses_predicted_picture_image() -> None: "true_size,pred_size,fuzzy,should_pass,expected_error", [ # Strict mode (fuzzy=False): tolerance is 1.5% of image dimension - # For 254x267 image: width_tol=3.8px, height_tol=4px + # For 254x267 image: 3px = 1.18% width, 4px = 1.50% height + ((254, 267), (251, 267), False, True, None), # 3px = 1.18% width: passes ( (254, 267), - (251, 267), + (250, 267), False, - True, - None, - ), # 3px width diff: passes (within 1.5%) - ((254, 267), (250, 267), False, False, "Image width mismatch"), # 4px: fails + False, + "Image width mismatch", + ), # 4px = 1.57%: fails ( (254, 267), (254, 263), False, True, None, - ), # 4px height diff: passes (within 1.5%) - ((254, 267), (254, 262), False, False, "Image height mismatch"), # 5px: fails + ), # 4px = 1.50% height: passes (at boundary) + ( + (254, 267), + (254, 262), + False, + False, + "Image height mismatch", + ), # 5px = 1.87%: fails # Fuzzy mode (fuzzy=True): tolerance is 5% of image dimension - # For 254x267 image: width_tol=12px, height_tol=13px - ((254, 267), (242, 254), True, True, None), # 12-13px diff: passes (within 5%) - ((254, 267), (241, 267), True, False, "Image width mismatch"), # 13px: fails - ((254, 267), (254, 253), True, False, "Image height mismatch"), # 14px: fails - # Small images should have at least 1 pixel tolerance - ((10, 10), (9, 9), False, True, None), # 1px diff on small image: passes + # For 254x267 image: 12px = 4.72% width, 13px = 4.87% height + ((254, 267), (242, 254), True, True, None), # 12-13px = ~4.7-4.9%: passes + ( + (254, 267), + (241, 267), + True, + False, + "Image width mismatch", + ), # 13px = 5.12%: fails + ( + (254, 267), + (254, 253), + True, + False, + "Image height mismatch", + ), # 14px = 5.24%: fails + # Small images: percentage-based tolerance is precise + ( + (10, 10), + (9, 9), + False, + False, + "Image width mismatch", + ), # 1px = 10%: fails (>> 1.5%) + ((100, 100), (99, 99), False, True, None), # 1px = 1%: passes (< 1.5%) ], ) def test_verify_docitems_image_size_fuzziness( - true_size: tuple(int, int), - pred_size: tuple(int, int), + true_size: tuple[int, int], + pred_size: tuple[int, int], fuzzy: bool, should_pass: bool, expected_error: str | None, diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 67d0257e00..8af8ca8170 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -197,20 +197,20 @@ def verify_picture_image_v2( true_width, true_height = true_image.size pred_width, pred_height = pred_item.size - # Calculate tolerance based on the true image dimensions - width_tol = max(1, int(true_width * tol_ratio)) - height_tol = max(1, int(true_height * tol_ratio)) - width_diff = abs(true_width - pred_width) height_diff = abs(true_height - pred_height) - assert width_diff <= width_tol, ( + # Calculate actual percentage differences + width_diff_ratio = width_diff / true_width if true_width > 0 else 0 + height_diff_ratio = height_diff / true_height if true_height > 0 else 0 + + assert width_diff_ratio <= tol_ratio, ( f"Image width mismatch: {true_width} vs {pred_width} " - f"(diff: {width_diff}, tol: {width_tol} [{tol_ratio:.1%}])" + f"(diff: {width_diff} pixels, {width_diff_ratio:.1%} vs tolerance {tol_ratio:.1%})" ) - assert height_diff <= height_tol, ( + assert height_diff_ratio <= tol_ratio, ( f"Image height mismatch: {true_height} vs {pred_height} " - f"(diff: {height_diff}, tol: {height_tol} [{tol_ratio:.1%}])" + f"(diff: {height_diff} pixels, {height_diff_ratio:.1%} vs tolerance {tol_ratio:.1%})" ) return True