From 009514b60f301ad1a9de4015be951fb98f0016b0 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 19 Jun 2026 11:07:23 +0200
Subject: [PATCH 1/3] test: add percentage-based tolerance for image size
 verification

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 tests/test_verify_utils.py | 67 +++++++++++++++++++++++++++++++++++---
 tests/verify_utils.py      | 48 +++++++++++++++++++++++----
 2 files changed, 104 insertions(+), 11 deletions(-)

diff --git a/tests/test_verify_utils.py b/tests/test_verify_utils.py
index ed837ae969..979b200303 100644
--- a/tests/test_verify_utils.py
+++ b/tests/test_verify_utils.py
@@ -102,14 +102,73 @@ def test_verify_docitems_rejects_picture_count_mismatch():
         )
 
 
-def test_verify_docitems_uses_predicted_picture_image():
-    doc_true = _make_doc_with_picture(image_size=(2, 2))
-    doc_pred = _make_doc_with_picture(image_size=(3, 2))
+def test_verify_docitems_uses_predicted_picture_image() -> None:
+    """Test that image size mismatches are detected."""
+    doc_true = _make_doc_with_picture(image_size=(10, 10))
+    doc_pred = _make_doc_with_picture(image_size=(15, 10))  # 5 pixel difference
 
-    with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError, match="Image width mismatch"):
         verify_docitems(
             doc_pred=doc_pred,
             doc_true=doc_true,
             fuzzy=False,
             pdf_filename="fixture.json",
         )
+
+
+@pytest.mark.parametrize(
+    "true_size,pred_size,fuzzy,should_pass,expected_error",
+    [
+        # Strict mode (fuzzy=False): tolerance is 1.5% of image dimension
+        # For 254x267 image: width_tol=3.8px, height_tol=4px
+        (
+            (254, 267),
+            (251, 267),
+            False,
+            True,
+            None,
+        ),  # 3px width diff: passes (within 1.5%)
+        ((254, 267), (250, 267), False, False, "Image width mismatch"),  # 4px: fails
+        (
+            (254, 267),
+            (254, 263),
+            False,
+            True,
+            None,
+        ),  # 4px height diff: passes (within 1.5%)
+        ((254, 267), (254, 262), False, False, "Image height mismatch"),  # 5px: fails
+        # Fuzzy mode (fuzzy=True): tolerance is 5% of image dimension
+        # For 254x267 image: width_tol=12px, height_tol=13px
+        ((254, 267), (242, 254), True, True, None),  # 12-13px diff: passes (within 5%)
+        ((254, 267), (241, 267), True, False, "Image width mismatch"),  # 13px: fails
+        ((254, 267), (254, 253), True, False, "Image height mismatch"),  # 14px: fails
+        # Small images should have at least 1 pixel tolerance
+        ((10, 10), (9, 9), False, True, None),  # 1px diff on small image: passes
+    ],
+)
+def test_verify_docitems_image_size_fuzziness(
+    true_size: tuple(int, int),
+    pred_size: tuple(int, int),
+    fuzzy: bool,
+    should_pass: bool,
+    expected_error: str | None,
+) -> None:
+    """Test image size verification with percentage-based tolerance in strict and fuzzy modes."""
+    doc_true = _make_doc_with_picture(image_size=true_size)
+    doc_pred = _make_doc_with_picture(image_size=pred_size)
+
+    if should_pass:
+        verify_docitems(
+            doc_pred=doc_pred,
+            doc_true=doc_true,
+            fuzzy=fuzzy,
+            pdf_filename="fixture.json",
+        )
+    else:
+        with pytest.raises(AssertionError, match=expected_error):
+            verify_docitems(
+                doc_pred=doc_pred,
+                doc_true=doc_true,
+                fuzzy=fuzzy,
+                pdf_filename="fixture.json",
+            )
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index ad2e1f6c14..0dfdc492ad 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -26,6 +26,8 @@
 FUZZY_BBOX_TOL_RATIO = (
     0.005  # OCR/image output varies more, but gross shifts should fail
 )
+STRICT_IMAGE_SIZE_TOL_RATIO = 0.015  # allow ~1.5% cross-platform image size variance
+FUZZY_IMAGE_SIZE_TOL_RATIO = 0.05  # OCR/image output varies more, allow ~5%
 
 
 class _TestPagesMeta(BaseModel):
@@ -171,13 +173,45 @@ def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool):
 
 
 def verify_picture_image_v2(
-    true_image: PILImage.Image, pred_item: Optional[PILImage.Image]
-):
+    true_image: PILImage.Image, pred_item: Optional[PILImage.Image], fuzzy: bool = False
+) -> None:
+    """Compare image properties with optional fuzziness for cross-platform variance.
+
+    Args:
+        true_image: Ground truth image
+        pred_item: Predicted image
+        fuzzy: If True, allow larger size differences (e.g., OCR/image processing variance)
+
+    Note:
+        We don't compare image bytes as they can vary significantly across platforms even for visually identical images
+    """
     assert pred_item is not None, "predicted image is None"
-    assert true_image.size == pred_item.size
-    assert true_image.mode == pred_item.mode
-    # assert true_image.tobytes() == pred_item.tobytes()
-    return True
+
+    # Check image mode (should be exact)
+    assert true_image.mode == pred_item.mode, (
+        f"Image mode mismatch: {true_image.mode} vs {pred_item.mode}"
+    )
+
+    # Check image size with percentage-based tolerance
+    tol_ratio = FUZZY_IMAGE_SIZE_TOL_RATIO if fuzzy else STRICT_IMAGE_SIZE_TOL_RATIO
+    true_width, true_height = true_image.size
+    pred_width, pred_height = pred_item.size
+
+    # Calculate tolerance based on the true image dimensions
+    width_tol = max(1, int(true_width * tol_ratio))
+    height_tol = max(1, int(true_height * tol_ratio))
+
+    width_diff = abs(true_width - pred_width)
+    height_diff = abs(true_height - pred_height)
+
+    assert width_diff <= width_tol, (
+        f"Image width mismatch: {true_width} vs {pred_width} "
+        f"(diff: {width_diff}, tol: {width_tol} [{tol_ratio:.1%}])"
+    )
+    assert height_diff <= height_tol, (
+        f"Image height mismatch: {true_height} vs {pred_height} "
+        f"(diff: {height_diff}, tol: {height_tol} [{tol_ratio:.1%}])"
+    )
 
 
 def verify_docitems(
@@ -285,7 +319,7 @@ def verify_docitems(
             true_image = true_item.get_image(doc=doc_true)
             pred_image = pred_item.get_image(doc=doc_pred)
             if true_image is not None:
-                assert verify_picture_image_v2(true_image, pred_image), (
+                assert verify_picture_image_v2(true_image, pred_image, fuzzy=fuzzy), (
                     f"[{pdf_filename}] Picture image mismatch"
                 )
         # TODO: check picture annotations

From fa51337b44bd53e241d1f928e2ac9052c9d8a4f8 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 19 Jun 2026 11:09:02 +0200
Subject: [PATCH 2/3] test(docx): set image verification fuzziness to False

The images in docx should pass the verification with small tolerance due to cross-platform

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 tests/test_backend_msword.py | 2 +-
 tests/verify_utils.py        | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py
index 9f57455877..b1cba97132 100644
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -107,7 +107,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument
         ), f"export to indented-text failed on {docx_path}"
 
         assert verify_document(
-            doc, str(docx_path) + ".json", generate=GENERATE, fuzzy=True
+            doc, str(docx_path) + ".json", generate=GENERATE, fuzzy=False
         ), f"DoclingDocument verification failed on {docx_path}"
 
         if docx_path.name in {"word_tables.docx", "docx_rich_cells.docx"}:
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 0dfdc492ad..67d0257e00 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -174,7 +174,7 @@ def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool):
 
 def verify_picture_image_v2(
     true_image: PILImage.Image, pred_item: Optional[PILImage.Image], fuzzy: bool = False
-) -> None:
+) -> bool:
     """Compare image properties with optional fuzziness for cross-platform variance.
 
     Args:
@@ -213,6 +213,8 @@ def verify_picture_image_v2(
         f"(diff: {height_diff}, tol: {height_tol} [{tol_ratio:.1%}])"
     )
 
+    return True
+
 
 def verify_docitems(
     *,

From 28c9f3908e128cbee804ca730d032ccb5a0410b9 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 19 Jun 2026 11:59:58 +0200
Subject: [PATCH 3/3] test: change the precision of the image tolerance

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 tests/test_verify_utils.py | 62 ++++++++++++++++++++++++++------------
 tests/verify_utils.py      | 16 +++++-----
 2 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/tests/test_verify_utils.py b/tests/test_verify_utils.py
index 979b200303..e2c4a44eb8 100644
--- a/tests/test_verify_utils.py
+++ b/tests/test_verify_utils.py
@@ -103,9 +103,8 @@ def test_verify_docitems_rejects_picture_count_mismatch():
 
 
 def test_verify_docitems_uses_predicted_picture_image() -> None:
-    """Test that image size mismatches are detected."""
-    doc_true = _make_doc_with_picture(image_size=(10, 10))
-    doc_pred = _make_doc_with_picture(image_size=(15, 10))  # 5 pixel difference
+    doc_true = _make_doc_with_picture(image_size=(2, 2))
+    doc_pred = _make_doc_with_picture(image_size=(3, 2))
 
     with pytest.raises(AssertionError, match="Image width mismatch"):
         verify_docitems(
@@ -120,35 +119,60 @@ def test_verify_docitems_uses_predicted_picture_image() -> None:
     "true_size,pred_size,fuzzy,should_pass,expected_error",
     [
         # Strict mode (fuzzy=False): tolerance is 1.5% of image dimension
-        # For 254x267 image: width_tol=3.8px, height_tol=4px
+        # For 254x267 image: 3px = 1.18% width, 4px = 1.50% height
+        ((254, 267), (251, 267), False, True, None),  # 3px = 1.18% width: passes
         (
             (254, 267),
-            (251, 267),
+            (250, 267),
             False,
-            True,
-            None,
-        ),  # 3px width diff: passes (within 1.5%)
-        ((254, 267), (250, 267), False, False, "Image width mismatch"),  # 4px: fails
+            False,
+            "Image width mismatch",
+        ),  # 4px = 1.57%: fails
         (
             (254, 267),
             (254, 263),
             False,
             True,
             None,
-        ),  # 4px height diff: passes (within 1.5%)
-        ((254, 267), (254, 262), False, False, "Image height mismatch"),  # 5px: fails
+        ),  # 4px = 1.50% height: passes (at boundary)
+        (
+            (254, 267),
+            (254, 262),
+            False,
+            False,
+            "Image height mismatch",
+        ),  # 5px = 1.87%: fails
         # Fuzzy mode (fuzzy=True): tolerance is 5% of image dimension
-        # For 254x267 image: width_tol=12px, height_tol=13px
-        ((254, 267), (242, 254), True, True, None),  # 12-13px diff: passes (within 5%)
-        ((254, 267), (241, 267), True, False, "Image width mismatch"),  # 13px: fails
-        ((254, 267), (254, 253), True, False, "Image height mismatch"),  # 14px: fails
-        # Small images should have at least 1 pixel tolerance
-        ((10, 10), (9, 9), False, True, None),  # 1px diff on small image: passes
+        # For 254x267 image: 12px = 4.72% width, 13px = 4.87% height
+        ((254, 267), (242, 254), True, True, None),  # 12-13px = ~4.7-4.9%: passes
+        (
+            (254, 267),
+            (241, 267),
+            True,
+            False,
+            "Image width mismatch",
+        ),  # 13px = 5.12%: fails
+        (
+            (254, 267),
+            (254, 253),
+            True,
+            False,
+            "Image height mismatch",
+        ),  # 14px = 5.24%: fails
+        # Small images: percentage-based tolerance is precise
+        (
+            (10, 10),
+            (9, 9),
+            False,
+            False,
+            "Image width mismatch",
+        ),  # 1px = 10%: fails (>> 1.5%)
+        ((100, 100), (99, 99), False, True, None),  # 1px = 1%: passes (< 1.5%)
     ],
 )
 def test_verify_docitems_image_size_fuzziness(
-    true_size: tuple(int, int),
-    pred_size: tuple(int, int),
+    true_size: tuple[int, int],
+    pred_size: tuple[int, int],
     fuzzy: bool,
     should_pass: bool,
     expected_error: str | None,
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 67d0257e00..8af8ca8170 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -197,20 +197,20 @@ def verify_picture_image_v2(
     true_width, true_height = true_image.size
     pred_width, pred_height = pred_item.size
 
-    # Calculate tolerance based on the true image dimensions
-    width_tol = max(1, int(true_width * tol_ratio))
-    height_tol = max(1, int(true_height * tol_ratio))
-
     width_diff = abs(true_width - pred_width)
     height_diff = abs(true_height - pred_height)
 
-    assert width_diff <= width_tol, (
+    # Calculate actual percentage differences
+    width_diff_ratio = width_diff / true_width if true_width > 0 else 0
+    height_diff_ratio = height_diff / true_height if true_height > 0 else 0
+
+    assert width_diff_ratio <= tol_ratio, (
         f"Image width mismatch: {true_width} vs {pred_width} "
-        f"(diff: {width_diff}, tol: {width_tol} [{tol_ratio:.1%}])"
+        f"(diff: {width_diff} pixels, {width_diff_ratio:.1%} vs tolerance {tol_ratio:.1%})"
     )
-    assert height_diff <= height_tol, (
+    assert height_diff_ratio <= tol_ratio, (
         f"Image height mismatch: {true_height} vs {pred_height} "
-        f"(diff: {height_diff}, tol: {height_tol} [{tol_ratio:.1%}])"
+        f"(diff: {height_diff} pixels, {height_diff_ratio:.1%} vs tolerance {tol_ratio:.1%})"
     )
 
     return True