Fix PDFMiner bug (#253)

ajjimeno · web-flow · commit c305d10e7d32 · 2023-10-13T22:55:05.000+11:00
Issue:

In some cases, PDFMiner identifies an image document as a full page and
in other installations not. It is difficult to find out when PDFMiner
behaves in one way or another. In either case tested, the version is
`pdfminer.six v20221105`. The solution is to ignore any annotation
coming from Chipper in case the full page clearing code is activated.
Not sure if this is relevant to other models.

---------

Co-authored-by: Antonio Jimeno Yepes &lt;antonio@unstructured.io&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
-## 0.7.4-dev0
+## 0.7.4-dev1
 
+* Fixed bug when PDFMiner predicts that an image text occupies the full page and removes annotations by Chipper.
+* Added random seed to Chipper text generation to avoid differences between calls to Chipper.
 * Allows user to use super-gradients model if they have a callback predict function, a yaml file with names field corresponding to classes and a path to the model weights
 
 ## 0.7.3
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.4-dev0"  # pragma: no cover
+__version__ = "0.7.4-dev1"  # pragma: no cover
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -125,6 +125,9 @@ def merge_inferred_layout_with_extracted_layout(
                 continue
         region_matched = False
         for inferred_region in inferred_layout:
+            if inferred_region.source in (Source.CHIPPER, Source.CHIPPERV1):
+                continue
+
             if inferred_region.bbox.intersects(extracted_region.bbox):
                 same_bbox = region_bounding_boxes_are_almost_the_same(
                     inferred_region.bbox,
diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py
@@ -5,6 +5,7 @@
 import cv2
 import numpy as np
 import torch
+import transformers
 from huggingface_hub import hf_hub_download
 from PIL.Image import Image
 from transformers import DonutProcessor, VisionEncoderDecoderModel
@@ -134,6 +135,7 @@ def predict_tokens(
         image: Image,
     ) -> Tuple[List[int], Sequence[Sequence[torch.Tensor]]]:
         """Predict tokens from image."""
+        transformers.set_seed(42)
         with torch.no_grad():
             outputs = self.model.generate(
                 self.processor(

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.4-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.7.4-dev1" # pragma: no cover`