Skip to content

Commit 9d58b34

Browse files
authored
Fix/fix table id checking logic (#3898)
- there is a bug in deciding if a page has tables before performing table extraction. This logic checks if the id associated with Table type element is True - however, it should be checking if the id is `None` because sometimes the id can be 0 (the first type of element in the page) - the fix updates the logic - adds a unit test for this specific case
1 parent a368aac commit 9d58b34

File tree

4 files changed

+29
-2
lines changed

4 files changed

+29
-2
lines changed

Diff for: CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Features
66

77
### Fixes
8+
- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.
89

910
## 0.16.18-dev1
1011

Diff for: example-docs/pdf/single_table.pdf

77.3 KB
Binary file not shown.

Diff for: test_unstructured/partition/pdf_image/test_ocr.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@
1616
LayoutElements,
1717
)
1818

19+
from test_unstructured.unit_utils import example_doc_path
1920
from unstructured.documents.elements import ElementType
2021
from unstructured.partition.pdf_image import ocr
21-
from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes
22+
from unstructured.partition.pdf_image.pdf_image_utils import (
23+
convert_pdf_to_images,
24+
pad_element_bboxes,
25+
)
2226
from unstructured.partition.utils.config import env_config
2327
from unstructured.partition.utils.constants import (
2428
Source,
@@ -436,6 +440,28 @@ def mock_ocr_layout():
436440
)
437441

438442

443+
def test_supplement_element_with_table_extraction():
444+
from unstructured_inference.models import tables
445+
446+
tables.load_agent()
447+
448+
image = next(convert_pdf_to_images(example_doc_path("pdf/single_table.pdf")))
449+
elements = LayoutElements(
450+
element_coords=np.array([[215.00109863, 731.89996338, 1470.07739258, 972.83129883]]),
451+
texts=np.array(["foo"]),
452+
sources=np.array(["yolox_sg"]),
453+
element_class_ids=np.array([0]),
454+
element_class_id_map={0: "Table"},
455+
)
456+
supplemented = ocr.supplement_element_with_table_extraction(
457+
elements=elements,
458+
image=image,
459+
tables_agent=tables.tables_agent,
460+
ocr_agent=ocr.OCRAgent.get_agent(language="eng"),
461+
)
462+
assert supplemented.text_as_html[0].startswith("<table>")
463+
464+
439465
def test_get_table_tokens(mock_ocr_layout):
440466
with patch.object(OCRAgentTesseract, "get_layout_from_image", return_value=mock_ocr_layout):
441467
ocr_agent = OCRAgent.get_agent(language="eng")

Diff for: unstructured/partition/pdf_image/ocr.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ def supplement_element_with_table_extraction(
276276
from unstructured_inference.models.tables import cells_to_html
277277

278278
table_id = {v: k for k, v in elements.element_class_id_map.items()}.get(ElementType.TABLE)
279-
if not table_id:
279+
if table_id is None:
280280
# no table found in this page
281281
return elements
282282

0 commit comments

Comments
 (0)