Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Features

### Fixes
- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.

## 0.16.18-dev1

Expand Down
Binary file added example-docs/pdf/single_table.pdf
Binary file not shown.
28 changes: 27 additions & 1 deletion test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@
LayoutElements,
)

from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image import ocr
from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes
from unstructured.partition.pdf_image.pdf_image_utils import (
convert_pdf_to_images,
pad_element_bboxes,
)
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import (
Source,
Expand Down Expand Up @@ -436,6 +440,28 @@ def mock_ocr_layout():
)


def test_supplement_element_with_table_extraction():
from unstructured_inference.models import tables

tables.load_agent()

image = next(convert_pdf_to_images(example_doc_path("pdf/single_table.pdf")))
elements = LayoutElements(
element_coords=np.array([[215.00109863, 731.89996338, 1470.07739258, 972.83129883]]),
texts=np.array(["foo"]),
sources=np.array(["yolox_sg"]),
element_class_ids=np.array([0]),
element_class_id_map={0: "Table"},
)
supplemented = ocr.supplement_element_with_table_extraction(
elements=elements,
image=image,
tables_agent=tables.tables_agent,
ocr_agent=ocr.OCRAgent.get_agent(language="eng"),
)
assert supplemented.text_as_html[0].startswith("<table>")


def test_get_table_tokens(mock_ocr_layout):
with patch.object(OCRAgentTesseract, "get_layout_from_image", return_value=mock_ocr_layout):
ocr_agent = OCRAgent.get_agent(language="eng")
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def supplement_element_with_table_extraction(
from unstructured_inference.models.tables import cells_to_html

table_id = {v: k for k, v in elements.element_class_id_map.items()}.get(ElementType.TABLE)
if not table_id:
if table_id is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤦

# no table found in this page
return elements

Expand Down
Loading