Skip to content

Commit 87a88a3

Browse files
feat: improve pdfminer element processing (#3618)
This PR implements splitting of `pdfminer` elements (`groups of text chunks`) into smaller bounding boxes (`text lines`). This implementation prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text. This PR also addresses #3430. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]>
1 parent 639ca59 commit 87a88a3

19 files changed

+4688
-689
lines changed

Diff for: CHANGELOG.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
## 0.15.12-dev1
1+
## 0.15.12-dev2
22

33
### Enhancements
44

5+
* **Improve `pdfminer` element processing** Implemented splitting of `pdfminer` elements (groups of text chunks) into smaller bounding boxes (text lines). This prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text.
6+
57
### Features
68

79
### Fixes
@@ -22,7 +24,6 @@
2224
* **Enhance `pdfminer` element cleanup** Expand removal of `pdfminer` elements to include those inside all `non-pdfminer` elements, not just `tables`.
2325
* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
2426
* **Vectorize pdfminer elements deduplication computation**. Use `numpy` operations to compute IOU and sub-region membership instead of using simply loop. This improves the speed of deduplicating elements for pages with a lot of elements.
25-
* **Add deprecation warning to embed code**
2627

2728
### Features
2829

Diff for: test_unstructured/partition/pdf_image/test_pdf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def test_partition_pdf_local_raises_with_no_filename():
159159
[
160160
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
161161
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
162-
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
162+
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer", "ocr_tesseract"}),
163163
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
164164
],
165165
)
@@ -552,7 +552,7 @@ def test_partition_pdf_with_copy_protection():
552552
filename = example_doc_path("pdf/copy-protected.pdf")
553553
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
554554
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
555-
idx = 2
555+
idx = 22
556556
assert elements[idx].text == title
557557
assert {element.metadata.page_number for element in elements} == {1, 2}
558558
assert elements[idx].metadata.detection_class_prob is not None

Diff for: test_unstructured/partition/pdf_image/test_pdfminer_processing.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pytest
33
from PIL import Image
44
from unstructured_inference.constants import Source as InferenceSource
5-
from unstructured_inference.inference.elements import Rectangle, TextRegion
5+
from unstructured_inference.inference.elements import EmbeddedTextRegion, Rectangle, TextRegion
66
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
77

88
from unstructured.partition.pdf_image.pdfminer_processing import (
@@ -11,6 +11,7 @@
1111
boxes_self_iou,
1212
clean_pdfminer_duplicate_image_elements,
1313
clean_pdfminer_inner_elements,
14+
remove_duplicate_embedded_text,
1415
)
1516
from unstructured.partition.utils.constants import Source
1617

@@ -209,3 +210,22 @@ def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
209210
def test_boxes_self_iou(coords, threshold, expected):
210211
bboxes = [Rectangle(*row) for row in coords]
211212
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
213+
214+
215+
def test_remove_duplicate_embedded_text():
216+
sample_elements = [
217+
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
218+
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
219+
EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
220+
]
221+
222+
result = remove_duplicate_embedded_text(sample_elements)
223+
224+
# Check that duplicates were removed and only 2 unique elements remain
225+
assert len(result) == 2
226+
assert result[0].text == "Text 2"
227+
assert result[1].text == "Text 3"
228+
229+
# Ensure the duplicate was removed by checking that result contains no redundant bboxes
230+
assert result[0].bbox == Rectangle(0, 0, 10, 10)
231+
assert result[1].bbox == Rectangle(20, 20, 30, 30)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from unittest.mock import MagicMock
2+
3+
from pdfminer.layout import LTContainer, LTTextLine
4+
5+
from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
6+
7+
8+
def test_extract_text_objects_nested_containers():
9+
"""Test extract_text_objects with nested LTContainers."""
10+
# Mock LTTextLine objects
11+
mock_text_line1 = MagicMock(spec=LTTextLine)
12+
mock_text_line2 = MagicMock(spec=LTTextLine)
13+
14+
# Mock inner container containing one LTTextLine
15+
mock_inner_container = MagicMock(spec=LTContainer)
16+
mock_inner_container.__iter__.return_value = [mock_text_line2]
17+
18+
# Mock outer container containing another LTTextLine and the inner container
19+
mock_outer_container = MagicMock(spec=LTContainer)
20+
mock_outer_container.__iter__.return_value = [mock_text_line1, mock_inner_container]
21+
22+
# Call the function with the outer container
23+
result = extract_text_objects(mock_outer_container)
24+
25+
# Assert both text line objects are extracted, even from nested containers
26+
assert len(result) == 2
27+
assert mock_text_line1 in result
28+
assert mock_text_line2 in result

Diff for: test_unstructured/partition/test_auto.py

+10-19
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import json
77
import os
88
import pathlib
9-
import sys
109
import tempfile
1110
import warnings
1211
from importlib import import_module
@@ -505,7 +504,7 @@ def test_auto_partition_org_from_file():
505504
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
506505
)
507506
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
508-
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
507+
file_path = example_doc_path("pdf/chevron-page.pdf")
509508
metadata_filename = file_path if pass_metadata_filename else None
510509

511510
elements = partition(
@@ -515,28 +514,23 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_
515514
strategy=PartitionStrategy.HI_RES,
516515
)
517516

518-
# NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
519-
# (on Mac) than it does in CI. Basically the first element when partitioning locally is split
520-
# in two when partitioning on CI. Other than that split the text is exactly the same.
521-
idx = 2 if sys.platform == "darwin" else 3
522-
523-
e = elements[idx]
517+
e = elements[0]
524518
assert isinstance(e, Title)
525-
assert e.text.startswith("LayoutParser")
519+
assert e.text.startswith("eastern mediterranean")
526520
assert e.metadata.filename == os.path.basename(file_path)
527521
assert e.metadata.file_directory == os.path.split(file_path)[0]
528522

529-
e = elements[idx + 1]
523+
e = elements[1]
530524
assert isinstance(e, NarrativeText)
531-
assert e.text.startswith("Zejiang Shen")
525+
assert e.text.startswith("We’re investing")
532526

533527

534528
@pytest.mark.parametrize(
535529
("pass_metadata_filename", "content_type"),
536530
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
537531
)
538532
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
539-
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
533+
file_path = example_doc_path("pdf/chevron-page.pdf")
540534
metadata_filename = file_path if pass_metadata_filename else None
541535

542536
with open(file_path, "rb") as f:
@@ -547,16 +541,13 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
547541
strategy=PartitionStrategy.HI_RES,
548542
)
549543

550-
# NOTE(scanny): see "from_filename" version of this test above for more on this oddness
551-
idx = 2 if sys.platform == "darwin" else 3
552-
553-
e = elements[idx]
544+
e = elements[0]
554545
assert isinstance(e, Title)
555-
assert e.text.startswith("LayoutParser")
546+
assert e.text.startswith("eastern mediterranean")
556547

557-
e = elements[idx + 1]
548+
e = elements[1]
558549
assert isinstance(e, NarrativeText)
559-
assert e.text.startswith("Zejiang Shen")
550+
assert e.text.startswith("We’re investing")
560551

561552

562553
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):

Diff for: test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json

+56-12
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,32 @@
307307
}
308308
}
309309
},
310+
{
311+
"type": "Header",
312+
"element_id": "9aa82368657b60536f152fd413aec316",
313+
"text": "Core Skills for Biomedical Data Scientists",
314+
"metadata": {
315+
"filetype": "application/pdf",
316+
"languages": [
317+
"eng"
318+
],
319+
"page_number": 2,
320+
"data_source": {
321+
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
322+
"version": "237960874052008560436652606947751982249",
323+
"record_locator": {
324+
"protocol": "abfs",
325+
"remote_file_path": "abfs://container1/"
326+
},
327+
"date_created": "1678440764.0",
328+
"date_modified": "1678440764.0"
329+
}
330+
}
331+
},
310332
{
311333
"type": "UncategorizedText",
312-
"element_id": "b810a8721369c3551c942aab9011b7d1",
313-
"text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________",
334+
"element_id": "4f2dbe3656a9ebc60c7e3426ad3cb3e3",
335+
"text": "_____________________________________________________________________________________________",
314336
"metadata": {
315337
"filetype": "application/pdf",
316338
"languages": [
@@ -331,7 +353,7 @@
331353
},
332354
{
333355
"type": "NarrativeText",
334-
"element_id": "c8fdefac1ae82fa42caeceff04853415",
356+
"element_id": "cd359ae8c49885ead47318021438eead",
335357
"text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.",
336358
"metadata": {
337359
"filetype": "application/pdf",
@@ -353,7 +375,7 @@
353375
},
354376
{
355377
"type": "Title",
356-
"element_id": "b5b7392d0a946f5016bfa8ad0c248a9b",
378+
"element_id": "bf8321a34edb7103ec4209f3e4a8a8da",
357379
"text": "Methodology",
358380
"metadata": {
359381
"filetype": "application/pdf",
@@ -375,7 +397,7 @@
375397
},
376398
{
377399
"type": "NarrativeText",
378-
"element_id": "d9d8e38d221ae621c0ddbcabaa4a28b4",
400+
"element_id": "1e1d3d1a5c1397fc588393568d829bc8",
379401
"text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:",
380402
"metadata": {
381403
"filetype": "application/pdf",
@@ -397,7 +419,7 @@
397419
},
398420
{
399421
"type": "ListItem",
400-
"element_id": "ba70aa3bc3ad0dec6a62939c94c5a20c",
422+
"element_id": "45d7ff56632d66a2ab2d4dd2716d4d2e",
401423
"text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.",
402424
"metadata": {
403425
"filetype": "application/pdf",
@@ -419,7 +441,7 @@
419441
},
420442
{
421443
"type": "ListItem",
422-
"element_id": "24724b1f0d20a6575f2782fd525c562f",
444+
"element_id": "bf452aac5123fcedda30dd6ed179f41c",
423445
"text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A.",
424446
"metadata": {
425447
"filetype": "application/pdf",
@@ -441,7 +463,7 @@
441463
},
442464
{
443465
"type": "ListItem",
444-
"element_id": "5e6c73154a1e5f74780c69afbc9bc084",
466+
"element_id": "ca176cbef532792b1f11830ff7520587",
445467
"text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.",
446468
"metadata": {
447469
"filetype": "application/pdf",
@@ -463,7 +485,7 @@
463485
},
464486
{
465487
"type": "NarrativeText",
466-
"element_id": "249f6c76b2c99dadbefb8b8811b0d4cd",
488+
"element_id": "11b170fedd889c3b895bbd28acd811ca",
467489
"text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist.",
468490
"metadata": {
469491
"filetype": "application/pdf",
@@ -485,8 +507,30 @@
485507
},
486508
{
487509
"type": "NarrativeText",
488-
"element_id": "6543ce4e447de8fb3db98ceb06a50c28",
489-
"text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
510+
"element_id": "2665aadf75bca259f1f5b4c91a53a301",
511+
"text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com",
512+
"metadata": {
513+
"filetype": "application/pdf",
514+
"languages": [
515+
"eng"
516+
],
517+
"page_number": 2,
518+
"data_source": {
519+
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
520+
"version": "237960874052008560436652606947751982249",
521+
"record_locator": {
522+
"protocol": "abfs",
523+
"remote_file_path": "abfs://container1/"
524+
},
525+
"date_created": "1678440764.0",
526+
"date_modified": "1678440764.0"
527+
}
528+
}
529+
},
530+
{
531+
"type": "NarrativeText",
532+
"element_id": "8bbfe1c3e6bca9a33226d20d69b2297a",
533+
"text": "2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
490534
"metadata": {
491535
"filetype": "application/pdf",
492536
"languages": [
@@ -507,7 +551,7 @@
507551
},
508552
{
509553
"type": "UncategorizedText",
510-
"element_id": "1a6ff96d028f18331a9d9c9748b49321",
554+
"element_id": "dd4a661e1a3c898a5cf6328ba56b924d",
511555
"text": "2",
512556
"metadata": {
513557
"filetype": "application/pdf",

0 commit comments

Comments
 (0)