Skip to content

Commit 344202f

Browse files
feat: detect language for PDFs (#4051)
The `@apply_metadata` decorator already contains logic to detect the language of the element text (on either a document or element level). Update pdfs, and later images, to use this decorator to get accurate element language results outputted. Test ``` from unstructured.partition.auto import partition def test_partition_pdf(): pdf_path = "example-docs/language-docs/fr_olap.pdf" elements = partition(pdf_path) # optionally set `detect_language_per_element=True)` print(f"Number of elements partitioned: {len(elements)}") # Check if elements are returned assert len(elements) > 0, "No elements were partitioned from the PDF." # check language outputted for each element for element in elements: print(element) print(element.metadata.languages) print("-------------------------------") test_partition_pdf() ``` --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: shreyanid <[email protected]>
1 parent 2ffaf6f commit 344202f

File tree

13 files changed

+129
-63
lines changed

13 files changed

+129
-63
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.18.7
2+
3+
### Enhancements
4+
5+
### Features
6+
- **Add language detection for PDFs** Add document and element level language detection to PDFs.
7+
8+
### Fixes
9+
110
## 0.18.6
211

312
### Enhancements
424 KB
Binary file not shown.

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,49 @@ def test_partition_pdf_strategies_keep_languages_metadata(strategy):
569569
assert elements[0].metadata.languages == ["kor"]
570570

571571

572+
@pytest.mark.parametrize(
573+
"strategy",
574+
[
575+
PartitionStrategy.FAST,
576+
PartitionStrategy.HI_RES,
577+
PartitionStrategy.OCR_ONLY,
578+
],
579+
)
580+
def test_partition_pdf_detects_document_language(strategy):
581+
filename = example_doc_path("language-docs/fr_olap.pdf")
582+
elements = pdf.partition_pdf(
583+
filename=filename,
584+
url=None,
585+
strategy=strategy,
586+
)
587+
588+
assert len(elements) > 0
589+
assert elements[0].metadata.languages == ["fra"]
590+
assert elements[-1].metadata.languages == ["fra"]
591+
592+
593+
@pytest.mark.parametrize(
594+
"strategy",
595+
[
596+
PartitionStrategy.FAST,
597+
PartitionStrategy.HI_RES,
598+
PartitionStrategy.OCR_ONLY,
599+
],
600+
)
601+
def test_partition_pdf_detects_language_per_element(strategy):
602+
filename = example_doc_path("language-docs/fr_olap.pdf")
603+
elements = pdf.partition_pdf(
604+
filename=filename,
605+
url=None,
606+
strategy=strategy,
607+
detect_language_per_element=True,
608+
)
609+
610+
assert len(elements) > 0
611+
assert elements[0].metadata.languages == ["fra"]
612+
assert elements[-1].metadata.languages == ["eng"]
613+
614+
572615
@pytest.mark.parametrize(
573616
"ocr_mode",
574617
[

test_unstructured/partition/test_auto.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,7 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
561561
strategy=PartitionStrategy.FAST,
562562
languages=None,
563563
metadata_filename=None,
564+
detect_language_per_element=False,
564565
infer_table_structure=False,
565566
extract_images_in_pdf=False,
566567
extract_image_block_types=None,
@@ -1301,6 +1302,27 @@ def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
13011302
assert all(e.metadata.languages == ["eng"] for e in elements)
13021303

13031304

1305+
@pytest.mark.parametrize(
1306+
"strategy",
1307+
[
1308+
PartitionStrategy.FAST,
1309+
PartitionStrategy.HI_RES,
1310+
PartitionStrategy.OCR_ONLY,
1311+
],
1312+
)
1313+
def test_auto_partition_detects_pdf_language_per_element(strategy):
1314+
filename = example_doc_path("language-docs/fr_olap.pdf")
1315+
elements = partition(
1316+
filename=filename,
1317+
strategy=strategy,
1318+
detect_language_per_element=True,
1319+
)
1320+
1321+
assert len(elements) > 0
1322+
assert elements[0].metadata.languages == ["fra"]
1323+
assert elements[-1].metadata.languages == ["eng"]
1324+
1325+
13041326
def test_auto_partition_languages_argument_default_to_None_when_omitted():
13051327
elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
13061328
# -- PageBreak and any other element with no text is assigned `None` --
@@ -1309,17 +1331,17 @@ def test_auto_partition_languages_argument_default_to_None_when_omitted():
13091331

13101332
def test_auto_partition_default_does_not_overwrite_other_defaults():
13111333
"""`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
1312-
# the default for `languages` is ["auto"] in partiton_text
1334+
# the default for `languages` is ["auto"] in partition_text
13131335
from unstructured.partition.text import partition_text
13141336

13151337
# Use a document that is primarily in a language other than English
13161338
file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
13171339
text_elements = partition_text(file_path)
1318-
assert text_elements[0].metadata.languages != ["eng"]
1340+
assert text_elements[13].metadata.languages != ["eng"]
13191341

13201342
auto_elements = partition(file_path)
1321-
assert auto_elements[0].metadata.languages != ["eng"]
1322-
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
1343+
assert auto_elements[13].metadata.languages != ["eng"]
1344+
assert auto_elements[13].metadata.languages == text_elements[13].metadata.languages
13231345

13241346

13251347
# ================================================================================================

test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,7 @@
55
"text": "My First Heading",
66
"metadata": {
77
"languages": [
8-
"por",
9-
"cat",
10-
"eng",
11-
"vie"
8+
"eng"
129
],
1310
"filetype": "text/html",
1411
"data_source": {
@@ -29,10 +26,7 @@
2926
"text": "My first paragraph.",
3027
"metadata": {
3128
"languages": [
32-
"por",
33-
"cat",
34-
"eng",
35-
"vie"
29+
"eng"
3630
],
3731
"filetype": "text/html",
3832
"data_source": {
@@ -53,10 +47,7 @@
5347
"text": "Some CP1252-specific characters:",
5448
"metadata": {
5549
"languages": [
56-
"por",
57-
"cat",
58-
"eng",
59-
"vie"
50+
"eng"
6051
],
6152
"filetype": "text/html",
6253
"data_source": {
@@ -77,9 +68,6 @@
7768
"text": " ¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯\n°\t±\t²\t³\t´\tµ\t\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿\nÀ\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ\nÐ\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß\nà\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï\nð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ",
7869
"metadata": {
7970
"languages": [
80-
"por",
81-
"cat",
82-
"eng",
8371
"vie"
8472
],
8573
"filetype": "text/html",

0 commit comments

Comments
 (0)