Skip to content

Commit 0001a33

Browse files
fix: pass extract image args to all partitioners (#3950)
This is needed in order for the user to specify whether to extract the base64 for images, which are now parsed by the html partitioner. ## Testing Adds test that validates this by calling the auto-partitioner with appropriate arguments partitioning an html file with base64 embedded image.
1 parent c0457c1 commit 0001a33

File tree

7 files changed

+28
-3
lines changed

7 files changed

+28
-3
lines changed

Diff for: CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.26-dev1
1+
## 0.16.26-dev2
22

33
### Enhancements
44

Diff for: example-docs/html-with-base64-image.html

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<div>
2+
<p>Test page</p>
3+
<img src="
4+
5+
" alt="Unstructured Logo" />
6+
</div>

Diff for: test_unstructured/partition/test_auto.py

+13
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,19 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b
627627
)
628628

629629

630+
def test_auto_partition_html_element_extraction():
631+
extract_image_block_types = ["Image"]
632+
633+
with tempfile.TemporaryDirectory() as tmpdir:
634+
elements = partition(
635+
example_doc_path("html-with-base64-image.html"),
636+
extract_image_block_types=extract_image_block_types,
637+
extract_image_block_to_payload=True,
638+
)
639+
640+
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
641+
642+
630643
def test_partition_pdf_does_not_raise_warning():
631644
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
632645
# per the pytest docs.

Diff for: test_unstructured_ingest/src/local.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
3232
--input-path example-docs \
3333
--work-dir "$WORK_DIR"
3434

35-
"$SCRIPT_DIR"/check-num-files-output.sh 13 $OUTPUT_FOLDER_NAME
35+
"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME
3636

3737
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.26-dev1" # pragma: no cover
1+
__version__ = "0.16.26-dev2" # pragma: no cover

Diff for: unstructured/partition/auto.py

+2
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,8 @@ def augment_metadata(elements: list[Element]) -> list[Element]:
283283
partitioning_kwargs["languages"] = languages
284284
partitioning_kwargs["starting_page_number"] = starting_page_number
285285
partitioning_kwargs["strategy"] = strategy
286+
partitioning_kwargs["extract_image_block_types"] = extract_image_block_types
287+
partitioning_kwargs["extract_image_block_to_payload"] = extract_image_block_to_payload
286288

287289
partition = partitioner_loader.get(file_type)
288290
elements = partition(filename=filename, file=file, **partitioning_kwargs)

Diff for: unstructured/partition/html/partition.py

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ def partition_html(
3737
detection_origin: Optional[str] = None,
3838
html_parser_version: Literal["v1", "v2"] = "v1",
3939
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
40+
extract_image_block_to_payload: bool = False,
41+
extract_image_block_types: Optional[list[str]] = None,
4042
**kwargs: Any,
4143
) -> list[Element]:
4244
"""Partitions an HTML document into its constituent elements.
@@ -86,6 +88,8 @@ def partition_html(
8688
detection_origin=detection_origin,
8789
html_parser_version=html_parser_version,
8890
image_alt_mode=image_alt_mode,
91+
extract_image_block_types=extract_image_block_types,
92+
extract_image_block_to_payload=extract_image_block_to_payload,
8993
)
9094

9195
return list(_HtmlPartitioner.iter_elements(opts))

0 commit comments

Comments
 (0)