Skip to content

Commit 014585e

Browse files
authored
fix: preserve the order of shapes in partition_pptx output (#193)
* order the shapes top to bottom and left to right * added tests for ordering * update change log and bump version * more tests * don't need enumerate * n -> on
1 parent a7ca58e commit 014585e

File tree

4 files changed

+66
-5
lines changed

4 files changed

+66
-5
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.4.5-dev5
1+
## 0.4.6
22

33
* Loosen the default cap threshold to `0.5`.
44
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@@ -13,6 +13,7 @@
1313
* Checks that titles and narrative text are at least 50% alpha characters.
1414
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
1515
environment variable for controlling the max number of words in a title.
16+
* Updated `partition_pptx` to order the elements on the page
1617

1718
## 0.4.4
1819

test_unstructured/partition/test_pptx.py

+47-1
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import pathlib
33
import pytest
44

5+
import pptx
6+
57
from unstructured.partition.pptx import partition_pptx
6-
from unstructured.documents.elements import ListItem, NarrativeText, Title
8+
from unstructured.documents.elements import ListItem, NarrativeText, Text, Title
79

810
DIRECTORY = pathlib.Path(__file__).parent.resolve()
911
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
@@ -41,3 +43,47 @@ def test_partition_pptx_raises_with_both_specified():
4143
def test_partition_pptx_raises_with_neither():
4244
with pytest.raises(ValueError):
4345
partition_pptx()
46+
47+
48+
def test_partition_pptx_orders_elements(tmpdir):
49+
filename = os.path.join(tmpdir, "test-ordering.pptx")
50+
51+
presentation = pptx.Presentation()
52+
blank_slide_layout = presentation.slide_layouts[6]
53+
slide = presentation.slides.add_slide(blank_slide_layout)
54+
55+
left = top = width = height = pptx.util.Inches(2)
56+
txBox = slide.shapes.add_textbox(left, top, width, height)
57+
tf = txBox.text_frame
58+
tf.text = "This is lower and should come second"
59+
60+
left = top = width = height = pptx.util.Inches(1)
61+
left = top = pptx.util.Inches(-10)
62+
txBox = slide.shapes.add_textbox(left, top, width, height)
63+
tf = txBox.text_frame
64+
tf.text = "This is off the page and shouldn't appear"
65+
66+
left = top = width = height = pptx.util.Inches(2)
67+
txBox = slide.shapes.add_textbox(left, top, width, height)
68+
tf = txBox.text_frame
69+
tf.text = ""
70+
71+
left = top = width = height = pptx.util.Inches(1)
72+
txBox = slide.shapes.add_textbox(left, top, width, height)
73+
tf = txBox.text_frame
74+
tf.text = "This is higher and should come first"
75+
76+
top = width = height = pptx.util.Inches(1)
77+
left = pptx.util.Inches(0.5)
78+
txBox = slide.shapes.add_textbox(left, top, width, height)
79+
tf = txBox.text_frame
80+
tf.text = "-------------TOP-------------"
81+
82+
presentation.save(filename)
83+
84+
elements = partition_pptx(filename=filename)
85+
assert elements == [
86+
Text("-------------TOP-------------"),
87+
NarrativeText("This is higher and should come first"),
88+
NarrativeText("This is lower and should come second"),
89+
]

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.5-dev5" # pragma: no cover
1+
__version__ = "0.4.6" # pragma: no cover

unstructured/partition/pptx.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pptx
44

5-
from unstructured.documents.elements import Element, ListItem, NarrativeText, Title
5+
from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title
66
from unstructured.partition.text_type import (
77
is_possible_narrative_text,
88
is_possible_title,
@@ -35,9 +35,16 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
3535

3636
elements: List[Element] = list()
3737
for slide in presentation.slides:
38-
for shape in slide.shapes:
38+
for shape in _order_shapes(slide.shapes):
39+
# NOTE(robinson) - we don't deal with tables yet, but so future humans can find
40+
# it again, here are docs on how to deal with tables. The check for tables should
41+
# be `if shape.has_table`
42+
# ref: https://python-pptx.readthedocs.io/en/latest/user/table.html#adding-a-table
3943
if not shape.has_text_frame:
4044
continue
45+
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
46+
if shape.top < 0 or shape.left < 0:
47+
continue
4148
for paragraph in shape.text_frame.paragraphs:
4249
text = paragraph.text
4350
if text.strip() == "":
@@ -48,10 +55,17 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
4855
elements.append(NarrativeText(text=text))
4956
elif is_possible_title(text):
5057
elements.append(Title(text=text))
58+
else:
59+
elements.append(Text(text=text))
5160

5261
return elements
5362

5463

64+
def _order_shapes(shapes):
65+
"""Orders the shapes from top to bottom and left to right."""
66+
return sorted(shapes, key=lambda x: (x.top, x.left))
67+
68+
5569
def _is_bulleted_paragraph(paragraph) -> bool:
5670
"""Determines if the paragraph is bulleted by looking for a bullet character prefix. Bullet
5771
characters in the openxml schema are represented by buChar"""

0 commit comments

Comments
 (0)