Skip to content

Commit 3c3b75a

Browse files
authored
feat/add include_slide_notes parameter (#455)
## Description * Added `include_slide_notes` parameter, default is `True`. Works for `.ppt` and `.pptx` file extensions. * Added two new files in `sample-docs`: `sample-docs/notes.ppt`, `sample-docs/notes.pptx` that include notes on their slides. This is to easily test the functionality, as there are no existing PowerPoint files that include slide notes. ## Testing ``` # using default value (True) returns additional NarrativeText element that contains notes curl -X 'POST' 'http://localhost:8000/general/v0/general' -H 'accept: application/json' -H 'Content-Type: multipart/form-data' -F 'files=@sample-docs/notes.pptx' -F 'output_format="text/csv"' # explicit include_slide_notes=True returns additional NarrativeText element that contains notes curl -X 'POST' 'http://localhost:8000/general/v0/general' -H 'accept: application/json' -H 'Content-Type: multipart/form-data' -F 'files=@sample-docs/notes.pptx' -F 'output_format="text/csv"' -F 'include_slide_notes=True' # explicit include_slide_notes=False returns no NarrativeText element curl -X 'POST' 'http://localhost:8000/general/v0/general' -H 'accept: application/json' -H 'Content-Type: multipart/form-data' -F 'files=@sample-docs/notes.pptx' -F 'output_format="text/csv"' -F 'include_slide_notes=False' ``` Same with file `notes.ppt`
1 parent 843d68a commit 3c3b75a

File tree

8 files changed

+63
-4
lines changed

8 files changed

+63
-4
lines changed

Diff for: CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.80-dev0
2+
3+
* Add `include_slide_notes` parameter, indicating whether slide notes in `ppt` and `pptx` files should be partitioned. Default is `True`. Now, when slide notes are present in the file, they will be included alongside other elements, which may shift the index numbers of non-note elements.
4+
15
## 0.0.79
26

37
* Bump to `unstructured` 0.15.7

Diff for: prepline_general/api/app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
app = FastAPI(
1414
title="Unstructured Pipeline API",
1515
summary="Partition documents with the Unstructured library",
16-
version="0.0.79",
16+
version="0.0.80",
1717
docs_url="/general/docs",
1818
openapi_url="/general/openapi.json",
1919
servers=[

Diff for: prepline_general/api/general.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ def pipeline_api(
273273
extract_image_block_types: Optional[List[str]] = None,
274274
unique_element_ids: Optional[bool] = False,
275275
starting_page_number: Optional[int] = None,
276+
include_slide_notes: Optional[bool] = True,
276277
) -> List[Dict[str, Any]] | str:
277278
if filename.endswith(".msg"):
278279
# Note(yuming): convert file type for msg files
@@ -316,6 +317,7 @@ def pipeline_api(
316317
"overlap": overlap,
317318
"overlap_all": overlap_all,
318319
"starting_page_number": starting_page_number,
320+
"include_slide_notes": include_slide_notes,
319321
},
320322
default=str,
321323
)
@@ -373,6 +375,7 @@ def pipeline_api(
373375
"extract_image_block_types": extract_image_block_types,
374376
"extract_image_block_to_payload": extract_image_block_to_payload,
375377
"unique_element_ids": unique_element_ids,
378+
"include_slide_notes": include_slide_notes,
376379
},
377380
default=str,
378381
)
@@ -403,6 +406,7 @@ def pipeline_api(
403406
"extract_image_block_to_payload": extract_image_block_to_payload,
404407
"unique_element_ids": unique_element_ids,
405408
"starting_page_number": starting_page_number,
409+
"include_slide_notes": include_slide_notes,
406410
}
407411

408412
if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
@@ -649,7 +653,7 @@ def return_content_type(filename: str):
649653

650654

651655
@router.get("/general/v0/general", include_in_schema=False)
652-
@router.get("/general/v0.0.79/general", include_in_schema=False)
656+
@router.get("/general/v0.0.80/general", include_in_schema=False)
653657
async def handle_invalid_get_request():
654658
raise HTTPException(
655659
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -664,7 +668,7 @@ async def handle_invalid_get_request():
664668
description="Description",
665669
operation_id="partition_parameters",
666670
)
667-
@router.post("/general/v0.0.79/general", include_in_schema=False)
671+
@router.post("/general/v0.0.80/general", include_in_schema=False)
668672
def general_partition(
669673
request: Request,
670674
# cannot use annotated type here because of a bug described here:
@@ -747,6 +751,7 @@ def response_generator(is_multipart: bool):
747751
overlap=form_params.overlap,
748752
overlap_all=form_params.overlap_all,
749753
starting_page_number=form_params.starting_page_number,
754+
include_slide_notes=form_params.include_slide_notes,
750755
)
751756

752757
yield (

Diff for: prepline_general/api/models/form_params.py

+13
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class GeneralFormParams(BaseModel):
3636
overlap: int
3737
overlap_all: bool
3838
starting_page_number: Optional[int] = None
39+
include_slide_notes: bool
3940

4041
@classmethod
4142
def as_form(
@@ -246,6 +247,17 @@ def as_form(
246247
example=3,
247248
),
248249
] = None,
250+
include_slide_notes: Annotated[
251+
bool,
252+
Form(
253+
title="include_slide_notes",
254+
description=(
255+
"When `True`, slide notes from .ppt and .pptx files"
256+
" will be included in the response. Default: `True`"
257+
),
258+
example=False,
259+
),
260+
] = True,
249261
) -> "GeneralFormParams":
250262
return cls(
251263
xml_keep_tags=xml_keep_tags,
@@ -273,4 +285,5 @@ def as_form(
273285
overlap_all=overlap_all,
274286
unique_element_ids=unique_element_ids,
275287
starting_page_number=starting_page_number,
288+
include_slide_notes=include_slide_notes,
276289
)

Diff for: preprocessing-pipeline-family.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.79
2+
version: 0.0.80

Diff for: sample-docs/notes.ppt

594 KB
Binary file not shown.

Diff for: sample-docs/notes.pptx

12.3 KB
Binary file not shown.

Diff for: test_general/api/test_app.py

+37
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,7 @@ def test_parallel_mode_passes_params(monkeypatch):
701701
"new_after_n_chars": "1501",
702702
"overlap": "25",
703703
"overlap_all": "true",
704+
"include_slide_notes": "true",
704705
},
705706
)
706707

@@ -733,6 +734,7 @@ def test_parallel_mode_passes_params(monkeypatch):
733734
new_after_n_chars=1501,
734735
overlap=25,
735736
overlap_all=True,
737+
include_slide_notes=True,
736738
)
737739

738740

@@ -1147,3 +1149,38 @@ def test__set_pdf_infer_table_structure(
11471149
)
11481150
is expected
11491151
)
1152+
1153+
1154+
@pytest.mark.parametrize(
1155+
("test_default", "include_slide_notes", "test_file"),
1156+
[
1157+
(True, None, Path("sample-docs") / "notes.ppt"),
1158+
(True, None, Path("sample-docs") / "notes.pptx"),
1159+
(False, True, Path("sample-docs") / "notes.ppt"),
1160+
(False, True, Path("sample-docs") / "notes.pptx"),
1161+
(False, False, Path("sample-docs") / "notes.ppt"),
1162+
(False, False, Path("sample-docs") / "notes.pptx"),
1163+
],
1164+
)
1165+
def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, test_file):
1166+
"""
1167+
Verifies that the output includes slide notes when the include_slide_notes parameter
1168+
is left as default or explicitly set to True.
1169+
"""
1170+
client = TestClient(app)
1171+
data = (
1172+
{"output_format": "text/csv"}
1173+
if test_default
1174+
else {"include_slide_notes": str(include_slide_notes), "output_format": "text/csv"}
1175+
)
1176+
response = client.post(
1177+
MAIN_API_ROUTE,
1178+
files=[("files", (str(test_file), open(test_file, "rb")))],
1179+
data=data,
1180+
)
1181+
df = pd.read_csv(io.StringIO(response.text))
1182+
1183+
if include_slide_notes or test_default:
1184+
assert "Here are important notes" == df["text"][0]
1185+
else:
1186+
assert "Here are important notes" != df["text"][0]

0 commit comments

Comments
 (0)