Skip to content

Commit 1ca7f2a

Browse files
committed
tests: add tests for partitioned parse
1 parent a9d18b0 commit 1ca7f2a

File tree

3 files changed

+78
-12
lines changed

3 files changed

+78
-12
lines changed

tests/parse/test_llama_parse.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import pytest
33
import shutil
4+
from typing import Optional, cast
45
from fsspec.implementations.local import LocalFileSystem
56
from httpx import AsyncClient
67

@@ -20,11 +21,15 @@ def test_simple_page_text() -> None:
2021
assert len(result[0].text) > 0
2122

2223

23-
@pytest.fixture
24-
def markdown_parser() -> LlamaParse:
24+
@pytest.fixture(params=[None, 2])
25+
def markdown_parser(request: pytest.FixtureRequest) -> LlamaParse:
2526
if os.environ.get("LLAMA_CLOUD_API_KEY", "") == "":
2627
pytest.skip("LLAMA_CLOUD_API_KEY not set")
27-
return LlamaParse(result_type="markdown", ignore_errors=False)
28+
return LlamaParse(
29+
result_type="markdown",
30+
ignore_errors=False,
31+
partition_pages=cast(Optional[int], request.param),
32+
)
2833

2934

3035
def test_simple_page_markdown(markdown_parser: LlamaParse) -> None:
@@ -35,8 +40,6 @@ def test_simple_page_markdown(markdown_parser: LlamaParse) -> None:
3540

3641

3742
def test_simple_page_markdown_bytes(markdown_parser: LlamaParse) -> None:
38-
markdown_parser = LlamaParse(result_type="markdown", ignore_errors=False)
39-
4043
filepath = "tests/test_files/attention_is_all_you_need.pdf"
4144
with open(filepath, "rb") as f:
4245
file_bytes = f.read()
@@ -51,8 +54,6 @@ def test_simple_page_markdown_bytes(markdown_parser: LlamaParse) -> None:
5154

5255

5356
def test_simple_page_markdown_buffer(markdown_parser: LlamaParse) -> None:
54-
markdown_parser = LlamaParse(result_type="markdown", ignore_errors=False)
55-
5657
filepath = "tests/test_files/attention_is_all_you_need.pdf"
5758
with open(filepath, "rb") as f:
5859
# client must provide extra_info with file_name
@@ -161,9 +162,12 @@ async def test_mixing_input_types() -> None:
161162
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
162163
reason="LLAMA_CLOUD_API_KEY not set",
163164
)
165+
@pytest.mark.parametrize("partition_pages", [None, 2])
164166
@pytest.mark.asyncio
165-
async def test_download_images() -> None:
166-
parser = LlamaParse(result_type="markdown", take_screenshot=True)
167+
async def test_download_images(partition_pages: Optional[int]) -> None:
168+
parser = LlamaParse(
169+
result_type="markdown", take_screenshot=True, partition_pages=partition_pages
170+
)
167171
filepath = "tests/test_files/attention_is_all_you_need.pdf"
168172
json_result = await parser.aget_json([filepath])
169173

@@ -175,3 +179,17 @@ async def test_download_images() -> None:
175179

176180
await parser.aget_images(json_result, download_path)
177181
assert len(os.listdir(download_path)) == len(json_result[0]["pages"][0]["images"])
182+
183+
184+
@pytest.mark.asyncio
185+
@pytest.mark.parametrize("split_by_page,expected", [(True, 4), (False, 1)])
186+
async def test_multiple_page_markdown(
187+
markdown_parser: LlamaParse,
188+
split_by_page: bool,
189+
expected: int,
190+
) -> None:
191+
markdown_parser.split_by_page = split_by_page
192+
filepath = "tests/test_files/TOS.pdf"
193+
result = await markdown_parser.aload_data(filepath)
194+
assert len(result) == expected
195+
assert all(len(doc.text) > 0 for doc in result)

tests/parse/test_llama_parse_result.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import tempfile
22
import os
33
import pytest
4+
from typing import Optional
45
from llama_cloud_services import LlamaParse
56
from llama_cloud_services.parse.types import JobResult
67

@@ -15,16 +16,23 @@ def chart_file_path() -> str:
1516
return "tests/test_files/attention_is_all_you_need_chart.pdf"
1617

1718

19+
@pytest.fixture
20+
def multiple_page_path() -> str:
21+
return "tests/test_files/TOS.pdf"
22+
23+
1824
@pytest.mark.asyncio
1925
@pytest.mark.skipif(
2026
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
2127
reason="LLAMA_CLOUD_API_KEY not set",
2228
)
23-
async def test_basic_parse_result(file_path: str):
29+
@pytest.mark.parametrize("partition_pages", [None, 2])
30+
async def test_basic_parse_result(file_path: str, partition_pages: Optional[int]):
2431
parser = LlamaParse(
2532
take_screenshot=True,
2633
auto_mode=True,
2734
fast_mode=False,
35+
partition_pages=partition_pages,
2836
)
2937
result = await parser.aparse(file_path)
3038

@@ -142,8 +150,11 @@ async def test_parse_layout(file_path: str):
142150
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
143151
reason="LLAMA_CLOUD_API_KEY not set",
144152
)
145-
def test_parse_multiple_files(file_path: str, chart_file_path: str):
146-
parser = LlamaParse()
153+
@pytest.mark.parametrize("partition_pages", [None, 2])
154+
def test_parse_multiple_files(
155+
file_path: str, chart_file_path: str, partition_pages: Optional[int]
156+
):
157+
parser = LlamaParse(partition_pages=partition_pages)
147158
result = parser.parse([file_path, chart_file_path])
148159

149160
assert isinstance(result, list)
@@ -152,3 +163,40 @@ def test_parse_multiple_files(file_path: str, chart_file_path: str):
152163
assert isinstance(result[1], JobResult)
153164
assert result[0].file_name == file_path
154165
assert result[1].file_name == chart_file_path
166+
167+
168+
@pytest.mark.asyncio
169+
@pytest.mark.skipif(
170+
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
171+
reason="LLAMA_CLOUD_API_KEY not set",
172+
)
173+
@pytest.mark.parametrize("partition_pages", [None, 2])
174+
async def test_multiple_page_parse_result(
175+
multiple_page_path: str, partition_pages: Optional[int]
176+
):
177+
parser = LlamaParse(
178+
take_screenshot=True,
179+
auto_mode=True,
180+
fast_mode=False,
181+
partition_pages=partition_pages,
182+
)
183+
results = await parser.aparse(multiple_page_path)
184+
if partition_pages is None:
185+
assert isinstance(results, JobResult)
186+
results = [results]
187+
else:
188+
assert isinstance(results, list)
189+
190+
for result in results:
191+
assert isinstance(result, JobResult)
192+
assert result.job_id is not None
193+
assert result.file_name == multiple_page_path
194+
assert len(result.pages) > 0
195+
196+
assert result.pages[0].text is not None
197+
assert len(result.pages[0].text) > 0
198+
199+
assert result.pages[0].md is not None
200+
assert len(result.pages[0].md) > 0
201+
202+
assert result.pages[0].md != result.pages[0].text

tests/test_files/TOS.pdf

182 KB
Binary file not shown.

0 commit comments

Comments
 (0)