1
1
import os
2
2
import pytest
3
3
import shutil
4
+ from typing import Optional , cast
4
5
from fsspec .implementations .local import LocalFileSystem
5
6
from httpx import AsyncClient
6
7
@@ -20,11 +21,15 @@ def test_simple_page_text() -> None:
20
21
assert len (result [0 ].text ) > 0
21
22
22
23
23
- @pytest .fixture
24
- def markdown_parser () -> LlamaParse :
24
+ @pytest .fixture ( params = [ None , 2 ])
25
+ def markdown_parser (request : pytest . FixtureRequest ) -> LlamaParse :
25
26
if os .environ .get ("LLAMA_CLOUD_API_KEY" , "" ) == "" :
26
27
pytest .skip ("LLAMA_CLOUD_API_KEY not set" )
27
- return LlamaParse (result_type = "markdown" , ignore_errors = False )
28
+ return LlamaParse (
29
+ result_type = "markdown" ,
30
+ ignore_errors = False ,
31
+ partition_pages = cast (Optional [int ], request .param ),
32
+ )
28
33
29
34
30
35
def test_simple_page_markdown (markdown_parser : LlamaParse ) -> None :
@@ -35,8 +40,6 @@ def test_simple_page_markdown(markdown_parser: LlamaParse) -> None:
35
40
36
41
37
42
def test_simple_page_markdown_bytes (markdown_parser : LlamaParse ) -> None :
38
- markdown_parser = LlamaParse (result_type = "markdown" , ignore_errors = False )
39
-
40
43
filepath = "tests/test_files/attention_is_all_you_need.pdf"
41
44
with open (filepath , "rb" ) as f :
42
45
file_bytes = f .read ()
@@ -51,8 +54,6 @@ def test_simple_page_markdown_bytes(markdown_parser: LlamaParse) -> None:
51
54
52
55
53
56
def test_simple_page_markdown_buffer (markdown_parser : LlamaParse ) -> None :
54
- markdown_parser = LlamaParse (result_type = "markdown" , ignore_errors = False )
55
-
56
57
filepath = "tests/test_files/attention_is_all_you_need.pdf"
57
58
with open (filepath , "rb" ) as f :
58
59
# client must provide extra_info with file_name
@@ -161,9 +162,12 @@ async def test_mixing_input_types() -> None:
161
162
os .environ .get ("LLAMA_CLOUD_API_KEY" , "" ) == "" ,
162
163
reason = "LLAMA_CLOUD_API_KEY not set" ,
163
164
)
165
+ @pytest .mark .parametrize ("partition_pages" , [None , 2 ])
164
166
@pytest .mark .asyncio
165
- async def test_download_images () -> None :
166
- parser = LlamaParse (result_type = "markdown" , take_screenshot = True )
167
+ async def test_download_images (partition_pages : Optional [int ]) -> None :
168
+ parser = LlamaParse (
169
+ result_type = "markdown" , take_screenshot = True , partition_pages = partition_pages
170
+ )
167
171
filepath = "tests/test_files/attention_is_all_you_need.pdf"
168
172
json_result = await parser .aget_json ([filepath ])
169
173
@@ -175,3 +179,17 @@ async def test_download_images() -> None:
175
179
176
180
await parser .aget_images (json_result , download_path )
177
181
assert len (os .listdir (download_path )) == len (json_result [0 ]["pages" ][0 ]["images" ])
182
+
183
+
184
+ @pytest .mark .asyncio
185
+ @pytest .mark .parametrize ("split_by_page,expected" , [(True , 4 ), (False , 1 )])
186
+ async def test_multiple_page_markdown (
187
+ markdown_parser : LlamaParse ,
188
+ split_by_page : bool ,
189
+ expected : int ,
190
+ ) -> None :
191
+ markdown_parser .split_by_page = split_by_page
192
+ filepath = "tests/test_files/TOS.pdf"
193
+ result = await markdown_parser .aload_data (filepath )
194
+ assert len (result ) == expected
195
+ assert all (len (doc .text ) > 0 for doc in result )
0 commit comments