@@ -22,7 +22,7 @@ def partition_pdf(
22
22
token : Optional [str ] = None ,
23
23
include_page_breaks : bool = False ,
24
24
strategy : str = "hi_res" ,
25
- extract_tables : bool = False ,
25
+ infer_table_structure : bool = False ,
26
26
encoding : str = "utf-8" ,
27
27
ocr_languages : str = "eng" ,
28
28
) -> List [Element ]:
@@ -45,12 +45,13 @@ def partition_pdf(
45
45
The strategy to use for partitioning the PDF. Uses a layout detection model if set
46
46
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
47
47
and processes it.
48
- extract_tables
49
- If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
50
- is True or False, the partitioning process will attempt to identify any tables in the
51
- document. This parameter indicates that the partitioning process will attempt to extract the
52
- structure of any identified tables. The table structure and cell contents will be stored as
53
- HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
48
+ infer_table_structure
49
+ Only applicable if `strategy=hi_res`.
50
+ If True, any Table elements that are extracted will also have a metadata field
51
+ named "text_as_html" where the table's text content is rendered into an html string.
52
+ I.e., rows and cells are preserved.
53
+ Whether True or False, the "text" field is always present in any Table element
54
+ and is the text content of the table (no structure).
54
55
encoding
55
56
The encoding method used to decode the text input. If None, utf-8 will be used.
56
57
ocr_languages
@@ -66,7 +67,7 @@ def partition_pdf(
66
67
token = token ,
67
68
include_page_breaks = include_page_breaks ,
68
69
strategy = strategy ,
69
- extract_tables = extract_tables ,
70
+ infer_table_structure = infer_table_structure ,
70
71
encoding = encoding ,
71
72
ocr_languages = ocr_languages ,
72
73
)
@@ -81,7 +82,7 @@ def partition_pdf_or_image(
81
82
is_image : bool = False ,
82
83
include_page_breaks : bool = False ,
83
84
strategy : str = "hi_res" ,
84
- extract_tables : bool = False ,
85
+ infer_table_structure : bool = False ,
85
86
encoding : str = "utf-8" ,
86
87
ocr_languages : str = "eng" ,
87
88
) -> List [Element ]:
@@ -117,7 +118,7 @@ def partition_pdf_or_image(
117
118
file = file ,
118
119
template = out_template ,
119
120
is_image = is_image ,
120
- extract_tables = extract_tables ,
121
+ infer_table_structure = infer_table_structure ,
121
122
include_page_breaks = True ,
122
123
ocr_languages = ocr_languages ,
123
124
)
@@ -128,7 +129,7 @@ def partition_pdf_or_image(
128
129
"detectron2 is not installed. Cannot use the hi_res partitioning "
129
130
"strategy. Falling back to partitioning with the fast strategy." ,
130
131
)
131
- if extract_tables :
132
+ if infer_table_structure :
132
133
logger .warning (
133
134
"Table extraction was selected, but is being ignored while using the fast "
134
135
"strategy." ,
@@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
173
174
file : Optional [bytes ] = None ,
174
175
template : Optional [str ] = None ,
175
176
is_image : bool = False ,
176
- extract_tables : bool = False ,
177
+ infer_table_structure : bool = False ,
177
178
include_page_breaks : bool = False ,
178
179
ocr_languages : str = "eng" ,
179
180
) -> List [Element ]:
@@ -204,15 +205,15 @@ def _partition_pdf_or_image_local(
204
205
template ,
205
206
is_image = is_image ,
206
207
ocr_languages = ocr_languages ,
207
- extract_tables = extract_tables ,
208
+ extract_tables = infer_table_structure ,
208
209
)
209
210
else :
210
211
layout = process_data_with_model (
211
212
file ,
212
213
template ,
213
214
is_image = is_image ,
214
215
ocr_languages = ocr_languages ,
215
- extract_tables = extract_tables ,
216
+ extract_tables = infer_table_structure ,
216
217
)
217
218
218
219
return document_to_element_list (layout , include_page_breaks = include_page_breaks )
0 commit comments