Skip to content

Commit f13258d

Browse files
paulamib123DevinTDHaPaulami Bhattacharya
committed
Documentation for SparkNLP Readers and Partition class (#14581)
* Update conda meta.yaml for 6.0.1 [skip test] * added documentation to file readers * updated docs for partition class in scala and python * fixed typos in sparkNLPReader and added documentation for Partition class * added parameters to Partition class and updated read function docs in Readers * updated readers documentation with ipynb path * updated partition description * fixed errors in email readers * fixed errors in email readers * added docs for partition transformer and pdf reader * added docs for python partition transformer and pdf reader * added docs for python partition transformer and pdf reader * updated docs to render partition and reader * reverted changes in init.py * reverted changes in imports * updated formatting docs for pdf reader * updated formatting of docs for spark nlp reader * updated formatting of docs for partition * updated formatting of docs for partition_transformer * updating links to notebooks and partition transformer description --------- Co-authored-by: Devin Ha <devin@trungducha.de> Co-authored-by: Paulami Bhattacharya <paulamibhattacharya@Paulamis-MacBook-Pro.local>
1 parent 77608b4 commit f13258d

16 files changed

Lines changed: 953 additions & 64 deletions

python/docs/reference/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@ Modules
1717
sparknlp.training
1818
sparknlp.pretrained
1919
sparknlp.logging
20+
sparknlp.partition
21+
sparknlp.reader

python/sparknlp/partition/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
"""Module to read various types of documents into chunks"""
1415
from sparknlp.partition.partition import *
1516
from sparknlp.partition.partition_transformer import *

python/sparknlp/partition/partition.py

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,108 @@
1414
import sparknlp
1515
from sparknlp.internal import ExtendedJavaWrapper
1616

17+
1718
class Partition(ExtendedJavaWrapper):
19+
"""
20+
A unified interface for extracting structured content from various document types
21+
using Spark NLP readers.
22+
23+
This class supports reading from files, URLs, in-memory strings, or byte arrays,
24+
and returns parsed output as a structured Spark DataFrame.
25+
26+
Supported formats include:
27+
- Plain text
28+
- HTML
29+
- Word (.doc/.docx)
30+
- Excel (.xls/.xlsx)
31+
- PowerPoint (.ppt/.pptx)
32+
- Email files (.eml, .msg)
33+
- PDFs
34+
35+
Parameters
36+
----------
37+
params : dict, optional
38+
Configuration parameters, including:
39+
40+
- content_type : str
41+
Override automatic file type detection.
42+
- store_content : bool
43+
Include raw file content in the output DataFrame.
44+
- timeout : int
45+
Timeout for fetching HTML content.
46+
- title_font_size : int
47+
Font size used to identify titles.
48+
- include_page_breaks : bool
49+
Tag content with page break metadata.
50+
- group_broken_paragraphs : bool
51+
Merge broken lines into full paragraphs.
52+
- title_length_size : int
53+
Max character length to qualify as title.
54+
- paragraph_split : str
55+
Regex to detect paragraph boundaries.
56+
- short_line_word_threshold : int
57+
Max words in a line to be considered short.
58+
- threshold : float
59+
Ratio of empty lines for switching grouping.
60+
- max_line_count : int
61+
Max lines evaluated in paragraph analysis.
62+
- include_slide_notes : bool
63+
Include speaker notes in output.
64+
- infer_table_structure : bool
65+
Generate HTML table structure.
66+
- append_cells : bool
67+
Merge Excel rows into one block.
68+
- cell_separator : str
69+
Join cell values in a row.
70+
- add_attachment_content : bool
71+
Include text of plain-text attachments.
72+
- headers : dict
73+
Request headers when using URLs.
74+
75+
Examples
76+
---------
77+
78+
Reading Text Files
79+
80+
>>> txt_directory = "/content/txtfiles/reader/txt"
81+
>>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
82+
>>> partition_df.show()
83+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
84+
>>> partition_df.show()
85+
>>> partition_df = Partition().partition(
86+
... "https://www.wikipedia.com",
87+
... headers={"Accept-Language": "es-ES"}
88+
... )
89+
>>> partition_df.show()
90+
+--------------------+--------------------+
91+
| path| txt|
92+
+--------------------+--------------------+
93+
|file:/content/txt...|[{Title, BIG DATA...|
94+
+--------------------+--------------------+
95+
96+
Reading Email Files
97+
98+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
99+
>>> partition_df.show()
100+
+--------------------+--------------------+
101+
| path| email|
102+
+--------------------+--------------------+
103+
|file:/content/ema...|[{Title, Test Sev...|
104+
+--------------------+--------------------+
105+
106+
Reading Webpages
107+
108+
>>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
109+
>>> partition_df.show()
110+
+--------------------+--------------------+
111+
| url| html|
112+
+--------------------+--------------------+
113+
|https://www.wikip...|[{Title, Wikipedi...|
114+
+--------------------+--------------------+
18115
116+
For more examples, refer to:
117+
`examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
118+
"""
19119
def __init__(self, **kwargs):
20120
self.spark = sparknlp.start()
21121
params = {}
@@ -27,21 +127,117 @@ def __init__(self, **kwargs):
27127

28128
super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)
29129

130+
30131
def partition(self, path, headers=None):
132+
"""
133+
Reads and parses content from a URL, file, or directory path.
134+
135+
Parameters
136+
----------
137+
path : str
138+
Path to file or directory. URLs and DFS are supported.
139+
headers : dict, optional
140+
Headers for URL requests.
141+
142+
Returns
143+
-------
144+
pyspark.sql.DataFrame
145+
DataFrame with parsed content.
146+
"""
31147
if headers is None:
32148
headers = {}
33149
jdf = self._java_obj.partition(path, headers)
34150
dataframe = self.getDataFrame(self.spark, jdf)
35151
return dataframe
36152

153+
37154
def partition_urls(self, path, headers=None):
155+
"""
156+
Reads and parses content from multiple URLs.
157+
158+
Parameters
159+
----------
160+
path : list[str]
161+
List of URLs.
162+
headers : dict, optional
163+
Request headers for URLs.
164+
165+
Returns
166+
-------
167+
pyspark.sql.DataFrame
168+
DataFrame with parsed URL content.
169+
170+
Examples
171+
--------
172+
>>> urls_df = Partition().partition_urls([
173+
... "https://www.wikipedia.org", "https://example.com/"
174+
... ])
175+
>>> urls_df.show()
176+
+--------------------+--------------------+
177+
| url| html|
178+
+--------------------+--------------------+
179+
|https://www.wikip...|[{Title, Wikipedi...|
180+
|https://example.com/|[{Title, Example ...|
181+
+--------------------+--------------------+
182+
183+
>>> urls_df.printSchema()
184+
root
185+
|-- url: string (nullable = true)
186+
|-- html: array (nullable = true)
187+
| |-- element: struct (containsNull = true)
188+
| | |-- elementType: string (nullable = true)
189+
| | |-- content: string (nullable = true)
190+
| | |-- metadata: map (nullable = true)
191+
| | | |-- key: string
192+
| | | |-- value: string (valueContainsNull = true)
193+
"""
38194
if headers is None:
39195
headers = {}
40196
jdf = self._java_obj.partitionUrlsJava(path, headers)
41197
dataframe = self.getDataFrame(self.spark, jdf)
42198
return dataframe
43199

200+
44201
def partition_text(self, text):
202+
"""
203+
Parses content from a raw text string.
204+
205+
Parameters
206+
----------
207+
text : str
208+
Raw text input.
209+
210+
Returns
211+
-------
212+
pyspark.sql.DataFrame
213+
DataFrame with parsed text.
214+
215+
Examples
216+
--------
217+
>>> raw_text = (
218+
... "The big brown fox\n"
219+
... "was walking down the lane.\n"
220+
... "\n"
221+
... "At the end of the lane,\n"
222+
... "the fox met a bear."
223+
... )
224+
>>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
225+
>>> text_df.show()
226+
+--------------------------------------+
227+
|txt |
228+
+--------------------------------------+
229+
|[{NarrativeText, The big brown fox was|
230+
+--------------------------------------+
231+
>>> text_df.printSchema()
232+
root
233+
|-- txt: array (nullable = true)
234+
| |-- element: struct (containsNull = true)
235+
| | |-- elementType: string (nullable = true)
236+
| | |-- content: string (nullable = true)
237+
| | |-- metadata: map (nullable = true)
238+
| | | |-- key: string
239+
| | | |-- value: string (valueContainsNull = true)
240+
"""
45241
jdf = self._java_obj.partitionText(text)
46242
dataframe = self.getDataFrame(self.spark, jdf)
47243
return dataframe

python/sparknlp/partition/partition_transformer.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from sparknlp.common import *
1515
from sparknlp.partition.partition_properties import *
1616

17-
1817
class PartitionTransformer(
1918
AnnotatorModel,
2019
HasEmailReaderProperties,
@@ -24,8 +23,58 @@ class PartitionTransformer(
2423
HasTextReaderProperties
2524
):
2625
"""
27-
PartitionTransformer is a class that provides methods for partitioning data into smaller chunks.
28-
It is designed to work with various file formats and content types.
26+
The PartitionTransformer annotator allows you to use the Partition feature more smoothly
27+
within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
28+
29+
It supports reading from files, URLs, in-memory strings, or byte arrays, and works
30+
within a Spark NLP pipeline.
31+
32+
Supported formats include:
33+
- Plain text
34+
- HTML
35+
- Word (.doc/.docx)
36+
- Excel (.xls/.xlsx)
37+
- PowerPoint (.ppt/.pptx)
38+
- Email files (.eml, .msg)
39+
- PDFs
40+
41+
Parameters
42+
----------
43+
inputCols : list of str
44+
Names of input columns (typically from DocumentAssembler).
45+
outputCol : str
46+
Name of the column to store the output.
47+
contentType : str
48+
The type of content: e.g., "text", "url", "file", etc.
49+
headers : dict, optional
50+
Headers to be used if content type is a URL.
51+
52+
Examples
53+
--------
54+
>>> dataset = spark.createDataFrame([
55+
... ("https://www.blizzard.com",),
56+
... ], ["text"])
57+
58+
>>> documentAssembler = DocumentAssembler() \
59+
... .setInputCol("text") \
60+
... .setOutputCol("document")
61+
62+
>>> partition = PartitionTransformer() \
63+
... .setInputCols(["document"]) \
64+
... .setOutputCol("partition") \
65+
... .setContentType("url") \
66+
... .setHeaders({"Accept-Language": "es-ES"})
67+
68+
>>> pipeline = Pipeline(stages=[documentAssembler, partition])
69+
>>> pipelineModel = pipeline.fit(dataset)
70+
>>> resultDf = pipelineModel.transform(dataset)
71+
>>> resultDf.show()
72+
+--------------------+--------------------+--------------------+
73+
| text| document| partition|
74+
+--------------------+--------------------+--------------------+
75+
|https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
76+
|https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
77+
+--------------------+--------------------+--------------------+
2978
"""
3079

3180
name = "PartitionTransformer"

python/sparknlp/reader/pdf_to_text.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,56 @@
1010
class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
1111
JavaMLReadable, JavaMLWritable):
1212
"""
13-
Extract text from Pdf document to single string or to several strings per each page.
14-
Input is a column with binary representation of PDF document.
15-
As output generate column with text and page number.
16-
Explode each page as separate row if split to page enabled.
13+
Extract text from PDF documents as either a single string or multiple strings per page.
14+
Input is a column with binary content of PDF files. Output is a column with extracted text,
15+
with options to include page numbers or split pages.
16+
17+
Parameters
18+
----------
19+
pageNumCol : str, optional
20+
Page number output column name.
21+
partitionNum : int, optional
22+
Number of partitions (default is 0).
23+
storeSplittedPdf : bool, optional
24+
Whether to store content of split PDFs (default is False).
25+
splitPage : bool, optional
26+
Enable/disable splitting per page (default is True).
27+
onlyPageNum : bool, optional
28+
Whether to extract only page numbers (default is False).
29+
textStripper : str or TextStripperType, optional
30+
Defines layout and formatting type.
31+
sort : bool, optional
32+
Enable/disable sorting content per page (default is False).
33+
34+
Examples
35+
--------
36+
>>> import sparknlp
37+
>>> from sparknlp.reader import *
38+
>>> from pyspark.ml import Pipeline
39+
>>> pdf_path = "Documents/files/pdf"
40+
>>> data_frame = spark.read.format("binaryFile").load(pdf_path)
41+
>>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
42+
>>> pipeline = Pipeline(stages=[pdf_to_text])
43+
>>> pipeline_model = pipeline.fit(data_frame)
44+
>>> pdf_df = pipeline_model.transform(data_frame)
45+
>>> pdf_df.show()
46+
+--------------------+--------------------+
47+
| path| modificationTime|
48+
+--------------------+--------------------+
49+
|file:/Users/paula...|2025-05-15 11:33:...|
50+
|file:/Users/paula...|2025-05-15 11:33:...|
51+
+--------------------+--------------------+
52+
>>> pdf_df.printSchema()
53+
root
54+
|-- path: string (nullable = true)
55+
|-- modificationTime: timestamp (nullable = true)
56+
|-- length: long (nullable = true)
57+
|-- text: string (nullable = true)
58+
|-- height_dimension: integer (nullable = true)
59+
|-- width_dimension: integer (nullable = true)
60+
|-- content: binary (nullable = true)
61+
|-- exception: string (nullable = true)
62+
|-- pagenum: integer (nullable = true)
1763
"""
1864
pageNumCol = Param(Params._dummy(), "pageNumCol",
1965
"Page number output column name.",

0 commit comments

Comments
 (0)