1414import sparknlp
1515from sparknlp .internal import ExtendedJavaWrapper
1616
17+
1718class Partition (ExtendedJavaWrapper ):
19+ """
20+ A unified interface for extracting structured content from various document types
21+ using Spark NLP readers.
22+
23+ This class supports reading from files, URLs, in-memory strings, or byte arrays,
24+ and returns parsed output as a structured Spark DataFrame.
25+
26+ Supported formats include:
27+ - Plain text
28+ - HTML
29+ - Word (.doc/.docx)
30+ - Excel (.xls/.xlsx)
31+ - PowerPoint (.ppt/.pptx)
32+ - Email files (.eml, .msg)
33+ - PDFs
34+
35+ Parameters
36+ ----------
37+ params : dict, optional
38+ Configuration parameters, including:
39+
40+ - content_type : str
41+ Override automatic file type detection.
42+ - store_content : bool
43+ Include raw file content in the output DataFrame.
44+ - timeout : int
45+ Timeout for fetching HTML content.
46+ - title_font_size : int
47+ Font size used to identify titles.
48+ - include_page_breaks : bool
49+ Tag content with page break metadata.
50+ - group_broken_paragraphs : bool
51+ Merge broken lines into full paragraphs.
52+ - title_length_size : int
53+ Max character length to qualify as title.
54+ - paragraph_split : str
55+ Regex to detect paragraph boundaries.
56+ - short_line_word_threshold : int
57+ Max words in a line to be considered short.
58+ - threshold : float
59+ Ratio of empty lines for switching grouping.
60+ - max_line_count : int
61+ Max lines evaluated in paragraph analysis.
62+ - include_slide_notes : bool
63+ Include speaker notes in output.
64+ - infer_table_structure : bool
65+ Generate HTML table structure.
66+ - append_cells : bool
67+ Merge Excel rows into one block.
68+ - cell_separator : str
69+ Join cell values in a row.
70+ - add_attachment_content : bool
71+ Include text of plain-text attachments.
72+ - headers : dict
73+ Request headers when using URLs.
74+
75+ Examples
76+ ---------
77+
78+ Reading Text Files
79+
80+ >>> txt_directory = "/content/txtfiles/reader/txt"
81+ >>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
82+ >>> partition_df.show()
83+ >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
84+ >>> partition_df.show()
85+ >>> partition_df = Partition().partition(
86+ ... "https://www.wikipedia.com",
87+ ... headers={"Accept-Language": "es-ES"}
88+ ... )
89+ >>> partition_df.show()
90+ +--------------------+--------------------+
91+ | path| txt|
92+ +--------------------+--------------------+
93+ |file:/content/txt...|[{Title, BIG DATA...|
94+ +--------------------+--------------------+
95+
96+ Reading Email Files
97+
98+ >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
99+ >>> partition_df.show()
100+ +--------------------+--------------------+
101+ | path| email|
102+ +--------------------+--------------------+
103+ |file:/content/ema...|[{Title, Test Sev...|
104+ +--------------------+--------------------+
105+
106+ Reading Webpages
107+
108+ >>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
109+ >>> partition_df.show()
110+ +--------------------+--------------------+
111+ | url| html|
112+ +--------------------+--------------------+
113+ |https://www.wikip...|[{Title, Wikipedi...|
114+ +--------------------+--------------------+
18115
116+ For more examples, refer to:
117+ `examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
118+ """
19119 def __init__ (self , ** kwargs ):
20120 self .spark = sparknlp .start ()
21121 params = {}
@@ -27,21 +127,117 @@ def __init__(self, **kwargs):
27127
28128 super (Partition , self ).__init__ ("com.johnsnowlabs.partition.Partition" , params )
29129
130+
30131 def partition (self , path , headers = None ):
132+ """
133+ Reads and parses content from a URL, file, or directory path.
134+
135+ Parameters
136+ ----------
137+ path : str
138+ Path to file or directory. URLs and DFS are supported.
139+ headers : dict, optional
140+ Headers for URL requests.
141+
142+ Returns
143+ -------
144+ pyspark.sql.DataFrame
145+ DataFrame with parsed content.
146+ """
31147 if headers is None :
32148 headers = {}
33149 jdf = self ._java_obj .partition (path , headers )
34150 dataframe = self .getDataFrame (self .spark , jdf )
35151 return dataframe
36152
153+
37154 def partition_urls (self , path , headers = None ):
155+ """
156+ Reads and parses content from multiple URLs.
157+
158+ Parameters
159+ ----------
160+ path : list[str]
161+ List of URLs.
162+ headers : dict, optional
163+ Request headers for URLs.
164+
165+ Returns
166+ -------
167+ pyspark.sql.DataFrame
168+ DataFrame with parsed URL content.
169+
170+ Examples
171+ --------
172+ >>> urls_df = Partition().partition_urls([
173+ ... "https://www.wikipedia.org", "https://example.com/"
174+ ... ])
175+ >>> urls_df.show()
176+ +--------------------+--------------------+
177+ | url| html|
178+ +--------------------+--------------------+
179+ |https://www.wikip...|[{Title, Wikipedi...|
180+ |https://example.com/|[{Title, Example ...|
181+ +--------------------+--------------------+
182+
183+ >>> urls_df.printSchema()
184+ root
185+ |-- url: string (nullable = true)
186+ |-- html: array (nullable = true)
187+ | |-- element: struct (containsNull = true)
188+ | | |-- elementType: string (nullable = true)
189+ | | |-- content: string (nullable = true)
190+ | | |-- metadata: map (nullable = true)
191+ | | | |-- key: string
192+ | | | |-- value: string (valueContainsNull = true)
193+ """
38194 if headers is None :
39195 headers = {}
40196 jdf = self ._java_obj .partitionUrlsJava (path , headers )
41197 dataframe = self .getDataFrame (self .spark , jdf )
42198 return dataframe
43199
200+
44201 def partition_text (self , text ):
202+ """
203+ Parses content from a raw text string.
204+
205+ Parameters
206+ ----------
207+ text : str
208+ Raw text input.
209+
210+ Returns
211+ -------
212+ pyspark.sql.DataFrame
213+ DataFrame with parsed text.
214+
215+ Examples
216+ --------
217+ >>> raw_text = (
218+ ... "The big brown fox\n "
219+ ... "was walking down the lane.\n "
220+ ... "\n "
221+ ... "At the end of the lane,\n "
222+ ... "the fox met a bear."
223+ ... )
224+ >>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
225+ >>> text_df.show()
226+ +--------------------------------------+
227+ |txt |
228+ +--------------------------------------+
229+ |[{NarrativeText, The big brown fox was|
230+ +--------------------------------------+
231+ >>> text_df.printSchema()
232+ root
233+ |-- txt: array (nullable = true)
234+ | |-- element: struct (containsNull = true)
235+ | | |-- elementType: string (nullable = true)
236+ | | |-- content: string (nullable = true)
237+ | | |-- metadata: map (nullable = true)
238+ | | | |-- key: string
239+ | | | |-- value: string (valueContainsNull = true)
240+ """
45241 jdf = self ._java_obj .partitionText (text )
46242 dataframe = self .getDataFrame (self .spark , jdf )
47243 return dataframe
0 commit comments