Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
11001bf
[SPARKNLP-1113] Adding Partition feature
danilojsl Mar 17, 2025
cebc516
[SPARKNLP-1118] Adding headers, ssl-verify, request timeout, page bre…
danilojsl Mar 31, 2025
1faa91d
[SPARKNLP-1116] Adding groupBrokenParagraphs option
danilojsl Apr 4, 2025
e98dd26
[SPARKNLP-1116] Adding includeSlideNotes option
danilojsl Apr 7, 2025
4dec816
[SPARKNLP-1116] Adding findSubtable option
danilojsl Apr 8, 2025
edfa581
[SPARKNLP-1116] Adding findSubtable option in SparkNLPReader
danilojsl Apr 8, 2025
38bae06
[SPARKNLP-1116] Renaming findSubtable to appendCells option in SparkN…
danilojsl Apr 8, 2025
4e9cd76
[SPARKNLP-1116] Handling headers null issue in SparkNLPReader for Pyt…
danilojsl Apr 10, 2025
f1ea42f
[SPARKNLP-1116] Refactoring parameters spark-nlp reader getters to ma…
danilojsl Apr 11, 2025
c4ab336
[SPARKNLP-1116] Adding Partitioning demo notebook
danilojsl Apr 30, 2025
058a8a5
[SPARKNLP-1174] Adding PartitionTransformer
danilojsl May 5, 2025
48cf2b3
[SPARKNLP-1174] Adding missing unit tests in readers
danilojsl May 14, 2025
5b0c581
[SPARKNLP-1174] Moving PDF parameters to HasPdfProperties
danilojsl May 14, 2025
ce4b9fb
[SPARKNLP-1174] Adding validation for partition URL content
danilojsl May 16, 2025
fcbf30f
[SPARKNLP-1174] Formatting modified files
danilojsl May 16, 2025
953e03d
[SPARKNLP-1174] Fix reading as text file content
danilojsl May 24, 2025
aaa342b
[SPARKNLP-1174] Adding PartitionTransformer demo notebook [skip test]
danilojsl May 24, 2025
367504f
[SPARKNLP-1174] Updates PartitionTransformer demo notebook [skip test]
danilojsl May 24, 2025
3846eaf
[SPARKNLP-1174] Updates PartitionTransformer file link [skip test]
danilojsl May 24, 2025
37e6e85
Documentation for SparkNLP Readers and Partition class (#14581)
paulamib123 May 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,267 changes: 1,267 additions & 0 deletions examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions python/sparknlp/partition/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sparknlp.partition.partition import *
from sparknlp.partition.partition_transformer import *
47 changes: 47 additions & 0 deletions python/sparknlp/partition/partition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sparknlp
from sparknlp.internal import ExtendedJavaWrapper

class Partition(ExtendedJavaWrapper):

def __init__(self, **kwargs):
self.spark = sparknlp.start()
params = {}
for key, value in kwargs.items():
try:
params[key] = str(value)
except Exception as e:
raise ValueError(f"Invalid value for key '{key}': Cannot cast {type(value)} to string. Original error: {e}")

super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)

def partition(self, path, headers=None):
if headers is None:
headers = {}
jdf = self._java_obj.partition(path, headers)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe

def partition_urls(self, path, headers=None):
if headers is None:
headers = {}
jdf = self._java_obj.partitionUrlsJava(path, headers)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe

def partition_text(self, text):
jdf = self._java_obj.partitionText(text)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe
256 changes: 256 additions & 0 deletions python/sparknlp/partition/partition_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict

from pyspark.ml.param import TypeConverters, Params, Param


class HasEmailReaderProperties(Params):

addAttachmentContent = Param(
Params._dummy(),
"addAttachmentContent",
"Whether to extract and include the textual content of plain-text attachments in the output",
typeConverter=TypeConverters.toBoolean
)

def setAddAttachmentContent(self, value):
"""
Sets whether to extract and include the textual content of plain-text attachments in the output.

Parameters
----------
value : bool
Whether to include text from plain-text attachments.
"""
return self._set(addAttachmentContent=value)

def getAddAttachmentContent(self):
"""
Gets whether to extract and include the textual content of plain-text attachments in the output.

Returns
-------
bool
Whether to include text from plain-text attachments.
"""
return self.getOrDefault(self.addAttachmentContent)


class HasExcelReaderProperties(Params):

cellSeparator = Param(
Params._dummy(),
"cellSeparator",
"String used to join cell values in a row when assembling textual output.",
typeConverter=TypeConverters.toString
)

def setCellSeparator(self, value):
"""
Sets the string used to join cell values in a row when assembling textual output.

Parameters
----------
value : str
Delimiter used to concatenate cell values.
"""
return self._set(cellSeparator=value)

def getCellSeparator(self):
"""
Gets the string used to join cell values in a row when assembling textual output.

Returns
-------
str
Delimiter used to concatenate cell values.
"""
return self.getOrDefault(self.cellSeparator)

appendCells = Param(
Params._dummy(),
"appendCells",
"Whether to append all rows into a single content block instead of creating separate elements per row.",
typeConverter=TypeConverters.toBoolean
)

def setAppendCells(self, value):
"""
Sets whether to append all rows into a single content block.

Parameters
----------
value : bool
True to merge rows into one block, False for individual elements.
"""
return self._set(appendCells=value)

def getAppendCells(self):
"""
Gets whether to append all rows into a single content block.

Returns
-------
bool
True to merge rows into one block, False for individual elements.
"""
return self.getOrDefault(self.appendCells)

class HasHTMLReaderProperties(Params):

timeout = Param(
Params._dummy(),
"timeout",
"Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
typeConverter=TypeConverters.toInt
)

def setTimeout(self, value):
"""
Sets the timeout (in seconds) for reading remote HTML resources.

Parameters
----------
value : int
Timeout in seconds for remote content retrieval.
"""
return self._set(timeout=value)

def getTimeout(self):
"""
Gets the timeout value for reading remote HTML resources.

Returns
-------
int
Timeout in seconds.
"""
return self.getOrDefault(self.timeout)

def setHeaders(self, headers: Dict[str, str]):
self._call_java("setHeadersPython", headers)
return self


class HasPowerPointProperties(Params):

includeSlideNotes = Param(
Params._dummy(),
"includeSlideNotes",
"Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
typeConverter=TypeConverters.toBoolean
)

def setIncludeSlideNotes(self, value):
"""
Sets whether to extract speaker notes from slides.

Parameters
----------
value : bool
If True, notes are included as narrative text elements.
"""
return self._set(includeSlideNotes=value)

def getIncludeSlideNotes(self):
"""
Gets whether to extract speaker notes from slides.

Returns
-------
bool
True if notes are included as narrative text elements.
"""
return self.getOrDefault(self.includeSlideNotes)

class HasTextReaderProperties(Params):

titleLengthSize = Param(
Params._dummy(),
"titleLengthSize",
"Maximum character length used to determine if a text block qualifies as a title during parsing.",
typeConverter=TypeConverters.toInt
)

def setTitleLengthSize(self, value):
return self._set(titleLengthSize=value)

def getTitleLengthSize(self):
return self.getOrDefault(self.titleLengthSize)

groupBrokenParagraphs = Param(
Params._dummy(),
"groupBrokenParagraphs",
"Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
typeConverter=TypeConverters.toBoolean
)

def setGroupBrokenParagraphs(self, value):
return self._set(groupBrokenParagraphs=value)

def getGroupBrokenParagraphs(self):
return self.getOrDefault(self.groupBrokenParagraphs)

paragraphSplit = Param(
Params._dummy(),
"paragraphSplit",
"Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
typeConverter=TypeConverters.toString
)

def setParagraphSplit(self, value):
return self._set(paragraphSplit=value)

def getParagraphSplit(self):
return self.getOrDefault(self.paragraphSplit)

shortLineWordThreshold = Param(
Params._dummy(),
"shortLineWordThreshold",
"Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
typeConverter=TypeConverters.toInt
)

def setShortLineWordThreshold(self, value):
return self._set(shortLineWordThreshold=value)

def getShortLineWordThreshold(self):
return self.getOrDefault(self.shortLineWordThreshold)

maxLineCount = Param(
Params._dummy(),
"maxLineCount",
"Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
typeConverter=TypeConverters.toInt
)

def setMaxLineCount(self, value):
return self._set(maxLineCount=value)

def getMaxLineCount(self):
return self.getOrDefault(self.maxLineCount)

threshold = Param(
Params._dummy(),
"threshold",
"Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
typeConverter=TypeConverters.toFloat
)

def setThreshold(self, value):
return self._set(threshold=value)

def getThreshold(self):
return self.getOrDefault(self.threshold)
Loading