JohnSnowLabs
diff --git a/‎python/docs/reference/index.rst‎
Lines changed: 2 additions & 0 deletions b/‎python/docs/reference/index.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/sparknlp/partition/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/sparknlp/partition/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/sparknlp/partition/partition.py‎
Lines changed: 196 additions & 0 deletions b/‎python/sparknlp/partition/partition.py‎
Lines changed: 196 additions & 0 deletions
diff --git a/‎python/sparknlp/partition/partition_transformer.py‎
Lines changed: 52 additions & 3 deletions b/‎python/sparknlp/partition/partition_transformer.py‎
Lines changed: 52 additions & 3 deletions
diff --git a/‎python/sparknlp/reader/pdf_to_text.py‎
Lines changed: 50 additions & 4 deletions b/‎python/sparknlp/reader/pdf_to_text.py‎
Lines changed: 50 additions & 4 deletions
@@ -17,3 +17,5 @@ Modules
    sparknlp.training
    sparknlp.pretrained
    sparknlp.logging
+   sparknlp.partition
+   sparknlp.reader
@@ -11,5 +11,6 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+"""Module to read various types of documents into chunks"""
 from sparknlp.partition.partition import *
 from sparknlp.partition.partition_transformer import *
@@ -14,8 +14,108 @@
 import sparknlp
 from sparknlp.internal import ExtendedJavaWrapper
 
+
 class Partition(ExtendedJavaWrapper):
+    """
+    A unified interface for extracting structured content from various document types
+    using Spark NLP readers.
+
+    This class supports reading from files, URLs, in-memory strings, or byte arrays,
+    and returns parsed output as a structured Spark DataFrame.
+
+    Supported formats include:
+    - Plain text
+    - HTML
+    - Word (.doc/.docx)
+    - Excel (.xls/.xlsx)
+    - PowerPoint (.ppt/.pptx)
+    - Email files (.eml, .msg)
+    - PDFs
+
+    Parameters
+    ----------
+    params : dict, optional
+        Configuration parameters, including:
+
+        - content_type : str
+            Override automatic file type detection.
+        - store_content : bool
+            Include raw file content in the output DataFrame.
+        - timeout : int
+            Timeout for fetching HTML content.
+        - title_font_size : int
+            Font size used to identify titles.
+        - include_page_breaks : bool
+            Tag content with page break metadata.
+        - group_broken_paragraphs : bool
+            Merge broken lines into full paragraphs.
+        - title_length_size : int
+            Max character length to qualify as title.
+        - paragraph_split : str
+            Regex to detect paragraph boundaries.
+        - short_line_word_threshold : int
+            Max words in a line to be considered short.
+        - threshold : float
+            Ratio of empty lines for switching grouping.
+        - max_line_count : int
+            Max lines evaluated in paragraph analysis.
+        - include_slide_notes : bool
+            Include speaker notes in output.
+        - infer_table_structure : bool
+            Generate HTML table structure.
+        - append_cells : bool
+            Merge Excel rows into one block.
+        - cell_separator : str
+            Join cell values in a row.
+        - add_attachment_content : bool
+            Include text of plain-text attachments.
+        - headers : dict
+            Request headers when using URLs.
+
+    Examples
+    ---------
+
+    Reading Text Files
+
+    >>> txt_directory = "/content/txtfiles/reader/txt"
+    >>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
+    >>> partition_df.show()
+    >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
+    >>> partition_df.show()
+    >>> partition_df = Partition().partition(
+    ...     "https://www.wikipedia.com",
+    ...     headers={"Accept-Language": "es-ES"}
+    ... )
+    >>> partition_df.show()
+    +--------------------+--------------------+
+    |                path|                 txt|
+    +--------------------+--------------------+
+    |file:/content/txt...|[{Title, BIG DATA...|
+    +--------------------+--------------------+
+
+    Reading Email Files
+
+    >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
+    >>> partition_df.show()
+    +--------------------+--------------------+
+    |                path|               email|
+    +--------------------+--------------------+
+    |file:/content/ema...|[{Title, Test Sev...|
+    +--------------------+--------------------+
+
+    Reading Webpages
+
+    >>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
+    >>> partition_df.show()
+    +--------------------+--------------------+
+    |                 url|                html|
+    +--------------------+--------------------+
+    |https://www.wikip...|[{Title, Wikipedi...|
+    +--------------------+--------------------+
 
+    For more examples, refer to:
+    `examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
+    """
     def  __init__(self, **kwargs):
         self.spark = sparknlp.start()
         params = {}
@@ -27,21 +127,117 @@ def  __init__(self, **kwargs):
 
         super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)
 
+
     def partition(self, path, headers=None):
+        """
+        Reads and parses content from a URL, file, or directory path.
+
+        Parameters
+        ----------
+        path : str
+            Path to file or directory. URLs and DFS are supported.
+        headers : dict, optional
+            Headers for URL requests.
+
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            DataFrame with parsed content.
+    """
         if headers is None:
             headers = {}
         jdf = self._java_obj.partition(path, headers)
         dataframe = self.getDataFrame(self.spark, jdf)
         return dataframe
 
+
     def partition_urls(self, path, headers=None):
+        """
+        Reads and parses content from multiple URLs.
+
+        Parameters
+        ----------
+        path : list[str]
+            List of URLs.
+        headers : dict, optional
+            Request headers for URLs.
+
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            DataFrame with parsed URL content.
+
+        Examples
+        --------
+        >>> urls_df = Partition().partition_urls([
+        ...     "https://www.wikipedia.org", "https://example.com/"
+        ... ])
+        >>> urls_df.show()
+        +--------------------+--------------------+
+        |                 url|                html|
+        +--------------------+--------------------+
+        |https://www.wikip...|[{Title, Wikipedi...|
+        |https://example.com/|[{Title, Example ...|
+        +--------------------+--------------------+
+
+        >>> urls_df.printSchema()
+         root
+         |-- url: string (nullable = true)
+         |-- html: array (nullable = true)
+         |    |-- element: struct (containsNull = true)
+         |    |    |-- elementType: string (nullable = true)
+         |    |    |-- content: string (nullable = true)
+         |    |    |-- metadata: map (nullable = true)
+         |    |    |    |-- key: string
+         |    |    |    |-- value: string (valueContainsNull = true)
+        """
         if headers is None:
             headers = {}
         jdf = self._java_obj.partitionUrlsJava(path, headers)
         dataframe = self.getDataFrame(self.spark, jdf)
         return dataframe
 
+
     def partition_text(self, text):
+        """
+        Parses content from a raw text string.
+
+        Parameters
+        ----------
+        text : str
+            Raw text input.
+
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            DataFrame with parsed text.
+
+        Examples
+        --------
+        >>> raw_text = (
+        ...     "The big brown fox\n"
+        ...     "was walking down the lane.\n"
+        ...     "\n"
+        ...     "At the end of the lane,\n"
+        ...     "the fox met a bear."
+        ... )
+        >>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
+        >>> text_df.show()
+        +--------------------------------------+
+        |txt                                   |
+        +--------------------------------------+
+        |[{NarrativeText, The big brown fox was|
+        +--------------------------------------+
+        >>> text_df.printSchema()
+        root
+         |-- txt: array (nullable = true)
+         |    |-- element: struct (containsNull = true)
+         |    |    |-- elementType: string (nullable = true)
+         |    |    |-- content: string (nullable = true)
+         |    |    |-- metadata: map (nullable = true)
+         |    |    |    |-- key: string
+         |    |    |    |-- value: string (valueContainsNull = true)
+        """
         jdf = self._java_obj.partitionText(text)
         dataframe = self.getDataFrame(self.spark, jdf)
         return dataframe
@@ -14,7 +14,6 @@
 from sparknlp.common import *
 from sparknlp.partition.partition_properties import *
 
-
 class PartitionTransformer(
     AnnotatorModel,
     HasEmailReaderProperties,
@@ -24,8 +23,58 @@ class PartitionTransformer(
     HasTextReaderProperties
 ):
     """
-    PartitionTransformer is a class that provides methods for partitioning data into smaller chunks.
-    It is designed to work with various file formats and content types.
+    The PartitionTransformer annotator allows you to use the Partition feature more smoothly
+    within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
+
+    It supports reading from files, URLs, in-memory strings, or byte arrays, and works
+    within a Spark NLP pipeline.
+
+    Supported formats include:
+    - Plain text
+    - HTML
+    - Word (.doc/.docx)
+    - Excel (.xls/.xlsx)
+    - PowerPoint (.ppt/.pptx)
+    - Email files (.eml, .msg)
+    - PDFs
+
+    Parameters
+    ----------
+    inputCols : list of str
+        Names of input columns (typically from DocumentAssembler).
+    outputCol : str
+        Name of the column to store the output.
+    contentType : str
+        The type of content: e.g., "text", "url", "file", etc.
+    headers : dict, optional
+        Headers to be used if content type is a URL.
+
+    Examples
+    --------
+    >>> dataset = spark.createDataFrame([
+    ...     ("https://www.blizzard.com",),
+    ... ], ["text"])
+
+    >>> documentAssembler = DocumentAssembler() \
+    ...     .setInputCol("text") \
+    ...     .setOutputCol("document")
+
+    >>> partition = PartitionTransformer() \
+    ...     .setInputCols(["document"]) \
+    ...     .setOutputCol("partition") \
+    ...     .setContentType("url") \
+    ...     .setHeaders({"Accept-Language": "es-ES"})
+
+    >>> pipeline = Pipeline(stages=[documentAssembler, partition])
+    >>> pipelineModel = pipeline.fit(dataset)
+    >>> resultDf = pipelineModel.transform(dataset)
+    >>> resultDf.show()
+    +--------------------+--------------------+--------------------+
+    |                text|            document|           partition|
+    +--------------------+--------------------+--------------------+
+    |https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
+    |https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
+    +--------------------+--------------------+--------------------+
     """
 
     name = "PartitionTransformer"
 
@@ -10,10 +10,56 @@
 class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
                 JavaMLReadable, JavaMLWritable):
     """
-    Extract text from Pdf document to single string or to several strings per each page.
-    Input is a column with binary representation of PDF document.
-    As output generate column with text and page number.
-    Explode each page as separate row if split to page enabled.
+    Extract text from PDF documents as either a single string or multiple strings per page.
+    Input is a column with binary content of PDF files. Output is a column with extracted text,
+    with options to include page numbers or split pages.
+
+    Parameters
+    ----------
+    pageNumCol : str, optional
+        Page number output column name.
+    partitionNum : int, optional
+        Number of partitions (default is 0).
+    storeSplittedPdf : bool, optional
+        Whether to store content of split PDFs (default is False).
+    splitPage : bool, optional
+        Enable/disable splitting per page (default is True).
+    onlyPageNum : bool, optional
+        Whether to extract only page numbers (default is False).
+    textStripper : str or TextStripperType, optional
+        Defines layout and formatting type.
+    sort : bool, optional
+        Enable/disable sorting content per page (default is False).
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.reader import *
+    >>> from pyspark.ml import Pipeline
+    >>> pdf_path = "Documents/files/pdf"
+    >>> data_frame = spark.read.format("binaryFile").load(pdf_path)
+    >>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
+    >>> pipeline = Pipeline(stages=[pdf_to_text])
+    >>> pipeline_model = pipeline.fit(data_frame)
+    >>> pdf_df = pipeline_model.transform(data_frame)
+    >>> pdf_df.show()
+    +--------------------+--------------------+
+    |                path|    modificationTime|
+    +--------------------+--------------------+
+    |file:/Users/paula...|2025-05-15 11:33:...|
+    |file:/Users/paula...|2025-05-15 11:33:...|
+    +--------------------+--------------------+
+    >>> pdf_df.printSchema()
+    root
+     |-- path: string (nullable = true)
+     |-- modificationTime: timestamp (nullable = true)
+     |-- length: long (nullable = true)
+     |-- text: string (nullable = true)
+     |-- height_dimension: integer (nullable = true)
+     |-- width_dimension: integer (nullable = true)
+     |-- content: binary (nullable = true)
+     |-- exception: string (nullable = true)
+     |-- pagenum: integer (nullable = true)
     """
     pageNumCol = Param(Params._dummy(), "pageNumCol",
                        "Page number output column name.",