rfctr(chunk): prep for adding TableSplitter (#3510)

scanny · web-flow · commit cbe1b3562190 · 2024-08-12T18:04:49.000Z
**Summary**
Mechanical refactoring in preparation for adding (pre-chunk)
`TableSplitter` in a PR stacked on this one.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.15.2-dev6
+## 0.15.2-dev7
 
 ### Enhancements
 
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
diff --git a/test_unstructured/common/__init__.py b/test_unstructured/common/__init__.py
diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py
@@ -0,0 +1,33 @@
+"""Unit-test suite for the `unstructured.common.html_table` module."""
+
+from __future__ import annotations
+
+from unstructured.common.html_table import htmlify_matrix_of_cell_texts
+
+
+class Describe_htmlify_matrix_of_cell_texts:
+    """Unit-test suite for `unstructured.common.html_table.htmlify_matrix_of_cell_texts()`."""
+
+    def test_htmlify_matrix_handles_empty_cells(self):
+        assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
+            "<table>"
+            "<tr><td>cell1</td><td></td><td>cell3</td></tr>"
+            "<tr><td></td><td>cell5</td><td></td></tr>"
+            "</table>"
+        )
+
+    def test_htmlify_matrix_handles_special_characters(self):
+        assert htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
+            "<table><tr><td>&lt;&gt;&amp;&quot;</td><td>newline<br/></td></tr></table>"
+        )
+
+    def test_htmlify_matrix_handles_multiple_rows_and_cells(self):
+        assert htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
+            "<table>"
+            "<tr><td>cell1</td><td>cell2</td></tr>"
+            "<tr><td>cell3</td><td>cell4</td></tr>"
+            "</table>"
+        )
+
+    def test_htmlify_matrix_handles_empty_matrix(self):
+        assert htmlify_matrix_of_cell_texts([]) == ""
diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py
@@ -339,30 +339,6 @@ def test_validate_date_args_raises_for_invalid_formats(date):
         assert utils.validate_date_args(date)
 
 
-def test_htmlify_matrix_handles_empty_cells():
-    assert utils.htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
-        "<table><tr><td>cell1</td><td></td><td>cell3</td></tr>"
-        "<tr><td></td><td>cell5</td><td></td></tr></table>"
-    )
-
-
-def test_htmlify_matrix_handles_special_characters():
-    assert utils.htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
-        "<table><tr><td>&lt;&gt;&amp;&quot;</td><td>newline<br/></td></tr></table>"
-    )
-
-
-def test_htmlify_matrix_handles_multiple_rows_and_cells():
-    assert utils.htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
-        "<table><tr><td>cell1</td><td>cell2</td></tr>"
-        "<tr><td>cell3</td><td>cell4</td></tr></table>"
-    )
-
-
-def test_htmlify_matrix_handles_empty_matrix():
-    assert utils.htmlify_matrix_of_cell_texts([]) == ""
-
-
 def test_only_returns_singleton_iterable():
     singleton_iterable = [42]
     result = utils.only(singleton_iterable)
diff --git a/typings/lxml/_types.pyi b/typings/lxml/_types.pyi
@@ -6,7 +6,7 @@ from typing import Any, Callable, Collection, Protocol, TypeVar
 
 from typing_extensions import TypeAlias
 
-from .etree import QName, _Element, _ElementTree
+from .etree import HTMLParser, QName, XMLParser, _Element, _ElementTree
 
 _ET = TypeVar("_ET", bound=_Element, default=_Element)
 _ET_co = TypeVar("_ET_co", bound=_Element, default=_Element, covariant=True)
@@ -30,5 +30,8 @@ _TextArg: TypeAlias = str | bytes | QName
 
 _XPathObject = Any
 
+# The basic parsers bundled in lxml.etree
+_DefEtreeParsers = XMLParser[_ET_co] | HTMLParser[_ET_co]
+
 class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
     def items(self) -> Collection[tuple[_KT_co, _VT_co]]: ...
diff --git a/typings/lxml/etree/_element.pyi b/typings/lxml/etree/_element.pyi
@@ -2,11 +2,12 @@
 
 from __future__ import annotations
 
-from typing import Collection, Generic, Iterator, TypeVar, overload
+from typing import Collection, Generic, Iterable, Iterator, TypeVar, overload
 
 from typing_extensions import Self
 
 from .. import _types as _t
+from ._module_misc import CDATA, QName
 
 _T = TypeVar("_T")
 
@@ -23,6 +24,12 @@ class _Element:
     def get(self, key: _t._AttrName) -> str | None: ...
     @overload
     def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
+    @overload
+    def iter(self, *tags: _t._TagSelector) -> Iterator[Self]: ...
+    @overload
+    def iter(
+        self, *, tag: _t._TagSelector | Iterable[_t._TagSelector] | None = None
+    ) -> Iterator[Self]: ...
     def iterancestors(
         self, *, tag: _t._TagSelector | Collection[_t._TagSelector] | None = None
     ) -> Iterator[Self]: ...
@@ -39,8 +46,12 @@ class _Element:
     def tag(self) -> str: ...
     @property
     def tail(self) -> str | None: ...
+    @tail.setter
+    def tail(self, value: str | CDATA | None) -> None: ...
     @property
     def text(self) -> str | None: ...
+    @text.setter
+    def text(self, value: str | QName | CDATA | None) -> None: ...
     def xpath(
         self,
         _path: str,
diff --git a/typings/lxml/etree/_module_misc.pyi b/typings/lxml/etree/_module_misc.pyi
@@ -2,4 +2,7 @@
 
 from __future__ import annotations
 
+class CDATA:
+    def __init__(self, data: str) -> None: ...
+
 class QName: ...
diff --git a/typings/lxml/etree/_parser.pyi b/typings/lxml/etree/_parser.pyi
@@ -1,8 +1,16 @@
+# pyright: reportPrivateUsage=false
+
 from __future__ import annotations
 
+from typing import Generic
+
+from .._types import _ET_co
 from ._classlookup import ElementClassLookup
 
-class HTMLParser:
+# Includes most stuff in _BaseParser
+class _FeedParser(Generic[_ET_co]): ...
+
+class HTMLParser(_FeedParser[_ET_co]):
     def __init__(
         self,
         *,
@@ -20,7 +28,7 @@ class HTMLParser:
     ) -> None: ...
     def set_element_class_lookup(self, lookup: ElementClassLookup | None = None) -> None: ...
 
-class XMLParser:
+class XMLParser(_FeedParser[_ET_co]):
     def __init__(
         self,
         *,
diff --git a/typings/lxml/html/__init__.pyi b/typings/lxml/html/__init__.pyi
@@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from ._element import (
+    HtmlElement as HtmlElement,
+)
+from ._parse import (
+    fragment_fromstring as fragment_fromstring,
+)
diff --git a/typings/lxml/html/_parse.pyi b/typings/lxml/html/_parse.pyi
@@ -0,0 +1,20 @@
+# pyright: reportPrivateUsage=false
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .._types import _DefEtreeParsers
+from ._element import HtmlElement
+
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+_HtmlElemParser: TypeAlias = _DefEtreeParsers[HtmlElement]
+
+def fragment_fromstring(
+    html: str,
+    create_parent: bool = False,
+    base_url: str | None = None,
+    parser: _HtmlElemParser | None = None,
+) -> HtmlElement: ...
diff --git a/typings/pandas/__init__.pyi b/typings/pandas/__init__.pyi
@@ -1,8 +1,5 @@
 from __future__ import annotations
 
-from pandas.core.api import (
-    DataFrame as DataFrame,
-)
-from pandas.io.api import (
-    read_csv as read_csv,
-)
+from pandas.core.api import DataFrame as DataFrame
+from pandas.io.api import read_csv as read_csv
+from pandas.io.api import read_excel as read_excel
diff --git a/typings/pandas/_typing.pyi b/typings/pandas/_typing.pyi
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from os import PathLike
+from typing import Protocol, TypeVar
+
+from typing_extensions import TypeAlias
+
+AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
+FilePath: TypeAlias = str | PathLike[str]
+S1 = TypeVar("S1")
+
+class BaseBuffer(Protocol):
+    @property
+    def mode(self) -> str: ...
+    def seek(self, __offset: int, __whence: int = ...) -> int: ...
+    def seekable(self) -> bool: ...
+    def tell(self) -> int: ...
+
+class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]):
+    def read(self, __n: int = ...) -> AnyStr_cov: ...
diff --git a/typings/pandas/core/frame.pyi b/typings/pandas/core/frame.pyi
@@ -1,9 +1,29 @@
+# pyright: reportPrivateUsage=false
+
 from __future__ import annotations
 
+from typing import Any, Hashable, Iterable
+
+from pandas.core.indexing import _iLocIndexer
+from pandas.core.series import Series
+
 class DataFrame:
+    def __getitem__(self, key: Iterable[Hashable] | slice) -> DataFrame: ...
+    def __len__(self) -> int: ...
+    @property
+    def T(self) -> DataFrame: ...
+    @property
+    def iloc(self) -> _iLocIndexerFrame: ...
+    def isna(self) -> DataFrame: ...
+    def iterrows(self) -> Iterable[tuple[Hashable, Series[Any]]]: ...
+    @property
+    def shape(self) -> tuple[int, int]: ...
     def to_html(
         self,
         index: bool = ...,
         header: bool = ...,
         na_rep: str = ...,
     ) -> str: ...
+
+class _iLocIndexerFrame(_iLocIndexer):
+    def __getitem__(self, idx: Any) -> DataFrame: ...
diff --git a/typings/pandas/io/api.pyi b/typings/pandas/io/api.pyi
@@ -1,5 +1,4 @@
 from __future__ import annotations
 
-from pandas.io.parsers import (
-    read_csv as read_csv,
-)
+from pandas.io.excel import read_excel as read_excel
+from pandas.io.parsers import read_csv as read_csv
diff --git a/typings/pandas/io/excel/__init__.pyi b/typings/pandas/io/excel/__init__.pyi
@@ -0,0 +1 @@
+from pandas.io.excel._base import read_excel as read_excel
diff --git a/typings/pandas/io/excel/_base.pyi b/typings/pandas/io/excel/_base.pyi
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from typing import Sequence
+
+from pandas._typing import FilePath, ReadBuffer
+from pandas.core.frame import DataFrame
+
+def read_excel(
+    io: FilePath | ReadBuffer[bytes],
+    sheet_name: None,
+    *,
+    header: int | Sequence[int] | None = ...,
+) -> dict[str, DataFrame]: ...
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.2-dev6"  # pragma: no cover
+__version__ = "0.15.2-dev7"  # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
diff --git a/unstructured/common/__init__.py b/unstructured/common/__init__.py
diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py
diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
diff --git a/unstructured/utils.py b/unstructured/utils.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-## 0.15.2-dev6`
	`1`	`+## 0.15.2-dev7`
`2`	`2`
`3`	`3`	`### Enhancements`
`4`	`4`