Skip to content

Commit cbe1b35

Browse files
authored
rfctr(chunk): prep for adding TableSplitter (#3510)
**Summary** Mechanical refactoring in preparation for adding (pre-chunk) `TableSplitter` in a PR stacked on this one.
1 parent d99b399 commit cbe1b35

24 files changed

+671
-561
lines changed

Diff for: CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.15.2-dev6
1+
## 0.15.2-dev7
22

33
### Enhancements
44

Diff for: test_unstructured/chunking/test_base.py

+249-244
Large diffs are not rendered by default.

Diff for: test_unstructured/common/__init__.py

Whitespace-only changes.

Diff for: test_unstructured/common/test_html_table.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""Unit-test suite for the `unstructured.common.html_table` module."""
2+
3+
from __future__ import annotations
4+
5+
from unstructured.common.html_table import htmlify_matrix_of_cell_texts
6+
7+
8+
class Describe_htmlify_matrix_of_cell_texts:
9+
"""Unit-test suite for `unstructured.common.html_table.htmlify_matrix_of_cell_texts()`."""
10+
11+
def test_htmlify_matrix_handles_empty_cells(self):
12+
assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
13+
"<table>"
14+
"<tr><td>cell1</td><td></td><td>cell3</td></tr>"
15+
"<tr><td></td><td>cell5</td><td></td></tr>"
16+
"</table>"
17+
)
18+
19+
def test_htmlify_matrix_handles_special_characters(self):
20+
assert htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
21+
"<table><tr><td>&lt;&gt;&amp;&quot;</td><td>newline<br/></td></tr></table>"
22+
)
23+
24+
def test_htmlify_matrix_handles_multiple_rows_and_cells(self):
25+
assert htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
26+
"<table>"
27+
"<tr><td>cell1</td><td>cell2</td></tr>"
28+
"<tr><td>cell3</td><td>cell4</td></tr>"
29+
"</table>"
30+
)
31+
32+
def test_htmlify_matrix_handles_empty_matrix(self):
33+
assert htmlify_matrix_of_cell_texts([]) == ""

Diff for: test_unstructured/test_utils.py

-24
Original file line numberDiff line numberDiff line change
@@ -339,30 +339,6 @@ def test_validate_date_args_raises_for_invalid_formats(date):
339339
assert utils.validate_date_args(date)
340340

341341

342-
def test_htmlify_matrix_handles_empty_cells():
343-
assert utils.htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
344-
"<table><tr><td>cell1</td><td></td><td>cell3</td></tr>"
345-
"<tr><td></td><td>cell5</td><td></td></tr></table>"
346-
)
347-
348-
349-
def test_htmlify_matrix_handles_special_characters():
350-
assert utils.htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
351-
"<table><tr><td>&lt;&gt;&amp;&quot;</td><td>newline<br/></td></tr></table>"
352-
)
353-
354-
355-
def test_htmlify_matrix_handles_multiple_rows_and_cells():
356-
assert utils.htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
357-
"<table><tr><td>cell1</td><td>cell2</td></tr>"
358-
"<tr><td>cell3</td><td>cell4</td></tr></table>"
359-
)
360-
361-
362-
def test_htmlify_matrix_handles_empty_matrix():
363-
assert utils.htmlify_matrix_of_cell_texts([]) == ""
364-
365-
366342
def test_only_returns_singleton_iterable():
367343
singleton_iterable = [42]
368344
result = utils.only(singleton_iterable)

Diff for: typings/lxml/_types.pyi

+4-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ from typing import Any, Callable, Collection, Protocol, TypeVar
66

77
from typing_extensions import TypeAlias
88

9-
from .etree import QName, _Element, _ElementTree
9+
from .etree import HTMLParser, QName, XMLParser, _Element, _ElementTree
1010

1111
_ET = TypeVar("_ET", bound=_Element, default=_Element)
1212
_ET_co = TypeVar("_ET_co", bound=_Element, default=_Element, covariant=True)
@@ -30,5 +30,8 @@ _TextArg: TypeAlias = str | bytes | QName
3030

3131
_XPathObject = Any
3232

33+
# The basic parsers bundled in lxml.etree
34+
_DefEtreeParsers = XMLParser[_ET_co] | HTMLParser[_ET_co]
35+
3336
class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
3437
def items(self) -> Collection[tuple[_KT_co, _VT_co]]: ...

Diff for: typings/lxml/etree/_element.pyi

+12-1
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
from __future__ import annotations
44

5-
from typing import Collection, Generic, Iterator, TypeVar, overload
5+
from typing import Collection, Generic, Iterable, Iterator, TypeVar, overload
66

77
from typing_extensions import Self
88

99
from .. import _types as _t
10+
from ._module_misc import CDATA, QName
1011

1112
_T = TypeVar("_T")
1213

@@ -23,6 +24,12 @@ class _Element:
2324
def get(self, key: _t._AttrName) -> str | None: ...
2425
@overload
2526
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
27+
@overload
28+
def iter(self, *tags: _t._TagSelector) -> Iterator[Self]: ...
29+
@overload
30+
def iter(
31+
self, *, tag: _t._TagSelector | Iterable[_t._TagSelector] | None = None
32+
) -> Iterator[Self]: ...
2633
def iterancestors(
2734
self, *, tag: _t._TagSelector | Collection[_t._TagSelector] | None = None
2835
) -> Iterator[Self]: ...
@@ -39,8 +46,12 @@ class _Element:
3946
def tag(self) -> str: ...
4047
@property
4148
def tail(self) -> str | None: ...
49+
@tail.setter
50+
def tail(self, value: str | CDATA | None) -> None: ...
4251
@property
4352
def text(self) -> str | None: ...
53+
@text.setter
54+
def text(self, value: str | QName | CDATA | None) -> None: ...
4455
def xpath(
4556
self,
4657
_path: str,

Diff for: typings/lxml/etree/_module_misc.pyi

+3
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,7 @@
22

33
from __future__ import annotations
44

5+
class CDATA:
6+
def __init__(self, data: str) -> None: ...
7+
58
class QName: ...

Diff for: typings/lxml/etree/_parser.pyi

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
1+
# pyright: reportPrivateUsage=false
2+
13
from __future__ import annotations
24

5+
from typing import Generic
6+
7+
from .._types import _ET_co
38
from ._classlookup import ElementClassLookup
49

5-
class HTMLParser:
10+
# Includes most stuff in _BaseParser
11+
class _FeedParser(Generic[_ET_co]): ...
12+
13+
class HTMLParser(_FeedParser[_ET_co]):
614
def __init__(
715
self,
816
*,
@@ -20,7 +28,7 @@ class HTMLParser:
2028
) -> None: ...
2129
def set_element_class_lookup(self, lookup: ElementClassLookup | None = None) -> None: ...
2230

23-
class XMLParser:
31+
class XMLParser(_FeedParser[_ET_co]):
2432
def __init__(
2533
self,
2634
*,

Diff for: typings/lxml/html/__init__.pyi

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from __future__ import annotations
2+
3+
from ._element import (
4+
HtmlElement as HtmlElement,
5+
)
6+
from ._parse import (
7+
fragment_fromstring as fragment_fromstring,
8+
)

Diff for: typings/lxml/html/_parse.pyi

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# pyright: reportPrivateUsage=false
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from .._types import _DefEtreeParsers
8+
from ._element import HtmlElement
9+
10+
if TYPE_CHECKING:
11+
from typing_extensions import TypeAlias
12+
13+
_HtmlElemParser: TypeAlias = _DefEtreeParsers[HtmlElement]
14+
15+
def fragment_fromstring(
16+
html: str,
17+
create_parent: bool = False,
18+
base_url: str | None = None,
19+
parser: _HtmlElemParser | None = None,
20+
) -> HtmlElement: ...

Diff for: typings/pandas/__init__.pyi

+3-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
from __future__ import annotations
22

3-
from pandas.core.api import (
4-
DataFrame as DataFrame,
5-
)
6-
from pandas.io.api import (
7-
read_csv as read_csv,
8-
)
3+
from pandas.core.api import DataFrame as DataFrame
4+
from pandas.io.api import read_csv as read_csv
5+
from pandas.io.api import read_excel as read_excel

Diff for: typings/pandas/_typing.pyi

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from __future__ import annotations
2+
3+
from os import PathLike
4+
from typing import Protocol, TypeVar
5+
6+
from typing_extensions import TypeAlias
7+
8+
AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
9+
FilePath: TypeAlias = str | PathLike[str]
10+
S1 = TypeVar("S1")
11+
12+
class BaseBuffer(Protocol):
13+
@property
14+
def mode(self) -> str: ...
15+
def seek(self, __offset: int, __whence: int = ...) -> int: ...
16+
def seekable(self) -> bool: ...
17+
def tell(self) -> int: ...
18+
19+
class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]):
20+
def read(self, __n: int = ...) -> AnyStr_cov: ...

Diff for: typings/pandas/core/frame.pyi

+20
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,29 @@
1+
# pyright: reportPrivateUsage=false
2+
13
from __future__ import annotations
24

5+
from typing import Any, Hashable, Iterable
6+
7+
from pandas.core.indexing import _iLocIndexer
8+
from pandas.core.series import Series
9+
310
class DataFrame:
11+
def __getitem__(self, key: Iterable[Hashable] | slice) -> DataFrame: ...
12+
def __len__(self) -> int: ...
13+
@property
14+
def T(self) -> DataFrame: ...
15+
@property
16+
def iloc(self) -> _iLocIndexerFrame: ...
17+
def isna(self) -> DataFrame: ...
18+
def iterrows(self) -> Iterable[tuple[Hashable, Series[Any]]]: ...
19+
@property
20+
def shape(self) -> tuple[int, int]: ...
421
def to_html(
522
self,
623
index: bool = ...,
724
header: bool = ...,
825
na_rep: str = ...,
926
) -> str: ...
27+
28+
class _iLocIndexerFrame(_iLocIndexer):
29+
def __getitem__(self, idx: Any) -> DataFrame: ...

Diff for: typings/pandas/io/api.pyi

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from __future__ import annotations
22

3-
from pandas.io.parsers import (
4-
read_csv as read_csv,
5-
)
3+
from pandas.io.excel import read_excel as read_excel
4+
from pandas.io.parsers import read_csv as read_csv

Diff for: typings/pandas/io/excel/__init__.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from pandas.io.excel._base import read_excel as read_excel

Diff for: typings/pandas/io/excel/_base.pyi

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from __future__ import annotations
2+
3+
from typing import Sequence
4+
5+
from pandas._typing import FilePath, ReadBuffer
6+
from pandas.core.frame import DataFrame
7+
8+
def read_excel(
9+
io: FilePath | ReadBuffer[bytes],
10+
sheet_name: None,
11+
*,
12+
header: int | Sequence[int] | None = ...,
13+
) -> dict[str, DataFrame]: ...

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.2-dev6" # pragma: no cover
1+
__version__ = "0.15.2-dev7" # pragma: no cover

0 commit comments

Comments
 (0)