Skip to content

Commit cbe1b35

Browse files
authored
rfctr(chunk): prep for adding TableSplitter (#3510)
**Summary** Mechanical refactoring in preparation for adding (pre-chunk) `TableSplitter` in a PR stacked on this one.
1 parent d99b399 commit cbe1b35

File tree

24 files changed

+671
-561
lines changed

24 files changed

+671
-561
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.15.2-dev6
1+
## 0.15.2-dev7
22

33
### Enhancements
44

test_unstructured/chunking/test_base.py

Lines changed: 249 additions & 244 deletions
Large diffs are not rendered by default.

test_unstructured/common/__init__.py

Whitespace-only changes.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""Unit-test suite for the `unstructured.common.html_table` module."""
2+
3+
from __future__ import annotations
4+
5+
from unstructured.common.html_table import htmlify_matrix_of_cell_texts
6+
7+
8+
class Describe_htmlify_matrix_of_cell_texts:
9+
"""Unit-test suite for `unstructured.common.html_table.htmlify_matrix_of_cell_texts()`."""
10+
11+
def test_htmlify_matrix_handles_empty_cells(self):
12+
assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
13+
"<table>"
14+
"<tr><td>cell1</td><td></td><td>cell3</td></tr>"
15+
"<tr><td></td><td>cell5</td><td></td></tr>"
16+
"</table>"
17+
)
18+
19+
def test_htmlify_matrix_handles_special_characters(self):
20+
assert htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
21+
"<table><tr><td>&lt;&gt;&amp;&quot;</td><td>newline<br/></td></tr></table>"
22+
)
23+
24+
def test_htmlify_matrix_handles_multiple_rows_and_cells(self):
25+
assert htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
26+
"<table>"
27+
"<tr><td>cell1</td><td>cell2</td></tr>"
28+
"<tr><td>cell3</td><td>cell4</td></tr>"
29+
"</table>"
30+
)
31+
32+
def test_htmlify_matrix_handles_empty_matrix(self):
33+
assert htmlify_matrix_of_cell_texts([]) == ""

test_unstructured/test_utils.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -339,30 +339,6 @@ def test_validate_date_args_raises_for_invalid_formats(date):
339339
assert utils.validate_date_args(date)
340340

341341

342-
def test_htmlify_matrix_handles_empty_cells():
343-
assert utils.htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
344-
"<table><tr><td>cell1</td><td></td><td>cell3</td></tr>"
345-
"<tr><td></td><td>cell5</td><td></td></tr></table>"
346-
)
347-
348-
349-
def test_htmlify_matrix_handles_special_characters():
350-
assert utils.htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
351-
"<table><tr><td>&lt;&gt;&amp;&quot;</td><td>newline<br/></td></tr></table>"
352-
)
353-
354-
355-
def test_htmlify_matrix_handles_multiple_rows_and_cells():
356-
assert utils.htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
357-
"<table><tr><td>cell1</td><td>cell2</td></tr>"
358-
"<tr><td>cell3</td><td>cell4</td></tr></table>"
359-
)
360-
361-
362-
def test_htmlify_matrix_handles_empty_matrix():
363-
assert utils.htmlify_matrix_of_cell_texts([]) == ""
364-
365-
366342
def test_only_returns_singleton_iterable():
367343
singleton_iterable = [42]
368344
result = utils.only(singleton_iterable)

typings/lxml/_types.pyi

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ from typing import Any, Callable, Collection, Protocol, TypeVar
66

77
from typing_extensions import TypeAlias
88

9-
from .etree import QName, _Element, _ElementTree
9+
from .etree import HTMLParser, QName, XMLParser, _Element, _ElementTree
1010

1111
_ET = TypeVar("_ET", bound=_Element, default=_Element)
1212
_ET_co = TypeVar("_ET_co", bound=_Element, default=_Element, covariant=True)
@@ -30,5 +30,8 @@ _TextArg: TypeAlias = str | bytes | QName
3030

3131
_XPathObject = Any
3232

33+
# The basic parsers bundled in lxml.etree
34+
_DefEtreeParsers = XMLParser[_ET_co] | HTMLParser[_ET_co]
35+
3336
class SupportsLaxedItems(Protocol[_KT_co, _VT_co]):
3437
def items(self) -> Collection[tuple[_KT_co, _VT_co]]: ...

typings/lxml/etree/_element.pyi

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
from __future__ import annotations
44

5-
from typing import Collection, Generic, Iterator, TypeVar, overload
5+
from typing import Collection, Generic, Iterable, Iterator, TypeVar, overload
66

77
from typing_extensions import Self
88

99
from .. import _types as _t
10+
from ._module_misc import CDATA, QName
1011

1112
_T = TypeVar("_T")
1213

@@ -23,6 +24,12 @@ class _Element:
2324
def get(self, key: _t._AttrName) -> str | None: ...
2425
@overload
2526
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
27+
@overload
28+
def iter(self, *tags: _t._TagSelector) -> Iterator[Self]: ...
29+
@overload
30+
def iter(
31+
self, *, tag: _t._TagSelector | Iterable[_t._TagSelector] | None = None
32+
) -> Iterator[Self]: ...
2633
def iterancestors(
2734
self, *, tag: _t._TagSelector | Collection[_t._TagSelector] | None = None
2835
) -> Iterator[Self]: ...
@@ -39,8 +46,12 @@ class _Element:
3946
def tag(self) -> str: ...
4047
@property
4148
def tail(self) -> str | None: ...
49+
@tail.setter
50+
def tail(self, value: str | CDATA | None) -> None: ...
4251
@property
4352
def text(self) -> str | None: ...
53+
@text.setter
54+
def text(self, value: str | QName | CDATA | None) -> None: ...
4455
def xpath(
4556
self,
4657
_path: str,

typings/lxml/etree/_module_misc.pyi

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,7 @@
22

33
from __future__ import annotations
44

5+
class CDATA:
6+
def __init__(self, data: str) -> None: ...
7+
58
class QName: ...

typings/lxml/etree/_parser.pyi

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
1+
# pyright: reportPrivateUsage=false
2+
13
from __future__ import annotations
24

5+
from typing import Generic
6+
7+
from .._types import _ET_co
38
from ._classlookup import ElementClassLookup
49

5-
class HTMLParser:
10+
# Includes most stuff in _BaseParser
11+
class _FeedParser(Generic[_ET_co]): ...
12+
13+
class HTMLParser(_FeedParser[_ET_co]):
614
def __init__(
715
self,
816
*,
@@ -20,7 +28,7 @@ class HTMLParser:
2028
) -> None: ...
2129
def set_element_class_lookup(self, lookup: ElementClassLookup | None = None) -> None: ...
2230

23-
class XMLParser:
31+
class XMLParser(_FeedParser[_ET_co]):
2432
def __init__(
2533
self,
2634
*,

typings/lxml/html/__init__.pyi

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from __future__ import annotations
2+
3+
from ._element import (
4+
HtmlElement as HtmlElement,
5+
)
6+
from ._parse import (
7+
fragment_fromstring as fragment_fromstring,
8+
)

0 commit comments

Comments
 (0)