Skip to content

Commit 54c113e

Browse files
committed
Full typing.
1 parent 82801c1 commit 54c113e

17 files changed

+109
-74
lines changed

clear_html/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
__version__ = "0.4.1"
2-
from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text # noqa: F401
2+
from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text
3+
4+
__all__ = ["clean_node", "cleaned_node_to_html", "cleaned_node_to_text"]

clear_html/body_annotations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def load(cls, path: Path) -> BodyAnnotations:
3737
)
3838
return cls({})
3939

40-
def save(self, path: Path):
40+
def save(self, path: Path) -> None:
4141
as_dict = {id_: attr.asdict(ann) for id_, ann in self.items()}
4242
path.write_text(
4343
json.dumps(as_dict, sort_keys=True, ensure_ascii=False, indent=4),

clear_html/clean.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
from __future__ import annotations
22

33
import copy
4-
from typing import Callable
4+
from typing import TYPE_CHECKING, Callable
55

66
import html_text
77
from lxml.html import HtmlElement, tostring
88

99
from clear_html.formatted_text import clean_doc
1010
from clear_html.html_embeddings import integrate_embeddings
1111

12+
if TYPE_CHECKING:
13+
from collections.abc import Set as AbstractSet
14+
1215

1316
def cleaned_node_to_text(
14-
node: HtmlElement, text_extractor: Callable | None = None
17+
node: HtmlElement, text_extractor: Callable[[HtmlElement], str] | None = None
1518
) -> str | None:
1619
"""Format the given html tree as plain text, applying particular exclusions
1720
only applied to plain text (i.e. remove figure captions).
@@ -65,21 +68,21 @@ def clean_node(node: HtmlElement, url: str | None = None) -> HtmlElement:
6568
return clean_doc(node, url, nodes_whitelist)
6669

6770

68-
def apply_text_exclusions(node: HtmlElement):
71+
def apply_text_exclusions(node: HtmlElement) -> None:
6972
"""Apply some additional exclusions that are needed to export the
7073
body as text. Modify given node."""
7174
exclude_figcaption(node)
7275

7376

74-
def exclude_figcaption(node: HtmlElement):
77+
def exclude_figcaption(node: HtmlElement) -> None:
7578
# Exclude html figcaption tag
7679
to_exclude = set(node.xpath(".//figcaption"))
7780
# Never exclude the node itself
7881
to_exclude -= {node}
7982
_drop_trees(to_exclude)
8083

8184

82-
def _drop_trees(to_exclude):
85+
def _drop_trees(to_exclude: AbstractSet[HtmlElement]) -> None:
8386
for el in to_exclude:
8487
if el.getparent() is not None: # Parent cannot be removed
8588
el.drop_tree()

clear_html/formatted_text/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,6 @@
4141
- Finally html is formatted a little bit to have a good looking.
4242
"""
4343

44-
from clear_html.formatted_text.main import clean_doc # noqa: F401
44+
from clear_html.formatted_text.main import clean_doc
45+
46+
__all__ = ["clean_doc"]

clear_html/formatted_text/cleaner.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING
3+
from typing import TYPE_CHECKING, Any
44

55
from lxml import etree
66
from lxml.html import HtmlElement, defs
@@ -38,8 +38,8 @@ def __init__(
3838
self,
3939
nodes_whitelist: AbstractSet[HtmlElement] | None = None,
4040
allow_data_attrs: bool = True,
41-
allow_tags=None,
42-
**kw,
41+
allow_tags: AbstractSet[str] | None = None,
42+
**kw: Any,
4343
):
4444
# Short-circuit the safe_attrs to be able to provide a smarter filtering
4545
self._body_safe_attrs = kw.pop("safe_attrs", defs.safe_attrs)
@@ -55,7 +55,7 @@ def __init__(
5555
kw[option] = False
5656
super().__init__(**kw)
5757

58-
def __call__(self, doc: HtmlElement): # type: ignore[override]
58+
def __call__(self, doc: HtmlElement) -> None: # type: ignore[override]
5959
super().__call__(doc)
6060
if self._body_safe_attrs_only:
6161
safe_attrs = self._body_safe_attrs
@@ -90,7 +90,7 @@ def __call__(self, doc: HtmlElement): # type: ignore[override]
9090
el = to_remove.pop()
9191
drop_tag_preserve_spacing(el)
9292

93-
def allow_element(self, el):
93+
def allow_element(self, el: HtmlElement) -> bool:
9494
if el in self._nodes_whitelist:
9595
return True
9696
return super().allow_element(el)

clear_html/formatted_text/figures.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def _get_figure_caption_cleaner() -> Cleaner:
5555
)
5656

5757

58-
def enclose_media_within_figure(doc: HtmlElement):
58+
def enclose_media_within_figure(doc: HtmlElement) -> None:
5959
"""Ensures all media (images, videos, etc) are enclosed within figures.
6060
If possible, images with
6161
a link also includes the link within the figure element."""
@@ -69,7 +69,7 @@ def enclose_media_within_figure(doc: HtmlElement):
6969

7070
def top_level_media_within_figure(
7171
doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
72-
):
72+
) -> None:
7373
"""Enclose top level isolated multimedia into figures. In other words,
7474
paragraphs containing only a single media element are replaced by a figure.
7575
Nodes in the white list are ignored.
@@ -90,7 +90,7 @@ def top_level_media_within_figure(
9090
'<div><figure><audio><source></source></audio></figure></div>'
9191
"""
9292

93-
def is_single_tag(el: HtmlElement):
93+
def is_single_tag(el: HtmlElement) -> bool:
9494
return len(el) == 1 and not has_text(el) and not has_tail(el[0])
9595

9696
for child in doc:
@@ -105,7 +105,7 @@ def is_single_tag(el: HtmlElement):
105105
single_p.tag = "figure"
106106

107107

108-
def infer_img_url_from_data_src_attr(doc: HtmlElement):
108+
def infer_img_url_from_data_src_attr(doc: HtmlElement) -> None:
109109
"""Fills src attribute from data-src for img tags.
110110
It is common to see img tags without src attribute but with data-src
111111
@@ -119,7 +119,7 @@ def infer_img_url_from_data_src_attr(doc: HtmlElement):
119119
el.attrib["src"] = cast("str", el.get("data-src"))
120120

121121

122-
def create_figures_from_isolated_figcaptions(node: HtmlElement):
122+
def create_figures_from_isolated_figcaptions(node: HtmlElement) -> None:
123123
"""Wraps isolated figcaptions with the content above and form a new figure.
124124
Mutates node.
125125
@@ -207,7 +207,7 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
207207
fuse_figcaptions(new_figure)
208208

209209

210-
def fuse_figcaptions(figure: HtmlElement):
210+
def fuse_figcaptions(figure: HtmlElement) -> None:
211211
"""Fuses first block of consecutive figcaptions and remove the rest found.
212212
213213
>>> fuse = _test_fn(fuse_figcaptions)
@@ -244,7 +244,7 @@ def fuse_figcaptions(figure: HtmlElement):
244244
drop_tag_preserve_spacing(child)
245245

246246

247-
def clean_figcaptions_html(node: HtmlElement):
247+
def clean_figcaptions_html(node: HtmlElement) -> None:
248248
"""Simplifies figcapion html
249249
>>> html = fromstring("<div><figcaption><table><p><strong>hey</strong></p></table></figcaption></div>")
250250
>>> clean_figcaptions_html(html)
@@ -256,7 +256,7 @@ def clean_figcaptions_html(node: HtmlElement):
256256
clean(caption)
257257

258258

259-
def remove_figures_without_content(doc: HtmlElement):
259+
def remove_figures_without_content(doc: HtmlElement) -> None:
260260
"""Removes figures that has no content apart of the figure caption. This
261261
can happen for some pages that inject the content with JS
262262
@@ -292,7 +292,7 @@ def remove_figures_without_content(doc: HtmlElement):
292292
drop_tag_preserve_spacing(figure, preserve_content=False)
293293

294294

295-
def clean_double_br_above_figcaption(doc: HtmlElement):
295+
def clean_double_br_above_figcaption(doc: HtmlElement) -> None:
296296
"""Some weird cases like when figure is implemented with tables
297297
we can end having a double br before figcaptions. For example
298298
in this case

clear_html/formatted_text/headings.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@
1313

1414

1515
def headings_nodes(doc: HtmlElement) -> list[HtmlElement]:
16-
return doc.xpath(
17-
".//*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]"
16+
return cast(
17+
"list[HtmlElement]",
18+
doc.xpath(
19+
".//*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]"
20+
),
1821
)
1922

2023

@@ -25,7 +28,7 @@ def min_heading(doc: HtmlElement) -> int:
2528

2629
def normalize_headings_level(
2730
doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
28-
):
31+
) -> None:
2932
"""Normalizes headings in the doc so that the lowest level is always 2.
3033
If six levels document is found, the last level is replaced by
3134
``<p><strong></strong></p>``

clear_html/formatted_text/main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def _get_default_cleaner(
111111
)
112112

113113

114-
def paragraphy(doc: HtmlElement):
114+
def paragraphy(doc: HtmlElement) -> None:
115115
"""Ensures all textual content is inside a paragraph for first level.
116116
Removes sequences of consecutive br tags enclosing surroundings into
117117
paragraphs. Note that these kind of double
@@ -153,7 +153,7 @@ def paragraphy(doc: HtmlElement):
153153
last_inline_chunk: list[HtmlElement] = []
154154
include_root_text = True
155155

156-
def push_accumulated_content_as_p(idx):
156+
def push_accumulated_content_as_p(idx: int) -> None:
157157
# Pushes content in last_inline_chunk in
158158
# a new paragraph.
159159
nonlocal include_root_text, doc, children, last_inline_chunk
@@ -188,7 +188,7 @@ def push_accumulated_content_as_p(idx):
188188
push_accumulated_content_as_p(n_children)
189189

190190

191-
def almost_pretty_format(doc: HtmlElement, url: str | None = None):
191+
def almost_pretty_format(doc: HtmlElement, url: str | None = None) -> None:
192192
"""Format doc to have a good looking when serialized as html.
193193
Only modifying first level of the body which is safe (formatting
194194
inner elements is not that safe). One line of separation for first
@@ -234,7 +234,7 @@ def almost_pretty_format(doc: HtmlElement, url: str | None = None):
234234
child.text = child.text.rstrip()
235235

236236

237-
def make_links_absolute(doc: HtmlElement, base_url: str):
237+
def make_links_absolute(doc: HtmlElement, base_url: str) -> None:
238238
"""Like doc.make_links_absolute which ignores errors,
239239
but also does not fail on urls with escape chars, skipping them instead.
240240
"""

clear_html/formatted_text/utils.py

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,15 @@
2525
from collections.abc import Mapping
2626
from collections.abc import Set as AbstractSet
2727

28+
from lxml.etree import QName
2829

29-
def translate_tags(doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()):
30+
# from lxml-stubs
31+
_TagName = str | bytes | bytearray | QName
32+
33+
34+
def translate_tags(
35+
doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
36+
) -> None:
3037
"""Translate tag names (i.e. b -> strong). Mutates the doc.
3138
Nodes in the white list are ignored.
3239
@@ -68,11 +75,11 @@ def set_article_tag_as_root(doc: HtmlElement) -> HtmlElement:
6875

6976
def wrap_tags(
7077
doc: HtmlElement,
71-
to_be_enclosed_tags: AbstractSet,
78+
to_be_enclosed_tags: AbstractSet[_TagName],
7279
enclosing_tag: str,
7380
node_check: Callable[[HtmlElement], bool] = lambda x: True,
74-
transparent_tags: AbstractSet = set(),
75-
):
81+
transparent_tags: AbstractSet[_TagName] = set(),
82+
) -> None:
7683
"""Enclose the elements with tag `to_be_enclosed_tags` within a tag
7784
`enclosing_tag` if they are not already enclosed, that is, if `enclosing_tag`
7885
is not already an ancestor. All transparent tags without more content
@@ -93,12 +100,12 @@ def wrap_tags(
93100

94101
def _wrap_tags_with(
95102
doc: HtmlElement,
96-
to_be_enclosed_tags: AbstractSet,
103+
to_be_enclosed_tags: AbstractSet[_TagName],
97104
enclosing_tag: str,
98-
ancestors_tags: AbstractSet = set(),
105+
ancestors_tags: AbstractSet[_TagName] = set(),
99106
node_check: Callable[[HtmlElement], bool] = lambda x: True,
100-
transparent_tags: AbstractSet = set(),
101-
):
107+
transparent_tags: AbstractSet[_TagName] = set(),
108+
) -> None:
102109
ancestors_tags = ancestors_tags | {doc.tag}
103110
if (
104111
(enclosing_tag not in ancestors_tags)
@@ -128,8 +135,8 @@ def _wrap_tags_with(
128135

129136

130137
def remove_empty_tags(
131-
doc: HtmlElement, white_list: AbstractSet[str] = set(), _root=True
132-
):
138+
doc: HtmlElement, white_list: AbstractSet[str] = set(), _root: bool = True
139+
) -> None:
133140
"""Removes empty tags, but skipping the `white_list` ones
134141
135142
>>> html = fromstring("<article><p><em></em></p></article>")
@@ -148,7 +155,7 @@ def remove_empty_tags(
148155
doc.drop_tag()
149156

150157

151-
def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True):
158+
def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content: bool = True) -> None:
152159
"""Drops a tag keeping its content. If element to be removed
153160
is a block element, leading or trailing double br tags would
154161
be introduced to preserve spacing. If preserve_content is
@@ -198,7 +205,7 @@ def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True):
198205
doc.drop_tree()
199206

200207

201-
def double_br(doc: HtmlElement | None):
208+
def double_br(doc: HtmlElement | None) -> bool:
202209
"""True if doc and next element are "br" tags without text in between."""
203210
if doc is None or doc.tag != "br":
204211
return False
@@ -238,7 +245,7 @@ def has_no_content(doc: HtmlElement) -> bool:
238245

239246
def is_empty(
240247
doc: HtmlElement, tags_with_content_even_if_empty: AbstractSet[str] = set()
241-
):
248+
) -> bool:
242249
"""Checks if given doc is an empty tag or tag formed with empty tags.
243250
``tags_with_content_even_if_empty`` tags are considered as having content
244251
even if empty.
@@ -270,7 +277,7 @@ def is_empty(
270277
)
271278

272279

273-
def is_phrasing_content(doc: HtmlElement):
280+
def is_phrasing_content(doc: HtmlElement) -> bool:
274281
"""'Phrasing content is the text of the document, as well as elements that
275282
mark up that text at the intra-paragraph level'
276283
(see https://html.spec.whatwg.org/#phrasing-content). This method return
@@ -350,8 +357,8 @@ def find_previous_non_empty_sibling(doc: HtmlElement) -> int | None:
350357
return None
351358

352359

353-
def _test_fn(fn):
354-
def func(doc):
360+
def _test_fn(fn: Callable[[HtmlElement], None]) -> Callable[[str], str]:
361+
def func(doc: str) -> str:
355362
html = fromstring(doc)
356363
fn(html)
357364
return tostring(html).decode()
@@ -362,9 +369,9 @@ def func(doc):
362369
def clean_incomplete_structures(
363370
doc: HtmlElement,
364371
rules: Mapping[str, AbstractSet[str]],
365-
preserve_content=True,
372+
preserve_content: bool = True,
366373
white_list: AbstractSet[HtmlElement] = set(),
367-
):
374+
) -> None:
368375
"""Drop tags (keeping content) of incomplete structures.
369376
For example, removes a td element if not belonging to any table.
370377
Never clean the base element. If preserve_content is false then nodes
@@ -401,10 +408,10 @@ def clean_incomplete_structures(
401408
def _clean_incomplete_structures(
402409
doc: HtmlElement,
403410
rules: Mapping[str, AbstractSet[str]],
404-
ancestors_tags: AbstractSet = set(),
405-
preserve_content=True,
411+
ancestors_tags: AbstractSet[_TagName] = set(),
412+
preserve_content: bool = True,
406413
white_list: AbstractSet[HtmlElement] = set(),
407-
):
414+
) -> None:
408415
ancestors_tags = ancestors_tags | {doc.tag}
409416
for child in doc:
410417
_clean_incomplete_structures(child, rules, ancestors_tags, preserve_content)
@@ -417,7 +424,7 @@ def _clean_incomplete_structures(
417424
drop_tag_preserve_spacing(doc, preserve_content)
418425

419426

420-
def kill_tag_content(doc: HtmlElement, tag: str):
427+
def kill_tag_content(doc: HtmlElement, tag: str) -> None:
421428
"""Removes the content of all these tags found in the doc
422429
423430
>>> def kill(html):

0 commit comments

Comments
 (0)