zytedata
diff --git a/‎clear_html/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎clear_html/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎clear_html/body_annotations.py‎
Lines changed: 1 addition & 1 deletion b/‎clear_html/body_annotations.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clear_html/clean.py‎
Lines changed: 8 additions & 5 deletions b/‎clear_html/clean.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎clear_html/formatted_text/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎clear_html/formatted_text/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎clear_html/formatted_text/cleaner.py‎
Lines changed: 5 additions & 5 deletions b/‎clear_html/formatted_text/cleaner.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎clear_html/formatted_text/figures.py‎
Lines changed: 9 additions & 9 deletions b/‎clear_html/formatted_text/figures.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎clear_html/formatted_text/headings.py‎
Lines changed: 6 additions & 3 deletions b/‎clear_html/formatted_text/headings.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎clear_html/formatted_text/main.py‎
Lines changed: 4 additions & 4 deletions b/‎clear_html/formatted_text/main.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎clear_html/formatted_text/utils.py‎
Lines changed: 29 additions & 22 deletions b/‎clear_html/formatted_text/utils.py‎
Lines changed: 29 additions & 22 deletions
@@ -1,2 +1,4 @@
 __version__ = "0.4.1"
-from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text  # noqa: F401
+from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text
+
+__all__ = ["clean_node", "cleaned_node_to_html", "cleaned_node_to_text"]
@@ -37,7 +37,7 @@ def load(cls, path: Path) -> BodyAnnotations:
         )
         return cls({})
 
-    def save(self, path: Path):
+    def save(self, path: Path) -> None:
         as_dict = {id_: attr.asdict(ann) for id_, ann in self.items()}
         path.write_text(
             json.dumps(as_dict, sort_keys=True, ensure_ascii=False, indent=4),
 
@@ -1,17 +1,20 @@
 from __future__ import annotations
 
 import copy
-from typing import Callable
+from typing import TYPE_CHECKING, Callable
 
 import html_text
 from lxml.html import HtmlElement, tostring
 
 from clear_html.formatted_text import clean_doc
 from clear_html.html_embeddings import integrate_embeddings
 
+if TYPE_CHECKING:
+    from collections.abc import Set as AbstractSet
+
 
 def cleaned_node_to_text(
-    node: HtmlElement, text_extractor: Callable | None = None
+    node: HtmlElement, text_extractor: Callable[[HtmlElement], str] | None = None
 ) -> str | None:
     """Format the given html tree as plain text, applying particular exclusions
     only applied to plain text (i.e. remove figure captions).
@@ -65,21 +68,21 @@ def clean_node(node: HtmlElement, url: str | None = None) -> HtmlElement:
     return clean_doc(node, url, nodes_whitelist)
 
 
-def apply_text_exclusions(node: HtmlElement):
+def apply_text_exclusions(node: HtmlElement) -> None:
     """Apply some additional exclusions that are needed to export the
     body as text. Modify given node."""
     exclude_figcaption(node)
 
 
-def exclude_figcaption(node: HtmlElement):
+def exclude_figcaption(node: HtmlElement) -> None:
     # Exclude html figcaption tag
     to_exclude = set(node.xpath(".//figcaption"))
     # Never exclude the node itself
     to_exclude -= {node}
     _drop_trees(to_exclude)
 
 
-def _drop_trees(to_exclude):
+def _drop_trees(to_exclude: AbstractSet[HtmlElement]) -> None:
     for el in to_exclude:
         if el.getparent() is not None:  # Parent cannot be removed
             el.drop_tree()
@@ -41,4 +41,6 @@
   - Finally html is formatted a little bit to have a good looking.
 """
 
-from clear_html.formatted_text.main import clean_doc  # noqa: F401
+from clear_html.formatted_text.main import clean_doc
+
+__all__ = ["clean_doc"]
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from lxml import etree
 from lxml.html import HtmlElement, defs
@@ -38,8 +38,8 @@ def __init__(
         self,
         nodes_whitelist: AbstractSet[HtmlElement] | None = None,
         allow_data_attrs: bool = True,
-        allow_tags=None,
-        **kw,
+        allow_tags: AbstractSet[str] | None = None,
+        **kw: Any,
     ):
         # Short-circuit the safe_attrs to be able to provide a smarter filtering
         self._body_safe_attrs = kw.pop("safe_attrs", defs.safe_attrs)
@@ -55,7 +55,7 @@ def __init__(
             kw[option] = False
         super().__init__(**kw)
 
-    def __call__(self, doc: HtmlElement):  # type: ignore[override]
+    def __call__(self, doc: HtmlElement) -> None:  # type: ignore[override]
         super().__call__(doc)
         if self._body_safe_attrs_only:
             safe_attrs = self._body_safe_attrs
@@ -90,7 +90,7 @@ def __call__(self, doc: HtmlElement):  # type: ignore[override]
                     el = to_remove.pop()
                     drop_tag_preserve_spacing(el)
 
-    def allow_element(self, el):
+    def allow_element(self, el: HtmlElement) -> bool:
         if el in self._nodes_whitelist:
             return True
         return super().allow_element(el)
@@ -55,7 +55,7 @@ def _get_figure_caption_cleaner() -> Cleaner:
     )
 
 
-def enclose_media_within_figure(doc: HtmlElement):
+def enclose_media_within_figure(doc: HtmlElement) -> None:
     """Ensures all media (images, videos, etc) are enclosed within figures.
     If possible, images with
     a link also includes the link within the figure element."""
@@ -69,7 +69,7 @@ def enclose_media_within_figure(doc: HtmlElement):
 
 def top_level_media_within_figure(
     doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
-):
+) -> None:
     """Enclose top level isolated multimedia into figures. In other words,
     paragraphs containing only a single media element are replaced by a figure.
     Nodes in the white list are ignored.
@@ -90,7 +90,7 @@ def top_level_media_within_figure(
     '<div><figure><audio><source></source></audio></figure></div>'
     """
 
-    def is_single_tag(el: HtmlElement):
+    def is_single_tag(el: HtmlElement) -> bool:
         return len(el) == 1 and not has_text(el) and not has_tail(el[0])
 
     for child in doc:
@@ -105,7 +105,7 @@ def is_single_tag(el: HtmlElement):
                 single_p.tag = "figure"
 
 
-def infer_img_url_from_data_src_attr(doc: HtmlElement):
+def infer_img_url_from_data_src_attr(doc: HtmlElement) -> None:
     """Fills src attribute from data-src for img tags.
     It is common to see img tags without src attribute but with data-src
 
@@ -119,7 +119,7 @@ def infer_img_url_from_data_src_attr(doc: HtmlElement):
             el.attrib["src"] = cast("str", el.get("data-src"))
 
 
-def create_figures_from_isolated_figcaptions(node: HtmlElement):
+def create_figures_from_isolated_figcaptions(node: HtmlElement) -> None:
     """Wraps isolated figcaptions with the content above and form a new figure.
     Mutates node.
 
@@ -207,7 +207,7 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
                 fuse_figcaptions(new_figure)
 
 
-def fuse_figcaptions(figure: HtmlElement):
+def fuse_figcaptions(figure: HtmlElement) -> None:
     """Fuses first block of consecutive figcaptions and remove the rest found.
 
     >>> fuse = _test_fn(fuse_figcaptions)
@@ -244,7 +244,7 @@ def fuse_figcaptions(figure: HtmlElement):
             drop_tag_preserve_spacing(child)
 
 
-def clean_figcaptions_html(node: HtmlElement):
+def clean_figcaptions_html(node: HtmlElement) -> None:
     """Simplifies figcapion html
     >>> html = fromstring("<div><figcaption><table><p><strong>hey</strong></p></table></figcaption></div>")
     >>> clean_figcaptions_html(html)
@@ -256,7 +256,7 @@ def clean_figcaptions_html(node: HtmlElement):
         clean(caption)
 
 
-def remove_figures_without_content(doc: HtmlElement):
+def remove_figures_without_content(doc: HtmlElement) -> None:
     """Removes figures that has no content apart of the figure caption. This
     can happen for some pages that inject the content with JS
 
@@ -292,7 +292,7 @@ def remove_figures_without_content(doc: HtmlElement):
             drop_tag_preserve_spacing(figure, preserve_content=False)
 
 
-def clean_double_br_above_figcaption(doc: HtmlElement):
+def clean_double_br_above_figcaption(doc: HtmlElement) -> None:
     """Some weird cases like when figure is implemented with tables
     we can end having a double br before figcaptions. For example
     in this case
 
@@ -13,8 +13,11 @@
 
 
 def headings_nodes(doc: HtmlElement) -> list[HtmlElement]:
-    return doc.xpath(
-        ".//*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]"
+    return cast(
+        "list[HtmlElement]",
+        doc.xpath(
+            ".//*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]"
+        ),
     )
 
 
@@ -25,7 +28,7 @@ def min_heading(doc: HtmlElement) -> int:
 
 def normalize_headings_level(
     doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
-):
+) -> None:
     """Normalizes headings in the doc so that the lowest level is always 2.
     If six levels document is found, the last level is replaced by
     ``<p><strong></strong></p>``
 
@@ -111,7 +111,7 @@ def _get_default_cleaner(
     )
 
 
-def paragraphy(doc: HtmlElement):
+def paragraphy(doc: HtmlElement) -> None:
     """Ensures all textual content is inside a paragraph for first level.
     Removes sequences of consecutive br tags enclosing surroundings into
     paragraphs. Note that these kind of double
@@ -153,7 +153,7 @@ def paragraphy(doc: HtmlElement):
     last_inline_chunk: list[HtmlElement] = []
     include_root_text = True
 
-    def push_accumulated_content_as_p(idx):
+    def push_accumulated_content_as_p(idx: int) -> None:
         # Pushes content in last_inline_chunk in
         # a new paragraph.
         nonlocal include_root_text, doc, children, last_inline_chunk
@@ -188,7 +188,7 @@ def push_accumulated_content_as_p(idx):
     push_accumulated_content_as_p(n_children)
 
 
-def almost_pretty_format(doc: HtmlElement, url: str | None = None):
+def almost_pretty_format(doc: HtmlElement, url: str | None = None) -> None:
     """Format doc to have a good looking when serialized as html.
     Only modifying first level of the body which is safe (formatting
     inner elements is not that safe). One line of separation for first
@@ -234,7 +234,7 @@ def almost_pretty_format(doc: HtmlElement, url: str | None = None):
                 child.text = child.text.rstrip()
 
 
-def make_links_absolute(doc: HtmlElement, base_url: str):
+def make_links_absolute(doc: HtmlElement, base_url: str) -> None:
     """Like doc.make_links_absolute which ignores errors,
     but also does not fail on urls with escape chars, skipping them instead.
     """
 
@@ -25,8 +25,15 @@
     from collections.abc import Mapping
     from collections.abc import Set as AbstractSet
 
+    from lxml.etree import QName
 
-def translate_tags(doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()):
+    # from lxml-stubs
+    _TagName = str | bytes | bytearray | QName
+
+
+def translate_tags(
+    doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
+) -> None:
     """Translate tag names (i.e. b -> strong). Mutates the doc.
     Nodes in the white list are ignored.
 
@@ -68,11 +75,11 @@ def set_article_tag_as_root(doc: HtmlElement) -> HtmlElement:
 
 def wrap_tags(
     doc: HtmlElement,
-    to_be_enclosed_tags: AbstractSet,
+    to_be_enclosed_tags: AbstractSet[_TagName],
     enclosing_tag: str,
     node_check: Callable[[HtmlElement], bool] = lambda x: True,
-    transparent_tags: AbstractSet = set(),
-):
+    transparent_tags: AbstractSet[_TagName] = set(),
+) -> None:
     """Enclose the elements with tag `to_be_enclosed_tags` within a tag
     `enclosing_tag` if they are not already enclosed, that is, if `enclosing_tag`
     is not already an ancestor. All transparent tags without more content
@@ -93,12 +100,12 @@ def wrap_tags(
 
 def _wrap_tags_with(
     doc: HtmlElement,
-    to_be_enclosed_tags: AbstractSet,
+    to_be_enclosed_tags: AbstractSet[_TagName],
     enclosing_tag: str,
-    ancestors_tags: AbstractSet = set(),
+    ancestors_tags: AbstractSet[_TagName] = set(),
     node_check: Callable[[HtmlElement], bool] = lambda x: True,
-    transparent_tags: AbstractSet = set(),
-):
+    transparent_tags: AbstractSet[_TagName] = set(),
+) -> None:
     ancestors_tags = ancestors_tags | {doc.tag}
     if (
         (enclosing_tag not in ancestors_tags)
@@ -128,8 +135,8 @@ def _wrap_tags_with(
 
 
 def remove_empty_tags(
-    doc: HtmlElement, white_list: AbstractSet[str] = set(), _root=True
-):
+    doc: HtmlElement, white_list: AbstractSet[str] = set(), _root: bool = True
+) -> None:
     """Removes empty tags, but skipping the `white_list` ones
 
     >>> html = fromstring("<article><p><em></em></p></article>")
@@ -148,7 +155,7 @@ def remove_empty_tags(
         doc.drop_tag()
 
 
-def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True):
+def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content: bool = True) -> None:
     """Drops a tag keeping its content. If element to be removed
     is a block element, leading or trailing double br tags would
     be introduced to preserve spacing. If preserve_content is
@@ -198,7 +205,7 @@ def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True):
         doc.drop_tree()
 
 
-def double_br(doc: HtmlElement | None):
+def double_br(doc: HtmlElement | None) -> bool:
     """True if doc and next element are "br" tags without text in between."""
     if doc is None or doc.tag != "br":
         return False
@@ -238,7 +245,7 @@ def has_no_content(doc: HtmlElement) -> bool:
 
 def is_empty(
     doc: HtmlElement, tags_with_content_even_if_empty: AbstractSet[str] = set()
-):
+) -> bool:
     """Checks if given doc is an empty tag or tag formed with empty tags.
     ``tags_with_content_even_if_empty`` tags are considered as having content
     even if empty.
@@ -270,7 +277,7 @@ def is_empty(
     )
 
 
-def is_phrasing_content(doc: HtmlElement):
+def is_phrasing_content(doc: HtmlElement) -> bool:
     """'Phrasing content is the text of the document, as well as elements that
     mark up that text at the intra-paragraph level'
     (see https://html.spec.whatwg.org/#phrasing-content). This method return
@@ -350,8 +357,8 @@ def find_previous_non_empty_sibling(doc: HtmlElement) -> int | None:
     return None
 
 
-def _test_fn(fn):
-    def func(doc):
+def _test_fn(fn: Callable[[HtmlElement], None]) -> Callable[[str], str]:
+    def func(doc: str) -> str:
         html = fromstring(doc)
         fn(html)
         return tostring(html).decode()
@@ -362,9 +369,9 @@ def func(doc):
 def clean_incomplete_structures(
     doc: HtmlElement,
     rules: Mapping[str, AbstractSet[str]],
-    preserve_content=True,
+    preserve_content: bool = True,
     white_list: AbstractSet[HtmlElement] = set(),
-):
+) -> None:
     """Drop tags (keeping content) of incomplete structures.
     For example, removes a td element if not belonging to any table.
     Never clean the base element. If preserve_content is false then nodes
@@ -401,10 +408,10 @@ def clean_incomplete_structures(
 def _clean_incomplete_structures(
     doc: HtmlElement,
     rules: Mapping[str, AbstractSet[str]],
-    ancestors_tags: AbstractSet = set(),
-    preserve_content=True,
+    ancestors_tags: AbstractSet[_TagName] = set(),
+    preserve_content: bool = True,
     white_list: AbstractSet[HtmlElement] = set(),
-):
+) -> None:
     ancestors_tags = ancestors_tags | {doc.tag}
     for child in doc:
         _clean_incomplete_structures(child, rules, ancestors_tags, preserve_content)
@@ -417,7 +424,7 @@ def _clean_incomplete_structures(
         drop_tag_preserve_spacing(doc, preserve_content)
 
 
-def kill_tag_content(doc: HtmlElement, tag: str):
+def kill_tag_content(doc: HtmlElement, tag: str) -> None:
     """Removes the content of all these tags found in the doc
 
     >>> def kill(html):
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def load(cls, path: Path) -> BodyAnnotations:`
`37`	`37`	`)`
`38`	`38`	`return cls({})`
`39`	`39`
`40`		`- def save(self, path: Path):`
	`40`	`+ def save(self, path: Path) -> None:`
`41`	`41`	`as_dict = {id_: attr.asdict(ann) for id_, ann in self.items()}`
`42`	`42`	`path.write_text(`
`43`	`43`	`json.dumps(as_dict, sort_keys=True, ensure_ascii=False, indent=4),`