zytedata
diff --git a/‎.github/workflows/publish.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/publish.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 8 additions & 5 deletions b/‎.github/workflows/test.yml‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎clear_html/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎clear_html/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎clear_html/body_annotations.py‎
Lines changed: 6 additions & 3 deletions b/‎clear_html/body_annotations.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎clear_html/clean.py‎
Lines changed: 8 additions & 5 deletions b/‎clear_html/clean.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎clear_html/formatted_text/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎clear_html/formatted_text/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎clear_html/formatted_text/cleaner.py‎
Lines changed: 5 additions & 5 deletions b/‎clear_html/formatted_text/cleaner.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎clear_html/formatted_text/figures.py‎
Lines changed: 17 additions & 16 deletions b/‎clear_html/formatted_text/figures.py‎
Lines changed: 17 additions & 16 deletions
@@ -13,9 +13,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v5
     - name: Set up Python
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
       with:
         python-version: '3.13'
     - name: Install dependencies
 
@@ -13,15 +13,18 @@ jobs:
       matrix:
         include:
         - python-version: "3.9"
+        - python-version: "3.9"
+          env:
+            TOXENV: pinned
         - python-version: "3.10"
         - python-version: "3.11"
         - python-version: "3.12"
         - python-version: "3.13"
-
+        - python-version: "3.14.0-rc.3"
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v5
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
@@ -45,9 +48,9 @@ jobs:
         tox-job: ["lint", "mypy", "twinecheck"]
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v5
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
 
@@ -10,6 +10,7 @@
 
 # IDE
 /.idea/
+/.vscode/
 
 .mypy_cache/
 .cache/
@@ -22,4 +23,4 @@ htmlcov/
 # Test and coverage
 test-results
 coverage-html
-
+coverage.xml
@@ -1,7 +1,12 @@
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.5
+  rev: v0.13.2
   hooks:
-    - id: ruff
+    - id: ruff-check
       args: [ --fix ]
     - id: ruff-format
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v6.0.0
+  hooks:
+  - id: end-of-file-fixer
+  - id: trailing-whitespace
@@ -1,2 +1,4 @@
 __version__ = "0.4.1"
-from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text  # noqa: F401
+from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text
+
+__all__ = ["clean_node", "cleaned_node_to_html", "cleaned_node_to_text"]
@@ -10,6 +10,9 @@
     from pathlib import Path
 
 
+logger = logging.getLogger(__name__)
+
+
 @attr.s(auto_attribs=True)
 class BodyAnnotation:
     url: str
@@ -29,13 +32,13 @@ def load(cls, path: Path) -> BodyAnnotations:
             with path.open("rt", encoding="utf8") as f:
                 pages = json.load(f)
                 return cls((k, BodyAnnotation(**v)) for k, v in pages.items())
-        logging.info(
+        logger.info(
             f"Body annotations file does not exist in {path}. Loading empty annotations"
         )
         return cls({})
 
-    def save(self, path: Path):
-        as_dict = {id: attr.asdict(ann) for id, ann in self.items()}
+    def save(self, path: Path) -> None:
+        as_dict = {id_: attr.asdict(ann) for id_, ann in self.items()}
         path.write_text(
             json.dumps(as_dict, sort_keys=True, ensure_ascii=False, indent=4),
             encoding="utf8",
 
@@ -1,17 +1,20 @@
 from __future__ import annotations
 
 import copy
-from typing import Callable
+from typing import TYPE_CHECKING, Callable
 
 import html_text
 from lxml.html import HtmlElement, tostring
 
 from clear_html.formatted_text import clean_doc
 from clear_html.html_embeddings import integrate_embeddings
 
+if TYPE_CHECKING:
+    from collections.abc import Set as AbstractSet
+
 
 def cleaned_node_to_text(
-    node: HtmlElement, text_extractor: Callable | None = None
+    node: HtmlElement, text_extractor: Callable[[HtmlElement], str] | None = None
 ) -> str | None:
     """Format the given html tree as plain text, applying particular exclusions
     only applied to plain text (i.e. remove figure captions).
@@ -65,21 +68,21 @@ def clean_node(node: HtmlElement, url: str | None = None) -> HtmlElement:
     return clean_doc(node, url, nodes_whitelist)
 
 
-def apply_text_exclusions(node: HtmlElement):
+def apply_text_exclusions(node: HtmlElement) -> None:
     """Apply some additional exclusions that are needed to export the
     body as text. Modify given node."""
     exclude_figcaption(node)
 
 
-def exclude_figcaption(node: HtmlElement):
+def exclude_figcaption(node: HtmlElement) -> None:
     # Exclude html figcaption tag
     to_exclude = set(node.xpath(".//figcaption"))
     # Never exclude the node itself
     to_exclude -= {node}
     _drop_trees(to_exclude)
 
 
-def _drop_trees(to_exclude):
+def _drop_trees(to_exclude: AbstractSet[HtmlElement]) -> None:
     for el in to_exclude:
         if el.getparent() is not None:  # Parent cannot be removed
             el.drop_tree()
@@ -41,4 +41,6 @@
   - Finally html is formatted a little bit to have a good looking.
 """
 
-from clear_html.formatted_text.main import clean_doc  # noqa: F401
+from clear_html.formatted_text.main import clean_doc
+
+__all__ = ["clean_doc"]
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from lxml import etree
 from lxml.html import HtmlElement, defs
@@ -38,8 +38,8 @@ def __init__(
         self,
         nodes_whitelist: AbstractSet[HtmlElement] | None = None,
         allow_data_attrs: bool = True,
-        allow_tags=None,
-        **kw,
+        allow_tags: AbstractSet[str] | None = None,
+        **kw: Any,
     ):
         # Short-circuit the safe_attrs to be able to provide a smarter filtering
         self._body_safe_attrs = kw.pop("safe_attrs", defs.safe_attrs)
@@ -55,7 +55,7 @@ def __init__(
             kw[option] = False
         super().__init__(**kw)
 
-    def __call__(self, doc: HtmlElement):  # type: ignore[override]
+    def __call__(self, doc: HtmlElement) -> None:  # type: ignore[override]
         super().__call__(doc)
         if self._body_safe_attrs_only:
             safe_attrs = self._body_safe_attrs
@@ -90,7 +90,7 @@ def __call__(self, doc: HtmlElement):  # type: ignore[override]
                     el = to_remove.pop()
                     drop_tag_preserve_spacing(el)
 
-    def allow_element(self, el):
+    def allow_element(self, el: HtmlElement) -> bool:
         if el in self._nodes_whitelist:
             return True
         return super().allow_element(el)
@@ -55,7 +55,7 @@ def _get_figure_caption_cleaner() -> Cleaner:
     )
 
 
-def enclose_media_within_figure(doc: HtmlElement):
+def enclose_media_within_figure(doc: HtmlElement) -> None:
     """Ensures all media (images, videos, etc) are enclosed within figures.
     If possible, images with
     a link also includes the link within the figure element."""
@@ -69,7 +69,7 @@ def enclose_media_within_figure(doc: HtmlElement):
 
 def top_level_media_within_figure(
     doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
-):
+) -> None:
     """Enclose top level isolated multimedia into figures. In other words,
     paragraphs containing only a single media element are replaced by a figure.
     Nodes in the white list are ignored.
@@ -90,7 +90,7 @@ def top_level_media_within_figure(
     '<div><figure><audio><source></source></audio></figure></div>'
     """
 
-    def is_single_tag(el: HtmlElement):
+    def is_single_tag(el: HtmlElement) -> bool:
         return len(el) == 1 and not has_text(el) and not has_tail(el[0])
 
     for child in doc:
@@ -105,7 +105,7 @@ def is_single_tag(el: HtmlElement):
                 single_p.tag = "figure"
 
 
-def infer_img_url_from_data_src_attr(doc: HtmlElement):
+def infer_img_url_from_data_src_attr(doc: HtmlElement) -> None:
     """Fills src attribute from data-src for img tags.
     It is common to see img tags without src attribute but with data-src
 
@@ -116,10 +116,10 @@ def infer_img_url_from_data_src_attr(doc: HtmlElement):
     """
     for el in doc.iterfind(".//img"):
         if not el.get("src") and el.get("data-src"):
-            el.attrib["src"] = cast(str, el.get("data-src"))
+            el.attrib["src"] = cast("str", el.get("data-src"))
 
 
-def create_figures_from_isolated_figcaptions(node: HtmlElement):
+def create_figures_from_isolated_figcaptions(node: HtmlElement) -> None:
     """Wraps isolated figcaptions with the content above and form a new figure.
     Mutates node.
 
@@ -157,8 +157,8 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
     '<article><figure><img href="link1"><br><br><figcaption>caption1</figcaption></figure></article>'
     """
     for caption in node.xpath(".//figcaption"):
-        slice = group_with_previous_content_block(caption)
-        if slice:
+        slice_ = group_with_previous_content_block(caption)
+        if slice_:
             anctrs = ancestors(caption, stop_at=node)
             ancestors_tags = [n.tag for n in anctrs]
             # Avoiding creating the figure if previous selected content is
@@ -169,14 +169,14 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
             # finally a figure was formed with a the paragraph before, which
             # is wrong. It is safe then not to form the figure and so the caption
             # will be just removed.
-            prev_content_node = slice.node[slice.start]
+            prev_content_node = slice_.node[slice_.start]
             prev_content_is_paragraph = (
                 prev_content_node.tag == "p"
                 and not FIGURE_CONTENT_TAGS
-                & {n.tag for n in descendants(prev_content_node)}
+                & {cast("str", n.tag) for n in descendants(prev_content_node)}
             )
             if "figure" not in ancestors_tags and not prev_content_is_paragraph:
-                if slice.node.tag in [
+                if slice_.node.tag in [
                     "table",
                     "tbody",
                     "thead",
@@ -194,19 +194,20 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
                     # structure.
                     for ancestor in anctrs:
                         if ancestor.tag in MUST_ANCESTORS_FOR_KEEP_CONTENT_REVERSED:
+                            assert isinstance(ancestor.tag, str)
                             ancestor.tag = MUST_ANCESTORS_FOR_KEEP_CONTENT_REVERSED[
                                 ancestor.tag
                             ]
                             break
-                new_figure = wrap_children_slice(slice, "figure")
+                new_figure = wrap_children_slice(slice_, "figure")
                 # Case when figure was at the same level that caption.
                 # This avoids having figures inside figures in this case.
                 for inner_figure in new_figure.xpath(".//figure"):
                     drop_tag_preserve_spacing(inner_figure)
                 fuse_figcaptions(new_figure)
 
 
-def fuse_figcaptions(figure: HtmlElement):
+def fuse_figcaptions(figure: HtmlElement) -> None:
     """Fuses first block of consecutive figcaptions and remove the rest found.
 
     >>> fuse = _test_fn(fuse_figcaptions)
@@ -243,7 +244,7 @@ def fuse_figcaptions(figure: HtmlElement):
             drop_tag_preserve_spacing(child)
 
 
-def clean_figcaptions_html(node: HtmlElement):
+def clean_figcaptions_html(node: HtmlElement) -> None:
     """Simplifies figcapion html
     >>> html = fromstring("<div><figcaption><table><p><strong>hey</strong></p></table></figcaption></div>")
     >>> clean_figcaptions_html(html)
@@ -255,7 +256,7 @@ def clean_figcaptions_html(node: HtmlElement):
         clean(caption)
 
 
-def remove_figures_without_content(doc: HtmlElement):
+def remove_figures_without_content(doc: HtmlElement) -> None:
     """Removes figures that has no content apart of the figure caption. This
     can happen for some pages that inject the content with JS
 
@@ -291,7 +292,7 @@ def remove_figures_without_content(doc: HtmlElement):
             drop_tag_preserve_spacing(figure, preserve_content=False)
 
 
-def clean_double_br_above_figcaption(doc: HtmlElement):
+def clean_double_br_above_figcaption(doc: HtmlElement) -> None:
     """Some weird cases like when figure is implemented with tables
     we can end having a double br before figcaptions. For example
     in this case