Skip to content

Commit 05ce5cb

Browse files
authored
Modernize (#11)
1 parent 5a10dfe commit 05ce5cb

22 files changed

+201
-135
lines changed

.github/workflows/publish.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ jobs:
1313
runs-on: ubuntu-latest
1414

1515
steps:
16-
- uses: actions/checkout@v4
16+
- uses: actions/checkout@v5
1717
- name: Set up Python
18-
uses: actions/setup-python@v5
18+
uses: actions/setup-python@v6
1919
with:
2020
python-version: '3.13'
2121
- name: Install dependencies

.github/workflows/test.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,18 @@ jobs:
1313
matrix:
1414
include:
1515
- python-version: "3.9"
16+
- python-version: "3.9"
17+
env:
18+
TOXENV: pinned
1619
- python-version: "3.10"
1720
- python-version: "3.11"
1821
- python-version: "3.12"
1922
- python-version: "3.13"
20-
23+
- python-version: "3.14.0-rc.3"
2124
steps:
22-
- uses: actions/checkout@v4
25+
- uses: actions/checkout@v5
2326
- name: Set up Python ${{ matrix.python-version }}
24-
uses: actions/setup-python@v5
27+
uses: actions/setup-python@v6
2528
with:
2629
python-version: ${{ matrix.python-version }}
2730
- name: Install dependencies
@@ -45,9 +48,9 @@ jobs:
4548
tox-job: ["lint", "mypy", "twinecheck"]
4649

4750
steps:
48-
- uses: actions/checkout@v4
51+
- uses: actions/checkout@v5
4952
- name: Set up Python ${{ matrix.python-version }}
50-
uses: actions/setup-python@v5
53+
uses: actions/setup-python@v6
5154
with:
5255
python-version: ${{ matrix.python-version }}
5356
- name: Install dependencies

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
# IDE
1212
/.idea/
13+
/.vscode/
1314

1415
.mypy_cache/
1516
.cache/
@@ -22,4 +23,4 @@ htmlcov/
2223
# Test and coverage
2324
test-results
2425
coverage-html
25-
26+
coverage.xml

.pre-commit-config.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
repos:
22
- repo: https://github.com/astral-sh/ruff-pre-commit
3-
rev: v0.9.5
3+
rev: v0.13.2
44
hooks:
5-
- id: ruff
5+
- id: ruff-check
66
args: [ --fix ]
77
- id: ruff-format
8+
- repo: https://github.com/pre-commit/pre-commit-hooks
9+
rev: v6.0.0
10+
hooks:
11+
- id: end-of-file-fixer
12+
- id: trailing-whitespace

clear_html/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
__version__ = "0.4.1"
2-
from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text # noqa: F401
2+
from .clean import clean_node, cleaned_node_to_html, cleaned_node_to_text
3+
4+
__all__ = ["clean_node", "cleaned_node_to_html", "cleaned_node_to_text"]

clear_html/body_annotations.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
from pathlib import Path
1111

1212

13+
logger = logging.getLogger(__name__)
14+
15+
1316
@attr.s(auto_attribs=True)
1417
class BodyAnnotation:
1518
url: str
@@ -29,13 +32,13 @@ def load(cls, path: Path) -> BodyAnnotations:
2932
with path.open("rt", encoding="utf8") as f:
3033
pages = json.load(f)
3134
return cls((k, BodyAnnotation(**v)) for k, v in pages.items())
32-
logging.info(
35+
logger.info(
3336
f"Body annotations file does not exist in {path}. Loading empty annotations"
3437
)
3538
return cls({})
3639

37-
def save(self, path: Path):
38-
as_dict = {id: attr.asdict(ann) for id, ann in self.items()}
40+
def save(self, path: Path) -> None:
41+
as_dict = {id_: attr.asdict(ann) for id_, ann in self.items()}
3942
path.write_text(
4043
json.dumps(as_dict, sort_keys=True, ensure_ascii=False, indent=4),
4144
encoding="utf8",

clear_html/clean.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
from __future__ import annotations
22

33
import copy
4-
from typing import Callable
4+
from typing import TYPE_CHECKING, Callable
55

66
import html_text
77
from lxml.html import HtmlElement, tostring
88

99
from clear_html.formatted_text import clean_doc
1010
from clear_html.html_embeddings import integrate_embeddings
1111

12+
if TYPE_CHECKING:
13+
from collections.abc import Set as AbstractSet
14+
1215

1316
def cleaned_node_to_text(
14-
node: HtmlElement, text_extractor: Callable | None = None
17+
node: HtmlElement, text_extractor: Callable[[HtmlElement], str] | None = None
1518
) -> str | None:
1619
"""Format the given html tree as plain text, applying particular exclusions
1720
only applied to plain text (i.e. remove figure captions).
@@ -65,21 +68,21 @@ def clean_node(node: HtmlElement, url: str | None = None) -> HtmlElement:
6568
return clean_doc(node, url, nodes_whitelist)
6669

6770

68-
def apply_text_exclusions(node: HtmlElement):
71+
def apply_text_exclusions(node: HtmlElement) -> None:
6972
"""Apply some additional exclusions that are needed to export the
7073
body as text. Modify given node."""
7174
exclude_figcaption(node)
7275

7376

74-
def exclude_figcaption(node: HtmlElement):
77+
def exclude_figcaption(node: HtmlElement) -> None:
7578
# Exclude html figcaption tag
7679
to_exclude = set(node.xpath(".//figcaption"))
7780
# Never exclude the node itself
7881
to_exclude -= {node}
7982
_drop_trees(to_exclude)
8083

8184

82-
def _drop_trees(to_exclude):
85+
def _drop_trees(to_exclude: AbstractSet[HtmlElement]) -> None:
8386
for el in to_exclude:
8487
if el.getparent() is not None: # Parent cannot be removed
8588
el.drop_tree()

clear_html/formatted_text/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,6 @@
4141
- Finally html is formatted a little bit to have a good looking.
4242
"""
4343

44-
from clear_html.formatted_text.main import clean_doc # noqa: F401
44+
from clear_html.formatted_text.main import clean_doc
45+
46+
__all__ = ["clean_doc"]

clear_html/formatted_text/cleaner.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING
3+
from typing import TYPE_CHECKING, Any
44

55
from lxml import etree
66
from lxml.html import HtmlElement, defs
@@ -38,8 +38,8 @@ def __init__(
3838
self,
3939
nodes_whitelist: AbstractSet[HtmlElement] | None = None,
4040
allow_data_attrs: bool = True,
41-
allow_tags=None,
42-
**kw,
41+
allow_tags: AbstractSet[str] | None = None,
42+
**kw: Any,
4343
):
4444
# Short-circuit the safe_attrs to be able to provide a smarter filtering
4545
self._body_safe_attrs = kw.pop("safe_attrs", defs.safe_attrs)
@@ -55,7 +55,7 @@ def __init__(
5555
kw[option] = False
5656
super().__init__(**kw)
5757

58-
def __call__(self, doc: HtmlElement): # type: ignore[override]
58+
def __call__(self, doc: HtmlElement) -> None: # type: ignore[override]
5959
super().__call__(doc)
6060
if self._body_safe_attrs_only:
6161
safe_attrs = self._body_safe_attrs
@@ -90,7 +90,7 @@ def __call__(self, doc: HtmlElement): # type: ignore[override]
9090
el = to_remove.pop()
9191
drop_tag_preserve_spacing(el)
9292

93-
def allow_element(self, el):
93+
def allow_element(self, el: HtmlElement) -> bool:
9494
if el in self._nodes_whitelist:
9595
return True
9696
return super().allow_element(el)

clear_html/formatted_text/figures.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def _get_figure_caption_cleaner() -> Cleaner:
5555
)
5656

5757

58-
def enclose_media_within_figure(doc: HtmlElement):
58+
def enclose_media_within_figure(doc: HtmlElement) -> None:
5959
"""Ensures all media (images, videos, etc) are enclosed within figures.
6060
If possible, images with
6161
a link also includes the link within the figure element."""
@@ -69,7 +69,7 @@ def enclose_media_within_figure(doc: HtmlElement):
6969

7070
def top_level_media_within_figure(
7171
doc: HtmlElement, white_list: AbstractSet[HtmlElement] = set()
72-
):
72+
) -> None:
7373
"""Enclose top level isolated multimedia into figures. In other words,
7474
paragraphs containing only a single media element are replaced by a figure.
7575
Nodes in the white list are ignored.
@@ -90,7 +90,7 @@ def top_level_media_within_figure(
9090
'<div><figure><audio><source></source></audio></figure></div>'
9191
"""
9292

93-
def is_single_tag(el: HtmlElement):
93+
def is_single_tag(el: HtmlElement) -> bool:
9494
return len(el) == 1 and not has_text(el) and not has_tail(el[0])
9595

9696
for child in doc:
@@ -105,7 +105,7 @@ def is_single_tag(el: HtmlElement):
105105
single_p.tag = "figure"
106106

107107

108-
def infer_img_url_from_data_src_attr(doc: HtmlElement):
108+
def infer_img_url_from_data_src_attr(doc: HtmlElement) -> None:
109109
"""Fills src attribute from data-src for img tags.
110110
It is common to see img tags without src attribute but with data-src
111111
@@ -116,10 +116,10 @@ def infer_img_url_from_data_src_attr(doc: HtmlElement):
116116
"""
117117
for el in doc.iterfind(".//img"):
118118
if not el.get("src") and el.get("data-src"):
119-
el.attrib["src"] = cast(str, el.get("data-src"))
119+
el.attrib["src"] = cast("str", el.get("data-src"))
120120

121121

122-
def create_figures_from_isolated_figcaptions(node: HtmlElement):
122+
def create_figures_from_isolated_figcaptions(node: HtmlElement) -> None:
123123
"""Wraps isolated figcaptions with the content above and form a new figure.
124124
Mutates node.
125125
@@ -157,8 +157,8 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
157157
'<article><figure><img href="link1"><br><br><figcaption>caption1</figcaption></figure></article>'
158158
"""
159159
for caption in node.xpath(".//figcaption"):
160-
slice = group_with_previous_content_block(caption)
161-
if slice:
160+
slice_ = group_with_previous_content_block(caption)
161+
if slice_:
162162
anctrs = ancestors(caption, stop_at=node)
163163
ancestors_tags = [n.tag for n in anctrs]
164164
# Avoiding creating the figure if previous selected content is
@@ -169,14 +169,14 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
169169
# finally a figure was formed with a the paragraph before, which
170170
# is wrong. It is safe then not to form the figure and so the caption
171171
# will be just removed.
172-
prev_content_node = slice.node[slice.start]
172+
prev_content_node = slice_.node[slice_.start]
173173
prev_content_is_paragraph = (
174174
prev_content_node.tag == "p"
175175
and not FIGURE_CONTENT_TAGS
176-
& {n.tag for n in descendants(prev_content_node)}
176+
& {cast("str", n.tag) for n in descendants(prev_content_node)}
177177
)
178178
if "figure" not in ancestors_tags and not prev_content_is_paragraph:
179-
if slice.node.tag in [
179+
if slice_.node.tag in [
180180
"table",
181181
"tbody",
182182
"thead",
@@ -194,19 +194,20 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
194194
# structure.
195195
for ancestor in anctrs:
196196
if ancestor.tag in MUST_ANCESTORS_FOR_KEEP_CONTENT_REVERSED:
197+
assert isinstance(ancestor.tag, str)
197198
ancestor.tag = MUST_ANCESTORS_FOR_KEEP_CONTENT_REVERSED[
198199
ancestor.tag
199200
]
200201
break
201-
new_figure = wrap_children_slice(slice, "figure")
202+
new_figure = wrap_children_slice(slice_, "figure")
202203
# Case when figure was at the same level that caption.
203204
# This avoids having figures inside figures in this case.
204205
for inner_figure in new_figure.xpath(".//figure"):
205206
drop_tag_preserve_spacing(inner_figure)
206207
fuse_figcaptions(new_figure)
207208

208209

209-
def fuse_figcaptions(figure: HtmlElement):
210+
def fuse_figcaptions(figure: HtmlElement) -> None:
210211
"""Fuses first block of consecutive figcaptions and remove the rest found.
211212
212213
>>> fuse = _test_fn(fuse_figcaptions)
@@ -243,7 +244,7 @@ def fuse_figcaptions(figure: HtmlElement):
243244
drop_tag_preserve_spacing(child)
244245

245246

246-
def clean_figcaptions_html(node: HtmlElement):
247+
def clean_figcaptions_html(node: HtmlElement) -> None:
247248
"""Simplifies figcapion html
248249
>>> html = fromstring("<div><figcaption><table><p><strong>hey</strong></p></table></figcaption></div>")
249250
>>> clean_figcaptions_html(html)
@@ -255,7 +256,7 @@ def clean_figcaptions_html(node: HtmlElement):
255256
clean(caption)
256257

257258

258-
def remove_figures_without_content(doc: HtmlElement):
259+
def remove_figures_without_content(doc: HtmlElement) -> None:
259260
"""Removes figures that has no content apart of the figure caption. This
260261
can happen for some pages that inject the content with JS
261262
@@ -291,7 +292,7 @@ def remove_figures_without_content(doc: HtmlElement):
291292
drop_tag_preserve_spacing(figure, preserve_content=False)
292293

293294

294-
def clean_double_br_above_figcaption(doc: HtmlElement):
295+
def clean_double_br_above_figcaption(doc: HtmlElement) -> None:
295296
"""Some weird cases like when figure is implemented with tables
296297
we can end having a double br before figcaptions. For example
297298
in this case

0 commit comments

Comments
 (0)