2525 from collections .abc import Mapping
2626 from collections .abc import Set as AbstractSet
2727
28+ from lxml .etree import QName
2829
29- def translate_tags (doc : HtmlElement , white_list : AbstractSet [HtmlElement ] = set ()):
30+ # from lxml-stubs
31+ _TagName = str | bytes | bytearray | QName
32+
33+
34+ def translate_tags (
35+ doc : HtmlElement , white_list : AbstractSet [HtmlElement ] = set ()
36+ ) -> None :
3037 """Translate tag names (i.e. b -> strong). Mutates the doc.
3138 Nodes in the white list are ignored.
3239
@@ -68,11 +75,11 @@ def set_article_tag_as_root(doc: HtmlElement) -> HtmlElement:
6875
6976def wrap_tags (
7077 doc : HtmlElement ,
71- to_be_enclosed_tags : AbstractSet ,
78+ to_be_enclosed_tags : AbstractSet [ _TagName ] ,
7279 enclosing_tag : str ,
7380 node_check : Callable [[HtmlElement ], bool ] = lambda x : True ,
74- transparent_tags : AbstractSet = set (),
75- ):
81+ transparent_tags : AbstractSet [ _TagName ] = set (),
82+ ) -> None :
7683 """Enclose the elements with tag `to_be_enclosed_tags` within a tag
7784 `enclosing_tag` if they are not already enclosed, that is, if `enclosing_tag`
7885 is not already an ancestor. All transparent tags without more content
@@ -93,12 +100,12 @@ def wrap_tags(
93100
94101def _wrap_tags_with (
95102 doc : HtmlElement ,
96- to_be_enclosed_tags : AbstractSet ,
103+ to_be_enclosed_tags : AbstractSet [ _TagName ] ,
97104 enclosing_tag : str ,
98- ancestors_tags : AbstractSet = set (),
105+ ancestors_tags : AbstractSet [ _TagName ] = set (),
99106 node_check : Callable [[HtmlElement ], bool ] = lambda x : True ,
100- transparent_tags : AbstractSet = set (),
101- ):
107+ transparent_tags : AbstractSet [ _TagName ] = set (),
108+ ) -> None :
102109 ancestors_tags = ancestors_tags | {doc .tag }
103110 if (
104111 (enclosing_tag not in ancestors_tags )
@@ -128,8 +135,8 @@ def _wrap_tags_with(
128135
129136
130137def remove_empty_tags (
131- doc : HtmlElement , white_list : AbstractSet [str ] = set (), _root = True
132- ):
138+ doc : HtmlElement , white_list : AbstractSet [str ] = set (), _root : bool = True
139+ ) -> None :
133140 """Removes empty tags, but skipping the `white_list` ones
134141
135142 >>> html = fromstring("<article><p><em></em></p></article>")
@@ -148,7 +155,7 @@ def remove_empty_tags(
148155 doc .drop_tag ()
149156
150157
151- def drop_tag_preserve_spacing (doc : HtmlElement , preserve_content = True ):
158+ def drop_tag_preserve_spacing (doc : HtmlElement , preserve_content : bool = True ) -> None :
152159 """Drops a tag keeping its content. If element to be removed
153160 is a block element, leading or trailing double br tags would
154161 be introduced to preserve spacing. If preserve_content is
@@ -198,7 +205,7 @@ def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True):
198205 doc .drop_tree ()
199206
200207
201- def double_br (doc : HtmlElement | None ):
208+ def double_br (doc : HtmlElement | None ) -> bool :
202209 """True if doc and next element are "br" tags without text in between."""
203210 if doc is None or doc .tag != "br" :
204211 return False
@@ -238,7 +245,7 @@ def has_no_content(doc: HtmlElement) -> bool:
238245
239246def is_empty (
240247 doc : HtmlElement , tags_with_content_even_if_empty : AbstractSet [str ] = set ()
241- ):
248+ ) -> bool :
242249 """Checks if given doc is an empty tag or tag formed with empty tags.
243250 ``tags_with_content_even_if_empty`` tags are considered as having content
244251 even if empty.
@@ -270,7 +277,7 @@ def is_empty(
270277 )
271278
272279
273- def is_phrasing_content (doc : HtmlElement ):
280+ def is_phrasing_content (doc : HtmlElement ) -> bool :
274281 """'Phrasing content is the text of the document, as well as elements that
275282 mark up that text at the intra-paragraph level'
276283 (see https://html.spec.whatwg.org/#phrasing-content). This method return
@@ -350,8 +357,8 @@ def find_previous_non_empty_sibling(doc: HtmlElement) -> int | None:
350357 return None
351358
352359
353- def _test_fn (fn ) :
354- def func (doc ) :
360+ def _test_fn (fn : Callable [[ HtmlElement ], None ]) -> Callable [[ str ], str ] :
361+ def func (doc : str ) -> str :
355362 html = fromstring (doc )
356363 fn (html )
357364 return tostring (html ).decode ()
@@ -362,9 +369,9 @@ def func(doc):
362369def clean_incomplete_structures (
363370 doc : HtmlElement ,
364371 rules : Mapping [str , AbstractSet [str ]],
365- preserve_content = True ,
372+ preserve_content : bool = True ,
366373 white_list : AbstractSet [HtmlElement ] = set (),
367- ):
374+ ) -> None :
368375 """Drop tags (keeping content) of incomplete structures.
369376 For example, removes a td element if not belonging to any table.
370377 Never clean the base element. If preserve_content is false then nodes
@@ -401,10 +408,10 @@ def clean_incomplete_structures(
401408def _clean_incomplete_structures (
402409 doc : HtmlElement ,
403410 rules : Mapping [str , AbstractSet [str ]],
404- ancestors_tags : AbstractSet = set (),
405- preserve_content = True ,
411+ ancestors_tags : AbstractSet [ _TagName ] = set (),
412+ preserve_content : bool = True ,
406413 white_list : AbstractSet [HtmlElement ] = set (),
407- ):
414+ ) -> None :
408415 ancestors_tags = ancestors_tags | {doc .tag }
409416 for child in doc :
410417 _clean_incomplete_structures (child , rules , ancestors_tags , preserve_content )
@@ -417,7 +424,7 @@ def _clean_incomplete_structures(
417424 drop_tag_preserve_spacing (doc , preserve_content )
418425
419426
420- def kill_tag_content (doc : HtmlElement , tag : str ):
427+ def kill_tag_content (doc : HtmlElement , tag : str ) -> None :
421428 """Removes the content of all these tags found in the doc
422429
423430 >>> def kill(html):
0 commit comments