@@ -55,7 +55,7 @@ def _get_figure_caption_cleaner() -> Cleaner:
5555 )
5656
5757
58- def enclose_media_within_figure (doc : HtmlElement ):
58+ def enclose_media_within_figure (doc : HtmlElement ) -> None :
5959 """Ensures all media (images, videos, etc) are enclosed within figures.
6060 If possible, images with
6161 a link also includes the link within the figure element."""
@@ -69,7 +69,7 @@ def enclose_media_within_figure(doc: HtmlElement):
6969
7070def top_level_media_within_figure (
7171 doc : HtmlElement , white_list : AbstractSet [HtmlElement ] = set ()
72- ):
72+ ) -> None :
7373 """Enclose top level isolated multimedia into figures. In other words,
7474 paragraphs containing only a single media element are replaced by a figure.
7575 Nodes in the white list are ignored.
@@ -90,7 +90,7 @@ def top_level_media_within_figure(
9090 '<div><figure><audio><source></source></audio></figure></div>'
9191 """
9292
93- def is_single_tag (el : HtmlElement ):
93+ def is_single_tag (el : HtmlElement ) -> bool :
9494 return len (el ) == 1 and not has_text (el ) and not has_tail (el [0 ])
9595
9696 for child in doc :
@@ -105,7 +105,7 @@ def is_single_tag(el: HtmlElement):
105105 single_p .tag = "figure"
106106
107107
108- def infer_img_url_from_data_src_attr (doc : HtmlElement ):
108+ def infer_img_url_from_data_src_attr (doc : HtmlElement ) -> None :
109109 """Fills src attribute from data-src for img tags.
110110 It is common to see img tags without src attribute but with data-src
111111
@@ -116,10 +116,10 @@ def infer_img_url_from_data_src_attr(doc: HtmlElement):
116116 """
117117 for el in doc .iterfind (".//img" ):
118118 if not el .get ("src" ) and el .get ("data-src" ):
119- el .attrib ["src" ] = cast (str , el .get ("data-src" ))
119+ el .attrib ["src" ] = cast (" str" , el .get ("data-src" ))
120120
121121
122- def create_figures_from_isolated_figcaptions (node : HtmlElement ):
122+ def create_figures_from_isolated_figcaptions (node : HtmlElement ) -> None :
123123 """Wraps isolated figcaptions with the content above and form a new figure.
124124 Mutates node.
125125
@@ -157,8 +157,8 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
157157 '<article><figure><img href="link1"><br><br><figcaption>caption1</figcaption></figure></article>'
158158 """
159159 for caption in node .xpath (".//figcaption" ):
160- slice = group_with_previous_content_block (caption )
161- if slice :
160+ slice_ = group_with_previous_content_block (caption )
161+ if slice_ :
162162 anctrs = ancestors (caption , stop_at = node )
163163 ancestors_tags = [n .tag for n in anctrs ]
164164 # Avoiding creating the figure if previous selected content is
@@ -169,14 +169,14 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
169169 # finally a figure was formed with a the paragraph before, which
170170 # is wrong. It is safe then not to form the figure and so the caption
171171 # will be just removed.
172- prev_content_node = slice .node [slice .start ]
172+ prev_content_node = slice_ .node [slice_ .start ]
173173 prev_content_is_paragraph = (
174174 prev_content_node .tag == "p"
175175 and not FIGURE_CONTENT_TAGS
176- & {n .tag for n in descendants (prev_content_node )}
176+ & {cast ( "str" , n .tag ) for n in descendants (prev_content_node )}
177177 )
178178 if "figure" not in ancestors_tags and not prev_content_is_paragraph :
179- if slice .node .tag in [
179+ if slice_ .node .tag in [
180180 "table" ,
181181 "tbody" ,
182182 "thead" ,
@@ -194,19 +194,20 @@ def create_figures_from_isolated_figcaptions(node: HtmlElement):
194194 # structure.
195195 for ancestor in anctrs :
196196 if ancestor .tag in MUST_ANCESTORS_FOR_KEEP_CONTENT_REVERSED :
197+ assert isinstance (ancestor .tag , str )
197198 ancestor .tag = MUST_ANCESTORS_FOR_KEEP_CONTENT_REVERSED [
198199 ancestor .tag
199200 ]
200201 break
201- new_figure = wrap_children_slice (slice , "figure" )
202+ new_figure = wrap_children_slice (slice_ , "figure" )
202203 # Case when figure was at the same level that caption.
203204 # This avoids having figures inside figures in this case.
204205 for inner_figure in new_figure .xpath (".//figure" ):
205206 drop_tag_preserve_spacing (inner_figure )
206207 fuse_figcaptions (new_figure )
207208
208209
209- def fuse_figcaptions (figure : HtmlElement ):
210+ def fuse_figcaptions (figure : HtmlElement ) -> None :
210211 """Fuses first block of consecutive figcaptions and remove the rest found.
211212
212213 >>> fuse = _test_fn(fuse_figcaptions)
@@ -243,7 +244,7 @@ def fuse_figcaptions(figure: HtmlElement):
243244 drop_tag_preserve_spacing (child )
244245
245246
246- def clean_figcaptions_html (node : HtmlElement ):
247+ def clean_figcaptions_html (node : HtmlElement ) -> None :
247248 """Simplifies figcapion html
248249 >>> html = fromstring("<div><figcaption><table><p><strong>hey</strong></p></table></figcaption></div>")
249250 >>> clean_figcaptions_html(html)
@@ -255,7 +256,7 @@ def clean_figcaptions_html(node: HtmlElement):
255256 clean (caption )
256257
257258
258- def remove_figures_without_content (doc : HtmlElement ):
259+ def remove_figures_without_content (doc : HtmlElement ) -> None :
259260 """Removes figures that has no content apart of the figure caption. This
260261 can happen for some pages that inject the content with JS
261262
@@ -291,7 +292,7 @@ def remove_figures_without_content(doc: HtmlElement):
291292 drop_tag_preserve_spacing (figure , preserve_content = False )
292293
293294
294- def clean_double_br_above_figcaption (doc : HtmlElement ):
295+ def clean_double_br_above_figcaption (doc : HtmlElement ) -> None :
295296 """Some weird cases like when figure is implemented with tables
296297 we can end having a double br before figcaptions. For example
297298 in this case
0 commit comments