@@ -189,8 +189,8 @@ async def _get_page_content(url: str) -> tuple[str, BeautifulSoup, str | None]:
189
189
)
190
190
if (
191
191
(http_equiv_refresh := soup .find ("meta" , attrs = {"http-equiv" : "refresh" }))
192
- and (http_equiv_refresh_value := http_equiv_refresh .get ("content" ))
193
- and (http_equiv_refresh_url := _parse_http_equiv_refresh (http_equiv_refresh_value ))
192
+ and (http_equiv_refresh_value := http_equiv_refresh .get ("content" )) # type: ignore[union-attr]
193
+ and (http_equiv_refresh_url := _parse_http_equiv_refresh (http_equiv_refresh_value )) # type: ignore[arg-type]
194
194
):
195
195
url = http_equiv_refresh_url
196
196
continue
@@ -228,7 +228,7 @@ def _build_article_data_from_soup(
228
228
external_article_id = "" ,
229
229
source_title = _get_site_title (fetched_url , soup ),
230
230
title = forced_title or _get_title (soup ),
231
- summary = _get_summary (soup , content ),
231
+ summary = _get_summary (soup ),
232
232
content = content ,
233
233
authors = tuple (_get_authors (soup )),
234
234
contributors = (),
@@ -242,7 +242,7 @@ def _build_article_data_from_soup(
242
242
)
243
243
244
244
245
- def _get_title (soup : BeautifulSoup ) -> str :
245
+ def _get_title (soup ) -> str :
246
246
title = ""
247
247
if (og_title := soup .find ("meta" , attrs = {"property" : "og:title" })) and og_title .get ("content" ):
248
248
title = og_title .get ("content" )
@@ -262,7 +262,7 @@ def _get_title(soup: BeautifulSoup) -> str:
262
262
return title
263
263
264
264
265
- def _get_site_title (fetched_url : str , soup : BeautifulSoup ) -> str :
265
+ def _get_site_title (fetched_url : str , soup ) -> str :
266
266
site_title = urlparse (fetched_url ).netloc
267
267
if (og_site_name := soup .find ("meta" , attrs = {"property" : "og:site_name" })) and og_site_name .get (
268
268
"content"
@@ -274,7 +274,7 @@ def _get_site_title(fetched_url: str, soup: BeautifulSoup) -> str:
274
274
return site_title
275
275
276
276
277
- def _get_summary (soup : BeautifulSoup , content : str ) -> str :
277
+ def _get_summary (soup ) -> str :
278
278
summary = ""
279
279
if (
280
280
og_description := soup .find ("meta" , attrs = {"property" : "og:description" })
@@ -302,7 +302,7 @@ def _get_fallback_summary_from_content(content: str) -> str:
302
302
)
303
303
304
304
305
- def _get_content (soup : BeautifulSoup ) -> str :
305
+ def _get_content (soup ) -> str :
306
306
articles = soup .find_all ("article" )
307
307
article_content = None
308
308
if len (articles ) > 1 :
@@ -323,7 +323,7 @@ def _get_content(soup: BeautifulSoup) -> str:
323
323
return str (article_content )
324
324
325
325
326
- def _parse_multiple_articles (soup : BeautifulSoup ):
326
+ def _parse_multiple_articles (soup ):
327
327
for article in soup .find_all (["article" , "section" ]):
328
328
attrs = set ()
329
329
if article_id := article .get ("id" ):
@@ -350,20 +350,20 @@ def _parse_multiple_articles(soup: BeautifulSoup):
350
350
return soup .find ("article" )
351
351
352
352
353
- def _extract_tag_from_content (soup : BeautifulSoup , tag_name : str ):
353
+ def _extract_tag_from_content (soup , tag_name : str ):
354
354
for tag in soup .find_all (tag_name ):
355
355
tag .extract ()
356
356
357
357
358
- def _get_authors (soup : BeautifulSoup ) -> list [str ]:
358
+ def _get_authors (soup ) -> list [str ]:
359
359
authors = []
360
360
if (meta_author := soup .find ("meta" , {"name" : "author" })) and meta_author .get ("content" ):
361
361
authors = [meta_author .get ("content" )]
362
362
363
363
return authors
364
364
365
365
366
- def _get_tags (soup : BeautifulSoup ) -> list [str ]:
366
+ def _get_tags (soup ) -> list [str ]:
367
367
tags = set ()
368
368
369
369
if article_tags := soup .find_all ("meta" , attrs = {"property" : "article:tag" }):
@@ -388,7 +388,7 @@ def parse_tags_list(tags_str: str) -> set[str]:
388
388
return parsed_tags
389
389
390
390
391
- def _get_link (fetched_url : str , soup : BeautifulSoup ) -> str :
391
+ def _get_link (fetched_url : str , soup ) -> str :
392
392
link = fetched_url
393
393
if (canonical_link := soup .find ("link" , {"rel" : "canonical" })) and canonical_link .get ("href" ):
394
394
link = canonical_link .get ("href" )
@@ -399,7 +399,7 @@ def _get_link(fetched_url: str, soup: BeautifulSoup) -> str:
399
399
return fetched_url
400
400
401
401
402
- def _get_preview_picture_url (fetched_url , soup : BeautifulSoup ) -> str :
402
+ def _get_preview_picture_url (fetched_url , soup ) -> str :
403
403
preview_picture_url = ""
404
404
405
405
if (og_image := soup .find ("meta" , attrs = {"property" : "og:image" })) and (
@@ -432,7 +432,7 @@ def _get_preview_picture_url(fetched_url, soup: BeautifulSoup) -> str:
432
432
return preview_picture_url
433
433
434
434
435
- def _get_published_at (soup : BeautifulSoup ) -> datetime | None :
435
+ def _get_published_at (soup ) -> datetime | None :
436
436
published_at = None
437
437
if (
438
438
article_published_time := soup .find ("meta" , attrs = {"property" : "article:published_time" })
@@ -442,7 +442,7 @@ def _get_published_at(soup: BeautifulSoup) -> datetime | None:
442
442
return published_at
443
443
444
444
445
- def _get_updated_at (soup : BeautifulSoup ) -> datetime | None :
445
+ def _get_updated_at (soup ) -> datetime | None :
446
446
updated_at = None
447
447
if (
448
448
article_modified_time := soup .find ("meta" , attrs = {"property" : "article:modified_time" })
@@ -452,11 +452,11 @@ def _get_updated_at(soup: BeautifulSoup) -> datetime | None:
452
452
return updated_at
453
453
454
454
455
- def _get_lang (soup : BeautifulSoup , content_language : str | None ) -> str :
455
+ def _get_lang (soup , content_language : str | None ) -> str :
456
456
if not soup .find ("html" ) or (language := soup .find ("html" ).get ("lang" )) is None :
457
457
language = content_language
458
458
459
- return language
459
+ return str ( language or "" )
460
460
461
461
462
462
def _build_table_of_content (content : str ) -> tuple [str , list [TableOfContentTopItem ]]:
0 commit comments