1+ import logging
2+ import os
13from typing import List
24from urllib .parse import urlparse
35
810 lines_to_url ,
911)
1012from welearn_datastack .data .url_collector import URLCollector
13+ from welearn_datastack .modules .xml_extractor import XMLExtractor
1114from welearn_datastack .utils_ .http_client_utils import get_new_https_session
1215
1316url_illegal_characters = ['"' , "<" , ">" ]
2023 "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0" ,
2124}
2225
26+ log_level : int = logging .getLevelName (os .getenv ("LOG_LEVEL" , "INFO" ))
27+ log_format : str = os .getenv (
28+ "LOG_FORMAT" , "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
29+ )
30+
31+ if not isinstance (log_level , int ):
32+ raise ValueError ("Log level is not recognized : '%s'" , log_level )
33+
34+ logging .basicConfig (
35+ level = logging .getLevelName (log_level ),
36+ format = log_format ,
37+ )
38+ logger = logging .getLogger (__name__ )
39+
2340
2441class AtomURLCollector (URLCollector ):
2542 def __init__ (
@@ -35,16 +52,27 @@ def collect(self) -> List[WeLearnDocument]:
3552 client = get_new_https_session ()
3653 res = client .get (url = self .feed_url , headers = headers )
3754 content = res .content .decode ("utf-8" )
55+ link_lines = []
56+
57+ entries = XMLExtractor (content ).extract_content (tag = "entry" )
58+ for entry in entries :
59+ links = XMLExtractor (entry .content ).extract_content_attribute_filter (
60+ tag = "link" , attribute_name = "rel" , attribute_value = "alternate"
61+ )
62+ if not links :
63+ logger .warning (
64+ "No link found for entry, skipping entry. Entry content: %s" ,
65+ entry .content ,
66+ )
67+ continue
68+
69+ if len (links ) > 1 :
70+ logger .warning (
71+ "Multiple rel='alternate' links found for entry; using the first. Entry content: %s" ,
72+ entry .content ,
73+ )
3874
39- flag = False
40- link_lines : List [str ] = []
41- for line in content .split ("\n " ):
42- # If we are in the entry section and we find a link
43- # The definition, especially "rel" part is empirical
44- if flag and line .strip ().startswith ('<link rel="alternate"' ):
45- link_lines .append (line .strip ())
46- if line .strip ().startswith ("<entry>" ):
47- flag = True
75+ link_lines .append (links [0 ].attributes .get ("href" , "" ))
4876
4977 urls = lines_to_url (domain , link_lines )
5078
0 commit comments