Merge pull request #137 from CyberCRI/Refacto/atom-collector-use-xml-extractor

lpi-tn · web-flow · commit 7a9a22587435 · 2026-06-02T16:10:27.000+02:00
feat: integrate XMLExtractor for link extraction and add logging configuration
diff --git a/tests/url_collector/test_feed_helpers.py b/tests/url_collector/test_feed_helpers.py
@@ -0,0 +1,34 @@
+from unittest import TestCase
+
+from welearn_datastack.collectors.helpers.feed_helpers import (
+    lines_to_url,
+    remove_illegal_character,
+)
+
+
+class TestFeedHelpers(TestCase):
+    def test_remove_illegal_character_no_modif(self):
+        text = "https://www.example.com/article1"
+        result = remove_illegal_character(text)
+        self.assertEqual(result, text)
+
+    def test_remove_illegal_character_illegal_characters(self):
+        text = 'https://www.example.com/article1</"link'
+        awaited_result = "https://www.example.com/article1"
+        result = remove_illegal_character(text)
+        self.assertEqual(result, awaited_result)
+
+    def test_line_to_url_correct(self):
+        line = "https://www.example.com/article1"
+        res = lines_to_url(domain="https://example.com", link_lines=[line])
+        self.assertEqual(res.pop(), "https://www.example.com/article1")
+
+    def test_line_to_url_domain_invalid(self):
+        line = "https://www.example.com/article1"
+        res = lines_to_url(domain="https://example.org", link_lines=[line])
+        self.assertEqual(len(res), 0)
+
+    def test_line_to_url_valid_domain_but_unsecure_http(self):
+        line = "http://www.example.com/article1"
+        res = lines_to_url(domain="https://example.com", link_lines=[line])
+        self.assertEqual(res.pop(), "https://www.example.com/article1")
diff --git a/welearn_datastack/collectors/atom_collector.py b/welearn_datastack/collectors/atom_collector.py
@@ -1,3 +1,5 @@
+import logging
+import os
 from typing import List
 from urllib.parse import urlparse
 
@@ -8,6 +10,7 @@
     lines_to_url,
 )
 from welearn_datastack.data.url_collector import URLCollector
+from welearn_datastack.modules.xml_extractor import XMLExtractor
 from welearn_datastack.utils_.http_client_utils import get_new_https_session
 
 url_illegal_characters = ['"', "<", ">"]
@@ -20,6 +23,20 @@
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
 }
 
+log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
+log_format: str = os.getenv(
+    "LOG_FORMAT", "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
+)
+
+if not isinstance(log_level, int):
+    raise ValueError("Log level is not recognized : '%s'", log_level)
+
+logging.basicConfig(
+    level=logging.getLevelName(log_level),
+    format=log_format,
+)
+logger = logging.getLogger(__name__)
+
 
 class AtomURLCollector(URLCollector):
     def __init__(
@@ -35,16 +52,27 @@ def collect(self) -> List[WeLearnDocument]:
         client = get_new_https_session()
         res = client.get(url=self.feed_url, headers=headers)
         content = res.content.decode("utf-8")
+        link_lines = []
+
+        entries = XMLExtractor(content).extract_content(tag="entry")
+        for entry in entries:
+            links = XMLExtractor(entry.content).extract_content_attribute_filter(
+                tag="link", attribute_name="rel", attribute_value="alternate"
+            )
+            if not links:
+                logger.warning(
+                    "No link found for entry, skipping entry. Entry content: %s",
+                    entry.content,
+                )
+                continue
+
+            if len(links) > 1:
+                logger.warning(
+                    "Multiple rel='alternate' links found for entry; using the first. Entry content: %s",
+                    entry.content,
+                )
 
-        flag = False
-        link_lines: List[str] = []
-        for line in content.split("\n"):
-            # If we are in the entry section and we find a link
-            # The definition, especially "rel" part is empirical
-            if flag and line.strip().startswith('<link rel="alternate"'):
-                link_lines.append(line.strip())
-            if line.strip().startswith("<entry>"):
-                flag = True
+            link_lines.append(links[0].attributes.get("href", ""))
 
         urls = lines_to_url(domain, link_lines)
 
diff --git a/welearn_datastack/collectors/helpers/feed_helpers.py b/welearn_datastack/collectors/helpers/feed_helpers.py
@@ -1,4 +1,5 @@
 from typing import List
+from urllib.parse import urlparse, urlunparse
 
 from welearn_database.data.models import Corpus, WeLearnDocument
 
@@ -13,23 +14,42 @@ def lines_to_url(domain: str, link_lines: List[str]) -> List[str]:
     :return: The list of URL
     """
     urls: List[str] = []
+    scheme = "https"
     # Refine lines to get URL
     for line in link_lines:
-        scheme = "https://"
-        https_place = line.find(scheme)
-        cursor = line[https_place:]
-        illegal_char_pos = [
-            cursor.find(x) for x in url_illegal_characters if cursor.find(x) >= 0
-        ]
-        end_place = min(illegal_char_pos)
-        url = cursor[:end_place]
-        url = url.strip()
+        line = remove_illegal_character(line)
+        parsed = urlparse(line)
+        if parsed.netloc == urlparse(domain).netloc or parsed.netloc.endswith(
+            f".{urlparse(domain).netloc}"
+        ):
+            urls.append(
+                urlunparse(
+                    [
+                        scheme,
+                        parsed.netloc,
+                        parsed.path,
+                        parsed.params,
+                        parsed.query,
+                        parsed.fragment,
+                    ]
+                )
+            )
 
-        if url.startswith(domain):
-            urls.append(url)
     return urls
 
 
+def remove_illegal_character(text: str) -> str:
+    illegal_char_pos = [
+        text.find(x) for x in url_illegal_characters if text.find(x) >= 0
+    ]
+    if illegal_char_pos:
+        end_place = min(illegal_char_pos)
+        url = text[:end_place]
+    else:
+        url = text
+    return url.strip()
+
+
 def extracted_url_to_url_datastore(
     corpus: Corpus,
     urls: List[str],