Skip to content

Commit 7a9a225

Browse files
authored
Merge pull request #137 from CyberCRI/Refacto/atom-collector-use-xml-extractor
feat: integrate XMLExtractor for link extraction and add logging configuration
2 parents 07fe5a1 + 797ebf2 commit 7a9a225

3 files changed

Lines changed: 102 additions & 20 deletions

File tree

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from unittest import TestCase
2+
3+
from welearn_datastack.collectors.helpers.feed_helpers import (
4+
lines_to_url,
5+
remove_illegal_character,
6+
)
7+
8+
9+
class TestFeedHelpers(TestCase):
10+
def test_remove_illegal_character_no_modif(self):
11+
text = "https://www.example.com/article1"
12+
result = remove_illegal_character(text)
13+
self.assertEqual(result, text)
14+
15+
def test_remove_illegal_character_illegal_characters(self):
16+
text = 'https://www.example.com/article1</"link'
17+
awaited_result = "https://www.example.com/article1"
18+
result = remove_illegal_character(text)
19+
self.assertEqual(result, awaited_result)
20+
21+
def test_line_to_url_correct(self):
22+
line = "https://www.example.com/article1"
23+
res = lines_to_url(domain="https://example.com", link_lines=[line])
24+
self.assertEqual(res.pop(), "https://www.example.com/article1")
25+
26+
def test_line_to_url_domain_invalid(self):
27+
line = "https://www.example.com/article1"
28+
res = lines_to_url(domain="https://example.org", link_lines=[line])
29+
self.assertEqual(len(res), 0)
30+
31+
def test_line_to_url_valid_domain_but_unsecure_http(self):
32+
line = "http://www.example.com/article1"
33+
res = lines_to_url(domain="https://example.com", link_lines=[line])
34+
self.assertEqual(res.pop(), "https://www.example.com/article1")

welearn_datastack/collectors/atom_collector.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import logging
2+
import os
13
from typing import List
24
from urllib.parse import urlparse
35

@@ -8,6 +10,7 @@
810
lines_to_url,
911
)
1012
from welearn_datastack.data.url_collector import URLCollector
13+
from welearn_datastack.modules.xml_extractor import XMLExtractor
1114
from welearn_datastack.utils_.http_client_utils import get_new_https_session
1215

1316
url_illegal_characters = ['"', "<", ">"]
@@ -20,6 +23,20 @@
2023
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
2124
}
2225

26+
log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
27+
log_format: str = os.getenv(
28+
"LOG_FORMAT", "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
29+
)
30+
31+
if not isinstance(log_level, int):
32+
raise ValueError("Log level is not recognized : '%s'", log_level)
33+
34+
logging.basicConfig(
35+
level=logging.getLevelName(log_level),
36+
format=log_format,
37+
)
38+
logger = logging.getLogger(__name__)
39+
2340

2441
class AtomURLCollector(URLCollector):
2542
def __init__(
@@ -35,16 +52,27 @@ def collect(self) -> List[WeLearnDocument]:
3552
client = get_new_https_session()
3653
res = client.get(url=self.feed_url, headers=headers)
3754
content = res.content.decode("utf-8")
55+
link_lines = []
56+
57+
entries = XMLExtractor(content).extract_content(tag="entry")
58+
for entry in entries:
59+
links = XMLExtractor(entry.content).extract_content_attribute_filter(
60+
tag="link", attribute_name="rel", attribute_value="alternate"
61+
)
62+
if not links:
63+
logger.warning(
64+
"No link found for entry, skipping entry. Entry content: %s",
65+
entry.content,
66+
)
67+
continue
68+
69+
if len(links) > 1:
70+
logger.warning(
71+
"Multiple rel='alternate' links found for entry; using the first. Entry content: %s",
72+
entry.content,
73+
)
3874

39-
flag = False
40-
link_lines: List[str] = []
41-
for line in content.split("\n"):
42-
# If we are in the entry section and we find a link
43-
# The definition, especially "rel" part is empirical
44-
if flag and line.strip().startswith('<link rel="alternate"'):
45-
link_lines.append(line.strip())
46-
if line.strip().startswith("<entry>"):
47-
flag = True
75+
link_lines.append(links[0].attributes.get("href", ""))
4876

4977
urls = lines_to_url(domain, link_lines)
5078

welearn_datastack/collectors/helpers/feed_helpers.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import List
2+
from urllib.parse import urlparse, urlunparse
23

34
from welearn_database.data.models import Corpus, WeLearnDocument
45

@@ -13,23 +14,42 @@ def lines_to_url(domain: str, link_lines: List[str]) -> List[str]:
1314
:return: The list of URL
1415
"""
1516
urls: List[str] = []
17+
scheme = "https"
1618
# Refine lines to get URL
1719
for line in link_lines:
18-
scheme = "https://"
19-
https_place = line.find(scheme)
20-
cursor = line[https_place:]
21-
illegal_char_pos = [
22-
cursor.find(x) for x in url_illegal_characters if cursor.find(x) >= 0
23-
]
24-
end_place = min(illegal_char_pos)
25-
url = cursor[:end_place]
26-
url = url.strip()
20+
line = remove_illegal_character(line)
21+
parsed = urlparse(line)
22+
if parsed.netloc == urlparse(domain).netloc or parsed.netloc.endswith(
23+
f".{urlparse(domain).netloc}"
24+
):
25+
urls.append(
26+
urlunparse(
27+
[
28+
scheme,
29+
parsed.netloc,
30+
parsed.path,
31+
parsed.params,
32+
parsed.query,
33+
parsed.fragment,
34+
]
35+
)
36+
)
2737

28-
if url.startswith(domain):
29-
urls.append(url)
3038
return urls
3139

3240

41+
def remove_illegal_character(text: str) -> str:
42+
illegal_char_pos = [
43+
text.find(x) for x in url_illegal_characters if text.find(x) >= 0
44+
]
45+
if illegal_char_pos:
46+
end_place = min(illegal_char_pos)
47+
url = text[:end_place]
48+
else:
49+
url = text
50+
return url.strip()
51+
52+
3353
def extracted_url_to_url_datastore(
3454
corpus: Corpus,
3555
urls: List[str],

0 commit comments

Comments
 (0)