Skip to content

Commit e76abc6

Browse files
authored
fix: harden category extraction breadcrumb parsing (#668)
## ℹ️ Description - Link to the related issue(s): Issue #667 - Harden breadcrumb category extraction so downloads no longer fail when the breadcrumb structure changes. ## 📋 Changes Summary - Parse breadcrumb anchors dynamically and fall back with debug logging when legacy selectors are needed. - Added unit coverage for multi-anchor, single-anchor, and fallback scenarios to keep diff coverage above 80%. - Documented required lint/format/test steps in PR checklist; no new dependencies. ### ⚙️ Type of Change - [x] 🐞 Bug fix (non-breaking change which fixes an issue) - [ ] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist - [x] I have reviewed my changes to ensure they meet the project's standards. - [x] I have tested my changes and ensured that all tests pass (`pdm run test`). - [x] I have formatted the code (`pdm run format`). - [x] I have verified that linting passes (`pdm run lint`). - [x] I have updated documentation where necessary. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Improved category extraction accuracy with enhanced breadcrumb parsing. * Better handling for listings with a single breadcrumb (returns stable category identifier). * More resilient fallback when breadcrumb data is missing or malformed. * Safer normalization of category identifiers to avoid incorrect parsing across site variations. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 9c73696 commit e76abc6

File tree

3 files changed

+73
-20
lines changed

3 files changed

+73
-20
lines changed

src/kleinanzeigen_bot/extract.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
22
# SPDX-License-Identifier: AGPL-3.0-or-later
33
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
4-
import json, mimetypes, os, shutil # isort: skip
4+
from gettext import gettext as _
5+
6+
import json, mimetypes, os, re, shutil # isort: skip
57
import urllib.request as urllib_request
68
from datetime import datetime
79
from typing import Any, Final
@@ -19,6 +21,9 @@
1921

2022
LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
2123

24+
_BREADCRUMB_MIN_DEPTH:Final[int] = 2
25+
BREADCRUMB_RE = re.compile(r"/c(\d+)")
26+
2227

2328
class AdExtractor(WebScrapingMixin):
2429
"""
@@ -402,13 +407,39 @@ async def _extract_category_from_ad_page(self) -> str:
402407
403408
:return: a category string of form abc/def, where a-f are digits
404409
"""
405-
category_line = await self.web_find(By.ID, "vap-brdcrmb")
410+
try:
411+
category_line = await self.web_find(By.ID, "vap-brdcrmb")
412+
except TimeoutError as exc:
413+
LOG.warning("Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s", exc)
414+
raise
415+
try:
416+
breadcrumb_links = await self.web_find_all(By.CSS_SELECTOR, "a", parent = category_line)
417+
except TimeoutError:
418+
breadcrumb_links = []
419+
420+
category_ids:list[str] = []
421+
for link in breadcrumb_links:
422+
href = str(link.attrs.get("href", "") or "")
423+
matches = BREADCRUMB_RE.findall(href)
424+
if matches:
425+
category_ids.extend(matches)
426+
427+
# Use the deepest two breadcrumb category codes when available.
428+
if len(category_ids) >= _BREADCRUMB_MIN_DEPTH:
429+
return f"{category_ids[-2]}/{category_ids[-1]}"
430+
if len(category_ids) == 1:
431+
return f"{category_ids[0]}/{category_ids[0]}"
432+
433+
# Fallback to legacy selectors in case the breadcrumb structure is unexpected.
434+
LOG.debug(_("Falling back to legacy breadcrumb selectors; collected ids: %s"), category_ids)
406435
category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
407436
category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
408437
href_first:str = str(category_first_part.attrs["href"])
409438
href_second:str = str(category_second_part.attrs["href"])
410-
cat_num_first = href_first.rsplit("/", maxsplit = 1)[-1][1:]
411-
cat_num_second = href_second.rsplit("/", maxsplit = 1)[-1][1:]
439+
cat_num_first_raw = href_first.rsplit("/", maxsplit = 1)[-1]
440+
cat_num_second_raw = href_second.rsplit("/", maxsplit = 1)[-1]
441+
cat_num_first = cat_num_first_raw[1:] if cat_num_first_raw.startswith("c") else cat_num_first_raw
442+
cat_num_second = cat_num_second_raw[1:] if cat_num_second_raw.startswith("c") else cat_num_second_raw
412443
category:str = cat_num_first + "/" + cat_num_second
413444

414445
return category

src/kleinanzeigen_bot/resources/translations.de.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ kleinanzeigen_bot/extract.py:
216216
_extract_contact_from_ad_page:
217217
"No street given in the contact.": "Keine Straße in den Kontaktdaten angegeben."
218218

219+
_extract_category_from_ad_page:
220+
"Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s": "Breadcrumb-Container 'vap-brdcrmb' nicht gefunden; kann Anzeigenkategorie nicht extrahieren: %s"
221+
"Falling back to legacy breadcrumb selectors; collected ids: %s": "Weiche auf ältere Breadcrumb-Selektoren aus; gesammelte IDs: %s"
222+
219223
#################################################
220224
kleinanzeigen_bot/utils/i18n.py:
221225
#################################################

tests/unit/test_extract.py

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: AGPL-3.0-or-later
33
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
44
import json, os # isort: skip
5+
from gettext import gettext as _
56
from typing import Any, TypedDict
67
from unittest.mock import AsyncMock, MagicMock, call, patch
78

@@ -435,7 +436,7 @@ async def test_extract_description_with_affixes(
435436
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
436437
test_extractor.page = page_mock
437438

438-
for config, raw_description, _ in description_test_cases: # Changed to _ since we don't use expected_description
439+
for config, raw_description, _expected_description in description_test_cases:
439440
test_extractor.config = test_bot_config.with_values(config)
440441

441442
with patch.multiple(test_extractor,
@@ -584,44 +585,61 @@ async def test_extract_category(self, extractor:AdExtractor) -> None:
584585
second_part = MagicMock()
585586
second_part.attrs = {"href": "/s-spielzeug/c23"}
586587

587-
with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
588-
mock_web_find.side_effect = [
589-
category_line,
590-
first_part,
591-
second_part
592-
]
588+
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \
589+
patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part, second_part]) as mock_web_find_all:
593590

594591
result = await extractor._extract_category_from_ad_page()
595592
assert result == "17/23"
596593

597-
mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
598-
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
599-
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
594+
mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
595+
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
596+
597+
@pytest.mark.asyncio
598+
# pylint: disable=protected-access
599+
async def test_extract_category_single_identifier(self, extractor:AdExtractor) -> None:
600+
"""Test category extraction when only a single breadcrumb code exists."""
601+
category_line = MagicMock()
602+
first_part = MagicMock()
603+
first_part.attrs = {"href": "/s-kleidung/c42"}
604+
605+
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \
606+
patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part]) as mock_web_find_all:
607+
608+
result = await extractor._extract_category_from_ad_page()
609+
assert result == "42/42"
610+
611+
mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
612+
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
600613

601614
@pytest.mark.asyncio
602615
# pylint: disable=protected-access
603-
async def test_extract_category_with_non_string_href(self, extractor:AdExtractor) -> None:
604-
"""Test category extraction with non-string href attributes to cover str() conversion."""
616+
async def test_extract_category_fallback_to_legacy_selectors(self, extractor:AdExtractor, caplog:pytest.LogCaptureFixture) -> None:
617+
"""Test category extraction when breadcrumb links are not available and legacy selectors are used."""
605618
category_line = MagicMock()
606619
first_part = MagicMock()
607-
# Use non-string href to test str() conversion
608-
first_part.attrs = {"href": 12345} # This will need str() conversion
620+
first_part.attrs = {"href": 12345} # Ensure str() conversion happens
609621
second_part = MagicMock()
610622
second_part.attrs = {"href": 67890} # This will need str() conversion
611623

612-
with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
624+
caplog.set_level("DEBUG")
625+
expected_message = _("Falling back to legacy breadcrumb selectors; collected ids: %s") % []
626+
with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find, \
627+
patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError) as mock_web_find_all:
628+
613629
mock_web_find.side_effect = [
614630
category_line,
615631
first_part,
616632
second_part
617633
]
618634

619635
result = await extractor._extract_category_from_ad_page()
620-
assert result == "2345/7890" # After str() conversion and slicing
636+
assert result == "12345/67890"
637+
assert sum(1 for record in caplog.records if record.message == expected_message) == 1
621638

622639
mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
623640
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
624641
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
642+
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
625643

626644
@pytest.mark.asyncio
627645
# pylint: disable=protected-access

0 commit comments

Comments
 (0)