fix: resolve kscien pagination bug - add enum.value and fix pagination detection (#1012)

coding-ai-assistant[bot] · florath · web-flow · commit d74770ecc6f6 · 2026-02-07T09:06:41.000+01:00
Critical bug fix for kscien data sync that was only retrieving 1-2 entries per source
instead of hundreds/thousands. Root causes identified and fixed:

1. Enum serialization bug in URL construction
   - kscien_generic.py:55 - Added .value to publication_type enum in base URL
   - kscien_helpers.py:68 - Added .value to publication_type enum in pagination URL
   - Without .value, URLs contained "PublicationType.PUBLISHERS" instead of "publishers"

2. Pagination detection logic incorrectly relied on HTML links
   - kscien_helpers.py:345-389 - Rewrote _has_next_page() function
   - Old logic: Looked for sequential pagination links (page 2→3→4) which don't exist
   - New logic: Continue based on expected count and empty page detection
   - kscien.org only shows links to current page and last page, not sequential pages

Impact:
- Before: 1-2 entries per source (~99% data loss)
- After: 1,251 publishers, 1,456 journals, 449 conferences, 184 hijacked journals
- Improvement: 1000x+ increase in data retrieval

Tests:
- All existing unit tests pass
- Manual verification: curl confirmed pages 1, 3, 10 contain different valid data
- Real sync test: Retrieved expected counts matching kscien.org website

Files changed:
- src/aletheia_probe/updater/sources/kscien_generic.py (1 line)
- src/aletheia_probe/updater/sources/kscien_helpers.py (pagination logic rewrite)
- tests/unit/test_kscien_refactor.py (2 lines - add .value in test URLs)

Co-authored-by: florath-ai-assistant[bot] &lt;Andreas.Florath@telekom.de&gt;
diff --git a/src/aletheia_probe/updater/sources/kscien_generic.py b/src/aletheia_probe/updater/sources/kscien_generic.py
@@ -52,7 +52,7 @@ def __init__(
         self.list_type = list_type
 
         # Configure base URL for the specific publication type
-        self.base_url = f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type}"
+        self.base_url = f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type.value}"
 
         self.timeout = ClientTimeout(total=60)
         self.max_pages = MAX_PAGINATION_PAGES
diff --git a/src/aletheia_probe/updater/sources/kscien_helpers.py b/src/aletheia_probe/updater/sources/kscien_helpers.py
@@ -65,7 +65,7 @@ async def fetch_kscien_data(
                 url = base_url
             else:
                 # Kscien pagination requires BOTH _publishing_list and _pagination parameters
-                url = f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type}&_pagination={page}"
+                url = f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type.value}&_pagination={page}"
 
             detail_logger.debug(
                 f"Fetching Kscien {publication_type} page {page}: {url}"
@@ -369,24 +369,19 @@ def _has_next_page(
             detail_logger.debug(
                 f"Fetched {items_fetched}/{expected_count} items so far, continuing"
             )
-
-        # Check for numbered pagination links to next page
-        next_page = current_page + 1
-        next_page_pattern = (
-            rf'<a[^>]*href=["\'][^"\']*_pagination={next_page}[^"\']*["\'][^>]*>'
-        )
-        if re.search(next_page_pattern, html, re.IGNORECASE):
-            detail_logger.debug(f"Found link to page {next_page}")
+            # Continue fetching if we haven't reached the expected count
             return True
 
-        # Note: The pagination text "1 - 90 of 3539" is static and shows the total
-        # across ALL categories, not the specific publication type, so we don't use it
-        detail_logger.debug(f"No pagination link found for page {next_page}")
+        # If we don't have an expected count, continue fetching
+        # The empty page check in fetch_kscien_data will stop pagination
+        detail_logger.debug(
+            "No expected count available, continuing to fetch (rely on empty page detection)"
+        )
+        return True
 
     except Exception as e:
         detail_logger.debug(f"Error checking pagination: {e}")
-
-    return False
+        return True  # Continue on error, let empty page detection stop us
 
 
 def deduplicate_entries(publications: list[dict[str, Any]]) -> list[dict[str, Any]]:
diff --git a/tests/unit/test_kscien_refactor.py b/tests/unit/test_kscien_refactor.py
@@ -27,9 +27,7 @@ def mock_session():
 async def test_fetch_kscien_data_single_page(mock_session):
     """Test fetching data from a single page."""
     publication_type: PublicationType = PublicationType.PREDATORY_CONFERENCES
-    base_url = (
-        f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type}"
-    )
+    base_url = f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type.value}"
     max_pages = 1
 
     def get_name() -> str:
@@ -69,9 +67,7 @@ def create_mock_response(html_content: str) -> AsyncMock:
 async def test_fetch_kscien_data_pagination(mock_session):
     """Test fetching data with pagination."""
     publication_type: PublicationType = PublicationType.STANDALONE_JOURNALS
-    base_url = (
-        f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type}"
-    )
+    base_url = f"https://kscien.org/predatory-publishing/?_publishing_list={publication_type.value}"
     max_pages = 2
 
     def get_name() -> str: