0xzerolight
diff --git a/‎TODOS.md‎
Lines changed: 31 additions & 0 deletions b/‎TODOS.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎app/crud.py‎
Lines changed: 2 additions & 2 deletions b/‎app/crud.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/migrations/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎app/migrations/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎app/migrations/m009_article_provider.py‎
Lines changed: 12 additions & 0 deletions b/‎app/migrations/m009_article_provider.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎app/models.py‎
Lines changed: 1 addition & 0 deletions b/‎app/models.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎app/scraping/__init__.py‎
Lines changed: 13 additions & 8 deletions b/‎app/scraping/__init__.py‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎app/scraping/providers.py‎
Lines changed: 67 additions & 0 deletions b/‎app/scraping/providers.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎app/scraping/routing.py‎
Lines changed: 87 additions & 0 deletions b/‎app/scraping/routing.py‎
Lines changed: 87 additions & 0 deletions
@@ -0,0 +1,31 @@
+# TODOs
+
+Deferred work items from eng review and design discussions.
+
+## GDELT as Third News Provider
+
+**What:** Add GDELT (Global Database of Events, Language, and Tone) as a third news provider.
+
+**Why:** Diversifies beyond Bing+Google. GDELT covers global events with no rate limiting, providing a fallback that doesn't depend on commercial search engines.
+
+**Pros:** Better geographic coverage, no rate limiting, tests the Protocol's extensibility beyond RSS.
+
+**Cons:** Returns JSON (not RSS), so the `NewsProvider` Protocol needs a `fetch_entries()` method or a JSON-to-FeedEntry adapter. Non-trivial change to the provider interface.
+
+**Context:** Design doc Open Question #1. The current `build_feed_url()` + `fetch_feed()` pattern only works for RSS providers. GDELT would need either a new Protocol method or an adapter that converts JSON responses to `FeedEntry` objects.
+
+**Depends on:** Multi-provider PR (provider registry + router).
+
+## Per-Topic Provider Configuration
+
+**What:** Add user-facing config (`config.yml`) to set provider priority order and enable/disable providers globally or per topic.
+
+**Why:** Power users on restricted networks (China, Russia) may need to disable Google entirely. Per-topic pinning lets users optimize provider choice for specific topic domains.
+
+**Pros:** Maximum flexibility for self-hosted users on restricted networks.
+
+**Cons:** Adds config surface area. Violates "user doesn't think about it" for the common case. Only valuable with 3+ providers.
+
+**Context:** Design doc Approach C, rejected for the initial multi-provider PR. Auto-routing handles the common case. Only worth building when the provider ecosystem grows.
+
+**Depends on:** Multi-provider PR + at least one more provider (e.g. GDELT).
@@ -314,9 +314,9 @@ def create_article(conn: sqlite3.Connection, article: Article) -> Article:
     data = article.to_insert_dict()
     cursor = conn.execute(
         """INSERT INTO articles (topic_id, title, url, content_hash,
-           raw_content, source_feed, fetched_at, processed)
+           raw_content, source_feed, source_provider, fetched_at, processed)
            VALUES (:topic_id, :title, :url, :content_hash,
-           :raw_content, :source_feed, :fetched_at, :processed)""",
+           :raw_content, :source_feed, :source_provider, :fetched_at, :processed)""",
         data,
     )
     article.id = cursor.lastrowid
 
@@ -15,6 +15,7 @@
 from app.migrations.m006_topic_tags import up as m006_up
 from app.migrations.m007_feed_health import up as m007_up
 from app.migrations.m008_interval_minutes import up as m008_up
+from app.migrations.m009_article_provider import up as m009_up
 
 MIGRATIONS: list[tuple[int, str, Callable[[sqlite3.Connection], None]]] = [
     (1, "baseline schema version", m001_up),
@@ -25,4 +26,5 @@
     (6, "add tags column to topics", m006_up),
     (7, "add feed_health table", m007_up),
     (8, "add check_interval_minutes column to topics", m008_up),
+    (9, "add source_provider column to articles", m009_up),
 ]
@@ -0,0 +1,12 @@
+"""Add source_provider column to articles table.
+
+Tracks which news provider (e.g. 'bing_news', 'google_news') was used
+to fetch each article. NULL for articles fetched before multi-provider
+support and for MANUAL mode articles.
+"""
+
+import sqlite3
+
+
+def up(conn: sqlite3.Connection) -> None:
+    conn.execute("ALTER TABLE articles ADD COLUMN source_provider TEXT")
@@ -82,6 +82,7 @@ class Article(BaseModel):
     content_hash: str
     raw_content: str | None = None
     source_feed: str
+    source_provider: str | None = None
     fetched_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
     processed: bool = False
 
 
@@ -19,9 +19,10 @@
 )
 from app.models import Article, Topic
 from app.scraping.content import extract_article_content
-from app.scraping.google_news import is_google_news_url, resolve_google_news_urls
+from app.scraping.google_news import resolve_google_news_urls
 from app.scraping.relevance import score_relevance
 from app.scraping.rss import FeedEntry, compute_article_hash, fetch_feeds_for_topic
+from app.scraping.rss import FeedResponse as FeedResponse
 
 logger = logging.getLogger(__name__)
 
@@ -75,12 +76,13 @@ def callback(feed_url, success, error_msg):
         return callback
 
     # 1. Fetch all feed entries
-    entries = await fetch_feeds_for_topic(
+    response = await fetch_feeds_for_topic(
         topic,
         timeout=feed_fetch_timeout,
         max_attempts=feed_max_retries,
         health_callback=_make_health_callback(conn),
     )
+    entries = response.entries
     if not entries:
         return FetchResult(articles=[], total_feed_entries=0)
 
@@ -130,12 +132,13 @@ def callback(feed_url, success, error_msg):
 
     # 3b. Resolve Google News redirect URLs for entries that need content fetching.
     # Done after dedup+limiting to minimize requests (typically ~10 URLs, not 100).
-    google_urls = [e.url for e, _ in fetch_batch if is_google_news_url(e.url)]
-    if google_urls:
-        resolved = await resolve_google_news_urls(google_urls, timeout=feed_fetch_timeout)
-        for entry, _ in fetch_batch:
-            if entry.url in resolved:
-                entry.url = resolved[entry.url]
+    if response.needs_url_resolution:
+        google_urls = [e.url for e, _ in fetch_batch if "news.google.com/" in e.url]
+        if google_urls:
+            resolved = await resolve_google_news_urls(google_urls, timeout=feed_fetch_timeout)
+            for entry, _ in fetch_batch:
+                if entry.url in resolved:
+                    entry.url = resolved[entry.url]
 
     # 4. Extract content concurrently with semaphore (only for entries needing fetch)
     semaphore = asyncio.Semaphore(concurrency)
@@ -163,6 +166,7 @@ async def _extract(entry: FeedEntry) -> str:
             content_hash=content_hash,
             raw_content=reused_content,
             source_feed=entry.source_feed,
+            source_provider=response.provider_name,
         )
         try:
             created = create_article(conn, article)
@@ -183,6 +187,7 @@ async def _extract(entry: FeedEntry) -> str:
             content_hash=content_hash,
             raw_content=content if isinstance(content, str) and content else None,
             source_feed=entry.source_feed,
+            source_provider=response.provider_name,
         )
         try:
             created = create_article(conn, article)
 
@@ -0,0 +1,67 @@
+"""News search provider definitions.
+
+Each provider knows how to build a feed URL for a topic. The
+``NewsProvider`` Protocol defines the interface; concrete classes
+implement it for specific news sources.
+"""
+
+from typing import Protocol
+from urllib.parse import quote_plus
+
+from app.models import Topic
+
+
+def _build_search_query(topic: Topic) -> str:
+    """Build a search query string from topic name and description.
+
+    Shared by all providers that use keyword-based search URLs.
+    Includes the topic name plus the first 6 words of the description
+    (if any) for additional context.
+    """
+    query_parts = [topic.name]
+    if topic.description:
+        desc_words = topic.description.split()[:6]
+        if desc_words:
+            query_parts.append(" ".join(desc_words))
+    return " ".join(query_parts)
+
+
+class NewsProvider(Protocol):
+    """Interface for news search providers."""
+
+    name: str
+    requires_api_key: bool
+
+    def build_feed_url(self, topic: Topic) -> str: ...
+
+    def needs_url_resolution(self) -> bool: ...
+
+
+class BingNewsProvider:
+    """Bing News RSS provider. No redirect resolution needed."""
+
+    name = "bing_news"
+    requires_api_key = False
+
+    def build_feed_url(self, topic: Topic) -> str:
+        query = _build_search_query(topic)
+        return f"https://www.bing.com/news/search?q={quote_plus(query)}&format=rss"
+
+    def needs_url_resolution(self) -> bool:
+        return False
+
+
+class GoogleNewsProvider:
+    """Google News RSS provider. Requires async URL resolution."""
+
+    name = "google_news"
+    requires_api_key = False
+
+    _TEMPLATE = "https://news.google.com/rss/search?q={query}&hl=en-US&gl=US&ceid=US:en"
+
+    def build_feed_url(self, topic: Topic) -> str:
+        query = _build_search_query(topic)
+        return self._TEMPLATE.format(query=quote_plus(query))
+
+    def needs_url_resolution(self) -> bool:
+        return True
@@ -0,0 +1,87 @@
+"""Provider routing with health-based cascade.
+
+Tracks per-provider health in-memory and selects the first healthy
+provider for each check cycle. Separate from the per-URL ``feed_health``
+table (which tracks individual feed URLs for the UI dashboard).
+"""
+
+import logging
+from dataclasses import dataclass, field
+from datetime import UTC, datetime, timedelta
+
+from app.scraping.providers import BingNewsProvider, GoogleNewsProvider, NewsProvider
+
+logger = logging.getLogger(__name__)
+
+# Default provider priority: Bing first (no redirect resolution),
+# Google second (best coverage but fragile).
+DEFAULT_PROVIDERS: list[NewsProvider] = [BingNewsProvider(), GoogleNewsProvider()]
+
+_UNHEALTHY_COOLDOWN = timedelta(minutes=30)
+_FAILURE_THRESHOLD = 3
+
+
+@dataclass
+class _ProviderHealth:
+    consecutive_failures: int = 0
+    last_failure: datetime | None = None
+
+
+@dataclass
+class ProviderRouter:
+    """Selects providers based on health state.
+
+    Health is tracked in-memory: 3+ consecutive failures marks a
+    provider unhealthy for 30 minutes. State resets on app restart
+    (desirable, transient failures don't persist).
+    """
+
+    providers: list[NewsProvider] = field(default_factory=lambda: list(DEFAULT_PROVIDERS))
+    _health: dict[str, _ProviderHealth] = field(default_factory=dict)
+
+    def get_provider(self) -> NewsProvider:
+        """Return the first healthy provider."""
+        for provider in self.providers:
+            if self._is_healthy(provider.name):
+                return provider
+        # All unhealthy — return first (best effort, cooldown will expire)
+        return self.providers[0]
+
+    def get_next_provider(self, after: NewsProvider) -> NewsProvider | None:
+        """Return the next healthy provider after the given one, or None."""
+        found = False
+        for provider in self.providers:
+            if found and self._is_healthy(provider.name):
+                return provider
+            if provider.name == after.name:
+                found = True
+        return None
+
+    def mark_unhealthy(self, provider_name: str) -> None:
+        """Record a failure for a provider."""
+        health = self._health.setdefault(provider_name, _ProviderHealth())
+        health.consecutive_failures += 1
+        health.last_failure = datetime.now(UTC)
+        logger.debug(
+            "Provider %s: failure %d/%d",
+            provider_name,
+            health.consecutive_failures,
+            _FAILURE_THRESHOLD,
+        )
+
+    def mark_healthy(self, provider_name: str) -> None:
+        """Reset failure count for a provider on success."""
+        if provider_name in self._health:
+            del self._health[provider_name]
+
+    def _is_healthy(self, provider_name: str) -> bool:
+        health = self._health.get(provider_name)
+        if not health or health.consecutive_failures < _FAILURE_THRESHOLD:
+            return True
+        # Cooldown expired — give it another chance
+        return bool(health.last_failure and datetime.now(UTC) - health.last_failure > _UNHEALTHY_COOLDOWN)
+
+
+# Module-level singleton — all callers import this.
+# The scheduler, CLI, and web layer share the same instance.
+router = ProviderRouter()