Skip to content

Commit ec8c43f

Browse files
committed
feat: alternate RSS feed providers added
1 parent a1647f0 commit ec8c43f

17 files changed

Lines changed: 717 additions & 79 deletions

TODOS.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# TODOs
2+
3+
Deferred work items from eng review and design discussions.
4+
5+
## GDELT as Third News Provider
6+
7+
**What:** Add GDELT (Global Database of Events, Language, and Tone) as a third news provider.
8+
9+
**Why:** Diversifies beyond Bing+Google. GDELT covers global events with no rate limiting, providing a fallback that doesn't depend on commercial search engines.
10+
11+
**Pros:** Better geographic coverage, no rate limiting, tests the Protocol's extensibility beyond RSS.
12+
13+
**Cons:** Returns JSON (not RSS), so the `NewsProvider` Protocol needs a `fetch_entries()` method or a JSON-to-FeedEntry adapter. Non-trivial change to the provider interface.
14+
15+
**Context:** Design doc Open Question #1. The current `build_feed_url()` + `fetch_feed()` pattern only works for RSS providers. GDELT would need either a new Protocol method or an adapter that converts JSON responses to `FeedEntry` objects.
16+
17+
**Depends on:** Multi-provider PR (provider registry + router).
18+
19+
## Per-Topic Provider Configuration
20+
21+
**What:** Add user-facing config (`config.yml`) to set provider priority order and enable/disable providers globally or per topic.
22+
23+
**Why:** Power users on restricted networks (China, Russia) may need to disable Google entirely. Per-topic pinning lets users optimize provider choice for specific topic domains.
24+
25+
**Pros:** Maximum flexibility for self-hosted users on restricted networks.
26+
27+
**Cons:** Adds config surface area. Violates "user doesn't think about it" for the common case. Only valuable with 3+ providers.
28+
29+
**Context:** Design doc Approach C, rejected for the initial multi-provider PR. Auto-routing handles the common case. Only worth building when the provider ecosystem grows.
30+
31+
**Depends on:** Multi-provider PR + at least one more provider (e.g. GDELT).

app/crud.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,9 @@ def create_article(conn: sqlite3.Connection, article: Article) -> Article:
314314
data = article.to_insert_dict()
315315
cursor = conn.execute(
316316
"""INSERT INTO articles (topic_id, title, url, content_hash,
317-
raw_content, source_feed, fetched_at, processed)
317+
raw_content, source_feed, source_provider, fetched_at, processed)
318318
VALUES (:topic_id, :title, :url, :content_hash,
319-
:raw_content, :source_feed, :fetched_at, :processed)""",
319+
:raw_content, :source_feed, :source_provider, :fetched_at, :processed)""",
320320
data,
321321
)
322322
article.id = cursor.lastrowid

app/migrations/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from app.migrations.m006_topic_tags import up as m006_up
1616
from app.migrations.m007_feed_health import up as m007_up
1717
from app.migrations.m008_interval_minutes import up as m008_up
18+
from app.migrations.m009_article_provider import up as m009_up
1819

1920
MIGRATIONS: list[tuple[int, str, Callable[[sqlite3.Connection], None]]] = [
2021
(1, "baseline schema version", m001_up),
@@ -25,4 +26,5 @@
2526
(6, "add tags column to topics", m006_up),
2627
(7, "add feed_health table", m007_up),
2728
(8, "add check_interval_minutes column to topics", m008_up),
29+
(9, "add source_provider column to articles", m009_up),
2830
]
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""Add source_provider column to articles table.
2+
3+
Tracks which news provider (e.g. 'bing_news', 'google_news') was used
4+
to fetch each article. NULL for articles fetched before multi-provider
5+
support and for MANUAL mode articles.
6+
"""
7+
8+
import sqlite3
9+
10+
11+
def up(conn: sqlite3.Connection) -> None:
12+
conn.execute("ALTER TABLE articles ADD COLUMN source_provider TEXT")

app/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class Article(BaseModel):
8282
content_hash: str
8383
raw_content: str | None = None
8484
source_feed: str
85+
source_provider: str | None = None
8586
fetched_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
8687
processed: bool = False
8788

app/scraping/__init__.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
)
2020
from app.models import Article, Topic
2121
from app.scraping.content import extract_article_content
22-
from app.scraping.google_news import is_google_news_url, resolve_google_news_urls
22+
from app.scraping.google_news import resolve_google_news_urls
2323
from app.scraping.relevance import score_relevance
2424
from app.scraping.rss import FeedEntry, compute_article_hash, fetch_feeds_for_topic
25+
from app.scraping.rss import FeedResponse as FeedResponse
2526

2627
logger = logging.getLogger(__name__)
2728

@@ -75,12 +76,13 @@ def callback(feed_url, success, error_msg):
7576
return callback
7677

7778
# 1. Fetch all feed entries
78-
entries = await fetch_feeds_for_topic(
79+
response = await fetch_feeds_for_topic(
7980
topic,
8081
timeout=feed_fetch_timeout,
8182
max_attempts=feed_max_retries,
8283
health_callback=_make_health_callback(conn),
8384
)
85+
entries = response.entries
8486
if not entries:
8587
return FetchResult(articles=[], total_feed_entries=0)
8688

@@ -130,12 +132,13 @@ def callback(feed_url, success, error_msg):
130132

131133
# 3b. Resolve Google News redirect URLs for entries that need content fetching.
132134
# Done after dedup+limiting to minimize requests (typically ~10 URLs, not 100).
133-
google_urls = [e.url for e, _ in fetch_batch if is_google_news_url(e.url)]
134-
if google_urls:
135-
resolved = await resolve_google_news_urls(google_urls, timeout=feed_fetch_timeout)
136-
for entry, _ in fetch_batch:
137-
if entry.url in resolved:
138-
entry.url = resolved[entry.url]
135+
if response.needs_url_resolution:
136+
google_urls = [e.url for e, _ in fetch_batch if "news.google.com/" in e.url]
137+
if google_urls:
138+
resolved = await resolve_google_news_urls(google_urls, timeout=feed_fetch_timeout)
139+
for entry, _ in fetch_batch:
140+
if entry.url in resolved:
141+
entry.url = resolved[entry.url]
139142

140143
# 4. Extract content concurrently with semaphore (only for entries needing fetch)
141144
semaphore = asyncio.Semaphore(concurrency)
@@ -163,6 +166,7 @@ async def _extract(entry: FeedEntry) -> str:
163166
content_hash=content_hash,
164167
raw_content=reused_content,
165168
source_feed=entry.source_feed,
169+
source_provider=response.provider_name,
166170
)
167171
try:
168172
created = create_article(conn, article)
@@ -183,6 +187,7 @@ async def _extract(entry: FeedEntry) -> str:
183187
content_hash=content_hash,
184188
raw_content=content if isinstance(content, str) and content else None,
185189
source_feed=entry.source_feed,
190+
source_provider=response.provider_name,
186191
)
187192
try:
188193
created = create_article(conn, article)

app/scraping/providers.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""News search provider definitions.
2+
3+
Each provider knows how to build a feed URL for a topic. The
4+
``NewsProvider`` Protocol defines the interface; concrete classes
5+
implement it for specific news sources.
6+
"""
7+
8+
from typing import Protocol
9+
from urllib.parse import quote_plus
10+
11+
from app.models import Topic
12+
13+
14+
def _build_search_query(topic: Topic) -> str:
15+
"""Build a search query string from topic name and description.
16+
17+
Shared by all providers that use keyword-based search URLs.
18+
Includes the topic name plus the first 6 words of the description
19+
(if any) for additional context.
20+
"""
21+
query_parts = [topic.name]
22+
if topic.description:
23+
desc_words = topic.description.split()[:6]
24+
if desc_words:
25+
query_parts.append(" ".join(desc_words))
26+
return " ".join(query_parts)
27+
28+
29+
class NewsProvider(Protocol):
30+
"""Interface for news search providers."""
31+
32+
name: str
33+
requires_api_key: bool
34+
35+
def build_feed_url(self, topic: Topic) -> str: ...
36+
37+
def needs_url_resolution(self) -> bool: ...
38+
39+
40+
class BingNewsProvider:
41+
"""Bing News RSS provider. No redirect resolution needed."""
42+
43+
name = "bing_news"
44+
requires_api_key = False
45+
46+
def build_feed_url(self, topic: Topic) -> str:
47+
query = _build_search_query(topic)
48+
return f"https://www.bing.com/news/search?q={quote_plus(query)}&format=rss"
49+
50+
def needs_url_resolution(self) -> bool:
51+
return False
52+
53+
54+
class GoogleNewsProvider:
55+
"""Google News RSS provider. Requires async URL resolution."""
56+
57+
name = "google_news"
58+
requires_api_key = False
59+
60+
_TEMPLATE = "https://news.google.com/rss/search?q={query}&hl=en-US&gl=US&ceid=US:en"
61+
62+
def build_feed_url(self, topic: Topic) -> str:
63+
query = _build_search_query(topic)
64+
return self._TEMPLATE.format(query=quote_plus(query))
65+
66+
def needs_url_resolution(self) -> bool:
67+
return True

app/scraping/routing.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""Provider routing with health-based cascade.
2+
3+
Tracks per-provider health in-memory and selects the first healthy
4+
provider for each check cycle. Separate from the per-URL ``feed_health``
5+
table (which tracks individual feed URLs for the UI dashboard).
6+
"""
7+
8+
import logging
9+
from dataclasses import dataclass, field
10+
from datetime import UTC, datetime, timedelta
11+
12+
from app.scraping.providers import BingNewsProvider, GoogleNewsProvider, NewsProvider
13+
14+
logger = logging.getLogger(__name__)
15+
16+
# Default provider priority: Bing first (no redirect resolution),
17+
# Google second (best coverage but fragile).
18+
DEFAULT_PROVIDERS: list[NewsProvider] = [BingNewsProvider(), GoogleNewsProvider()]
19+
20+
_UNHEALTHY_COOLDOWN = timedelta(minutes=30)
21+
_FAILURE_THRESHOLD = 3
22+
23+
24+
@dataclass
25+
class _ProviderHealth:
26+
consecutive_failures: int = 0
27+
last_failure: datetime | None = None
28+
29+
30+
@dataclass
31+
class ProviderRouter:
32+
"""Selects providers based on health state.
33+
34+
Health is tracked in-memory: 3+ consecutive failures marks a
35+
provider unhealthy for 30 minutes. State resets on app restart
36+
(desirable, transient failures don't persist).
37+
"""
38+
39+
providers: list[NewsProvider] = field(default_factory=lambda: list(DEFAULT_PROVIDERS))
40+
_health: dict[str, _ProviderHealth] = field(default_factory=dict)
41+
42+
def get_provider(self) -> NewsProvider:
43+
"""Return the first healthy provider."""
44+
for provider in self.providers:
45+
if self._is_healthy(provider.name):
46+
return provider
47+
# All unhealthy — return first (best effort, cooldown will expire)
48+
return self.providers[0]
49+
50+
def get_next_provider(self, after: NewsProvider) -> NewsProvider | None:
51+
"""Return the next healthy provider after the given one, or None."""
52+
found = False
53+
for provider in self.providers:
54+
if found and self._is_healthy(provider.name):
55+
return provider
56+
if provider.name == after.name:
57+
found = True
58+
return None
59+
60+
def mark_unhealthy(self, provider_name: str) -> None:
61+
"""Record a failure for a provider."""
62+
health = self._health.setdefault(provider_name, _ProviderHealth())
63+
health.consecutive_failures += 1
64+
health.last_failure = datetime.now(UTC)
65+
logger.debug(
66+
"Provider %s: failure %d/%d",
67+
provider_name,
68+
health.consecutive_failures,
69+
_FAILURE_THRESHOLD,
70+
)
71+
72+
def mark_healthy(self, provider_name: str) -> None:
73+
"""Reset failure count for a provider on success."""
74+
if provider_name in self._health:
75+
del self._health[provider_name]
76+
77+
def _is_healthy(self, provider_name: str) -> bool:
78+
health = self._health.get(provider_name)
79+
if not health or health.consecutive_failures < _FAILURE_THRESHOLD:
80+
return True
81+
# Cooldown expired — give it another chance
82+
return bool(health.last_failure and datetime.now(UTC) - health.last_failure > _UNHEALTHY_COOLDOWN)
83+
84+
85+
# Module-level singleton — all callers import this.
86+
# The scheduler, CLI, and web layer share the same instance.
87+
router = ProviderRouter()

0 commit comments

Comments
 (0)