Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/feed-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ name: Feed Validation
#
# Triggers:
# - push to main: catches drift introduced by merged registry edits
# - schedule (every 6h): catches third-party feed outages on a cadence
# operators can act on without staring at PR checks
# - schedule (daily 00:00 UTC): catches third-party feed outages on a cadence
# operators can act on without staring at PR checks. Earlier 6h cadence
# was 4× the necessary discovery rate — feed outages don't change that
# fast and 542 feeds × 4 runs/day was wasted runner-minutes + third-
# party-fetch volume that no one acted on.
# - workflow_dispatch: manual re-runs from the Actions UI
#
# The --ci flag enforces three guardrails inside scripts/validate-rss-feeds.mjs:
Expand All @@ -27,7 +30,7 @@ on:
- 'shared/rss-allowed-domains.json'
- '.github/workflows/feed-validation.yml'
schedule:
- cron: '0 */6 * * *'
- cron: '0 0 * * *'
workflow_dispatch:

permissions:
Expand Down
14 changes: 13 additions & 1 deletion api/_rss-allowed-domains.js
Original file line number Diff line number Diff line change
Expand Up @@ -311,5 +311,17 @@ export default [
"hirado.hu",
"portfolio.hu",
"www.portfolio.hu",
"www.atv.hu"
"www.atv.hu",
"abcnews.go.com",
"abcnews.com",
"www.corriere.it",
"www.rt.com",
"www.alarabiya.net",
"tuoitrenews.vn",
"www.yonhapnewstv.co.kr",
"www.chosun.com",
"rss.libsyn.com",
"feeds.megaphone.fm",
"rss.art19.com",
"idp.nature.com"
];
14 changes: 13 additions & 1 deletion scripts/shared/rss-allowed-domains.json
Original file line number Diff line number Diff line change
Expand Up @@ -308,5 +308,17 @@
"hirado.hu",
"portfolio.hu",
"www.portfolio.hu",
"www.atv.hu"
"www.atv.hu",
"abcnews.go.com",
"abcnews.com",
"www.corriere.it",
"www.rt.com",
"www.alarabiya.net",
"tuoitrenews.vn",
"www.yonhapnewstv.co.kr",
"www.chosun.com",
"rss.libsyn.com",
"feeds.megaphone.fm",
"rss.art19.com",
"idp.nature.com"
]
60 changes: 54 additions & 6 deletions scripts/validate-rss-feeds.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@ const FETCH_TIMEOUT = 15_000;
const CONCURRENCY = 10;
const STALE_DAYS = 30;

// Sentinel error-message prefixes for the SSRF/config guardrails. Centralised so
// the throwing sites (assertCiAllowed, fetchFeed) and the isConfigDrift
// classifier can never drift apart — rename a reason, BOTH consumers update in
// lockstep. Without this, an innocuous reword (e.g. dropping `(--ci)`) would
// silently reclassify hard failures as soft warnings.
const CONFIG_DRIFT_REASONS = Object.freeze({
INVALID_URL: 'Invalid URL',
NON_HTTPS: 'Non-https scheme rejected in --ci mode:',
HOST_NOT_ALLOWED: 'Host not in allowlist (--ci):',
TOO_MANY_REDIRECTS: 'Too many redirects',
});

// --ci flag hardens the validator for trusted-context CI runs (push-to-main
// + schedule workflow). NOT enabled in PR CI — PR CI never runs this script
// because PR contributors can rewrite feeds.ts to make GitHub runners hit
Expand Down Expand Up @@ -95,13 +107,13 @@ function assertCiAllowed(rawUrl) {
try {
parsed = new URL(rawUrl);
} catch {
throw new Error('Invalid URL');
throw new Error(CONFIG_DRIFT_REASONS.INVALID_URL);
}
if (parsed.protocol !== 'https:') {
throw new Error(`Non-https scheme rejected in --ci mode: ${parsed.protocol}`);
throw new Error(`${CONFIG_DRIFT_REASONS.NON_HTTPS} ${parsed.protocol}`);
}
if (!isAllowedDomain(parsed.hostname)) {
throw new Error(`Host not in allowlist (--ci): ${parsed.hostname}`);
throw new Error(`${CONFIG_DRIFT_REASONS.HOST_NOT_ALLOWED} ${parsed.hostname}`);
}
return parsed;
}
Expand Down Expand Up @@ -144,7 +156,7 @@ async function fetchFeed(url) {
// the headers handshake is what we wanted bounded per hop.
return await resp.text();
}
throw new Error('Too many redirects');
throw new Error(CONFIG_DRIFT_REASONS.TOO_MANY_REDIRECTS);
}
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
Expand All @@ -162,7 +174,10 @@ async function fetchFeed(url) {
}

function parseNewestDate(xml) {
const parser = new XMLParser({ ignoreAttributes: false });
// processEntities:false — we only read date strings, never decode entity-bearing content.
// fast-xml-parser v5's default entity-expansion threshold trips on legit large feeds
// (Guardian, Fox, Axios, CISA, WHO, MIT, …) and produces false-positive DEAD rows.
const parser = new XMLParser({ ignoreAttributes: false, processEntities: false });
const doc = parser.parse(xml);

const dates = [];
Expand Down Expand Up @@ -295,7 +310,40 @@ async function main() {
console.log(`Summary: ${ok.length} OK, ${stale.length} stale, ${dead.length} dead, ${empty.length} empty` +
(skipped.length ? `, ${skipped.length} skipped` : ''));

if (stale.length || dead.length) process.exit(1);
// Exit policy:
// HARD-FAIL on config/SSRF-guard drift — these are bugs the maintainer can fix.
// Reasons enumerated in CONFIG_DRIFT_REASONS (top of file). Both the throwing
// sites and this classifier consume the same constants so a future reword
// can't silently demote a hard fail to a warning.
// SOFT-FAIL (exit 0 with warning) on third-party state — third-party 4xx/timeouts,
// STALE feeds, EMPTY feeds. These rot naturally; failing the build on them
// produces 100% CI noise and the prior workflow proved no one acts on it.
// Promoting third-party failures to hard-fail requires a registry-cleanup PR
// first; revisit once the long tail is groomed.
const isConfigDrift = (r) =>
typeof r.detail === 'string' &&
Object.values(CONFIG_DRIFT_REASONS).some(prefix => r.detail.startsWith(prefix));
const configDrift = dead.filter(isConfigDrift);
const thirdPartyDead = dead.filter(r => !isConfigDrift(r));

if (configDrift.length) {
console.error(
`\nFAIL: ${configDrift.length} feed(s) violate the CI guardrails ` +
`(allowlist drift or plaintext URL). Fix src/config/feeds.ts and/or the 5 ` +
`allowlist mirrors (shared/rss-allowed-domains.json, .cjs, ` +
`scripts/shared/rss-allowed-domains.json, ` +
`api/_rss-allowed-domains.js, vite.config.ts:RSS_PROXY_ALLOWED_DOMAINS).`
);
process.exit(1);
}

if (stale.length || thirdPartyDead.length || empty.length) {
console.warn(
`\nWARN: ${thirdPartyDead.length} third-party dead, ${stale.length} stale, ` +
`${empty.length} empty. Third-party state — not a build failure. ` +
`Groom src/config/feeds.ts when the count crosses a threshold worth a PR.`
);
}
}

main().catch(err => {
Expand Down
2 changes: 1 addition & 1 deletion server/worldmonitor/news/v1/_feeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ export const VARIANT_FEEDS: Record<string, Record<string, ServerFeed[]>> = {
{ name: 'Product Hunt', url: 'https://www.producthunt.com/feed' },
],
hardware: [
{ name: "Tom's Hardware", url: 'https://www.tomshardware.com/feeds/all' },
{ name: "Tom's Hardware", url: 'https://www.tomshardware.com/feeds.xml' },
{ name: 'SemiAnalysis', url: 'https://www.semianalysis.com/feed' },
{ name: 'Semiconductor News', url: gn('semiconductor OR chip OR TSMC OR NVIDIA OR Intel when:3d') },
],
Expand Down
14 changes: 13 additions & 1 deletion shared/rss-allowed-domains.json
Original file line number Diff line number Diff line change
Expand Up @@ -308,5 +308,17 @@
"hirado.hu",
"portfolio.hu",
"www.portfolio.hu",
"www.atv.hu"
"www.atv.hu",
"abcnews.go.com",
"abcnews.com",
"www.corriere.it",
"www.rt.com",
"www.alarabiya.net",
"tuoitrenews.vn",
"www.yonhapnewstv.co.kr",
"www.chosun.com",
"rss.libsyn.com",
"feeds.megaphone.fm",
"rss.art19.com",
"idp.nature.com"
]
4 changes: 2 additions & 2 deletions src/config/feeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ const FULL_FEEDS: Record<string, Feed[]> = {
{ name: 'Al Arabiya', url: { en: rss('https://news.google.com/rss/search?q=site:english.alarabiya.net+when:2d&hl=en-US&gl=US&ceid=US:en'), ar: rss('https://www.alarabiya.net/tools/mrss/?cat=main') } },
// Arab News and Times of Israel removed — 403 from cloud IPs
{ name: 'Guardian ME', url: rss('https://www.theguardian.com/world/middleeast/rss') },
{ name: 'BBC Persian', url: rss('http://feeds.bbci.co.uk/persian/tv-and-radio-37434376/rss.xml') },
{ name: 'BBC Persian', url: rss('https://feeds.bbci.co.uk/persian/rss.xml') },
{ name: 'Iran International', url: rss('https://news.google.com/rss/search?q=site:iranintl.com+when:2d&hl=en-US&gl=US&ceid=US:en') },
{ name: 'Fars News', url: rss('https://news.google.com/rss/search?q=site:farsnews.ir+when:2d&hl=en-US&gl=US&ceid=US:en') },
{ name: 'IRNA', url: rss('https://en.irna.ir/rss') },
Expand Down Expand Up @@ -623,7 +623,7 @@ const TECH_FEEDS: Record<string, Feed[]> = {
{ name: 'Seeking Alpha Tech', url: rss('https://seekingalpha.com/market_currents.xml') },
],
hardware: [
{ name: "Tom's Hardware", url: rss('https://www.tomshardware.com/feeds/all') },
{ name: "Tom's Hardware", url: rss('https://www.tomshardware.com/feeds.xml') },
{ name: 'SemiAnalysis', url: rss('https://news.google.com/rss/search?q=site:semianalysis.com+when:7d&hl=en-US&gl=US&ceid=US:en') },
{ name: 'Semiconductor News', url: rss('https://news.google.com/rss/search?q=semiconductor+OR+chip+OR+TSMC+OR+NVIDIA+OR+Intel+when:3d&hl=en-US&gl=US&ceid=US:en') },
],
Expand Down
4 changes: 4 additions & 0 deletions vite.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,10 @@ const RSS_PROXY_ALLOWED_DOMAINS = new Set([
'www.goodnewsnetwork.org', 'www.positive.news', 'reasonstobecheerful.world',
'www.optimistdaily.com', 'www.sunnyskyz.com', 'www.huffpost.com',
'www.sciencedaily.com', 'feeds.nature.com', 'www.livescience.com', 'www.newscientist.com',
// Feed-registry coverage (PR fix/feed-validation-unblock — kept sync with shared/rss-allowed-domains.json)
'abcnews.go.com', 'abcnews.com', 'www.corriere.it', 'www.rt.com', 'www.alarabiya.net', 'tuoitrenews.vn',
'www.yonhapnewstv.co.kr', 'www.chosun.com', 'rss.libsyn.com', 'feeds.megaphone.fm', 'rss.art19.com',
'idp.nature.com',
]);

function rssProxyPlugin(): Plugin {
Expand Down
Loading