Skip to content

Commit cc9eff8

Browse files
committed
check robots.txt before scraping and respect etags to not inncur more server cost than needed
1 parent 106b5a2 commit cc9eff8

File tree

2 files changed

+189
-45
lines changed

2 files changed

+189
-45
lines changed

.cache/rss-cache.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"https://gist.github.com/papes1ns.atom": {
3+
"etag": "W/\"1385b5e54c606838b2dfb49968a91aa8\""
4+
},
5+
"https://spin.atomicobject.com/author/nathan-papes/feed/atom/": {
6+
"etag": "W/\"167b45694557adbffd6ce95294611bd1\"",
7+
"lastModified": "Tue, 29 Jul 2025 12:00:12 GMT"
8+
},
9+
"https://www.youtube.com/feeds/videos.xml?channel_id=UCPwv65XQty1QbqE04FitBlQ": {}
10+
}

script/rss.ts

Lines changed: 179 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,96 @@ import { Content, Provider } from "@spin.dev/core";
1818
import fs from "fs/promises";
1919
import path from "path";
2020
// (Bun currently ships a TOML parser but no serializer, so we keep a tiny helper)
21+
// ---------------------------------------------------------------------------
22+
// Polite scraping constants & utilities -------------------------------------
23+
const USER_AGENT = "spin.dev/rss-bot (+https://natepapes.com)";
24+
const CONCURRENCY_LIMIT = 5;
25+
const CACHE_PATH = path.join(import.meta.dir, "..", ".cache", "rss-cache.json");
26+
27+
type CacheEntry = { etag?: string; lastModified?: string };
28+
29+
let feedCache: Record<string, CacheEntry> = {};
30+
try {
31+
feedCache = JSON.parse(await fs.readFile(CACHE_PATH, "utf8"));
32+
} catch {
33+
/* first run – cache will be created when we save later */
34+
}
35+
36+
async function saveCache() {
37+
await fs.mkdir(path.dirname(CACHE_PATH), { recursive: true });
38+
await fs.writeFile(CACHE_PATH, JSON.stringify(feedCache, null, 2));
39+
}
40+
41+
/** Simple semaphore enforcing max parallel requests per domain */
42+
function createSemaphore(limit: number) {
43+
let active = 0;
44+
const queue: Array<() => void> = [];
45+
const next = () => {
46+
active--;
47+
if (queue.length) queue.shift()!();
48+
};
49+
return async <T>(fn: () => Promise<T>): Promise<T> =>
50+
new Promise<T>((res, rej) => {
51+
const run = () => {
52+
active++;
53+
fn().then(res).catch(rej).finally(next);
54+
};
55+
active < limit ? run() : queue.push(run);
56+
});
57+
}
58+
59+
const acquire = createSemaphore(CONCURRENCY_LIMIT);
60+
61+
/** Fetch wrapper adding UA + retry/back-off for 429/503 */
62+
async function fetchWithRetry(
63+
url: string,
64+
init: RequestInit = {},
65+
retries = 3
66+
): Promise<Response> {
67+
const opts: RequestInit = {
68+
...init,
69+
headers: { "User-Agent": USER_AGENT, ...(init.headers as any) },
70+
redirect: "follow",
71+
};
72+
for (let attempt = 0; attempt <= retries; attempt++) {
73+
const res = await fetch(url, opts);
74+
if (![429, 503].includes(res.status) || attempt === retries) return res;
75+
const retryAfter =
76+
Number(res.headers.get("retry-after")) * 1000 || 2 ** attempt * 1000;
77+
await new Promise((r) => setTimeout(r, retryAfter));
78+
}
79+
throw new Error(`Failed after ${retries + 1} retries → ${url}`);
80+
}
81+
82+
// robots.txt disallow cache per origin
83+
const robotsCache: Record<string, string[]> = {};
84+
async function isAllowed(target: URL): Promise<boolean> {
85+
const origin = target.origin;
86+
if (!(origin in robotsCache)) {
87+
try {
88+
const res = await fetchWithRetry(`${origin}/robots.txt`);
89+
if (!res.ok) throw new Error();
90+
const txt = await res.text();
91+
const disallow: string[] = [];
92+
let inGlobal = false;
93+
for (const line of txt.split("\n")) {
94+
const trimmed = line.trim();
95+
if (/^user-agent:\s*\*/i.test(trimmed)) inGlobal = true;
96+
else if (/^user-agent:/i.test(trimmed)) inGlobal = false;
97+
else if (inGlobal && /^disallow:/i.test(trimmed)) {
98+
const parts = trimmed.split(":");
99+
if (parts[1]) disallow.push(parts[1].trim());
100+
}
101+
}
102+
robotsCache[origin] = disallow;
103+
} catch {
104+
robotsCache[origin] = []; // assume allowed if cannot fetch
105+
}
106+
}
107+
const rules = robotsCache[origin] ?? [];
108+
return rules.every((p) => !target.pathname.startsWith(p));
109+
}
110+
// ---------------------------------------------------------------------------
21111

22112
// ---------- Helpers --------------------------------------------------------
23113

@@ -86,7 +176,7 @@ function decodeEntities(str: string): string {
86176
async function enrichContent(url: string): Promise<Partial<Content>> {
87177
const partial: Partial<Content> = {};
88178

89-
const response = await fetch(url, { redirect: "follow" });
179+
const response = await fetchWithRetry(url);
90180

91181
const rewriter = new HTMLRewriter()
92182
.on("meta[name='twitter:data2']", {
@@ -189,7 +279,7 @@ function parseFeedEntries(xml: string): Array<{
189279
// ---------- Main script ----------------------------------------------------
190280

191281
const ROOT = path.join(import.meta.dir, "..", "providers");
192-
await fs.rm(ROOT, { recursive: true, force: true });
282+
// No global deletion; we remove per-provider directory only when feed changed.
193283

194284
const rssConfigPath = path.join(import.meta.dir, "..", "rss", "rss.toml");
195285
const configModule = await import(rssConfigPath, { with: { type: "toml" } });
@@ -204,64 +294,108 @@ for (const [providerId, cfg] of Object.entries(configs)) {
204294
rss: cfg.rss ?? cfg.url ?? "",
205295
});
206296

207-
// Fetch & parse feed ------------------------------------------------------
297+
// Fetch & parse feed (ETag/Last-Modified cache) ---------------------------
208298
console.log(`Fetching feed for ${providerId}…`);
209-
const feedResponse = await fetch(providerMeta.rss, { redirect: "follow" });
299+
300+
const cacheEntry = feedCache[providerMeta.rss] ?? {};
301+
302+
const feedResponse = await fetchWithRetry(providerMeta.rss, {
303+
headers: {
304+
...(cacheEntry.etag ? { "If-None-Match": cacheEntry.etag } : {}),
305+
...(cacheEntry.lastModified
306+
? { "If-Modified-Since": cacheEntry.lastModified }
307+
: {}),
308+
},
309+
});
310+
311+
if (feedResponse.status === 304) {
312+
console.log("ℹ︎ Feed unchanged (304) – skipping.");
313+
continue;
314+
}
315+
316+
if (!feedResponse.ok)
317+
throw new Error(
318+
`Failed to fetch feed (${feedResponse.status}) → ${providerMeta.rss}`
319+
);
320+
321+
feedCache[providerMeta.rss] = {
322+
etag: feedResponse.headers.get("etag") ?? undefined,
323+
lastModified: feedResponse.headers.get("last-modified") ?? undefined,
324+
};
325+
210326
const feedXml = await feedResponse.text();
211327
const entries = parseFeedEntries(feedXml).slice(0, 30); // limit to 30 most recent
212328

213-
// Generate content files in parallel -------------------------------------
329+
// If feed is modified, clear existing provider directory to regenerate fresh.
330+
const providerDir = path.join(ROOT, providerId);
331+
332+
// remove old provider directory (including provider.toml & content) to avoid stale files
333+
await fs.rm(providerDir, { recursive: true, force: true });
334+
335+
// Generate content files (concurrency-guarded) ----------------------------
214336
const usedSlugs = new Set<string>();
215337

216338
await Promise.all(
217-
entries.map(async (entry) => {
218-
const rawSlug = slugify(entry.title) || slugify(entry.link);
219-
const contentId = uniqueSlug(rawSlug, usedSlugs);
220-
usedSlugs.add(contentId);
221-
222-
const baseContent: Content = Content.parse({
223-
id: contentId.toLowerCase(),
224-
title: decodeEntities(entry.title),
225-
description: decodeEntities(entry.summary ?? ""),
226-
url: entry.link,
227-
created_at: entry.published.endsWith("Z")
228-
? entry.published
229-
: new Date(entry.published).toISOString().replace(/\.\d+Z$/, "Z"), // ISO-8601 string
230-
...(entry.tags?.length ? { tags: entry.tags.map(decodeEntities) } : {}),
231-
});
232-
233-
const enriched = await enrichContent(entry.link);
234-
235-
if (enriched.title) enriched.title = decodeEntities(enriched.title);
236-
if (enriched.description)
237-
enriched.description = decodeEntities(enriched.description);
238-
239-
if (enriched.tags || baseContent.tags) {
240-
const merged = Array.from(
241-
new Set([...(baseContent.tags ?? []), ...(enriched.tags ?? [])])
339+
entries.map((entry) =>
340+
acquire(async () => {
341+
const linkURL = new URL(entry.link);
342+
if (!(await isAllowed(linkURL))) {
343+
console.warn(`⚠︎ robots.txt disallow – skipping ${entry.link}`);
344+
return;
345+
}
346+
347+
const rawSlug = slugify(entry.title) || slugify(entry.link);
348+
const contentId = uniqueSlug(rawSlug, usedSlugs);
349+
usedSlugs.add(contentId);
350+
351+
const baseContent: Content = Content.parse({
352+
id: contentId.toLowerCase(),
353+
title: decodeEntities(entry.title),
354+
description: decodeEntities(entry.summary ?? ""),
355+
url: entry.link,
356+
created_at: entry.published.endsWith("Z")
357+
? entry.published
358+
: new Date(entry.published).toISOString().replace(/\.\d+Z$/, "Z"),
359+
...(entry.tags?.length
360+
? { tags: entry.tags.map(decodeEntities) }
361+
: {}),
362+
});
363+
364+
const enriched = await enrichContent(entry.link);
365+
366+
if (enriched.title) enriched.title = decodeEntities(enriched.title);
367+
if (enriched.description)
368+
enriched.description = decodeEntities(enriched.description);
369+
370+
if (enriched.tags || baseContent.tags) {
371+
const merged = Array.from(
372+
new Set([...(baseContent.tags ?? []), ...(enriched.tags ?? [])])
373+
);
374+
if (merged.length) baseContent.tags = merged.map(decodeEntities);
375+
delete (enriched as any).tags;
376+
}
377+
378+
const content: Content = Content.parse({ ...baseContent, ...enriched });
379+
380+
const contentDir = path.join(providerDir, "content");
381+
await fs.mkdir(contentDir, { recursive: true });
382+
await fs.writeFile(
383+
path.join(contentDir, `${contentId}.toml`),
384+
toTOML(content)
242385
);
243-
if (merged.length) baseContent.tags = merged.map(decodeEntities);
244-
delete (enriched as any).tags;
245-
}
246-
247-
const content: Content = Content.parse({ ...baseContent, ...enriched });
248-
249-
const contentDir = path.join(ROOT, providerId, "content");
250-
await fs.mkdir(contentDir, { recursive: true });
251-
await fs.writeFile(
252-
path.join(contentDir, `${contentId}.toml`),
253-
toTOML(content)
254-
);
255-
})
386+
})
387+
)
256388
);
257389

258390
// Write provider metadata -----------------------------------------------
259-
const providerDir = path.join(ROOT, providerId);
260391
await fs.mkdir(providerDir, { recursive: true });
261392
await fs.writeFile(
262393
path.join(providerDir, "provider.toml"),
263394
toTOML(providerMeta)
264395
);
265396
}
266397

267-
console.log("✅ providers folder refreshed");
398+
// Persist updated cache once all providers processed
399+
await saveCache();
400+
401+
console.log("✅ providers folder refreshed (polite mode enabled)");

0 commit comments

Comments
 (0)