@@ -18,6 +18,96 @@ import { Content, Provider } from "@spin.dev/core";
1818import fs from "fs/promises" ;
1919import path from "path" ;
2020// (Bun currently ships a TOML parser but no serializer, so we keep a tiny helper)
21+ // ---------------------------------------------------------------------------
22+ // Polite scraping constants & utilities -------------------------------------
23+ const USER_AGENT = "spin.dev/rss-bot (+https://natepapes.com)" ;
24+ const CONCURRENCY_LIMIT = 5 ;
25+ const CACHE_PATH = path . join ( import . meta. dir , ".." , ".cache" , "rss-cache.json" ) ;
26+
27+ type CacheEntry = { etag ?: string ; lastModified ?: string } ;
28+
29+ let feedCache : Record < string , CacheEntry > = { } ;
30+ try {
31+ feedCache = JSON . parse ( await fs . readFile ( CACHE_PATH , "utf8" ) ) ;
32+ } catch {
33+ /* first run – cache will be created when we save later */
34+ }
35+
36+ async function saveCache ( ) {
37+ await fs . mkdir ( path . dirname ( CACHE_PATH ) , { recursive : true } ) ;
38+ await fs . writeFile ( CACHE_PATH , JSON . stringify ( feedCache , null , 2 ) ) ;
39+ }
40+
41+ /** Simple semaphore enforcing max parallel requests per domain */
42+ function createSemaphore ( limit : number ) {
43+ let active = 0 ;
44+ const queue : Array < ( ) => void > = [ ] ;
45+ const next = ( ) => {
46+ active -- ;
47+ if ( queue . length ) queue . shift ( ) ! ( ) ;
48+ } ;
49+ return async < T > ( fn : ( ) => Promise < T > ) : Promise < T > =>
50+ new Promise < T > ( ( res , rej ) => {
51+ const run = ( ) => {
52+ active ++ ;
53+ fn ( ) . then ( res ) . catch ( rej ) . finally ( next ) ;
54+ } ;
55+ active < limit ? run ( ) : queue . push ( run ) ;
56+ } ) ;
57+ }
58+
59+ const acquire = createSemaphore ( CONCURRENCY_LIMIT ) ;
60+
61+ /** Fetch wrapper adding UA + retry/back-off for 429/503 */
62+ async function fetchWithRetry (
63+ url : string ,
64+ init : RequestInit = { } ,
65+ retries = 3
66+ ) : Promise < Response > {
67+ const opts : RequestInit = {
68+ ...init ,
69+ headers : { "User-Agent" : USER_AGENT , ...( init . headers as any ) } ,
70+ redirect : "follow" ,
71+ } ;
72+ for ( let attempt = 0 ; attempt <= retries ; attempt ++ ) {
73+ const res = await fetch ( url , opts ) ;
74+ if ( ! [ 429 , 503 ] . includes ( res . status ) || attempt === retries ) return res ;
75+ const retryAfter =
76+ Number ( res . headers . get ( "retry-after" ) ) * 1000 || 2 ** attempt * 1000 ;
77+ await new Promise ( ( r ) => setTimeout ( r , retryAfter ) ) ;
78+ }
79+ throw new Error ( `Failed after ${ retries + 1 } retries → ${ url } ` ) ;
80+ }
81+
82+ // robots.txt disallow cache per origin
83+ const robotsCache : Record < string , string [ ] > = { } ;
84+ async function isAllowed ( target : URL ) : Promise < boolean > {
85+ const origin = target . origin ;
86+ if ( ! ( origin in robotsCache ) ) {
87+ try {
88+ const res = await fetchWithRetry ( `${ origin } /robots.txt` ) ;
89+ if ( ! res . ok ) throw new Error ( ) ;
90+ const txt = await res . text ( ) ;
91+ const disallow : string [ ] = [ ] ;
92+ let inGlobal = false ;
93+ for ( const line of txt . split ( "\n" ) ) {
94+ const trimmed = line . trim ( ) ;
95+ if ( / ^ u s e r - a g e n t : \s * \* / i. test ( trimmed ) ) inGlobal = true ;
96+ else if ( / ^ u s e r - a g e n t : / i. test ( trimmed ) ) inGlobal = false ;
97+ else if ( inGlobal && / ^ d i s a l l o w : / i. test ( trimmed ) ) {
98+ const parts = trimmed . split ( ":" ) ;
99+ if ( parts [ 1 ] ) disallow . push ( parts [ 1 ] . trim ( ) ) ;
100+ }
101+ }
102+ robotsCache [ origin ] = disallow ;
103+ } catch {
104+ robotsCache [ origin ] = [ ] ; // assume allowed if cannot fetch
105+ }
106+ }
107+ const rules = robotsCache [ origin ] ?? [ ] ;
108+ return rules . every ( ( p ) => ! target . pathname . startsWith ( p ) ) ;
109+ }
110+ // ---------------------------------------------------------------------------
21111
22112// ---------- Helpers --------------------------------------------------------
23113
@@ -86,7 +176,7 @@ function decodeEntities(str: string): string {
86176async function enrichContent ( url : string ) : Promise < Partial < Content > > {
87177 const partial : Partial < Content > = { } ;
88178
89- const response = await fetch ( url , { redirect : "follow" } ) ;
179+ const response = await fetchWithRetry ( url ) ;
90180
91181 const rewriter = new HTMLRewriter ( )
92182 . on ( "meta[name='twitter:data2']" , {
@@ -189,7 +279,7 @@ function parseFeedEntries(xml: string): Array<{
189279// ---------- Main script ----------------------------------------------------
190280
191281const ROOT = path . join ( import . meta. dir , ".." , "providers" ) ;
192- await fs . rm ( ROOT , { recursive : true , force : true } ) ;
282+ // No global deletion; we remove per-provider directory only when feed changed.
193283
194284const rssConfigPath = path . join ( import . meta. dir , ".." , "rss" , "rss.toml" ) ;
195285const configModule = await import ( rssConfigPath , { with : { type : "toml" } } ) ;
@@ -204,64 +294,108 @@ for (const [providerId, cfg] of Object.entries(configs)) {
204294 rss : cfg . rss ?? cfg . url ?? "" ,
205295 } ) ;
206296
207- // Fetch & parse feed --------------------------- ---------------------------
297+ // Fetch & parse feed (ETag/Last-Modified cache) ---------------------------
208298 console . log ( `Fetching feed for ${ providerId } …` ) ;
209- const feedResponse = await fetch ( providerMeta . rss , { redirect : "follow" } ) ;
299+
300+ const cacheEntry = feedCache [ providerMeta . rss ] ?? { } ;
301+
302+ const feedResponse = await fetchWithRetry ( providerMeta . rss , {
303+ headers : {
304+ ...( cacheEntry . etag ? { "If-None-Match" : cacheEntry . etag } : { } ) ,
305+ ...( cacheEntry . lastModified
306+ ? { "If-Modified-Since" : cacheEntry . lastModified }
307+ : { } ) ,
308+ } ,
309+ } ) ;
310+
311+ if ( feedResponse . status === 304 ) {
312+ console . log ( "ℹ︎ Feed unchanged (304) – skipping." ) ;
313+ continue ;
314+ }
315+
316+ if ( ! feedResponse . ok )
317+ throw new Error (
318+ `Failed to fetch feed (${ feedResponse . status } ) → ${ providerMeta . rss } `
319+ ) ;
320+
321+ feedCache [ providerMeta . rss ] = {
322+ etag : feedResponse . headers . get ( "etag" ) ?? undefined ,
323+ lastModified : feedResponse . headers . get ( "last-modified" ) ?? undefined ,
324+ } ;
325+
210326 const feedXml = await feedResponse . text ( ) ;
211327 const entries = parseFeedEntries ( feedXml ) . slice ( 0 , 30 ) ; // limit to 30 most recent
212328
213- // Generate content files in parallel -------------------------------------
329+ // If feed is modified, clear existing provider directory to regenerate fresh.
330+ const providerDir = path . join ( ROOT , providerId ) ;
331+
332+ // remove old provider directory (including provider.toml & content) to avoid stale files
333+ await fs . rm ( providerDir , { recursive : true , force : true } ) ;
334+
335+ // Generate content files (concurrency-guarded) ----------------------------
214336 const usedSlugs = new Set < string > ( ) ;
215337
216338 await Promise . all (
217- entries . map ( async ( entry ) => {
218- const rawSlug = slugify ( entry . title ) || slugify ( entry . link ) ;
219- const contentId = uniqueSlug ( rawSlug , usedSlugs ) ;
220- usedSlugs . add ( contentId ) ;
221-
222- const baseContent : Content = Content . parse ( {
223- id : contentId . toLowerCase ( ) ,
224- title : decodeEntities ( entry . title ) ,
225- description : decodeEntities ( entry . summary ?? "" ) ,
226- url : entry . link ,
227- created_at : entry . published . endsWith ( "Z" )
228- ? entry . published
229- : new Date ( entry . published ) . toISOString ( ) . replace ( / \. \d + Z $ / , "Z" ) , // ISO-8601 string
230- ...( entry . tags ?. length ? { tags : entry . tags . map ( decodeEntities ) } : { } ) ,
231- } ) ;
232-
233- const enriched = await enrichContent ( entry . link ) ;
234-
235- if ( enriched . title ) enriched . title = decodeEntities ( enriched . title ) ;
236- if ( enriched . description )
237- enriched . description = decodeEntities ( enriched . description ) ;
238-
239- if ( enriched . tags || baseContent . tags ) {
240- const merged = Array . from (
241- new Set ( [ ...( baseContent . tags ?? [ ] ) , ...( enriched . tags ?? [ ] ) ] )
339+ entries . map ( ( entry ) =>
340+ acquire ( async ( ) => {
341+ const linkURL = new URL ( entry . link ) ;
342+ if ( ! ( await isAllowed ( linkURL ) ) ) {
343+ console . warn ( `⚠︎ robots.txt disallow – skipping ${ entry . link } ` ) ;
344+ return ;
345+ }
346+
347+ const rawSlug = slugify ( entry . title ) || slugify ( entry . link ) ;
348+ const contentId = uniqueSlug ( rawSlug , usedSlugs ) ;
349+ usedSlugs . add ( contentId ) ;
350+
351+ const baseContent : Content = Content . parse ( {
352+ id : contentId . toLowerCase ( ) ,
353+ title : decodeEntities ( entry . title ) ,
354+ description : decodeEntities ( entry . summary ?? "" ) ,
355+ url : entry . link ,
356+ created_at : entry . published . endsWith ( "Z" )
357+ ? entry . published
358+ : new Date ( entry . published ) . toISOString ( ) . replace ( / \. \d + Z $ / , "Z" ) ,
359+ ...( entry . tags ?. length
360+ ? { tags : entry . tags . map ( decodeEntities ) }
361+ : { } ) ,
362+ } ) ;
363+
364+ const enriched = await enrichContent ( entry . link ) ;
365+
366+ if ( enriched . title ) enriched . title = decodeEntities ( enriched . title ) ;
367+ if ( enriched . description )
368+ enriched . description = decodeEntities ( enriched . description ) ;
369+
370+ if ( enriched . tags || baseContent . tags ) {
371+ const merged = Array . from (
372+ new Set ( [ ...( baseContent . tags ?? [ ] ) , ...( enriched . tags ?? [ ] ) ] )
373+ ) ;
374+ if ( merged . length ) baseContent . tags = merged . map ( decodeEntities ) ;
375+ delete ( enriched as any ) . tags ;
376+ }
377+
378+ const content : Content = Content . parse ( { ...baseContent , ...enriched } ) ;
379+
380+ const contentDir = path . join ( providerDir , "content" ) ;
381+ await fs . mkdir ( contentDir , { recursive : true } ) ;
382+ await fs . writeFile (
383+ path . join ( contentDir , `${ contentId } .toml` ) ,
384+ toTOML ( content )
242385 ) ;
243- if ( merged . length ) baseContent . tags = merged . map ( decodeEntities ) ;
244- delete ( enriched as any ) . tags ;
245- }
246-
247- const content : Content = Content . parse ( { ...baseContent , ...enriched } ) ;
248-
249- const contentDir = path . join ( ROOT , providerId , "content" ) ;
250- await fs . mkdir ( contentDir , { recursive : true } ) ;
251- await fs . writeFile (
252- path . join ( contentDir , `${ contentId } .toml` ) ,
253- toTOML ( content )
254- ) ;
255- } )
386+ } )
387+ )
256388 ) ;
257389
258390 // Write provider metadata -----------------------------------------------
259- const providerDir = path . join ( ROOT , providerId ) ;
260391 await fs . mkdir ( providerDir , { recursive : true } ) ;
261392 await fs . writeFile (
262393 path . join ( providerDir , "provider.toml" ) ,
263394 toTOML ( providerMeta )
264395 ) ;
265396}
266397
267- console . log ( "✅ providers folder refreshed" ) ;
398+ // Persist updated cache once all providers processed
399+ await saveCache ( ) ;
400+
401+ console . log ( "✅ providers folder refreshed (polite mode enabled)" ) ;
0 commit comments