11#!/usr/bin/env node
22
3- import { exec } from "child_process" ;
3+ import { execFile } from "child_process" ;
44import { createWriteStream } from "fs" ;
5- import { mkdir , rm , writeFile } from "fs/promises" ;
5+ import { mkdir , readFile , rm , writeFile } from "fs/promises" ;
66import { HttpsProxyAgent } from "https-proxy-agent" ;
77import { JSDOM } from "jsdom" ;
88import fetch from "node-fetch" ;
9+ import { styleText , format as utilFormat } from "node:util" ;
910import { dirname , join , normalize , resolve } from "path" ;
1011import { format , resolveConfig } from "prettier" ;
1112import { pipeline } from "stream/promises" ;
1213import { fileURLToPath } from "url" ;
1314import { promisify } from "util" ;
14- import { styleText , format as utilFormat } from "node:util" ;
1515
16- const execAsync = promisify ( exec ) ;
16+ const execFileAsync = promisify ( execFile ) ;
1717
1818const ARTICLES_CMS_BASE_URL = process . env . ARTICLES_CMS_BASE_URL ;
1919const ARTICLES_CMS_USERNAME = process . env . ARTICLES_CMS_USERNAME ;
@@ -32,6 +32,9 @@ const __filename = fileURLToPath(import.meta.url);
3232const __dirname = dirname ( __filename ) ;
3333
3434const OUTPUT_DIR = resolve ( join ( __dirname , ".." , "var" , "data" , "articles" ) ) ;
35+ const RUN_HISTORY_FILE = resolve ( join ( __dirname , ".." , "var" , "data" , "articles-scraper-cronjob-history.json" ) ) ;
36+ const RUN_HISTORY_REMOTE_FILE = `${ RCLONE_S3_REMOTE } :${ S3_BUCKET_NAME } /articles/articles-scraper-cronjob-history.json` ;
37+ const MAX_HISTORY_ENTRIES = 10 ;
3538
3639const HTTP_PROXY = process . env . HTTP_PROXY ;
3740
@@ -45,8 +48,17 @@ const logger = {
4548 success : ( ...args ) => console . log ( styleText ( "bgGreen" , formatArgs ( args ) ) ) ,
4649} ;
4750
48- let nbDownloadedFiles = 0 ;
49- let nbDownloadedFilesFailed = 0 ;
51+ const stats = {
52+ articles : {
53+ detected : 0 ,
54+ downloaded : 0 ,
55+ failed : 0 ,
56+ } ,
57+ files : {
58+ downloaded : 0 ,
59+ failed : 0 ,
60+ } ,
61+ } ;
5062
5163const CONCURRENCY_LIMIT = 1 ;
5264const CONCURRENCY_DELAY = 200 ;
@@ -244,11 +256,11 @@ const downloadFile = async (originalFilePath) => {
244256 await pipeline ( response . body , createWriteStream ( newFilePath ) ) ;
245257
246258 logger . log ( `File saved to ${ newFilePath } ` ) ;
247- nbDownloadedFiles ++ ;
259+ stats . files . downloaded ++ ;
248260 return newFilePath ;
249261 } catch ( error ) {
250262 logger . error ( `Failed to download ${ url . href } : ${ error . message } ` ) ;
251- nbDownloadedFilesFailed ++ ;
263+ stats . files . failed ++ ;
252264 throw error ;
253265 }
254266} ;
@@ -278,9 +290,111 @@ const prettify = async (string) => {
278290 } ) ;
279291} ;
280292
293+ const readHistory = async ( ) => {
294+ try {
295+ const historyContent = await readFile ( RUN_HISTORY_FILE , "utf8" ) ;
296+ const parsedHistory = JSON . parse ( historyContent ) ;
297+
298+ if ( Array . isArray ( parsedHistory ?. runs ) ) {
299+ return parsedHistory ;
300+ }
301+ } catch ( error ) {
302+ if ( error ?. code !== "ENOENT" ) {
303+ logger . warn ( `Unable to read ${ RUN_HISTORY_FILE } , recreating history file.` ) ;
304+ }
305+ }
306+
307+ return {
308+ runs : [ ] ,
309+ } ;
310+ } ;
311+
312+ const appendRunHistory = async ( entry ) => {
313+ const history = await readHistory ( ) ;
314+ history . runs . unshift ( entry ) ;
315+ history . runs = history . runs . slice ( 0 , MAX_HISTORY_ENTRIES ) ;
316+
317+ await ensureDirectoryExists ( RUN_HISTORY_FILE ) ;
318+ await writeFile ( RUN_HISTORY_FILE , JSON . stringify ( history , null , 2 ) , { flag : "w" } ) ;
319+ } ;
320+
321+ const isHistoryMissingOnRemote = ( error ) => {
322+ const stderr = String ( error ?. stderr ?? "" ) ;
323+ return / n o t f o u n d | n o s u c h f i l e | o b j e c t d o e s n o t e x i s t | f a i l e d t o c o p y / i. test ( stderr ) ;
324+ } ;
325+
326+ const pullHistoryFromS3 = async ( ) => {
327+ const args = [ "copyto" , RUN_HISTORY_REMOTE_FILE , RUN_HISTORY_FILE ] ;
328+ const command = `rclone ${ args . join ( " " ) } ` ;
329+
330+ try {
331+ await execFileAsync ( "rclone" , args ) ;
332+ logger . info ( `Pulled history file from ${ RUN_HISTORY_REMOTE_FILE } ` ) ;
333+
334+ return {
335+ command,
336+ exitCode : 0 ,
337+ } ;
338+ } catch ( error ) {
339+ if ( isHistoryMissingOnRemote ( error ) ) {
340+ logger . info ( `No history file found on ${ RUN_HISTORY_REMOTE_FILE } , starting with local empty history.` ) ;
341+ return {
342+ command,
343+ exitCode : Number . isInteger ( error ?. code ) ? error . code : 1 ,
344+ } ;
345+ }
346+
347+ logger . warn ( `Failed to pull history from ${ RUN_HISTORY_REMOTE_FILE } , continuing with local fallback.` ) ;
348+ return {
349+ command,
350+ exitCode : Number . isInteger ( error ?. code ) ? error . code : 1 ,
351+ } ;
352+ }
353+ } ;
354+
355+ const pushHistoryToS3 = async ( ) => {
356+ const args = [ "copyto" , RUN_HISTORY_FILE , RUN_HISTORY_REMOTE_FILE ] ;
357+ const command = `rclone ${ args . join ( " " ) } ` ;
358+
359+ try {
360+ await execFileAsync ( "rclone" , args ) ;
361+ logger . info ( `Pushed history file to ${ RUN_HISTORY_REMOTE_FILE } ` ) ;
362+
363+ return {
364+ command,
365+ exitCode : 0 ,
366+ } ;
367+ } catch ( error ) {
368+ logger . error ( `Failed to push history file to ${ RUN_HISTORY_REMOTE_FILE } ` ) ;
369+
370+ return {
371+ command,
372+ exitCode : Number . isInteger ( error ?. code ) ? error . code : 1 ,
373+ } ;
374+ }
375+ } ;
376+
281377const syncS3 = async ( ) => {
282- await execAsync ( `rclone sync ${ OUTPUT_DIR } ${ RCLONE_S3_REMOTE } :${ S3_BUCKET_NAME } /articles` ) ;
283- logger . info ( `Synchronised ${ OUTPUT_DIR } with S3` ) ;
378+ const destination = `${ RCLONE_S3_REMOTE } :${ S3_BUCKET_NAME } /articles` ;
379+ const args = [ "sync" , OUTPUT_DIR , destination ] ;
380+ const command = `rclone ${ args . join ( " " ) } ` ;
381+
382+ try {
383+ await execFileAsync ( "rclone" , args ) ;
384+ logger . info ( `Synchronised ${ OUTPUT_DIR } to ${ RCLONE_S3_REMOTE } :${ S3_BUCKET_NAME } /articles` ) ;
385+
386+ return {
387+ command,
388+ exitCode : 0 ,
389+ } ;
390+ } catch ( error ) {
391+ logger . error ( `Sync failed for command: ${ command } ` ) ;
392+
393+ return {
394+ command,
395+ exitCode : Number . isInteger ( error ?. code ) ? error . code : 1 ,
396+ } ;
397+ }
284398} ;
285399
286400const cleanOutputDir = async ( ) => {
@@ -413,6 +527,11 @@ const processSingleArticle = async (slug) => {
413527} ;
414528
415529( async ( ) => {
530+ const executedAt = new Date ( ) ;
531+ let didScrapeFail = false ;
532+
533+ await pullHistoryFromS3 ( ) ;
534+
416535 try {
417536 await cleanOutputDir ( ) ;
418537
@@ -423,6 +542,7 @@ const processSingleArticle = async (slug) => {
423542 const { firstPage, lastPage } = await getPageNumbers ( ARTICLES_CMS_BASE_URL ) ;
424543 const pagesRange = getArrayRange ( firstPage , lastPage ) ; // [0,1,2,3,...,n]
425544 const articleSlugsList = ( await withConcurrency ( pagesRange , ( page ) => processArticlesIndex ( ARTICLES_CMS_BASE_URL , page ) ) ) . flat ( ) ;
545+ stats . articles . detected = articleSlugsList . length ;
426546
427547 // la liste paginée des articles par tag (index pour chaque tag)
428548 const response = await fetch ( ARTICLES_CMS_BASE_URL , getFetchOptions ( ) ) ;
@@ -439,21 +559,53 @@ const processSingleArticle = async (slug) => {
439559 } ) ;
440560
441561 // les articles individuels
442- await withConcurrency ( articleSlugsList , ( slug ) => processSingleArticle ( slug ) ) ;
562+ await withConcurrency ( articleSlugsList , async ( slug ) => {
563+ try {
564+ await processSingleArticle ( slug ) ;
565+ stats . articles . downloaded ++ ;
566+ } catch ( error ) {
567+ stats . articles . failed ++ ;
568+ logger . warn ( `Failed to process article ${ slug } : ${ error ?. message ?? "unknown error" } ` ) ;
569+ }
570+ } ) ;
571+
572+ if ( stats . articles . failed > 0 ) {
573+ logger . warn ( `Failed to download ${ stats . articles . failed } article(s).` ) ;
574+ }
443575
444- logger . info ( `Downloaded ${ nbDownloadedFiles } file(s) successfully.` ) ;
445- if ( nbDownloadedFilesFailed > 0 ) {
446- logger . warn ( `Failed to download ${ nbDownloadedFilesFailed } file(s).` ) ;
576+ logger . info ( `Downloaded ${ stats . files . downloaded } file(s) successfully.` ) ;
577+ if ( stats . files . failed > 0 ) {
578+ logger . warn ( `Failed to download ${ stats . files . failed } file(s).` ) ;
447579 } else {
448580 logger . success ( "All files downloaded successfully." ) ;
449581 }
450582 } catch ( error ) {
583+ didScrapeFail = true ;
451584 logger . error ( "Script failed:" , error ) ;
452585 }
453586
454- await syncS3 ( ) ;
587+ const syncResult = await syncS3 ( ) ;
588+ if ( syncResult . exitCode !== 0 ) {
589+ didScrapeFail = true ;
590+ }
591+
592+ await appendRunHistory ( {
593+ executedAt : executedAt . toISOString ( ) ,
594+ durationMs : new Date ( ) . getTime ( ) - executedAt . getTime ( ) ,
595+ stats,
596+ } ) ;
597+
598+ const historyPushResult = await pushHistoryToS3 ( ) ;
599+ if ( historyPushResult . exitCode !== 0 ) {
600+ logger . warn ( "History file sync to S3 failed after local update." ) ;
601+ didScrapeFail = true ;
602+ }
455603
456604 if ( process . env . APP_ENV === "prod" ) {
457605 await cleanOutputDir ( ) ;
458606 }
607+
608+ if ( didScrapeFail ) {
609+ process . exitCode = 1 ;
610+ }
459611} ) ( ) ;
0 commit comments