Skip to content

Commit ad00272

Browse files
authored
feat(articles-scraper): ajout historique bref du script #956 (#990)
1 parent 1988ae1 commit ad00272

File tree

1 file changed

+167
-15
lines changed

1 file changed

+167
-15
lines changed

bin/articles-scraper.mjs

Lines changed: 167 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
#!/usr/bin/env node
22

3-
import { exec } from "child_process";
3+
import { execFile } from "child_process";
44
import { createWriteStream } from "fs";
5-
import { mkdir, rm, writeFile } from "fs/promises";
5+
import { mkdir, readFile, rm, writeFile } from "fs/promises";
66
import { HttpsProxyAgent } from "https-proxy-agent";
77
import { JSDOM } from "jsdom";
88
import fetch from "node-fetch";
9+
import { styleText, format as utilFormat } from "node:util";
910
import { dirname, join, normalize, resolve } from "path";
1011
import { format, resolveConfig } from "prettier";
1112
import { pipeline } from "stream/promises";
1213
import { fileURLToPath } from "url";
1314
import { promisify } from "util";
14-
import { styleText, format as utilFormat } from "node:util";
1515

16-
const execAsync = promisify(exec);
16+
const execFileAsync = promisify(execFile);
1717

1818
const ARTICLES_CMS_BASE_URL = process.env.ARTICLES_CMS_BASE_URL;
1919
const ARTICLES_CMS_USERNAME = process.env.ARTICLES_CMS_USERNAME;
@@ -32,6 +32,9 @@ const __filename = fileURLToPath(import.meta.url);
3232
const __dirname = dirname(__filename);
3333

3434
const OUTPUT_DIR = resolve(join(__dirname, "..", "var", "data", "articles"));
35+
const RUN_HISTORY_FILE = resolve(join(__dirname, "..", "var", "data", "articles-scraper-cronjob-history.json"));
36+
const RUN_HISTORY_REMOTE_FILE = `${RCLONE_S3_REMOTE}:${S3_BUCKET_NAME}/articles/articles-scraper-cronjob-history.json`;
37+
const MAX_HISTORY_ENTRIES = 10;
3538

3639
const HTTP_PROXY = process.env.HTTP_PROXY;
3740

@@ -45,8 +48,17 @@ const logger = {
4548
success: (...args) => console.log(styleText("bgGreen", formatArgs(args))),
4649
};
4750

48-
let nbDownloadedFiles = 0;
49-
let nbDownloadedFilesFailed = 0;
51+
const stats = {
52+
articles: {
53+
detected: 0,
54+
downloaded: 0,
55+
failed: 0,
56+
},
57+
files: {
58+
downloaded: 0,
59+
failed: 0,
60+
},
61+
};
5062

5163
const CONCURRENCY_LIMIT = 1;
5264
const CONCURRENCY_DELAY = 200;
@@ -244,11 +256,11 @@ const downloadFile = async (originalFilePath) => {
244256
await pipeline(response.body, createWriteStream(newFilePath));
245257

246258
logger.log(`File saved to ${newFilePath}`);
247-
nbDownloadedFiles++;
259+
stats.files.downloaded++;
248260
return newFilePath;
249261
} catch (error) {
250262
logger.error(`Failed to download ${url.href}: ${error.message}`);
251-
nbDownloadedFilesFailed++;
263+
stats.files.failed++;
252264
throw error;
253265
}
254266
};
@@ -278,9 +290,111 @@ const prettify = async (string) => {
278290
});
279291
};
280292

293+
const readHistory = async () => {
294+
try {
295+
const historyContent = await readFile(RUN_HISTORY_FILE, "utf8");
296+
const parsedHistory = JSON.parse(historyContent);
297+
298+
if (Array.isArray(parsedHistory?.runs)) {
299+
return parsedHistory;
300+
}
301+
} catch (error) {
302+
if (error?.code !== "ENOENT") {
303+
logger.warn(`Unable to read ${RUN_HISTORY_FILE}, recreating history file.`);
304+
}
305+
}
306+
307+
return {
308+
runs: [],
309+
};
310+
};
311+
312+
const appendRunHistory = async (entry) => {
313+
const history = await readHistory();
314+
history.runs.unshift(entry);
315+
history.runs = history.runs.slice(0, MAX_HISTORY_ENTRIES);
316+
317+
await ensureDirectoryExists(RUN_HISTORY_FILE);
318+
await writeFile(RUN_HISTORY_FILE, JSON.stringify(history, null, 2), { flag: "w" });
319+
};
320+
321+
const isHistoryMissingOnRemote = (error) => {
322+
const stderr = String(error?.stderr ?? "");
323+
return /not found|no such file|object does not exist|failed to copy/i.test(stderr);
324+
};
325+
326+
const pullHistoryFromS3 = async () => {
327+
const args = ["copyto", RUN_HISTORY_REMOTE_FILE, RUN_HISTORY_FILE];
328+
const command = `rclone ${args.join(" ")}`;
329+
330+
try {
331+
await execFileAsync("rclone", args);
332+
logger.info(`Pulled history file from ${RUN_HISTORY_REMOTE_FILE}`);
333+
334+
return {
335+
command,
336+
exitCode: 0,
337+
};
338+
} catch (error) {
339+
if (isHistoryMissingOnRemote(error)) {
340+
logger.info(`No history file found on ${RUN_HISTORY_REMOTE_FILE}, starting with local empty history.`);
341+
return {
342+
command,
343+
exitCode: Number.isInteger(error?.code) ? error.code : 1,
344+
};
345+
}
346+
347+
logger.warn(`Failed to pull history from ${RUN_HISTORY_REMOTE_FILE}, continuing with local fallback.`);
348+
return {
349+
command,
350+
exitCode: Number.isInteger(error?.code) ? error.code : 1,
351+
};
352+
}
353+
};
354+
355+
const pushHistoryToS3 = async () => {
356+
const args = ["copyto", RUN_HISTORY_FILE, RUN_HISTORY_REMOTE_FILE];
357+
const command = `rclone ${args.join(" ")}`;
358+
359+
try {
360+
await execFileAsync("rclone", args);
361+
logger.info(`Pushed history file to ${RUN_HISTORY_REMOTE_FILE}`);
362+
363+
return {
364+
command,
365+
exitCode: 0,
366+
};
367+
} catch (error) {
368+
logger.error(`Failed to push history file to ${RUN_HISTORY_REMOTE_FILE}`);
369+
370+
return {
371+
command,
372+
exitCode: Number.isInteger(error?.code) ? error.code : 1,
373+
};
374+
}
375+
};
376+
281377
const syncS3 = async () => {
282-
await execAsync(`rclone sync ${OUTPUT_DIR} ${RCLONE_S3_REMOTE}:${S3_BUCKET_NAME}/articles`);
283-
logger.info(`Synchronised ${OUTPUT_DIR} with S3`);
378+
const destination = `${RCLONE_S3_REMOTE}:${S3_BUCKET_NAME}/articles`;
379+
const args = ["sync", OUTPUT_DIR, destination];
380+
const command = `rclone ${args.join(" ")}`;
381+
382+
try {
383+
await execFileAsync("rclone", args);
384+
logger.info(`Synchronised ${OUTPUT_DIR} to ${RCLONE_S3_REMOTE}:${S3_BUCKET_NAME}/articles`);
385+
386+
return {
387+
command,
388+
exitCode: 0,
389+
};
390+
} catch (error) {
391+
logger.error(`Sync failed for command: ${command}`);
392+
393+
return {
394+
command,
395+
exitCode: Number.isInteger(error?.code) ? error.code : 1,
396+
};
397+
}
284398
};
285399

286400
const cleanOutputDir = async () => {
@@ -413,6 +527,11 @@ const processSingleArticle = async (slug) => {
413527
};
414528

415529
(async () => {
530+
const executedAt = new Date();
531+
let didScrapeFail = false;
532+
533+
await pullHistoryFromS3();
534+
416535
try {
417536
await cleanOutputDir();
418537

@@ -423,6 +542,7 @@ const processSingleArticle = async (slug) => {
423542
const { firstPage, lastPage } = await getPageNumbers(ARTICLES_CMS_BASE_URL);
424543
const pagesRange = getArrayRange(firstPage, lastPage); // [0,1,2,3,...,n]
425544
const articleSlugsList = (await withConcurrency(pagesRange, (page) => processArticlesIndex(ARTICLES_CMS_BASE_URL, page))).flat();
545+
stats.articles.detected = articleSlugsList.length;
426546

427547
// la liste paginée des articles par tag (index pour chaque tag)
428548
const response = await fetch(ARTICLES_CMS_BASE_URL, getFetchOptions());
@@ -439,21 +559,53 @@ const processSingleArticle = async (slug) => {
439559
});
440560

441561
// les articles individuels
442-
await withConcurrency(articleSlugsList, (slug) => processSingleArticle(slug));
562+
await withConcurrency(articleSlugsList, async (slug) => {
563+
try {
564+
await processSingleArticle(slug);
565+
stats.articles.downloaded++;
566+
} catch (error) {
567+
stats.articles.failed++;
568+
logger.warn(`Failed to process article ${slug}: ${error?.message ?? "unknown error"}`);
569+
}
570+
});
571+
572+
if (stats.articles.failed > 0) {
573+
logger.warn(`Failed to download ${stats.articles.failed} article(s).`);
574+
}
443575

444-
logger.info(`Downloaded ${nbDownloadedFiles} file(s) successfully.`);
445-
if (nbDownloadedFilesFailed > 0) {
446-
logger.warn(`Failed to download ${nbDownloadedFilesFailed} file(s).`);
576+
logger.info(`Downloaded ${stats.files.downloaded} file(s) successfully.`);
577+
if (stats.files.failed > 0) {
578+
logger.warn(`Failed to download ${stats.files.failed} file(s).`);
447579
} else {
448580
logger.success("All files downloaded successfully.");
449581
}
450582
} catch (error) {
583+
didScrapeFail = true;
451584
logger.error("Script failed:", error);
452585
}
453586

454-
await syncS3();
587+
const syncResult = await syncS3();
588+
if (syncResult.exitCode !== 0) {
589+
didScrapeFail = true;
590+
}
591+
592+
await appendRunHistory({
593+
executedAt: executedAt.toISOString(),
594+
durationMs: new Date().getTime() - executedAt.getTime(),
595+
stats,
596+
});
597+
598+
const historyPushResult = await pushHistoryToS3();
599+
if (historyPushResult.exitCode !== 0) {
600+
logger.warn("History file sync to S3 failed after local update.");
601+
didScrapeFail = true;
602+
}
455603

456604
if (process.env.APP_ENV === "prod") {
457605
await cleanOutputDir();
458606
}
607+
608+
if (didScrapeFail) {
609+
process.exitCode = 1;
610+
}
459611
})();

0 commit comments

Comments
 (0)