-
Notifications
You must be signed in to change notification settings - Fork 7
/
index.js
87 lines (68 loc) · 3 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
const puppeteer = require('puppeteer');
const TurndownService = require('turndown');
const fs = require('fs');
const args = require('yargs').argv;
const turndownService = new TurndownService();
// check for line arguments and define default values
const pageURL = args.url ? args.url : 'https://justmarkup.com';
const postsSelector = args.postSelector ? args.postSelector : '.main .article h2 a';
const titleSelector = args.titleSelector ? args.titleSelector : '.article h1';
const contentSelector = args.contentSelector ? args.contentSelector : '.article .entry-content';
const postsDirectory = args.dir ? __dirname + args.dir : __dirname + '/posts/';
(async() => {
// start the browser
const browser = await puppeteer.launch({ args: ['--no-sandbox'] });
// open the main page url
const page = await browser.newPage();
try {
await page.goto(pageURL);
console.log('opened the page: ', pageURL);
} catch (error) {
console.log(error);
console.log('failed to open the page: ', pageURL);
}
// Find all links to articles
await page.waitForSelector(postsSelector, { timeout: 0 });
const postUrls = await page.$$eval(postsSelector, postLinks => postLinks.map(link => link.href));
// Visit each page one by one
for (let postUrl of postUrls) {
// open the page
try {
await page.goto(postUrl);
console.log('opened the page: ', postUrl);
} catch (error) {
console.log(error);
console.log('failed to open the page: ', postUrl);
}
// get the pathname
let pagePathname = await page.evaluate(() => location.pathname);
// this replaces all / with -
pagePathname = pagePathname.replace(/\//g, "-");
// my pathname starts with /log/ so I remove this and also remove the last -
// pagePathname = pagePathname.substr(5);
// pagePathname = pagePathname.slice(0, -1);
// get the title of the post
await page.waitForSelector(titleSelector);
const pageTitle = await page.$eval(titleSelector, titleSelector => titleSelector.outerHTML);
// get the content of the page
await page.waitForSelector(contentSelector, { timeout: 0 });
const pageContent = await page.$eval(contentSelector, contentSelector => contentSelector.innerHTML);
// convert the html to markdown
let pageContentMarkdown = turndownService.turndown(pageTitle + pageContent);
// Check if folder exists before writing files there
if (!fs.existsSync(postsDirectory)) {
fs.mkdirSync(postsDirectory);
}
// save the file as pathname.md
fs.writeFile(postsDirectory + pagePathname + '.md', pageContentMarkdown, (err) => {
if (err) {
console.log(err);
}
// success case, the file was saved
console.log('Page saved!');
});
}
// all done, close the browser
await browser.close();
process.exit()
})();