-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
54 lines (44 loc) · 1.56 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
'use strict';
// See https://www.mattcutts.com/blog/seo-glossary-url-definitions/ for URL semantics
const url = 'https://www.example.com',
domainName = 'example.com',
userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36';
const scrape = require('website-scraper'),
parse = require('url-parse'),
path = require('path');
const urlFilter = function(url) {
// Allow internal URLs
const isInternalPageOrResource = url.includes(domainName);
if (isInternalPageOrResource) return true;
// Allow only external resources with whitelisted file extensions
const parsedUrl = parse(url),
pathname = parsedUrl.pathname || '',
fileExtension = pathname.split('.').pop() || '';
const whitelist = ['css', 'js', 'txt', 'pdf', 'epub', 'mobi', 'doc', 'docx', 'ppt', 'pptx', 'csv'];
const isExtensionWhitelisted = whitelist.some(ext => ext === fileExtension);
if (isExtensionWhitelisted) return true;
return false;
};
const delayPlugin = {
apply(registerAction) {
// Server rejecting a scrape attempt, simply add in some synthetic delays
registerAction('beforeRequest', async ({ resource, requestOptions }) => {
const time = Math.round(Math.random() * 10000);
await new Promise(resolve => setTimeout(resolve, time));
return { requestOptions };
});
}
};
const options = {
urls: [url],
urlFilter,
directory: path.resolve(__dirname, 'websites', domainName),
request: {
headers: {
'User-Agent': userAgent
}
},
recursive: true,
plugins: [delayPlugin]
};
scrape(options);