Skip to content

Commit 76e792b

Browse files
author
Dan Reeves
authored
Limit similar URLs (#15)
1 parent c9b7670 commit 76e792b

20 files changed

+855
-678
lines changed

index.js

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,37 @@ const cli = meow(`
1212
1313
Options
1414
--outfile, -o Save the backstop config to this file
15+
--debug Logs out errors produced while crawling
1516
--ignore-robots Ignore the sites robots.txt
1617
--ignore-ssl-errors Treat any certificate as valid (e.g. self-signed
1718
or expired)
18-
--debug Logs out errors produced while crawling
1919
--allow-subdomains Allow crawling links found to subdomains of the
2020
current domain
21+
--limit-similar[=3] Limits the number of similar URLs to a set number
22+
Defaults to 3
23+
e.g /blog/1, /blog/2, /blog/3
2124
2225
Examples
2326
$ backstop-crawl http://localhost
24-
`, {
25-
alias: {
26-
o: 'outfile',
27-
},
28-
});
27+
`,
28+
{
29+
alias: {
30+
o: 'outfile',
31+
},
32+
});
33+
34+
if (cli.flags.limitSimilar) {
35+
if (!Number.isInteger(cli.flags.limitSimilar)) {
36+
// Set default if true
37+
cli.flags.limitSimilar = 3;
38+
}
39+
}
2940

3041
if (cli.input.length) {
3142
if (validurl(cli.input[0])) {
3243
crawl(cli.input[0], cli.flags);
3344
} else {
34-
console.error(`Error: "${cli.input[0]}" isn't a valid URL`);
45+
console.error(`> Error: "${cli.input[0]}" isn't a valid URL`);
3546
process.exit(1);
3647
}
3748
} else {

lib/crawl.js

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ const chalk = require('chalk');
88
const mkpath = require('mkpath');
99
const jsonfile = require('jsonfile');
1010
const defaultConf = require('./default-config');
11+
const limitSimilar = require('./limit-similar');
1112

1213
const EXT_BLACKLIST = /\.pdf|\.js|\.css|\.png|\.jpg|\.jpeg|\.gif|\.json|\.xml|\.txt$/i;
1314
const SPINNER_WIDTH = 2;
14-
const urls = [];
15+
let urls = [];
1516

1617
module.exports = function crawl (url, flags) {
1718
const crawler = simplecrawler(url);
@@ -83,6 +84,14 @@ module.exports = function crawl (url, flags) {
8384

8485
// Done. Output the file
8586
crawler.on('complete', () => {
87+
88+
if (flags.limitSimilar) {
89+
spinner.stopAndPersist({
90+
symbol: '>',
91+
text: `Limiting similar urls to ${flags.limitSimilar} of each`,
92+
});
93+
urls = limitSimilar(urls, flags.limitSimilar);
94+
}
8695
defaultConf.scenarios = urls;
8796
const path = dirname(outfile);
8897
mkpath(path, (mkpathErr) => {

lib/limit-similar.js

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/* eslint-disable no-param-reassign, no-else-return */
2+
const urlParse = require('url-parse');
3+
const entries = require('object.entries');
4+
5+
module.exports = function limitSimilar (urls, num) {
6+
7+
const countSimilar = urls
8+
.map(url => url.url)
9+
.reduce((similar, url) => {
10+
const path = urlParse(url).pathname;
11+
const parts = path.trim().split('/').filter(part => part.trim());
12+
// Skip top level urls, e.g. /about
13+
if (parts.length > 1) {
14+
// Insert the anything-thats-not-slash regex into
15+
// the last part of the url
16+
const abstractPath = `${path.replace(/\/[^/]+[^$]?$/, '/[^/]+[^$]?')}$`;
17+
if (abstractPath in similar) {
18+
similar[abstractPath] += 1;
19+
} else {
20+
similar[abstractPath] = 1;
21+
}
22+
}
23+
return similar;
24+
}, {});
25+
26+
const gtNum = entries(countSimilar)
27+
.filter(entry => (entry[1] > num))
28+
.reduce((prev, entry) => Object.assign({}, prev, {
29+
[entry[0]]: entry[1],
30+
}), {});
31+
32+
const repeatedPaths = Object.keys(gtNum);
33+
34+
const filteredCount = {};
35+
const filteredUrls = urls
36+
.filter((url) => {
37+
const pathname = urlParse(url.url).pathname;
38+
for (var path of repeatedPaths) { // eslint-disable-line vars-on-top, no-var
39+
if (pathname.match(new RegExp(path))) {
40+
if (path in filteredCount) {
41+
if (filteredCount[path] < num) {
42+
filteredCount[path] += 1;
43+
return true;
44+
} else {
45+
return false;
46+
}
47+
} else {
48+
filteredCount[path] = 1;
49+
}
50+
}
51+
}
52+
return true;
53+
});
54+
55+
return filteredUrls;
56+
};

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@
4949
"jsonfile": "2.4.0",
5050
"meow": "3.7.0",
5151
"mkpath": "1.0.0",
52+
"object.entries": "^1.0.4",
5253
"ora": "1.1.0",
5354
"simplecrawler": "1.0.3",
55+
"url-parse": "^1.1.7",
5456
"valid-url": "1.0.9"
5557
},
5658
"devDependencies": {

test/fixtures/default-test.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@
2727
"iframe"
2828
]
2929
},
30+
{
31+
"label": "/blog/",
32+
"url": "http://0.0.0.0:8080/blog/",
33+
"selectors": [
34+
"document"
35+
],
36+
"hideSelectors": [
37+
"iframe"
38+
]
39+
},
3040
{
3141
"label": "/test2.html",
3242
"url": "http://0.0.0.0:8080/test2.html",
@@ -36,6 +46,56 @@
3646
"hideSelectors": [
3747
"iframe"
3848
]
49+
},
50+
{
51+
"label": "/blog/one.html",
52+
"url": "http://0.0.0.0:8080/blog/one.html",
53+
"selectors": [
54+
"document"
55+
],
56+
"hideSelectors": [
57+
"iframe"
58+
]
59+
},
60+
{
61+
"label": "/blog/two.html",
62+
"url": "http://0.0.0.0:8080/blog/two.html",
63+
"selectors": [
64+
"document"
65+
],
66+
"hideSelectors": [
67+
"iframe"
68+
]
69+
},
70+
{
71+
"label": "/blog/three.html",
72+
"url": "http://0.0.0.0:8080/blog/three.html",
73+
"selectors": [
74+
"document"
75+
],
76+
"hideSelectors": [
77+
"iframe"
78+
]
79+
},
80+
{
81+
"label": "/blog/four.html",
82+
"url": "http://0.0.0.0:8080/blog/four.html",
83+
"selectors": [
84+
"document"
85+
],
86+
"hideSelectors": [
87+
"iframe"
88+
]
89+
},
90+
{
91+
"label": "/blog/five.html",
92+
"url": "http://0.0.0.0:8080/blog/five.html",
93+
"selectors": [
94+
"document"
95+
],
96+
"hideSelectors": [
97+
"iframe"
98+
]
3999
}
40100
],
41101
"paths": {

test/fixtures/ignore-robots.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@
2727
"iframe"
2828
]
2929
},
30+
{
31+
"label": "/blog/",
32+
"url": "http://0.0.0.0:8080/blog/",
33+
"selectors": [
34+
"document"
35+
],
36+
"hideSelectors": [
37+
"iframe"
38+
]
39+
},
3040
{
3141
"label": "/test2.html",
3242
"url": "http://0.0.0.0:8080/test2.html",
@@ -37,6 +47,56 @@
3747
"iframe"
3848
]
3949
},
50+
{
51+
"label": "/blog/one.html",
52+
"url": "http://0.0.0.0:8080/blog/one.html",
53+
"selectors": [
54+
"document"
55+
],
56+
"hideSelectors": [
57+
"iframe"
58+
]
59+
},
60+
{
61+
"label": "/blog/two.html",
62+
"url": "http://0.0.0.0:8080/blog/two.html",
63+
"selectors": [
64+
"document"
65+
],
66+
"hideSelectors": [
67+
"iframe"
68+
]
69+
},
70+
{
71+
"label": "/blog/three.html",
72+
"url": "http://0.0.0.0:8080/blog/three.html",
73+
"selectors": [
74+
"document"
75+
],
76+
"hideSelectors": [
77+
"iframe"
78+
]
79+
},
80+
{
81+
"label": "/blog/four.html",
82+
"url": "http://0.0.0.0:8080/blog/four.html",
83+
"selectors": [
84+
"document"
85+
],
86+
"hideSelectors": [
87+
"iframe"
88+
]
89+
},
90+
{
91+
"label": "/blog/five.html",
92+
"url": "http://0.0.0.0:8080/blog/five.html",
93+
"selectors": [
94+
"document"
95+
],
96+
"hideSelectors": [
97+
"iframe"
98+
]
99+
},
40100
{
41101
"label": "/no-robots.html",
42102
"url": "http://0.0.0.0:8080/no-robots.html",

test/fixtures/limit-similar-2.json

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{
2+
"viewports": [
3+
{
4+
"name": "Screen",
5+
"width": 1440,
6+
"height": 900
7+
}
8+
],
9+
"scenarios": [
10+
{
11+
"label": "/",
12+
"url": "http://0.0.0.0:8080/",
13+
"selectors": [
14+
"document"
15+
],
16+
"hideSelectors": [
17+
"iframe"
18+
]
19+
},
20+
{
21+
"label": "/test1.html",
22+
"url": "http://0.0.0.0:8080/test1.html",
23+
"selectors": [
24+
"document"
25+
],
26+
"hideSelectors": [
27+
"iframe"
28+
]
29+
},
30+
{
31+
"label": "/blog/",
32+
"url": "http://0.0.0.0:8080/blog/",
33+
"selectors": [
34+
"document"
35+
],
36+
"hideSelectors": [
37+
"iframe"
38+
]
39+
},
40+
{
41+
"label": "/test2.html",
42+
"url": "http://0.0.0.0:8080/test2.html",
43+
"selectors": [
44+
"document"
45+
],
46+
"hideSelectors": [
47+
"iframe"
48+
]
49+
},
50+
{
51+
"label": "/blog/one.html",
52+
"url": "http://0.0.0.0:8080/blog/one.html",
53+
"selectors": [
54+
"document"
55+
],
56+
"hideSelectors": [
57+
"iframe"
58+
]
59+
},
60+
{
61+
"label": "/blog/two.html",
62+
"url": "http://0.0.0.0:8080/blog/two.html",
63+
"selectors": [
64+
"document"
65+
],
66+
"hideSelectors": [
67+
"iframe"
68+
]
69+
}
70+
],
71+
"paths": {
72+
"bitmaps_reference": "backstop_data/bitmaps_reference",
73+
"bitmaps_test": "backstop_data/bitmaps_test",
74+
"casper_scripts": "backstop_data/casper_scripts",
75+
"html_report": "backstop_data/html_report",
76+
"ci_report": "backstop_data/ci_report"
77+
},
78+
"casperFlags": [],
79+
"engine": "phantomjs",
80+
"report": [
81+
"browser"
82+
],
83+
"debug": false
84+
}

0 commit comments

Comments
 (0)