Skip to content

Commit 472ef28

Browse files
author
yujiosaka
authored
Merge pull request #109 from yujiosaka/support_depth_first_order
feat: support depthPriority option
2 parents ce24a6a + 1cdbb99 commit 472ef28

5 files changed

Lines changed: 90 additions & 26 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
99
### Added
1010

1111
- Emit `newpage` event.
12-
- Support `deniedDomains` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options.
12+
- Support `deniedDomains` and `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options.
1313

1414
### changed
1515

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Powered by Headless Chrome, the crawler provides [simple APIs](#api-reference) t
99

1010
* Distributed crawling
1111
* Configure concurrency, delay and retry
12-
* Breadth-first search (BFS) to automatically follow links
12+
* Support both [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search) and [breadth-first search](https://en.wikipedia.org/wiki/Breadth-first_search) algorithm
1313
* Pluggable cache storages such as [Redis](https://redis.io)
1414
* Support [CSV](https://tools.ietf.org/html/rfc4180) and [JSON Lines](http://jsonlines.org) for exporting results
1515
* Pause at the max request and resume at any time
@@ -180,7 +180,7 @@ browserWSEndpoint, ignoreHTTPSErrors
180180
Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
181181

182182
```
183-
url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
183+
url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
184184
```
185185

186186
> **Note**: In practice, setting the options every time you queue equests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -220,7 +220,7 @@ ignoreHTTPSErrors, headless, executablePath, slowMo, args, ignoreDefaultArgs, ha
220220
Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
221221

222222
```
223-
url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
223+
url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
224224
```
225225

226226
> **Note**: In practice, setting the options every time you queue the requests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -239,6 +239,7 @@ url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryD
239239
* `url` <[string]> Url to navigate to. The url should include scheme, e.g. `https://`.
240240
* `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links.
241241
* `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred.
242+
* `depthPriority` <[boolean]> Whether to adjust priority based on its depth, defaults to `true`. Leave default to increase priority for higher depth, which is [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search).
242243
* `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
243244
* `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
244245
* `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.

lib/hccrawler.js

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ class HCCrawler extends EventEmitter {
113113
jQuery: true,
114114
persistCache: false,
115115
skipDuplicates: true,
116+
depthPriority: true,
116117
obeyRobotsTxt: true,
117118
followSitemapXml: false,
118119
screenshot: null,
@@ -128,7 +129,7 @@ class HCCrawler extends EventEmitter {
128129
this._onSuccess = options.onSuccess || null;
129130
this._onError = options.onError || null;
130131
this._exportHeader();
131-
this._queue.on('pull', (...args) => this._onPull(...args));
132+
this._queue.on('pull', (...args) => this._startRequest(...args));
132133
this._browser.on('disconnected', () => {
133134
this.emit(HCCrawler.Events.Disconnected);
134135
});
@@ -158,7 +159,7 @@ class HCCrawler extends EventEmitter {
158159
if (!mergedOptions.url) throw new Error('Url must be defined!');
159160
if (mergedOptions.device && !includes(deviceNames, mergedOptions.device)) throw new Error('Specified device is not supported!');
160161
if (mergedOptions.delay > 0 && mergedOptions.maxConcurrency !== 1) throw new Error('Max concurrency must be 1 when delay is set!');
161-
this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS));
162+
this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS), 1);
162163
});
163164
}
164165

@@ -266,10 +267,12 @@ class HCCrawler extends EventEmitter {
266267

267268
/**
268269
* @param {!Object} options
269-
* @param {!number=} depth
270+
* @param {!number} depth
270271
*/
271-
_push(options, depth = 1) {
272-
this._queue.push(options, depth, options.priority);
272+
_push(options, depth) {
273+
let { priority } = options;
274+
if (!priority && options.depthPriority) priority = depth;
275+
this._queue.push(options, depth, priority);
273276
}
274277

275278
/**
@@ -278,15 +281,19 @@ class HCCrawler extends EventEmitter {
278281
* @return {!Promise}
279282
* @private
280283
*/
281-
_onPull(options, depth) {
284+
_startRequest(options, depth) {
282285
return this._skipRequest(options)
283286
.then(skip => {
284287
if (skip) {
285288
this.emit(HCCrawler.Events.RequestSkipped, options);
286289
return Promise.resolve();
287290
}
288291
return this._followSitemap(options, depth)
289-
.then(() => this._request(options, depth));
292+
.then(() => this._request(options, depth))
293+
.then(links => {
294+
this._checkRequestCount();
295+
return delay(options.delay).then(() => this._followLinks(links, options, depth));
296+
});
290297
});
291298
}
292299

@@ -303,9 +310,7 @@ class HCCrawler extends EventEmitter {
303310
this._shouldRequest(options),
304311
])
305312
.then(([requested, allowedRobot, allowedDomain, shouldRequest]) => {
306-
if (requested || !allowedRobot || !allowedDomain || !shouldRequest) {
307-
return true;
308-
}
313+
if (requested || !allowedRobot || !allowedDomain || !shouldRequest) return true;
309314
return false;
310315
});
311316
}
@@ -314,7 +319,7 @@ class HCCrawler extends EventEmitter {
314319
* @param {!Object} options
315320
* @param {!number} depth
316321
* @param {!number=} retryCount
317-
* @return {!Promise}
322+
* @return {!Promise<!Array<!string>}
318323
* @private
319324
*/
320325
_request(options, depth, retryCount = 0) {
@@ -324,18 +329,12 @@ class HCCrawler extends EventEmitter {
324329
this.emit(HCCrawler.Events.NewPage, crawler.page());
325330
return crawler.crawl()
326331
.then(res => {
327-
res = extend({}, res);
328-
res.options = options;
329-
res.depth = depth;
332+
res = extend({ options, depth }, res);
330333
this.emit(HCCrawler.Events.RequestFinished, res);
331334
return this._success(res)
332-
.then(() => {
333-
this._exportLine(res);
334-
this._checkRequestCount();
335-
this._followLinks(res.links, options, depth);
336-
})
335+
.then(() => { void this._exportLine(res); })
337336
.then(() => crawler.close())
338-
.then(() => delay(options.delay));
337+
.then(() => res.links);
339338
})
340339
.catch(error => {
341340
if (retryCount >= options.retryCount) throw error;
@@ -348,9 +347,8 @@ class HCCrawler extends EventEmitter {
348347
.catch(error => {
349348
this.emit(HCCrawler.Events.RequestFailed, error);
350349
return this._error(error)
351-
.then(() => void this._checkRequestCount())
352350
.then(() => crawler.close())
353-
.then(() => delay(options.delay));
351+
.then(() => []);
354352
});
355353
});
356354
}

test/hccrawler.test.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,38 @@ describe('HCCrawler', () => {
512512
assert.equal(onSuccess.callCount, 3);
513513
});
514514
});
515+
516+
context('when the first page contains several links', () => {
517+
beforeEach(() => {
518+
server.setContent('/1.html', `
519+
go to <a href="${PREFIX}/2.html">/2.html</a>
520+
go to <a href="${PREFIX}/3.html">/3.html</a>
521+
`);
522+
server.setContent('/2.html', `go to <a href="${PREFIX}/4.html">/4.html</a>`);
523+
});
524+
525+
it('follow links with depth first order with maxDepth = 3', () => {
526+
crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3 });
527+
return crawler.onIdle()
528+
.then(() => {
529+
assert.equal(onSuccess.callCount, 4);
530+
assert.equal(onSuccess.firstCall.args[0].depth, 1);
531+
assert.equal(onSuccess.secondCall.args[0].depth, 2);
532+
assert.equal(onSuccess.thirdCall.args[0].depth, 3);
533+
});
534+
});
535+
536+
it('follow links with breadth first order with maxDepth = 3 and depthPriority = false', () => {
537+
crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3, depthPriority: false });
538+
return crawler.onIdle()
539+
.then(() => {
540+
assert.equal(onSuccess.callCount, 4);
541+
assert.equal(onSuccess.firstCall.args[0].depth, 1);
542+
assert.equal(onSuccess.secondCall.args[0].depth, 2);
543+
assert.equal(onSuccess.thirdCall.args[0].depth, 2);
544+
});
545+
});
546+
});
515547
});
516548

517549
context('when the crawler is launched with maxRequest option', () => {

test/helper.test.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ const {
99
escapeQuotes,
1010
getRobotsUrl,
1111
lowerBound,
12+
checkDomainMatch,
1213
getSitemapUrls,
1314
unescape,
1415
stringifyArgument,
@@ -217,6 +218,38 @@ describe('Helper', () => {
217218
});
218219
});
219220

221+
describe('Helper.checkDomainMatch', () => {
222+
it('returns false for empty array', () => {
223+
const actual = checkDomainMatch([], '127.0.0.1');
224+
const expected = false;
225+
assert.equal(actual, expected);
226+
});
227+
228+
it('returns false when no domain fully matches requested hostname', () => {
229+
const actual = checkDomainMatch(['localhost', '0.0.0.0'], '127.0.0.1');
230+
const expected = false;
231+
assert.equal(actual, expected);
232+
});
233+
234+
it('returns false when no domain matches requested hostname by regular expression', () => {
235+
const actual = checkDomainMatch([/^localhost$/, /^\d\.\d\.\d\.\d$/], '127.0.0.1');
236+
const expected = false;
237+
assert.equal(actual, expected);
238+
});
239+
240+
it('returns true when a domain fully matches requested hostname', () => {
241+
const actual = checkDomainMatch(['localhost', '127.0.0.1'], '127.0.0.1');
242+
const expected = true;
243+
assert.equal(actual, expected);
244+
});
245+
246+
it('returns true when a domain fully matches requested hostname by regular expression', () => {
247+
const actual = checkDomainMatch([/^localhost$/, /^\d+\.\d+\.\d+\.\d+$/], '127.0.0.1');
248+
const expected = true;
249+
assert.equal(actual, expected);
250+
});
251+
});
252+
220253
describe('Helper.getSitemapUrls', () => {
221254
it('returns empty array for empty xml', () => {
222255
const actual = getSitemapUrls('');

0 commit comments

Comments
 (0)