Merge pull request #109 from yujiosaka/support_depth_first_order

yujiosaka · web-flow · commit 472ef288d807 · 2018-02-21T19:26:51.000+09:00
feat: support depthPriority option
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Added
 
 - Emit `newpage` event.
-- Support `deniedDomains` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options.
+- Support `deniedDomains` and `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options.
 
 ### changed
 
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Powered by Headless Chrome, the crawler provides [simple APIs](#api-reference) t
 
 * Distributed crawling
 * Configure concurrency, delay and retry
-* Breadth-first search (BFS) to automatically follow links
+* Support both [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search) and [breadth-first search](https://en.wikipedia.org/wiki/Breadth-first_search) algorithm
 * Pluggable cache storages such as [Redis](https://redis.io)
 * Support [CSV](https://tools.ietf.org/html/rfc4180) and [JSON Lines](http://jsonlines.org) for exporting results
 * Pause at the max request and resume at any time
@@ -180,7 +180,7 @@ browserWSEndpoint, ignoreHTTPSErrors
 Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
 
 ```
-url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
+url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
 ```
 
 > **Note**: In practice, setting the options every time you queue equests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -220,7 +220,7 @@ ignoreHTTPSErrors, headless, executablePath, slowMo, args, ignoreDefaultArgs, ha
 Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
 
 ```
-url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
+url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
 ```
 
 > **Note**: In practice, setting the options every time you queue the requests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -239,6 +239,7 @@ url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryD
   * `url` <[string]> Url to navigate to. The url should include scheme, e.g. `https://`.
   * `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links.
   * `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred.
+  * `depthPriority` <[boolean]> Whether to adjust priority based on its depth, defaults to `true`. Leave default to increase priority for higher depth, which is [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search).
   * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
   * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
   * `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.
diff --git a/lib/hccrawler.js b/lib/hccrawler.js
@@ -113,6 +113,7 @@ class HCCrawler extends EventEmitter {
       jQuery: true,
       persistCache: false,
       skipDuplicates: true,
+      depthPriority: true,
       obeyRobotsTxt: true,
       followSitemapXml: false,
       screenshot: null,
@@ -128,7 +129,7 @@ class HCCrawler extends EventEmitter {
     this._onSuccess = options.onSuccess || null;
     this._onError = options.onError || null;
     this._exportHeader();
-    this._queue.on('pull', (...args) => this._onPull(...args));
+    this._queue.on('pull', (...args) => this._startRequest(...args));
     this._browser.on('disconnected', () => {
       this.emit(HCCrawler.Events.Disconnected);
     });
@@ -158,7 +159,7 @@ class HCCrawler extends EventEmitter {
       if (!mergedOptions.url) throw new Error('Url must be defined!');
       if (mergedOptions.device && !includes(deviceNames, mergedOptions.device)) throw new Error('Specified device is not supported!');
       if (mergedOptions.delay > 0 && mergedOptions.maxConcurrency !== 1) throw new Error('Max concurrency must be 1 when delay is set!');
-      this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS));
+      this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS), 1);
     });
   }
 
@@ -266,10 +267,12 @@ class HCCrawler extends EventEmitter {
 
   /**
    * @param {!Object} options
-   * @param {!number=} depth
+   * @param {!number} depth
    */
-  _push(options, depth = 1) {
-    this._queue.push(options, depth, options.priority);
+  _push(options, depth) {
+    let { priority } = options;
+    if (!priority && options.depthPriority) priority = depth;
+    this._queue.push(options, depth, priority);
   }
 
   /**
@@ -278,15 +281,19 @@ class HCCrawler extends EventEmitter {
    * @return {!Promise}
    * @private
    */
-  _onPull(options, depth) {
+  _startRequest(options, depth) {
     return this._skipRequest(options)
       .then(skip => {
         if (skip) {
           this.emit(HCCrawler.Events.RequestSkipped, options);
           return Promise.resolve();
         }
         return this._followSitemap(options, depth)
-          .then(() => this._request(options, depth));
+          .then(() => this._request(options, depth))
+          .then(links => {
+            this._checkRequestCount();
+            return delay(options.delay).then(() => this._followLinks(links, options, depth));
+          });
       });
   }
 
@@ -303,9 +310,7 @@ class HCCrawler extends EventEmitter {
       this._shouldRequest(options),
     ])
       .then(([requested, allowedRobot, allowedDomain, shouldRequest]) => {
-        if (requested || !allowedRobot || !allowedDomain || !shouldRequest) {
-          return true;
-        }
+        if (requested || !allowedRobot || !allowedDomain || !shouldRequest) return true;
         return false;
       });
   }
@@ -314,7 +319,7 @@ class HCCrawler extends EventEmitter {
    * @param {!Object} options
    * @param {!number} depth
    * @param {!number=} retryCount
-   * @return {!Promise}
+   * @return {!Promise<!Array<!string>}
    * @private
    */
   _request(options, depth, retryCount = 0) {
@@ -324,18 +329,12 @@ class HCCrawler extends EventEmitter {
         this.emit(HCCrawler.Events.NewPage, crawler.page());
         return crawler.crawl()
           .then(res => {
-            res = extend({}, res);
-            res.options = options;
-            res.depth = depth;
+            res = extend({ options, depth }, res);
             this.emit(HCCrawler.Events.RequestFinished, res);
             return this._success(res)
-              .then(() => {
-                this._exportLine(res);
-                this._checkRequestCount();
-                this._followLinks(res.links, options, depth);
-              })
+              .then(() => { void this._exportLine(res); })
               .then(() => crawler.close())
-              .then(() => delay(options.delay));
+              .then(() => res.links);
           })
           .catch(error => {
             if (retryCount >= options.retryCount) throw error;
@@ -348,9 +347,8 @@ class HCCrawler extends EventEmitter {
           .catch(error => {
             this.emit(HCCrawler.Events.RequestFailed, error);
             return this._error(error)
-              .then(() => void this._checkRequestCount())
               .then(() => crawler.close())
-              .then(() => delay(options.delay));
+              .then(() => []);
           });
       });
   }
diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js
@@ -512,6 +512,38 @@ describe('HCCrawler', () => {
               assert.equal(onSuccess.callCount, 3);
             });
         });
+
+        context('when the first page contains several links', () => {
+          beforeEach(() => {
+            server.setContent('/1.html', `
+            go to <a href="${PREFIX}/2.html">/2.html</a>
+            go to <a href="${PREFIX}/3.html">/3.html</a>
+            `);
+            server.setContent('/2.html', `go to <a href="${PREFIX}/4.html">/4.html</a>`);
+          });
+
+          it('follow links with depth first order with maxDepth = 3', () => {
+            crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3 });
+            return crawler.onIdle()
+              .then(() => {
+                assert.equal(onSuccess.callCount, 4);
+                assert.equal(onSuccess.firstCall.args[0].depth, 1);
+                assert.equal(onSuccess.secondCall.args[0].depth, 2);
+                assert.equal(onSuccess.thirdCall.args[0].depth, 3);
+              });
+          });
+
+          it('follow links with breadth first order with maxDepth = 3 and depthPriority = false', () => {
+            crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3, depthPriority: false });
+            return crawler.onIdle()
+              .then(() => {
+                assert.equal(onSuccess.callCount, 4);
+                assert.equal(onSuccess.firstCall.args[0].depth, 1);
+                assert.equal(onSuccess.secondCall.args[0].depth, 2);
+                assert.equal(onSuccess.thirdCall.args[0].depth, 2);
+              });
+          });
+        });
       });
 
       context('when the crawler is launched with maxRequest option', () => {
diff --git a/test/helper.test.js b/test/helper.test.js
@@ -9,6 +9,7 @@ const {
   escapeQuotes,
   getRobotsUrl,
   lowerBound,
+  checkDomainMatch,
   getSitemapUrls,
   unescape,
   stringifyArgument,
@@ -217,6 +218,38 @@ describe('Helper', () => {
     });
   });
 
+  describe('Helper.checkDomainMatch', () => {
+    it('returns false for empty array', () => {
+      const actual = checkDomainMatch([], '127.0.0.1');
+      const expected = false;
+      assert.equal(actual, expected);
+    });
+
+    it('returns false when no domain fully matches requested hostname', () => {
+      const actual = checkDomainMatch(['localhost', '0.0.0.0'], '127.0.0.1');
+      const expected = false;
+      assert.equal(actual, expected);
+    });
+
+    it('returns false when no domain matches requested hostname by regular expression', () => {
+      const actual = checkDomainMatch([/^localhost$/, /^\d\.\d\.\d\.\d$/], '127.0.0.1');
+      const expected = false;
+      assert.equal(actual, expected);
+    });
+
+    it('returns true when a domain fully matches requested hostname', () => {
+      const actual = checkDomainMatch(['localhost', '127.0.0.1'], '127.0.0.1');
+      const expected = true;
+      assert.equal(actual, expected);
+    });
+
+    it('returns true when a domain fully matches requested hostname by regular expression', () => {
+      const actual = checkDomainMatch([/^localhost$/, /^\d+\.\d+\.\d+\.\d+$/], '127.0.0.1');
+      const expected = true;
+      assert.equal(actual, expected);
+    });
+  });
+
   describe('Helper.getSitemapUrls', () => {
     it('returns empty array for empty xml', () => {
       const actual = getSitemapUrls('');