Skip to content

Commit c485126

Browse files
author
yujiosaka
authored
Merge pull request #223 from yujiosaka/support-redirect-chain
Support redirect chain
2 parents 37ded4f + f9fab14 commit c485126

6 files changed

Lines changed: 141 additions & 32 deletions

File tree

API.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ const HCCrawler = require('headless-chrome-crawler');
7575
* `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values.
7676
* `onSuccess(response)` <[Function]> Function to be called when `evaluatePage()` successes.
7777
* `response` <[Object]>
78+
* `redirectChain` <[Array]<[Object]>> Redirect chain of requests.
79+
* `url` <[string]> Requested url.
80+
* `headers` <[Object]> Request headers.
7881
* `response` <[Object]>
7982
* `ok` <[boolean]> whether the status code in the range 200-299 or not.
8083
* `status` <[string]> status code of the request.
@@ -83,7 +86,7 @@ const HCCrawler = require('headless-chrome-crawler');
8386
* `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values.
8487
* `result` <[Serializable]> The result resolved from `evaluatePage()` option.
8588
* `screenshot` <[Buffer]> Buffer with the screenshot image, which is `null` when `screenshot` option not passed.
86-
* `links` <[Array]> List of links found in the requested page.
89+
* `links` <[Array]<[string]>> List of links found in the requested page.
8790
* `depth` <[number]> Depth of the followed links.
8891
* `onError(error)` <[Function]> Function to be called when request fails.
8992
* `error` <[Error]> Error object.
@@ -158,7 +161,8 @@ url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, ret
158161
* `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links.
159162
* `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred.
160163
* `depthPriority` <[boolean]> Whether to adjust priority based on its depth, defaults to `true`. Leave default to increase priority for higher depth, which is [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search).
161-
* `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
164+
* `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `true`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
165+
* `skipRequestedRedirect` <[boolean]> Whether to skip requests already appeared in redirect chains of requests, default to `false`. This option is ignored when `skipDuplicates` is set `false`.
162166
* `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
163167
* `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.
164168
* `allowedDomains` <[Array]<[string]|[RegExp]>> List of domains allowed to request. Pass `null` or leave default to skip checking allowed domain

lib/crawler.js

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
const map = require('lodash/map');
12
const reduce = require('lodash/reduce');
23
const pick = require('lodash/pick');
34
const isEmpty = require('lodash/isEmpty');
45
const uniq = require('lodash/uniq');
5-
const noop = require('lodash/noop');
66
const devices = require('puppeteer/DeviceDescriptors');
77
const {
88
resolveUrl,
@@ -15,6 +15,10 @@ const GOTO_OPTIONS = [
1515
'timeout',
1616
'waitUntil',
1717
];
18+
const REQUEST_FIELDS = [
19+
'url',
20+
'headers',
21+
];
1822
const RESPONSE_FIELDS = [
1923
'ok',
2024
'url',
@@ -46,6 +50,7 @@ class Crawler {
4650
const links = await this._collectLinks(response.url);
4751
return {
4852
response: this._reduceResponse(response),
53+
redirectChain: this._getRedirectChain(response),
4954
result,
5055
screenshot,
5156
links,
@@ -209,9 +214,9 @@ class Crawler {
209214
* @private
210215
*/
211216
async _scrape() {
212-
const evaluatePage = this._options.evaluatePage || noop;
217+
if (!this._options.evaluatePage) return null;
213218
await this._addJQuery();
214-
return this._page.evaluate(evaluatePage);
219+
return this._page.evaluate(this._options.evaluatePage);
215220
}
216221

217222
/**
@@ -267,7 +272,19 @@ class Crawler {
267272
}
268273

269274
/**
270-
* @param {!Response} response
275+
* @param {!Puppeteer.Request} request
276+
* @return {!Object}
277+
* @private
278+
*/
279+
_reduceRequest(request) {
280+
return reduce(REQUEST_FIELDS, (memo, field) => {
281+
memo[field] = request[field]();
282+
return memo;
283+
}, {});
284+
}
285+
286+
/**
287+
* @param {!Puppeteer.Response} response
271288
* @return {!Object}
272289
* @private
273290
*/
@@ -277,6 +294,15 @@ class Crawler {
277294
return memo;
278295
}, {});
279296
}
297+
298+
/**
299+
* @param {!Puppeteer.Response} response
300+
* @return {!Array<!Object>}
301+
* @private
302+
*/
303+
_getRedirectChain(response) {
304+
return map(response.request().redirectChain(), this._reduceRequest);
305+
}
280306
}
281307

282308
tracePublicAPI(Crawler);

lib/hccrawler.js

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const each = require('lodash/each');
88
const includes = require('lodash/includes');
99
const isString = require('lodash/isString');
1010
const isArray = require('lodash/isArray');
11-
const request = require('request-promise');
11+
const rp = require('request-promise');
1212
// @ts-ignore
1313
const robotsParser = require('robots-parser');
1414
const Puppeteer = require('puppeteer');
@@ -119,6 +119,7 @@ class HCCrawler extends EventEmitter {
119119
depthPriority: true,
120120
obeyRobotsTxt: true,
121121
followSitemapXml: false,
122+
skipRequestedRedirect: false,
122123
screenshot: null,
123124
viewport: null,
124125
}, options);
@@ -284,14 +285,15 @@ class HCCrawler extends EventEmitter {
284285
*/
285286
async _startRequest(options, depth) {
286287
const skip = await this._skipRequest(options);
287-
await this._markRequested(options);
288288
if (skip) {
289289
this.emit(HCCrawler.Events.RequestSkipped, options);
290+
await this._markRequested(options);
290291
return;
291292
}
292293
const allowed = await this._checkAllowedRobots(options);
293294
if (!allowed) {
294295
this.emit(HCCrawler.Events.RequestDisallowed, options);
296+
await this._markRequested(options);
295297
return;
296298
}
297299
await this._followSitemap(options, depth);
@@ -329,10 +331,14 @@ class HCCrawler extends EventEmitter {
329331
this.emit(HCCrawler.Events.NewPage, crawler.page());
330332
try {
331333
const res = await crawler.crawl();
334+
const requested = await this._checkRequestedRedirect(options, res.response);
335+
if (requested) return [];
332336
extend(res, { options, depth });
333337
await crawler.close();
334338
this.emit(HCCrawler.Events.RequestFinished, res);
335339
await this._success(res);
340+
await this._markRequested(options);
341+
await this._markRequestedRedirects(options, res.redirectChain, res.response);
336342
this._exportLine(res);
337343
return res.links;
338344
} catch (error) {
@@ -387,7 +393,7 @@ class HCCrawler extends EventEmitter {
387393
let sitemapXml = await this._cache.get(sitemapUrl);
388394
if (!sitemapXml) {
389395
try {
390-
sitemapXml = await request(sitemapUrl);
396+
sitemapXml = await rp(sitemapUrl);
391397
} catch (error) {
392398
this.emit(HCCrawler.Events.SitemapXmlRequestFailed, error);
393399
sitemapXml = EMPTY_TXT;
@@ -408,7 +414,7 @@ class HCCrawler extends EventEmitter {
408414
let robotsTxt = await this._cache.get(robotsUrl);
409415
if (!robotsTxt) {
410416
try {
411-
robotsTxt = await request(robotsUrl);
417+
robotsTxt = await rp(robotsUrl);
412418
} catch (error) {
413419
this.emit(HCCrawler.Events.RobotsTxtRequestFailed, error);
414420
robotsTxt = EMPTY_TXT;
@@ -454,6 +460,18 @@ class HCCrawler extends EventEmitter {
454460
return !!value;
455461
}
456462

463+
/**
464+
* @param {!Object} options
465+
* @param {!Object} response
466+
* @return {!Promise<!boolean>}
467+
* @private
468+
*/
469+
async _checkRequestedRedirect(options, response) {
470+
if (!options.skipRequestedRedirect) return false;
471+
const requested = await this._checkRequested(extend({}, options, { url: response.url }));
472+
return requested;
473+
}
474+
457475
/**
458476
* @param {!Object} options
459477
* @return {!Promise}
@@ -465,6 +483,21 @@ class HCCrawler extends EventEmitter {
465483
await this._cache.set(key, '1');
466484
}
467485

486+
/**
487+
* @param {!Object} options
488+
* @param {!Array<!Object>} redirectChain
489+
* @param {!Object} response
490+
* @return {!Promise}
491+
* @private
492+
*/
493+
async _markRequestedRedirects(options, redirectChain, response) {
494+
if (!options.skipRequestedRedirect) return;
495+
await Promise.all(map(redirectChain, async request => {
496+
await this._markRequested(extend({}, options, { url: request.url }));
497+
}));
498+
await this._markRequested(extend({}, options, { url: response.url }));
499+
}
500+
468501
/**
469502
* @param {!Object} options
470503
* @return {!Promise<?boolean>}

lib/puppeteer.d.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { Browser, Page, Dialog } from 'puppeteer';
1+
import { Browser, Page, Request, Response, Dialog } from 'puppeteer';
22

33
export as namespace Puppeteer;
4-
export { Browser, Page, Dialog };
4+
export { Browser, Page, Request, Response, Dialog };

0 commit comments

Comments
 (0)