-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.js
More file actions
61 lines (52 loc) · 1.55 KB
/
crawler.js
File metadata and controls
61 lines (52 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
const fs = require('fs')
const url = require('url')
const _ = require('lodash')
const async = require('async')
const cheerio = require('cheerio')
const request = require('request')
const base = 'localhost:4000'
const firstLink = 'http://' + base + ''
const crawled = []
const inboundLinks = []
const makeRequest = function(crawlUrl, callback){
console.log("💻💻💻 Scraping...\n")
const startTime = new Date().getTime()
request(crawlUrl, function (error, response, body) {
const pageObject = {}
pageObject.links = []
const endTime = new Date().getTime()
const requestTime = endTime - startTime
pageObject.requestTime = requestTime
const $ = cheerio.load(body)
pageObject.title = $('title').text()
pageObject.url = crawlUrl
$('a').each(function(i, elem){
pageObject.links.push({linkText: $(elem).text(), linkUrl: elem.attribs.href})
})
callback(error, pageObject)
})
}
const myLoop = function(link){
makeRequest(link, function(error, pageObject){
if(error) throw error // need better error handling
console.log(pageObject)
crawled.push(pageObject.url)
async.eachSeries(pageObject.links, function(item, cb){
parsedUrl = url.parse(item.linkUrl)
if(parsedUrl.hostname == base){
inboundLinks.push(item.linkUrl)
}
cb()
}
,function(){
const nextLink = _.difference(_.uniq(inboundLinks), crawled)
if(nextLink.length > 0){
myLoop(nextLink[0])
}
else {
console.log('\n🥳🥳🥳 Done...')
}
})
})
}
myLoop(firstLink)