Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 133 additions & 93 deletions pjscrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -483,11 +483,19 @@ var pjs = (function(){
onAlert: function(msg) { log.alert('CLIENT: ' + msg) }
});

page.onError = function (msg, trace) {
log.error('Traceback:');
log.error(msg);
trace.forEach(function (item) {
log.error(' ' + item.file + ':' + item.line);
});
};

// add waitFor method
page.waitFor = function(test, callback) {
page.waitFor = function(test, callback, s, complete) {
// check for short-circuit
if (this.evaluate(test)) {
callback(page);
callback(page, s, complete);
} else {
// poll until timeout or success
var elapsed = 0,
Expand All @@ -497,7 +505,7 @@ var pjs = (function(){
log.alert('Timeout after ' + ~~(elapsed / 1000) + ' seconds');
}
window.clearInterval(timeoutId);
callback(page);
callback(page, s, complete);
} else {
elapsed += config.timeoutInterval;
}
Expand Down Expand Up @@ -624,7 +632,19 @@ var pjs = (function(){
}
}
}
runNext();
if (page && s.opts.nextPage) {
if (!s.opts.maxDepth || s.depth < s.opts.maxDepth) {
if (page.evaluate(s.opts.nextPage)) {
s.depth = s.depth + 1;
} else {
runNext();
}
} else {
runNext();
}
} else {
runNext();
}
},
runCounter = 0;
// run each
Expand Down Expand Up @@ -678,110 +698,130 @@ var pjs = (function(){
page.settings = extend(page.settings, config.pageSettings);

// run the scrape
page.open(url, function(status) {
page.open(url, function (status) {
// check for load errors
if (status != "success") {
log.error('Page did not load (status=' + status + '): ' + url);
complete(false);
return;
}
// look for 4xx or 5xx status codes
var statusCodeStart = page.resource && String(page.resource.status).charAt(0);
if (statusCodeStart == '4' || statusCodeStart == '5') {
if (page.resource.status == 404) {
log.error('Page not found: ' + url);
} else {
log.error('Page error code ' + page.resource.status + ' on ' + url);
}
complete(false);
return;
suite.handleLoad(page, status, complete)
});
},
handleLoad: function (page, status, complete) {
var s = this, opts = s.opts, url;
// check for load errors
url = page.evaluate(function () { return window.location + ""; });

// look for 4xx or 5xx status codes
var statusCodeStart = String(page.resource.status).charAt(0);
if (statusCodeStart == '4' || statusCodeStart == '5') {
if (page.resource.status == 404) {
log.error('Page not found: ' + url);
} else {
log.error('Page error code ' + page.resource.status + ' on ' + url);
}
// mark as visited
visited[url] = true;
log.msg('Scraping ' + url);
// load jQuery
page.injectJs('client/jquery.js');
page.evaluate(function() {
window._pjs$ = jQuery.noConflict(true);
complete(false);
return;
}
// mark as visited
visited[url] = true;
log.msg('Scraping ' + url);
// load jQuery
page.injectJs('client/jquery.js');
page.evaluate(function() {
window._pjs$ = jQuery.noConflict(true);
});
// load pjscrape client-side code
page.injectJs('client/pjscrape_client.js');
// attach persistent state
page.evaluate(new Function(
"_pjs.state = " + JSON.stringify(page.state) + ";"
));

// reset the global jQuery vars
if (!opts.noConflict) {
page.evaluate(function() {
window.$ = window.jQuery = window._pjs$;
});
// load pjscrape client-side code
page.injectJs('client/pjscrape_client.js');
// attach persistent state
page.evaluate(new Function(
"_pjs.state = " + JSON.stringify(page.state) + ";"
));
// reset the global jQuery vars
if (!opts.noConflict) {
page.evaluate(function() {
window.$ = window.jQuery = window._pjs$;
});
}
page.waitFor(opts.ready, s.doScraping, s, complete);
},
doScraping: function (page, suite, complete) {
var opts=suite.opts, scrapers=opts.scrapers;
if (page.evaluate(opts.scrapable)) {
page.onLoadStarted = function () {
}
// run scraper(s)
page.waitFor(opts.ready, function(page) {
if (page.evaluate(opts.scrapable)) {
// load script(s) if necessary
if (opts.loadScript) {
opts.loadScript.forEach(function(script) {
page.injectJs(script);
})
}
// run prescrape
page.evaluate(opts.preScrape);
// run each scraper and send any results to writer
if (scrapers && scrapers.length) {
// set up callback manager
var i = 0;
function checkComplete() {
if (++i == scrapers.length) {
// save state
page.state = page.evaluate(function() {
return _pjs.state;
});
// run completion callback
complete(page);
}
}
// run all scrapers
scrapers.forEach(function(scraper) {
if (isFunction(scraper)) {
// standard scraper
suite.addItem(page.evaluate(scraper));
checkComplete();
} else if (typeof scraper == 'string') {
// selector-only scraper
suite.addItem(page.evaluate(new Function(
"return _pjs.getText('" + scraper + "');"
)));
checkComplete();
} else if (scraper.scraper) {
// wrapped scraper, more options (just async now)
if (scraper.async) {
// start the scrape
page.evaluate(scraper.scraper);
// wait for the scraper to return items
page.waitFor(
function() {
return _pjs.items !== undefined
},
function() {
suite.addItem(page.evaluate(function() {
return _pjs.items;
}));
checkComplete();
}
);
}
}
page.onLoadFinished = function (success) {
suite.handleLoad(page, success, complete);
}
// load script(s) if necessary
if (opts.loadScript) {
opts.loadScript.forEach(function(script) {
page.injectJs(script);
})




}
// run prescrape
page.evaluate(opts.preScrape);
// run each scraper and send any results to writer
if (scrapers && scrapers.length) {
// set up callback manager
var i = 0;
function checkComplete() {
if (++i == scrapers.length) {
// save state
page.state = page.evaluate(function() {
return _pjs.state;
});
// run completion callback
complete(page);
}
} else {
complete(page);
}
});
});
// run all scrapers
scrapers.forEach(function(scraper) {
if (isFunction(scraper)) {
// standard scraper
suite.addItem(page.evaluate(scraper));
checkComplete();
} else if (typeof scraper == 'string') {
// selector-only scraper
suite.addItem(page.evaluate(new Function(
"return _pjs.getText('" + scraper + "');"
)));
checkComplete();
} else if (scraper.scraper) {
// wrapped scraper, more options (just async now)
if (scraper.async) {
// start the scrape
page.evaluate(scraper.scraper);
// wait for the scraper to return items
page.waitFor(
function() {
return _pjs.items !== undefined
},
function() {
suite.addItem(page.evaluate(function() {
return _pjs.items;
}));
checkComplete();
}
);
}
}
});
}
} else {
complete(page);
}
}
};



/**
* Run the set of configured scraper suites.
* @name pjs.init
Expand Down
6 changes: 5 additions & 1 deletion tests/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,11 @@ def test_persistent_state(self):
out = getPjscrapeOutput('test_persistent_state.js')
self.assertEqual(out, '["Page 0","Page 1","Page 2"]')

def test_nextpage(self):
out = getPjscrapeOutput('test_nextpage.js')
self.assertEqual(out, '["Test Page: Next page","Test Page: Next page 2"]')

if __name__ == '__main__':
# run tests
suite = unittest.TestLoader().loadTestsFromTestCase(TestPjscrape)
unittest.TextTestRunner(verbosity=2).run(suite)
unittest.TextTestRunner(verbosity=2).run(suite)
20 changes: 20 additions & 0 deletions tests/test_nextpage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
pjs.addSuite({
url: 'http://localhost:8888/test_site/nextpage.html',
scrapers: [
function() {
var items = $('h1').text();
return items;
}
],
// maxDepth limits number of times the nextPage function will be evaluated
maxDepth: 1,
nextPage: function () {
var next = $('[alt="Next"]');
if (next.length) {
next.click();
return true;
} else {
return false;
}
}
});
13 changes: 13 additions & 0 deletions tests/test_site/nextpage.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html>
<head>
<title>TEST</title>
</head>
<body>
<h1>Test Page: Next page</h1>
<form method="GET" action="nextpage2.html">
</form>
<a alt="Next" href="#" onclick="document.forms[0].submit();">Go to next page</a>
<script type="text/javascript" src="jquery-1.4.1.min.js"></script>
</body>
</html>
13 changes: 13 additions & 0 deletions tests/test_site/nextpage2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html>
<head>
<title>TEST</title>
</head>
<body>
<h1>Test Page: Next page 2</h1>
<form method="GET" action="nextpage3.html">
</form>
<a alt="Next" href="#" onclick="document.forms[0].submit();">Go to next page</a>
<script type="text/javascript" src="jquery-1.4.1.min.js"></script>
</body>
</html>
10 changes: 10 additions & 0 deletions tests/test_site/nextpage3.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!DOCTYPE html>
<html>
<head>
<title>TEST</title>
</head>
<body>
<h1>Test Page: Next page 3</h1>
<script type="text/javascript" src="jquery-1.4.1.min.js"></script>
</body>
</html>