Skip to content

Commit dc90dc5

Browse files
committed
fix 3 bugs
1 parent 6abdbec commit dc90dc5

File tree

4 files changed

+23
-15
lines changed

4 files changed

+23
-15
lines changed

core/spider/casper_crawler.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ casper.on('popup.created', function (popup) {
108108
popup.settings.userAgent = user_agent;
109109
popup.onResourceRequested = function (requestData, request) {
110110
requested_count++;
111+
requestData['type'] = 'request';
111112
requested_urls.push(JSON.stringify(requestData));
112113
casper.echo('popup onResourceRequested: ' + requestData.url, 'INFO');
113114
//abort current request, important!

core/spider/core.js

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,16 @@ exports.FireintheHole = function (frame, timeout) {
122122
return false
123123
}
124124

125-
function startsWith(str, sub) {
126-
return str.lastIndexOf(sub, 0) === 0
125+
function normalizeUrl(url) {
126+
if (!url)
127+
return '';
128+
if (startsWith(url, '//'))
129+
url = window.location.protocol + url;
130+
return url.split('#')[0]
127131
}
128132

129-
function rmFragment(url) {
130-
return url ? url.split('#')[0] : ''
133+
function startsWith(str, sub) {
134+
return str.lastIndexOf(sub, 0) === 0
131135
}
132136

133137
function rndStr(length) {
@@ -260,9 +264,9 @@ exports.FireintheHole = function (frame, timeout) {
260264
if (querystring) {
261265
var data = {
262266
'method': form.method.toUpperCase(),
263-
'url': rmFragment(action),
267+
'url': normalizeUrl(action),
264268
'postData': querystring,
265-
'headers': [{'name': 'Referer', 'value': rmFragment(form.baseURI)}],
269+
'headers': [{'name': 'Referer', 'value': normalizeUrl(form.baseURI)}],
266270
'type': 'static'
267271
};
268272
var request = JSON.stringify(data);
@@ -312,9 +316,9 @@ exports.FireintheHole = function (frame, timeout) {
312316
if (url && validScheme(url) && url.length < 1024) {
313317
var data = {
314318
'method': 'GET',
315-
'url': rmFragment(url),
319+
'url': normalizeUrl(url),
316320
'postData': '',
317-
'headers': [{'name': 'Referer', 'value': rmFragment(document.baseURI)}],
321+
'headers': [{'name': 'Referer', 'value': normalizeUrl(document.baseURI)}],
318322
'type': 'static'
319323
};
320324
var request = JSON.stringify(data);
@@ -349,9 +353,9 @@ exports.FireintheHole = function (frame, timeout) {
349353
continue;
350354
var data = {
351355
'method': 'GET',
352-
'url': rmFragment(url),
356+
'url': normalizeUrl(url),
353357
'postData': '',
354-
'headers': [{'name': 'Referer', 'value': rmFragment(document.baseURI)}],
358+
'headers': [{'name': 'Referer', 'value': normalizeUrl(document.baseURI)}],
355359
'type': 'static'
356360
};
357361
var request = JSON.stringify(data);
@@ -386,8 +390,11 @@ exports.FireintheHole = function (frame, timeout) {
386390
runInFunc(event);
387391
} else if (typeof event === 'object') {
388392
console.info('object event ', event["event"], event["element"].tagName);
389-
if (local_events.indexOf(event['event']) >= 0)
390-
timeout = 100;
393+
if (local_events.indexOf(event['event']) >= 0) {
394+
timeout = 100
395+
} else {
396+
timeout = 1000;
397+
}
391398
var evt = document.createEvent('CustomEvent');
392399
evt.initCustomEvent(event["event"], true, true, null);
393400
event["element"].dispatchEvent(evt);

core/utils/url.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def normalize_url(url):
5555
return 'http://{}'.format(url)
5656
# //www.test.com/index.php
5757
if not p.scheme:
58-
url = urlparse.urlunsplit(('http', p.netloc, p.path or '/',
58+
url = urlparse.urlunparse(('http', p.netloc, p.path or '/',
5959
p.query, p.params, p.fragment))
6060
return url
6161

@@ -140,6 +140,7 @@ def blocked(self):
140140

141141
if __name__ == '__main__':
142142
urlstring = 'http://www.test.com/fuck/kjskdjf.php?args=kjsdfu&k=kuc&ii=ksc#skdf'
143+
# urlstring = '//mat1.gtimg.com/www/icon/favicon2.ico'
143144
url = URL(urlstring)
144145
print url.path_querystring_pattern
145146
print url.pattern

tspider.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,7 @@ def cmdparse():
8585
producer.create_task_from_url(url)
8686
# file object
8787
else:
88-
with target:
89-
producer.create_task_from_file(target)
88+
producer.create_task_from_file(target)
9089

9190
redis_handle.close()
9291
map(lambda x: x.join(), consumer_pool)

0 commit comments

Comments
 (0)