forked from scrapy-plugins/scrapy-splash
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrequest.py
More file actions
162 lines (134 loc) · 5.59 KB
/
request.py
File metadata and controls
162 lines (134 loc) · 5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import copy
import scrapy
from scrapy.http import FormRequest
from scrapy.utils.url import canonicalize_url
from scrapy_splash import SlotPolicy
from scrapy_splash.utils import to_unicode, dict_hash
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
from scrapy.utils.misc import load_object
try:
from scrapy.utils.misc import build_from_crawler
except ImportError: # Scrapy < 2.12
from scrapy.utils.misc import create_instance
def build_from_crawler(objcls, crawler, /, *args, **kwargs):
return create_instance(objcls, None, crawler, *args, **kwargs)
# XXX: we can't implement SplashRequest without middleware support
# because there is no way to set Splash URL based on settings
# from inside SplashRequest.
class SplashRequest(scrapy.Request):
"""
scrapy.Request subclass which instructs Scrapy to render
the page using Splash.
It requires SplashMiddleware to work.
"""
def __init__(self,
url=None,
callback=None,
method='GET',
endpoint='render.html',
args=None,
splash_url=None,
slot_policy=SlotPolicy.PER_DOMAIN,
splash_headers=None,
dont_process_response=False,
dont_send_headers=False,
magic_response=True,
session_id='default',
http_status_from_error_code=True,
cache_args=None,
meta=None,
**kwargs):
if url is None:
url = 'about:blank'
url = to_unicode(url)
meta = copy.deepcopy(meta) or {}
splash_meta = meta.setdefault('splash', {})
splash_meta.setdefault('endpoint', endpoint)
splash_meta.setdefault('slot_policy', slot_policy)
if splash_url is not None:
splash_meta['splash_url'] = splash_url
if splash_headers is not None:
splash_meta['splash_headers'] = splash_headers
if dont_process_response:
splash_meta['dont_process_response'] = True
else:
splash_meta.setdefault('magic_response', magic_response)
if dont_send_headers:
splash_meta['dont_send_headers'] = True
if http_status_from_error_code:
splash_meta['http_status_from_error_code'] = True
if cache_args is not None:
splash_meta['cache_args'] = cache_args
if session_id is not None:
if splash_meta['endpoint'].strip('/') == 'execute':
splash_meta.setdefault('session_id', session_id)
_args = {'url': url} # put URL to args in order to preserve #fragment
_args.update(args or {})
_args.update(splash_meta.get('args', {}))
splash_meta['args'] = _args
# This is not strictly required, but it strengthens Splash
# requests against AjaxCrawlMiddleware
meta['ajax_crawlable'] = True
super(SplashRequest, self).__init__(url, callback, method, meta=meta,
**kwargs)
@property
def _processed(self):
return self.meta.get('_splash_processed')
@property
def _splash_args(self):
return self.meta.get('splash', {}).get('args', {})
@property
def _original_url(self):
return self._splash_args.get('url')
@property
def _original_method(self):
return self._splash_args.get('http_method', 'GET')
def __repr__(self):
if not self._processed:
return super().__repr__()
return "<%s %s via %s>" % (self._original_method, self._original_url, self.url)
class SplashFormRequest(SplashRequest, FormRequest):
"""
Use SplashFormRequest if you want to make a FormRequest via splash.
Accepts the same arguments as SplashRequest, and also formdata,
like FormRequest. First, FormRequest is initialized, and then it's
url, method and body are passed to SplashRequest.
Note that FormRequest calls escape_ajax on url (via Request._set_url).
"""
def __init__(self, url=None, callback=None, method=None, formdata=None,
body=None, **kwargs):
# First init FormRequest to get url, body and method
if formdata:
FormRequest.__init__(
self, url=url, method=method, formdata=formdata)
url, method, body = self.url, self.method, self.body
# Then pass all other kwargs to SplashRequest
SplashRequest.__init__(
self, url=url, callback=callback, method=method, body=body,
**kwargs)
class SplashRequestFingerprinter:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
self._base_request_fingerprinter = build_from_crawler(
load_object(
crawler.settings.get(
"SCRAPY_SPLASH_REQUEST_FINGERPRINTER_BASE_CLASS",
REQUEST_FINGERPRINTER_CLASS,
)
),
crawler,
)
def fingerprint(self, request):
""" Request fingerprint which takes 'splash' meta key into account """
fp = self._base_request_fingerprinter.fingerprint(request)
if 'splash' not in request.meta:
return fp
splash_options = copy.deepcopy(request.meta['splash'])
args = splash_options.setdefault('args', {})
if 'url' in args:
args['url'] = canonicalize_url(args['url'], keep_fragments=True)
return dict_hash(splash_options, fp).encode()