forked from scrapy-plugins/scrapy-splash
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
130 lines (107 loc) · 3.78 KB
/
utils.py
File metadata and controls
130 lines (107 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import json
import hashlib
import six
from scrapy.http import Headers
from scrapy.utils.python import to_unicode, to_bytes
def dict_hash(obj, start=''):
""" Return a hash for a dict, based on its contents """
h = hashlib.sha1(to_bytes(start))
h.update(to_bytes(obj.__class__.__name__))
if isinstance(obj, dict):
for key, value in sorted(obj.items()):
h.update(to_bytes(key))
h.update(to_bytes(dict_hash(value)))
elif isinstance(obj, (list, tuple)):
for el in obj:
h.update(to_bytes(dict_hash(el)))
else:
# basic types
if isinstance(obj, bool):
value = str(int(obj))
elif isinstance(obj, (six.integer_types, float)):
value = str(obj)
elif isinstance(obj, (six.text_type, bytes)):
value = obj
elif obj is None:
value = b''
else:
raise ValueError("Unsupported value type: %s" % obj.__class__)
h.update(to_bytes(value))
return h.hexdigest()
def _process(value, sha=False):
if isinstance(value, (six.text_type, bytes)):
if sha:
return hashlib.sha1(to_bytes(value)).hexdigest()
return 'h', hash(value)
if isinstance(value, dict):
return {_process(k, sha=True): _process(v, sha) for k, v in value.items()}
if isinstance(value, (list, tuple)):
return [_process(v, sha) for v in value]
return value
def _fast_hash(value):
"""
Return a hash for any JSON-serializable value.
Hash is not guaranteed to be the same in different Python processes,
but it is very fast to compute for data structures with large string
values.
"""
return _json_based_hash(_process(value))
_hash_cache = {} # fast hash => hash
def json_based_hash(value):
"""
Return a hash for any JSON-serializable value.
>>> json_based_hash({"foo": "bar", "baz": [1, 2]})
'0570066939bea46c610bfdc35b20f37ef09d05ed'
"""
fp = _fast_hash(value)
if fp not in _hash_cache:
_hash_cache[fp] = _json_based_hash(_process(value, sha=True))
return _hash_cache[fp]
def _json_based_hash(value):
v = json.dumps(value, sort_keys=True, ensure_ascii=False).encode('utf8')
return hashlib.sha1(v).hexdigest()
def headers_to_scrapy(headers):
"""
Return scrapy.http.Headers instance from headers data.
3 data formats are supported:
* {name: value, ...} dict;
* [(name, value), ...] list;
* [{'name': name, 'value': value'}, ...] list (HAR headers format).
"""
if isinstance(headers or {}, dict):
return Headers(headers or {})
if isinstance(headers[0], dict):
return Headers([
(d['name'], d.get('value', ''))
for d in headers
])
return Headers(headers)
def scrapy_headers_to_unicode_dict(headers):
"""
Convert scrapy.http.Headers instance to a dictionary
suitable for JSON encoding.
"""
return {
to_unicode(key): to_unicode(b','.join(value))
for key, value in headers.items()
}
def parse_x_splash_saved_arguments_header(value):
"""
Parse X-Splash-Saved-Arguments header value.
>>> value = u"name1=9a6747fc6259aa374ab4e1bb03074b6ec672cf99;name2=ba001160ef96fe2a3f938fea9e6762e204a562b3"
>>> dct = parse_x_splash_saved_arguments_header(value)
>>> sorted(list(dct.keys()))
['name1', 'name2']
>>> dct['name1']
'9a6747fc6259aa374ab4e1bb03074b6ec672cf99'
>>> dct['name2']
'ba001160ef96fe2a3f938fea9e6762e204a562b3'
Binary header values are also supported:
>>> dct2 = parse_x_splash_saved_arguments_header(value.encode('utf8'))
>>> dct2 == dct
True
"""
value = to_unicode(value)
return dict(kv.split('=', 1) for kv in value.split(";"))