Skip to content

Commit 2a27200

Browse files
committed
improve proxy & session
Signed-off-by: Zhiyuan Chen <[email protected]>
1 parent 5b75e1b commit 2a27200

File tree

4 files changed

+73
-53
lines changed

4 files changed

+73
-53
lines changed

icrawler/defaults.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
MAX_RETRIES = 3
2+
BACKOFF_BASE = 1.2
3+
14
ACCEPT_LANGUAGES = "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2"
25
USER_AGENT = (
36
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"

icrawler/utils/proxy_pool.py

+22-23
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
import json
21
import logging
32
import queue
43
import random
54
import threading
65
import time
76

7+
import chanfig
88
import requests
99
from bs4 import BeautifulSoup
1010

@@ -44,7 +44,12 @@ def to_dict(self):
4444
dict: A dict with four keys: ``addr``, ``protocol``,
4545
``weight`` and ``last_checked``
4646
"""
47-
return dict(addr=self.addr, protocol=self.protocol, weight=self.weight, last_checked=self.last_checked)
47+
return {
48+
"addr": self.addr,
49+
"protocol": self.protocol,
50+
"weight": self.weight,
51+
"last_checked": self.last_checked,
52+
}
4853

4954

5055
class ProxyPool:
@@ -146,17 +151,15 @@ def save(self, filename):
146151
for proxy in self.proxies[protocol]:
147152
serializable_proxy = self.proxies[protocol][proxy].to_dict()
148153
proxies[protocol].append(serializable_proxy)
149-
with open(filename, "w") as fout:
150-
json.dump(proxies, fout)
154+
chanfig.save(proxies, filename)
151155

152156
def load(self, filename):
153157
"""Load proxies from file"""
154-
with open(filename) as fin:
155-
proxies = json.load(fin)
156-
for protocol in proxies:
157-
for proxy in proxies[protocol]:
158+
proxies = chanfig.load(filename)
159+
for protocol, protocol_proxies in proxies.items():
160+
for proxy in protocol_proxies:
158161
self.proxies[protocol][proxy["addr"]] = Proxy(
159-
proxy["addr"], proxy["protocol"], proxy["weight"], proxy["last_checked"]
162+
proxy["addr"], protocol, proxy.get("weight", 1.0), proxy.get("last_checked")
160163
)
161164
self.addr_list[protocol].append(proxy["addr"])
162165

@@ -215,7 +218,7 @@ def is_valid(self, addr, protocol="http", timeout=5):
215218
raise
216219
except requests.exceptions.Timeout:
217220
return {"valid": False, "msg": "timeout"}
218-
except:
221+
except BaseException: # noqa: B036
219222
return {"valid": False, "msg": "exception"}
220223
else:
221224
if r.status_code == 200:
@@ -278,20 +281,20 @@ def scan(
278281
t = threading.Thread(
279282
name=f"val-{i + 1:0>2d}",
280283
target=self.validate,
281-
kwargs=dict(
282-
proxy_scanner=proxy_scanner,
283-
expected_num=expected_num,
284-
queue_timeout=queue_timeout,
285-
val_timeout=val_timeout,
286-
),
284+
kwargs={
285+
"proxy_scanner": proxy_scanner,
286+
"expected_num": expected_num,
287+
"queue_timeout": queue_timeout,
288+
"val_timeout": val_timeout,
289+
},
287290
)
288291
t.daemon = True
289292
val_threads.append(t)
290293
t.start()
291294
for t in val_threads:
292295
t.join()
293296
self.logger.info("Proxy scanning done!")
294-
except:
297+
except BaseException:
295298
raise
296299
finally:
297300
if out_file is not None:
@@ -466,18 +469,14 @@ def scan_free_proxy_list(self):
466469
def scan_file(self, src_file):
467470
"""Scan candidate proxies from an existing file"""
468471
self.logger.info(f"start scanning file {src_file} for proxy list...")
469-
with open(src_file) as fin:
470-
proxies = json.load(fin)
472+
proxies = chanfig.load(src_file)
471473
for protocol in proxies.keys():
472474
for proxy in proxies[protocol]:
473475
self.proxy_queue.put({"addr": proxy["addr"], "protocol": protocol})
474476

475477
def is_scanning(self):
476478
"""Return whether at least one scanning thread is alive"""
477-
for t in self.scan_threads:
478-
if t.is_alive():
479-
return True
480-
return False
479+
return any(t.is_alive() for t in self.scan_threads)
481480

482481
def scan(self):
483482
"""Start a thread for each registered scan function to scan proxy lists"""

icrawler/utils/session.py

+46-29
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,59 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
from collections.abc import Mapping
15
from urllib.parse import urlsplit
26

37
import requests
8+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
9+
10+
from .. import defaults
11+
from .proxy_pool import ProxyPool
412

513

614
class Session(requests.Session):
7-
def __init__(self, proxy_pool):
15+
def __init__(
16+
self, proxy_pool: ProxyPool | None = None, headers: Mapping | None = None, cookies: Mapping | None = None
17+
):
818
super().__init__()
19+
self.logger = logging.getLogger("cscholars.connection")
920
self.proxy_pool = proxy_pool
21+
if headers is not None:
22+
self.headers.update(headers)
23+
if cookies is not None:
24+
self.cookies.update(cookies)
1025

1126
def _url_scheme(self, url):
1227
return urlsplit(url).scheme
1328

14-
def get(self, url, **kwargs):
15-
proxy = self.proxy_pool.get_next(protocol=self._url_scheme(url))
16-
if proxy is None:
17-
return super().get(url, **kwargs)
18-
try:
19-
response = super().get(url, proxies=proxy.format(), **kwargs)
20-
except requests.exceptions.ConnectionError:
21-
self.proxy_pool.decrease_weight(proxy)
22-
raise
23-
except:
24-
raise
25-
else:
26-
self.proxy_pool.increase_weight(proxy)
27-
return response
28-
29-
def post(self, url, data=None, json=None, **kwargs):
30-
proxy = self.proxy_pool.get_next(protocol=self._url_scheme(url))
31-
if proxy is None:
32-
return super().get(url, data, json, **kwargs)
33-
try:
34-
response = super().post(url, data, json, proxies=proxy.format(), **kwargs)
35-
except requests.exceptions.ConnectionError:
36-
self.proxy_pool.decrease_weight(proxy)
37-
raise
38-
except:
39-
raise
29+
@retry(
30+
stop=stop_after_attempt(defaults.MAX_RETRIES),
31+
wait=wait_random_exponential(exp_base=defaults.BACKOFF_BASE),
32+
retry=retry_if_exception_type((requests.RequestException, requests.HTTPError, requests.ConnectionError)),
33+
)
34+
def request(self, method, url, *args, **kwargs):
35+
message = f"{method}ing {url}"
36+
if args and kwargs:
37+
message += f" with {args} and {kwargs}"
38+
elif args:
39+
message += f" with {args}"
40+
elif kwargs:
41+
message += f" with {kwargs}"
42+
self.logger.debug(message)
43+
44+
if self.proxy_pool is not None:
45+
proxy = self.proxy_pool.get_next(protocol=self._url_scheme(url))
46+
self.logger.debug(f"Using proxy: {proxy.format()}")
47+
try:
48+
response = super().request(method, url, *args, proxies=proxy.format(), **kwargs)
49+
response.raise_for_status()
50+
self.proxy_pool.increase_weight(proxy)
51+
except (requests.ConnectionError, requests.HTTPError):
52+
self.proxy_pool.decrease_weight(proxy)
53+
raise
4054
else:
41-
self.proxy_pool.increase_weight(proxy)
42-
return response
55+
response = super().request(method, url, *args, **kwargs)
56+
57+
if "set-cookie" in response.headers:
58+
self.cookies.update(response.cookies)
59+
return response

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,12 @@ dynamic = [
4646
dependencies = [
4747
"beautifulsoup4",
4848
"bs4",
49+
"chanfig",
4950
"lxml",
5051
"pillow",
51-
"pyyaml",
5252
"requests",
5353
"six",
54+
"tenacity",
5455
]
5556
urls.documentation = "https://icrawler.readthedocs.io/"
5657
urls.homepage = "https://icrawler.readthedocs.io/"

0 commit comments

Comments
 (0)