diff --git a/CHANGES.rst b/CHANGES.rst index 0173ed3b..a8d0f030 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,43 @@ Changes ======= +0.33.0 (unreleased) +------------------- + +- Added a minimum delay between reuses of any given :ref:`plugin-managed + session `. + + It is :setting:`DOWNLOAD_DELAY` by default. Use + :setting:`ZYTE_API_SESSION_DELAY` to change that or + :setting:`ZYTE_API_SESSION_POOLS` to override it for specific + :setting:`session pools `. + + :setting:`ZYTE_API_SESSION_RANDOMIZE_DELAY` controls whether that minimum + delay is randomized by multiplying it by a random factor between 0.5 and + 1.5. It defaults to :setting:`RANDOMIZE_DOWNLOAD_DELAY`. + +- The value of the :reqmeta:`zyte_api_session_pool` request metadata key and + the return value of the :meth:`SessionConfig.pool() + ` method can now be a dictionary + instead of a string, allowing to override :setting:`ZYTE_API_SESSION_DELAY` + and :setting:`ZYTE_API_SESSION_POOL_SIZE` for the corresponding pool. + + However, they cannot override values defined in + :setting:`ZYTE_API_SESSION_POOLS`. + +- Deprecated the ``ZYTE_API_SESSION_POOL_SIZES`` setting in favor of the new + :setting:`ZYTE_API_SESSION_POOLS` setting, where you can set ``"size"``. + +- Changed the terminology around :ref:`session management ` to try + to make it clearer and more consistent: + + | client-managed sessions → user-managed sessions + | server-managed sessions → Zyte-managed sessions + | scrapy-zyte-api session management → plugin-managed sessions + +- Added a :ref:`session-troubleshooting` section to the :ref:`session` page. + + 0.32.0 (2026-01-20) ------------------- @@ -92,10 +129,10 @@ Changes :http:`request:httpResponseHeaders` will no longer be enabled by default, and :ref:`request header mapping ` is disabled. -* Session pool IDs, of server-managed sessions (:http:`request:sessionContext`) - or :ref:`set through the session management API `, now affect - request fingerprinting: 2 requests identical except for their session pool ID - are *not* considered duplicate requests any longer. +* Session pool IDs, of Zyte-managed sessions (:http:`request:sessionContext`) + or :ref:`plugin-managed sessions `, now affect request + fingerprinting: 2 requests identical except for their session pool ID are + *not* considered duplicate requests any longer. * When it is not clear whether a request will use browser rendering or not, e.g. an :ref:`automatic extraction request ` without an diff --git a/docs/reference/meta.rst b/docs/reference/meta.rst index 41c18266..0ced00e5 100644 --- a/docs/reference/meta.rst +++ b/docs/reference/meta.rst @@ -95,8 +95,8 @@ zyte_api_session_enabled Default: :setting:`ZYTE_API_SESSION_ENABLED` -Whether to use :ref:`scrapy-zyte-api session management ` for the -request (``True``) or not (``False``). +Whether to send the request with a :ref:`plugin-managed session ` +(``True``) or not (``False``). .. seealso:: :meth:`scrapy_zyte_api.SessionConfig.enabled` @@ -141,7 +141,7 @@ zyte_api_session_pool Default: ``""`` -Determines the ID of the session pool to assign to the request, overriding the -:ref:`default pool assignment logic `. +If not falsy, it determines the default pool ID and options for the request. -.. seealso:: :meth:`scrapy_zyte_api.SessionConfig.pool` +It supports the same values as the return value of +:meth:`scrapy_zyte_api.SessionConfig.pool`. diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 304ece35..b4b8720d 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -407,6 +407,27 @@ object, for example to read settings: ZYTE_API_SESSION_CHECKER = MySessionChecker +.. setting:: ZYTE_API_SESSION_DELAY + +ZYTE_API_SESSION_DELAY +====================== + +Default: :setting:`DOWNLOAD_DELAY` + +Minimum number of seconds to wait before reusing a :ref:`plugin-managed +session `. + +To override this value for specific pools, use the ``"delay"`` key in a +:class:`dict` value of the :setting:`ZYTE_API_SESSION_POOLS` setting, of the +:reqmeta:`zyte_api_session_pool` request metadata key, or that returned by +:meth:`~scrapy_zyte_api.SessionConfig.pool`. + +Increasing this number can reduce the number of ban-related session +expirations, hence increasing the lifetime of each session. See +:ref:`optimize-sessions`. + +.. seealso:: :setting:`ZYTE_API_SESSION_RANDOMIZE_DELAY` + .. setting:: ZYTE_API_SESSION_ENABLED ZYTE_API_SESSION_ENABLED @@ -414,7 +435,7 @@ ZYTE_API_SESSION_ENABLED Default: ``False`` -Enables :ref:`scrapy-zyte-api session management `. +Enables :ref:`plugin-managed sessions `. .. setting:: ZYTE_API_SESSION_LOCATION @@ -535,22 +556,34 @@ The maximum number of active :ref:`scrapy-zyte-api sessions ` to keep per :ref:`pool `. To override this value for specific pools, use -:setting:`ZYTE_API_SESSION_POOL_SIZES`. +:setting:`ZYTE_API_SESSION_POOLS` or return a dictionary from +:meth:`~scrapy_zyte_api.SessionConfig.pool` containing a ``"size"`` key. Increase this number to lower the frequency with which requests are sent through each session, which on some websites may increase the lifetime of each session. See :ref:`optimize-sessions`. -.. setting:: ZYTE_API_SESSION_POOL_SIZES +.. setting:: ZYTE_API_SESSION_POOLS -ZYTE_API_SESSION_POOL_SIZES -=========================== +ZYTE_API_SESSION_POOLS +====================== Default: ``{}`` :class:`dict` where keys are :ref:`pool ` IDs and values are -overrides of :setting:`ZYTE_API_SESSION_POOL_SIZE` for those pools. +dicts with any combination of the following keys that override the +corresponding setting for that pool: + +- ``"delay"`` overrides :setting:`ZYTE_API_SESSION_DELAY`. + +- ``"randomize_delay"`` overrides + :setting:`ZYTE_API_SESSION_RANDOMIZE_DELAY`. + +- ``"size"`` overrides :setting:`ZYTE_API_SESSION_POOL_SIZE`. + +These overrides take precedence over :attr:`SessionConfig.pool +`. .. setting:: ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS @@ -586,6 +619,15 @@ queue. See :setting:`ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS` for details. +.. setting:: ZYTE_API_SESSION_RANDOMIZE_DELAY + +ZYTE_API_SESSION_RANDOMIZE_DELAY +================================ + +Default: :setting:`RANDOMIZE_DOWNLOAD_DELAY` + +If enabled, :setting:`ZYTE_API_SESSION_DELAY` is randomized each time it is +used by multiplying it by a random factor between 0.5 and 1.5. .. setting:: ZYTE_API_SKIP_HEADERS diff --git a/docs/usage/session.rst b/docs/usage/session.rst index f0b2de28..3127fbb7 100644 --- a/docs/usage/session.rst +++ b/docs/usage/session.rst @@ -1,27 +1,27 @@ .. _session: -================== -Session management -================== +======================= +Plugin-managed sessions +======================= Zyte API provides powerful session APIs: -- :ref:`Client-managed sessions ` give you full control - over session management. +- :ref:`User-managed sessions ` give you full control over + session management. -- :ref:`Server-managed sessions ` let Zyte API - handle session management for you. +- :ref:`Zyte-managed sessions ` let Zyte API handle + session management for you. When using scrapy-zyte-api, you can use these session APIs through the corresponding Zyte API fields (:http:`request:session`, :http:`request:sessionContext`). -However, scrapy-zyte-api also provides its own session management API, similar -to that of :ref:`server-managed sessions `, but -built on top of :ref:`client-managed sessions `. +However, scrapy-zyte-api also provides plugin-managed sessions, with an API +similar to that of Zyte-managed sessions, but built on top of user-managed +sessions. -scrapy-zyte-api session management offers some advantages over -:ref:`server-managed sessions `: +Plugin-managed sessions offer some advantages over :ref:`Zyte-managed sessions +`: - You can perform :ref:`session validity checks `, so that the sessions of responses that do not pass those checks are refreshed, and the @@ -34,24 +34,24 @@ scrapy-zyte-api session management offers some advantages over - You have granular control over the session pool size, max errors, etc. See :ref:`optimize-sessions` and :ref:`session-configs`. -However, scrapy-zyte-api session management is not a replacement for -:ref:`server-managed sessions ` or -:ref:`client-managed sessions `: +However, plugin-managed sessions are not a replacement for :ref:`Zyte-managed +sessions ` or :ref:`user-managed sessions +`: -- :ref:`Server-managed sessions ` offer a longer - life time than the :ref:`client-managed sessions ` - that scrapy-zyte-api session management uses, so as long as you do not need - one of the scrapy-zyte-api session management features, server-managed - sessions can be significantly more efficient (fewer total sessions needed +- :ref:`Zyte-managed sessions ` offer a longer life + time than the :ref:`user-managed sessions ` that + plugin-managed sessions use, so as long as you do not need one of the + features of plugin-managed sessions, Zyte-managed sessions can be + significantly more efficient (fewer session-initialization requests needed per crawl). - Zyte API can also optimize server-managed sessions based on the target - website. With scrapy-zyte-api session management, you need to :ref:`handle + Zyte API can also optimize Zyte-managed sessions based on the target + website. With plugin-managed sessions, you need to :ref:`handle optimization yourself `. -- :ref:`Client-managed sessions ` offer full control - over session management, while scrapy-zyte-api session management removes - some of that control to provide an easier API for supported use cases. +- :ref:`User-managed sessions ` offer full control over + session management, while plugin-managed sessions remove some of that + control to provide an easier API for supported use cases. .. _enable-sessions: @@ -134,7 +134,7 @@ To change the :ref:`default session initialization parameters :reqmeta:`zyte_api_session_params` request metadata key. It works similarly to :http:`request:sessionContextParams` from - :ref:`server-managed sessions `, but it supports + :ref:`Zyte-managed sessions `, but it supports arbitrary Zyte API parameters instead of a specific subset. If it does not define a ``"url"``, the URL of the request :ref:`triggering @@ -247,7 +247,7 @@ overrides `. The :setting:`ZYTE_API_SESSION_POOL_SIZE` setting determines the desired number of concurrent, active, working sessions per pool. The -:setting:`ZYTE_API_SESSION_POOL_SIZES` setting allows defining different values +:setting:`ZYTE_API_SESSION_POOLS` setting allows defining different values for specific pools. .. _pool-size: @@ -274,7 +274,6 @@ The session pool assigned to a request affects the :ref:`fingerprint considered different requests, i.e. not duplicate requests, even if they are otherwise identical. - .. _optimize-sessions: Optimizing sessions @@ -290,17 +289,17 @@ Here are some things you can try: - On some websites, sending too many requests too fast through a session can cause the target website to ban that session. - - On those websites, you can increase the number of sessions in the pool - (:setting:`ZYTE_API_SESSION_POOL_SIZE`). The more different sessions you - use, the more slowly you send requests through each session. - - Mind, however, that :ref:`client-managed sessions ` - expire after `15 minutes since creation or 2 minutes since the last request - `_. - At a certain point, increasing :setting:`ZYTE_API_SESSION_POOL_SIZE` - without increasing :setting:`CONCURRENT_REQUESTS - ` and :setting:`CONCURRENT_REQUESTS_PER_DOMAIN + + On those websites, you can increase :setting:`ZYTE_API_SESSION_DELAY`, + :setting:`ZYTE_API_SESSION_POOL_SIZE`, or both, to lower the rate of + session reuse. + + Mind, however, that :ref:`user-managed sessions ` expire + after 15 minutes since creation or 2 minutes since the last request (see + :http:`request:session`). At a certain point, increasing + :setting:`ZYTE_API_SESSION_POOL_SIZE` without increasing + :setting:`CONCURRENT_REQUESTS ` and + :setting:`CONCURRENT_REQUESTS_PER_DOMAIN ` accordingly can be counterproductive. @@ -317,10 +316,9 @@ Here are some things you can try: If you do not need :ref:`session checking ` and your :ref:`initialization parameters ` are only -:http:`request:browserHtml` and :http:`request:actions`, :ref:`server-managed +:http:`request:browserHtml` and :http:`request:actions`, :ref:`Zyte-managed sessions ` might be a more cost-effective choice, as -they live much longer than :ref:`client-managed sessions -`. +they live much longer than :ref:`user-managed sessions `. .. _session-configs: @@ -445,7 +443,7 @@ implementation may also close your spider with a custom reason by raising a Session stats ============= -The following stats exist for scrapy-zyte-api session management: +The following stats exist for plugin-managed sessions: ``scrapy-zyte-api/sessions/pools/{pool}/init/check-error`` Number of times that a session for pool ``{pool}`` triggered an unexpected @@ -501,3 +499,28 @@ The following stats exist for scrapy-zyte-api session management: ``scrapy-zyte-api/sessions/use/disabled`` Number of processed requests for which session management was disabled. + +.. _session-troubleshooting: + +Troubleshooting +=============== + +.. _session-troubleshooting-could-not-get-session-id: + +RuntimeError: Could not get a session ID +---------------------------------------- + +If you see this exception, indicating that after a given number of attempts, +with a given minimum wait time between attempts, it was not possible to get a +session ID from the session rotation queue, consider the following +possibilities: + +- A bug in your session validation code may be causing it to return ``False`` + for a valid response. + + This is specially likely if you see this issue for very few, specific + requests, while most requests work fine. + +- The values of the :setting:`ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS` and + :setting:`ZYTE_API_SESSION_QUEUE_WAIT_TIME` settings may be too low for + your scenario, in which case you can modify them accordingly. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3c8ab1ff..f17a0fec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,15 +22,16 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] requires-python = ">=3.10" -# Sync with [pinned] @ tox.ini +# Sync with [min] @ tox.ini dependencies = [ "packaging>=20.0", "scrapy>=2.0.1", + "typing_extensions>=4.1.0", "zyte-api>=0.6.0", ] [project.optional-dependencies] -# Sync with [testenv:pinned-provider] @ tox.ini +# Sync with [testenv:min-provider] @ tox.ini provider = [ "andi>=0.6.0", "scrapy-poet>=0.22.3", @@ -39,6 +40,7 @@ provider = [ ] x402 = [ "zyte-api[x402]>=0.8.0", + "x402<2.0.0", ] [project.urls] diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index cbf8ca98..73f9914e 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -1,10 +1,24 @@ import json +import random +import time from asyncio import Task, create_task, sleep from collections import defaultdict, deque from copy import deepcopy from functools import partial from logging import getLogger -from typing import Any, DefaultDict, Deque, Dict, List, Optional, Set, Type, Union +from typing import ( + Any, + DefaultDict, + Tuple, + Dict, + List, + Optional, + Set, + Type, + TypedDict, + Union, + cast, +) from uuid import uuid4 from weakref import WeakKeyDictionary @@ -25,11 +39,23 @@ _close_spider, ) +try: + from typing import NotRequired # Python 3.11+ +except ImportError: + from typing_extensions import NotRequired # Python 3.10 + logger = getLogger(__name__) SESSION_INIT_META_KEY = "_is_session_init_request" ZYTE_API_META_KEYS = ("zyte_api", "zyte_api_automap", "zyte_api_provider") +def _troubleshoot(slug): + return ( + f"https://scrapy-zyte-api.readthedocs.io/en/latest/usage/session.html" + f"#session-troubleshooting-{slug}" + ) + + def get_request_session_id(request: Request) -> Optional[str]: """Return the session ID of *request*, or ``None`` if it does not have a session ID assigned.""" @@ -149,6 +175,22 @@ class TooManyBadSessionInits(RuntimeError): pass +class PoolConfig(TypedDict): + id: str + delay: NotRequired[float] + randomize_delay: NotRequired[bool] + size: NotRequired[int] + + +class PoolOptions(TypedDict): + delay: NotRequired[float] + randomize_delay: NotRequired[bool] + size: NotRequired[int] + + +QueueSession = Tuple[str, float] # (session_id, next_use_timestamp) + + class SessionConfig: """Default session configuration for :ref:`scrapy-zyte-api sessions `.""" @@ -248,8 +290,9 @@ def process_request(self, request: Request) -> Optional[Request]: """ return None - def pool(self, request: Request) -> str: - """Return the ID of the session pool to use for *request*. + def pool(self, request: Request) -> str | PoolConfig: + """Return the ID of the session pool to use for *request*, or a + :class:`dict` with additional session pool config. The main aspects of the default implementation are described in :ref:`session-pools`. @@ -263,6 +306,30 @@ def pool(self, request: Request) -> str: used, the pool ID is the target domain followed by an at sign and the comma-separated values of the non-empty fields from :data:`ADDRESS_FIELDS` (e.g. ``example.com@US,NY,10001``). + + Instead of a string, this method can also return a :class:`dict` + containing the pool ID under the ``id`` key, and optionally any other + key supported by :setting:`ZYTE_API_SESSION_POOLS`. For example: + + .. code-block:: python + + def pool(self, request): + if "ecommerce.example" in urlparse_cached(request).netloc: + return { + "id": "ecommerce.example", + "delay": 2.0, + "size": 16, + } + return super().pool(request) + + The values of optional keys take precedence over the corresponding + pool-independent settings, e.g. ``delay`` takes precedence over + :setting:`ZYTE_API_SESSION_DELAY` for the corresponding pool ID, but do + not override those defined in :setting:`ZYTE_API_SESSION_POOLS`. + + For any given pool ID, the values of optional keys are only taken into + account when the pool ID is first encountered. You cannot use this + method to change them at run time. """ meta_pool = request.meta.get("zyte_api_session_pool", "") if meta_pool: @@ -557,11 +624,27 @@ def __init__(self, crawler: Crawler): settings = crawler.settings - pool_size = settings.getint("ZYTE_API_SESSION_POOL_SIZE", 8) - self._pending_initial_sessions: Dict[str, int] = defaultdict(lambda: pool_size) - pool_sizes = settings.getdict("ZYTE_API_SESSION_POOL_SIZES", {}) - for pool, size in pool_sizes.items(): - self._pending_initial_sessions[pool] = size + self._default_pool_delay = settings.getfloat( + "ZYTE_API_SESSION_DELAY", settings.getfloat("DOWNLOAD_DELAY") + ) + self._randomize_delay = settings.getbool( + "ZYTE_API_SESSION_RANDOMIZE_DELAY", + settings.getbool("RANDOMIZE_DOWNLOAD_DELAY"), + ) + self._default_pool_size = settings.getint("ZYTE_API_SESSION_POOL_SIZE", 8) + self._pending_initial_sessions: Dict[str, int] = {} + self._pool_configs = settings.getdict("ZYTE_API_SESSION_POOLS") + pool_sizes = settings.getdict("ZYTE_API_SESSION_POOL_SIZES") + if pool_sizes: + logger.warning( + "ZYTE_API_SESSION_POOL_SIZES is deprecated, use " + "ZYTE_API_SESSION_POOLS instead" + ) + for pool_id, pool_size in pool_sizes.items(): + self._pool_configs.setdefault(pool_id, {}).setdefault("size", pool_size) + for pool, config in self._pool_configs.items(): + if "size" in config: + self._pending_initial_sessions[pool] = config["size"] self._max_check_failures = settings.getint( "ZYTE_API_SESSION_MAX_CHECK_FAILURES", 1 @@ -574,7 +657,7 @@ def __init__(self, crawler: Crawler): max_bad_inits = settings.getint("ZYTE_API_SESSION_MAX_BAD_INITS", 8) self._max_bad_inits: Dict[str, int] = defaultdict(lambda: max_bad_inits) max_bad_inits_per_pool = settings.getdict( - "ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL", {} + "ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL" ) for pool, pool_max_bad_inits in max_bad_inits_per_pool.items(): self._max_bad_inits[pool] = pool_max_bad_inits @@ -616,7 +699,7 @@ def __init__(self, crawler: Crawler): # If the queue is empty, sleep and try again. Sessions from the pool # will be appended to the queue as they are initialized and ready to # use. - self._queues: Dict[str, Deque[str]] = defaultdict(deque) + self._queues: Dict[str, deque[QueueSession]] = defaultdict(deque) self._queue_max_attempts = settings.getint( "ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS", 60 ) @@ -661,10 +744,49 @@ def get_pool(self, request): session_config = self._get_session_config(request) try: pool = session_config.pool(request) - except Exception: - raise PoolError - self._pool_cache[request] = pool - return pool + except Exception as exception: + message = ( + f"Exception raised on session config pool() method call " + f"for request {request}." + ) + raise PoolError(message) from exception + options: PoolOptions + if isinstance(pool, str): + pool_id = pool + options = {} + else: + try: + pool_id = pool["id"] + except (KeyError, TypeError) as exception: + message = ( + f'Exception raised when accessing pool["id"] on the ' + f"return value of the session config pool() method call " + f"for request {request}." + ) + raise PoolError(message) from exception + else: + options = cast( + PoolOptions, {k: v for k, v in pool.items() if k != "id"} + ) + delay = options.get("delay", self._default_pool_delay) + randomize_delay = options.get("randomize_delay", self._randomize_delay) + size = options.get("size", self._default_pool_size) + if pool_id not in self._pool_configs: + self._pool_configs[pool_id] = { + "delay": delay, + "size": size, + "randomize_delay": randomize_delay, + } + self._pending_initial_sessions[pool_id] = size + else: + config = self._pool_configs[pool_id] + config.setdefault("delay", delay) + config.setdefault("randomize_delay", randomize_delay) + if "size" not in config: + self._pending_initial_sessions[pool_id] = size + config.setdefault("size", size) + self._pool_cache[request] = pool_id + return pool_id async def _init_session(self, session_id: str, request: Request, pool: str) -> bool: assert self._crawler.engine @@ -759,15 +881,22 @@ async def _create_session(self, request: Request, pool: str) -> str: self._bad_inits[pool] += 1 if self._bad_inits[pool] >= self._max_bad_inits[pool]: raise TooManyBadSessionInits - self._queues[pool].append(session_id) + pool_config = self._pool_configs[pool] + delay = pool_config["delay"] + sleep_delay = next_use_delay = delay + if pool_config["randomize_delay"]: + next_use_delay *= random.uniform(0.5, 1.5) + sleep_delay *= random.uniform(0.5, 1.5) + await sleep(sleep_delay) + next_use = time.time() + next_use_delay + self._queues[pool].append((session_id, next_use)) return session_id async def _next_from_queue(self, request: Request, pool: str) -> str: - session_id = None attempts = 0 - while session_id not in self._pools[pool]: # After 1st loop: invalid session. + while True: try: - session_id = self._queues[pool].popleft() + session_id, next_use = self._queues[pool].popleft() except IndexError: # No ready-to-use session available. attempts += 1 if attempts >= self._queue_max_attempts: @@ -775,21 +904,28 @@ async def _next_from_queue(self, request: Request, pool: str) -> str: f"Could not get a session ID from the session " f"rotation queue after {attempts} attempts, waiting " f"at least {self._queue_wait_time} seconds between " - f"attempts. Either the values of the " - f"ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS and " - f"ZYTE_API_SESSION_QUEUE_WAIT_TIME settings are too " - f"low for your scenario, in which case you can modify " - f"them accordingly, or there might be a bug with " - f"scrapy-zyte-api session management. If you think it " - f"could be the later, please report the issue at " - f"https://github.com/scrapy-plugins/scrapy-zyte-api/issues/new " - f"providing a minimal reproducible example if " - f"possible, or debug logs and stats otherwise." + f"attempts. See " + f"{_troubleshoot('could-not-get-session-id')}" ) await sleep(self._queue_wait_time) - assert session_id is not None - self._queues[pool].append(session_id) - return session_id + continue + if session_id not in self._pools[pool]: + continue # Invalid session + now = time.time() + if next_use > now: + wait = next_use - now + logger.debug( + f"Waiting {wait:.3f} seconds for session {session_id} " + f"from pool {pool!r} to become available" + ) + await sleep(wait) + now = time.time() + pool_config = self._pool_configs[pool] + next_use_delay = pool_config["delay"] + if pool_config["randomize_delay"]: + next_use_delay *= random.uniform(0.5, 1.5) + self._queues[pool].append((session_id, now + next_use_delay)) + return session_id async def _next(self, request) -> str: """Return the ID of the next working session in the session pool @@ -1023,7 +1159,7 @@ async def process_exception( reason=reason, ) - def get_pool(self, request: Request): + def get_pool(self, request: Request) -> PoolConfig | str | None: return ( self._sessions.get_pool(request) if self._sessions.is_enabled(request) diff --git a/tests/__init__.py b/tests/__init__.py index 94b8c764..627a35ee 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -24,7 +24,9 @@ _API_KEY = "a" +UNSET = object() DEFAULT_CLIENT_CONCURRENCY = AsyncZyteAPI(api_key=_API_KEY).n_conn + SETTINGS_T = Dict[str, Any] SETTINGS: SETTINGS_T = { "DOWNLOAD_HANDLERS": { @@ -70,7 +72,12 @@ "TELNETCONSOLE_ENABLED": False, "ZYTE_API_KEY": _API_KEY, } -UNSET = object() + +SESSION_SETTINGS: SETTINGS_T = { + "ZYTE_API_SESSION_DELAY": 0, + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_QUEUE_WAIT_TIME": 0, +} class DummySpider(Spider): @@ -194,3 +201,11 @@ async def process_response(middleware, request, response) -> Request | None: else: maybe_awaitable = middleware.process_response(request, response, spider=None) await _ensure_awaitable(maybe_awaitable) + + +def get_session_stats(crawler): + return { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } diff --git a/tests/conftest.py b/tests/conftest.py index 6acd1c42..53e49161 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,3 +15,6 @@ def fresh_mockserver(): with MockServer() as server: yield server + + +pytest.register_assert_rewrite("tests.helpers") diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 00000000..be9a689d --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,24 @@ +from typing import Any + +from . import get_session_stats + + +def assert_session_stats(crawler, expected: dict[str, Any]): + actual = get_session_stats(crawler) + if not expected or any(k.startswith("scrapy-zyte-api/sessions") for k in expected): + pass + elif any(k.startswith("/") for k in expected): + expected = {f"scrapy-zyte-api/sessions{k}": v for k, v in expected.items()} + elif any(isinstance(v, dict) for v in expected.values()): + expected = { + f"scrapy-zyte-api/sessions/pools/{pool}/{stat}": value + for pool, stats in expected.items() + for stat, value in stats.items() + } + else: + expected = { + f"scrapy-zyte-api/sessions/pools/{pool}/{stat}": value + for pool, (init, use) in expected.items() + for stat, value in (("init/check-passed", init), ("use/check-passed", use)) + } + assert actual == expected diff --git a/tests/test_addon.py b/tests/test_addon.py index dfe07ba9..947261cd 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -282,3 +282,56 @@ def test_no_poet_setting_changes(initial_settings, expected_settings): ) def test_poet_setting_changes(initial_settings, expected_settings): _test_setting_changes(initial_settings, expected_settings) + + +@pytest.mark.parametrize( + ("manual_settings", "addon_settings"), + ( + ( + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + {}, + ), + ( + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + {"ZYTE_API_RETRY_POLICY": "zyte_api.zyte_api_retrying"}, + ), + ( + { + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" + }, + {"ZYTE_API_RETRY_POLICY": "zyte_api.aggressive_retrying"}, + ), + ( + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + ), + ( + { + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" + }, + { + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" + }, + ), + ( + {"ZYTE_API_RETRY_POLICY": "tests.UNSET"}, + {"ZYTE_API_RETRY_POLICY": "tests.UNSET"}, + ), + ), +) +@deferred_f_from_coro_f +async def test_sessions(manual_settings, addon_settings): + crawler = await get_crawler_zyte_api( + { + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_SESSION_ENABLED": True, + **manual_settings, + }, + poet=False, + ) + addon_crawler = await get_crawler_zyte_api( + {"ZYTE_API_SESSION_ENABLED": True, **addon_settings}, use_addon=True, poet=False + ) + assert serialize_settings(crawler.settings) == serialize_settings( + addon_crawler.settings + ) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 9c3dd15d..e7fcf1bc 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -36,6 +36,7 @@ DEFAULT_CLIENT_CONCURRENCY, SETTINGS, SETTINGS_T, + UNSET, get_crawler, get_download_handler, get_downloader_middleware, @@ -237,9 +238,6 @@ async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): assert not resp.headers -UNSET = object() - - @deferred_f_from_coro_f @pytest.mark.parametrize( "setting,enabled", diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 722d1740..86bf213d 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -734,7 +734,7 @@ async def test_page_params(): # Session pool IDs affect fingerprinting, but session initialization # parameters do not. # - # When using server-managed requests, that means that a different + # When using Zyte-managed sessions, that means that a different # sessionContext parameter affects the fingerprint, while a different # sessionContextParameters does not, even if sessionContext remains the # same (which would be a user error). diff --git a/tests/test_sessions.py b/tests/test_sessions.py deleted file mode 100644 index 9341938c..00000000 --- a/tests/test_sessions.py +++ /dev/null @@ -1,3122 +0,0 @@ -from collections import deque -from copy import copy, deepcopy -from math import floor -from typing import Any, Dict, Optional, Tuple, Union -from unittest.mock import patch - -import pytest -from aiohttp.client_exceptions import ServerConnectionError -from scrapy.utils.defer import deferred_f_from_coro_f -from scrapy import Request, Spider, signals -from scrapy.exceptions import CloseSpider -from scrapy.http import Response -from scrapy.utils.httpobj import urlparse_cached -from scrapy.utils.misc import load_object -from zyte_api import RequestError - -from scrapy_zyte_api import ( - SESSION_AGGRESSIVE_RETRY_POLICY, - SESSION_DEFAULT_RETRY_POLICY, - LocationSessionConfig, - SessionConfig, - get_request_session_id, - is_session_init_request, - session_config, -) -from scrapy_zyte_api._session import SESSION_INIT_META_KEY, session_config_registry -from scrapy_zyte_api.utils import ( - _RAW_CLASS_SETTING_SUPPORT, - _REQUEST_ERROR_HAS_QUERY, - maybe_deferred_to_future, -) - -from . import get_crawler, serialize_settings - -UNSET = object() - - -@pytest.mark.parametrize( - ("setting", "meta", "outcome"), - ( - (UNSET, UNSET, False), - (UNSET, True, True), - (UNSET, False, False), - (True, UNSET, True), - (True, True, True), - (True, False, False), - (False, UNSET, False), - (False, True, True), - (False, False, False), - ), -) -@deferred_f_from_coro_f -async def test_enabled(setting, meta, outcome, mockserver): - settings = {"ZYTE_API_URL": mockserver.urljoin("/")} - if setting is not UNSET: - settings["ZYTE_API_SESSION_ENABLED"] = setting - meta_dict = {} - if meta is not UNSET: - meta_dict = {"zyte_api_session_enabled": meta} - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request("https://example.com", meta=meta_dict) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - if outcome: - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - else: - assert session_stats == { - "scrapy-zyte-api/sessions/use/disabled": 1, - } - - -@pytest.mark.parametrize( - ("params_setting", "params_meta", "location_setting", "location_meta", "outcome"), - ( - (UNSET, UNSET, UNSET, UNSET, False), - (UNSET, UNSET, UNSET, None, False), - (UNSET, UNSET, UNSET, False, False), - (UNSET, UNSET, UNSET, True, True), - (UNSET, UNSET, False, UNSET, False), - (UNSET, UNSET, False, None, False), - (UNSET, UNSET, False, False, False), - (UNSET, UNSET, False, True, True), - (UNSET, UNSET, True, UNSET, True), - (UNSET, UNSET, True, None, False), - (UNSET, UNSET, True, False, False), - (UNSET, UNSET, True, True, True), - (UNSET, False, UNSET, UNSET, False), - (UNSET, False, UNSET, None, False), - (UNSET, False, UNSET, False, False), - (UNSET, False, UNSET, True, False), - (UNSET, False, False, UNSET, False), - (UNSET, False, False, None, False), - (UNSET, False, False, False, False), - (UNSET, False, False, True, False), - (UNSET, False, True, UNSET, False), - (UNSET, False, True, None, False), - (UNSET, False, True, False, False), - (UNSET, False, True, True, False), - (UNSET, True, UNSET, UNSET, True), - (UNSET, True, UNSET, None, True), - (UNSET, True, UNSET, False, True), - (UNSET, True, UNSET, True, True), - (UNSET, True, False, UNSET, True), - (UNSET, True, False, None, True), - (UNSET, True, False, False, True), - (UNSET, True, False, True, True), - (UNSET, True, True, UNSET, True), - (UNSET, True, True, None, True), - (UNSET, True, True, False, True), - (UNSET, True, True, True, True), - (False, UNSET, UNSET, UNSET, False), - (False, UNSET, UNSET, None, False), - (False, UNSET, UNSET, False, False), - (False, UNSET, UNSET, True, True), - (False, UNSET, False, UNSET, False), - (False, UNSET, False, None, False), - (False, UNSET, False, False, False), - (False, UNSET, False, True, True), - (False, UNSET, True, UNSET, False), - (False, UNSET, True, None, False), - (False, UNSET, True, False, False), - (False, UNSET, True, True, True), - (False, False, UNSET, UNSET, False), - (False, False, UNSET, None, False), - (False, False, UNSET, False, False), - (False, False, UNSET, True, False), - (False, False, False, UNSET, False), - (False, False, False, None, False), - (False, False, False, False, False), - (False, False, False, True, False), - (False, False, True, UNSET, False), - (False, False, True, None, False), - (False, False, True, False, False), - (False, False, True, True, False), - (False, True, UNSET, UNSET, True), - (False, True, UNSET, None, True), - (False, True, UNSET, False, True), - (False, True, UNSET, True, True), - (False, True, False, UNSET, True), - (False, True, False, None, True), - (False, True, False, False, True), - (False, True, False, True, True), - (False, True, True, UNSET, True), - (False, True, True, None, True), - (False, True, True, False, True), - (False, True, True, True, True), - (True, UNSET, UNSET, UNSET, True), - (True, UNSET, UNSET, None, True), - (True, UNSET, UNSET, False, False), - (True, UNSET, UNSET, True, True), - (True, UNSET, False, UNSET, True), - (True, UNSET, False, None, True), - (True, UNSET, False, False, False), - (True, UNSET, False, True, True), - (True, UNSET, True, UNSET, True), - (True, UNSET, True, None, True), - (True, UNSET, True, False, False), - (True, UNSET, True, True, True), - (True, False, UNSET, UNSET, False), - (True, False, UNSET, None, False), - (True, False, UNSET, False, False), - (True, False, UNSET, True, False), - (True, False, False, UNSET, False), - (True, False, False, None, False), - (True, False, False, False, False), - (True, False, False, True, False), - (True, False, True, UNSET, False), - (True, False, True, None, False), - (True, False, True, False, False), - (True, False, True, True, False), - (True, True, UNSET, UNSET, True), - (True, True, UNSET, None, True), - (True, True, UNSET, False, True), - (True, True, UNSET, True, True), - (True, True, False, UNSET, True), - (True, True, False, None, True), - (True, True, False, False, True), - (True, True, False, True, True), - (True, True, True, UNSET, True), - (True, True, True, None, True), - (True, True, True, False, True), - (True, True, True, True, True), - ), -) -@deferred_f_from_coro_f -async def test_params_precedence( - params_setting, params_meta, location_setting, location_meta, outcome, mockserver -): - postal_codes = {True: "10001", False: "10002"} - pool = ( - "postal-code-10001.example[0]" - if params_meta in postal_codes - else ( - f"postal-code-10001.example@{postal_codes[location_meta]}" - if location_meta in postal_codes - else "postal-code-10001.example" - ) - ) - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - meta: Dict[str, Any] = {} - - if params_setting is not UNSET: - settings["ZYTE_API_SESSION_PARAMS"] = { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": postal_codes[params_setting]}, - } - ] - } - if params_meta is not UNSET: - meta["zyte_api_session_params"] = { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": postal_codes[params_meta]}, - } - ] - } - if location_setting is not UNSET: - settings["ZYTE_API_SESSION_LOCATION"] = { - "postalCode": postal_codes[location_setting] - } - if location_meta is None: - meta["zyte_api_session_location"] = {} - elif location_meta is not UNSET: - meta["zyte_api_session_location"] = {"postalCode": postal_codes[location_meta]} - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request( - "https://postal-code-10001.example", - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": postal_codes[True]}, - } - ] - }, - **meta, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - if outcome: - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/check-passed": 1, - f"scrapy-zyte-api/sessions/pools/{pool}/use/check-passed": 1, - } - else: - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/failed": 1, - } - - -@pytest.mark.parametrize( - ("params", "close_reason", "stats"), - ( - ( - {"browserHtml": True}, - "bad_session_inits", - { - "scrapy-zyte-api/sessions/pools/forbidden.example/init/failed": 1, - }, - ), - ( - {"browserHtml": True, "url": "https://example.com"}, - "failed_forbidden_domain", - { - "scrapy-zyte-api/sessions/pools/forbidden.example/init/check-passed": 1, - }, - ), - ), -) -@deferred_f_from_coro_f -async def test_url_override(params, close_reason, stats, mockserver): - """If session params define a URL, that URL is used for session - initialization. Otherwise, the URL from the request getting the session - assigned first is used for session initialization.""" - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_PARAMS": params, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://forbidden.example"] - - def parse(self, response): - pass - - def closed(self, reason): - self.close_reason = reason - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert crawler.spider.close_reason == close_reason - assert session_stats == stats - - -class ConstantChecker: - def __init__(self, result): - self._result = result - - def check(self, response: Response, request: Request) -> bool: - if self._result in (True, False): - return self._result - raise self._result - - -class TrueChecker(ConstantChecker): - def __init__(self): - super().__init__(True) - - -class FalseChecker(ConstantChecker): - def __init__(self): - super().__init__(False) - - -class CloseSpiderChecker(ConstantChecker): - def __init__(self): - super().__init__(CloseSpider("closed_by_checker")) - - -class UnexpectedExceptionChecker(ConstantChecker): - def __init__(self): - super().__init__(Exception) - - -class TrueCrawlerChecker(ConstantChecker): - @classmethod - def from_crawler(cls, crawler): - return cls(crawler) - - def __init__(self, crawler): - super().__init__(crawler.settings["ZYTE_API_SESSION_ENABLED"]) - - -class FalseCrawlerChecker(ConstantChecker): - @classmethod - def from_crawler(cls, crawler): - return cls(crawler) - - def __init__(self, crawler): - super().__init__(not crawler.settings["ZYTE_API_SESSION_ENABLED"]) - - -class UseChecker(ConstantChecker): - """Always pass for session initialization requests, apply the check logic - only on session use requests.""" - - def check(self, response: Response, request: Request) -> bool: - if response.meta.get(SESSION_INIT_META_KEY, False) is True: - return True - return super().check(response, request) - - -class FalseUseChecker(FalseChecker, UseChecker): - pass - - -class CloseSpiderUseChecker(CloseSpiderChecker, UseChecker): - pass - - -class UnexpectedExceptionUseChecker(UnexpectedExceptionChecker, UseChecker): - pass - - -class OnlyPassFirstInitChecker: - def __init__(self): - self.on_first_init = True - - def check(self, response: Response, request: Request) -> bool: - if self.on_first_init: - self.on_first_init = False - return True - return False - - -# NOTE: There is no use checker subclass for TrueChecker because the outcome -# would be the same (always return True), and there are no use checker -# subclasses for the crawler classes because the init use is enough to verify -# that using the crawler works. - -CHECKER_TESTS: Tuple[Tuple[str, str, Dict[str, int]], ...] = ( - ( - "tests.test_sessions.TrueChecker", - "finished", - { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - }, - ), - ( - "tests.test_sessions.FalseChecker", - "bad_session_inits", - {"scrapy-zyte-api/sessions/pools/example.com/init/check-failed": 1}, - ), - ( - "tests.test_sessions.FalseUseChecker", - "finished", - { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, - "scrapy-zyte-api/sessions/pools/example.com/use/check-failed": 1, - }, - ), - ("tests.test_sessions.CloseSpiderChecker", "closed_by_checker", {}), - ( - "tests.test_sessions.CloseSpiderUseChecker", - "closed_by_checker", - { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - }, - ), - ( - "tests.test_sessions.UnexpectedExceptionChecker", - "bad_session_inits", - {"scrapy-zyte-api/sessions/pools/example.com/init/check-error": 1}, - ), - ( - "tests.test_sessions.UnexpectedExceptionUseChecker", - "finished", - { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, - "scrapy-zyte-api/sessions/pools/example.com/use/check-error": 1, - }, - ), - ( - "tests.test_sessions.TrueCrawlerChecker", - "finished", - { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - }, - ), - ( - "tests.test_sessions.FalseCrawlerChecker", - "bad_session_inits", - {"scrapy-zyte-api/sessions/pools/example.com/init/check-failed": 1}, - ), - ( - "tests.test_sessions.OnlyPassFirstInitChecker", - "bad_session_inits", - { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/init/check-failed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-failed": 1, - }, - ), -) - - -@pytest.mark.parametrize( - ("checker", "close_reason", "stats"), - ( - *CHECKER_TESTS, - *( - pytest.param( - load_object(checker), - close_reason, - stats, - marks=pytest.mark.skipif( - not _RAW_CLASS_SETTING_SUPPORT, - reason=( - "Configuring component classes instead of their " - "import paths requires Scrapy 2.4+." - ), - ), - ) - for checker, close_reason, stats in CHECKER_TESTS - ), - ), -) -@deferred_f_from_coro_f -async def test_checker(checker, close_reason, stats, mockserver): - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_CHECKER": checker, - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - pass - - def closed(self, reason): - self.close_reason = reason - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert crawler.spider.close_reason == close_reason - assert session_stats == stats - - -@pytest.mark.parametrize( - ("postal_code", "url", "close_reason", "stats"), - ( - ( - None, - "https://postal-code-10001-soft.example", - "finished", - { - "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/use/check-passed": 1, - }, - ), - ( - "10001", - "https://postal-code-10001-soft.example", - "finished", - { - "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/use/check-passed": 1, - }, - ), - ( - "10002", - "https://postal-code-10001-soft.example", - "bad_session_inits", - { - "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/init/check-failed": 1 - }, - ), - ( - "10001", - "https://no-location-support.example", - "unsupported_set_location", - {}, - ), - ), -) -@deferred_f_from_coro_f -async def test_checker_location(postal_code, url, close_reason, stats, mockserver): - """The default checker looks into the outcome of the ``setLocation`` action - if a location meta/setting was used.""" - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - if postal_code is not None: - settings["ZYTE_API_SESSION_LOCATION"] = {"postalCode": postal_code} - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": postal_code}, - } - ] - }, - }, - ) - - def parse(self, response): - pass - - def closed(self, reason): - self.close_reason = reason - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert crawler.spider.close_reason == close_reason - assert session_stats == stats - - -class CloseSpiderURLChecker: - def check(self, response: Response, request: Request) -> bool: - if "fail" in request.url: - raise CloseSpider("closed_by_checker") - return True - - -@deferred_f_from_coro_f -async def test_checker_close_spider_use(mockserver): - """A checker can raise CloseSpider not only during session initialization, - but also during session use.""" - settings = { - "ZYTE_API_SESSION_CHECKER": "tests.test_sessions.CloseSpiderURLChecker", - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com/fail"] - - def parse(self, response): - pass - - def closed(self, reason): - self.close_reason = reason - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert crawler.spider.close_reason == "closed_by_checker" - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - } - - -@pytest.mark.parametrize( - ("setting", "value"), - ( - (0, 1), - (1, 1), - (2, 2), - (None, 8), - ), -) -@deferred_f_from_coro_f -async def test_max_bad_inits(setting, value, mockserver): - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_PARAMS": {"browserHtml": True, "httpResponseBody": True}, - } - if setting is not None: - settings["ZYTE_API_SESSION_MAX_BAD_INITS"] = setting - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/failed": value, - } - - -@pytest.mark.parametrize( - ("global_setting", "pool_setting", "value"), - ( - (None, 0, 1), - (None, 1, 1), - (None, 2, 2), - (3, None, 3), - ), -) -@deferred_f_from_coro_f -async def test_max_bad_inits_per_pool(global_setting, pool_setting, value, mockserver): - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_PARAMS": {"browserHtml": True, "httpResponseBody": True}, - } - if global_setting is not None: - settings["ZYTE_API_SESSION_MAX_BAD_INITS"] = global_setting - if pool_setting is not None: - settings["ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL"] = { - "pool.example": pool_setting - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com", "https://pool.example"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/failed": ( - 8 if global_setting is None else global_setting - ), - "scrapy-zyte-api/sessions/pools/pool.example/init/failed": value, - } - - -@pytest.mark.parametrize( - ("setting", "value"), - ( - (None, 1), - (0, 1), - (1, 1), - (2, 2), - ), -) -@deferred_f_from_coro_f -async def test_max_check_failures(setting, value, mockserver): - retry_times = 2 - settings = { - "RETRY_TIMES": retry_times, - "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", - "ZYTE_API_SESSION_CHECKER": "tests.test_sessions.FalseUseChecker", - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, - "ZYTE_API_SESSION_POOL_SIZE": 1, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - if setting is not None: - settings["ZYTE_API_SESSION_MAX_CHECK_FAILURES"] = setting - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": floor( - (retry_times + 1) / value - ) - + 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-failed": retry_times + 1, - } - - -@pytest.mark.parametrize( - ("setting", "value"), - ( - (None, 1), - (0, 1), - (1, 1), - (2, 2), - ), -) -@deferred_f_from_coro_f -async def test_max_errors(setting, value, mockserver): - retry_times = 2 - settings = { - "RETRY_TIMES": retry_times, - "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, - "ZYTE_API_SESSION_POOL_SIZE": 1, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - if setting is not None: - settings["ZYTE_API_SESSION_MAX_ERRORS"] = setting - - class TestSpider(Spider): - name = "test" - start_urls = ["https://temporary-download-error.example"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/temporary-download-error.example/init/check-passed": floor( - (retry_times + 1) / value - ) - + 1, - "scrapy-zyte-api/sessions/pools/temporary-download-error.example/use/failed": retry_times - + 1, - } - - -class DomainChecker: - def check(self, response: Response, request: Request) -> bool: - domain = urlparse_cached(request).netloc - return "fail" not in domain - - -@deferred_f_from_coro_f -async def test_check_overrides_error(mockserver): - """Max errors are ignored if a session does not pass its session check.""" - retry_times = 2 - settings = { - "RETRY_TIMES": retry_times, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_CHECKER": "tests.test_sessions.DomainChecker", - "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_ERRORS": 2, - "ZYTE_API_SESSION_POOL_SIZE": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://session-check-fails.example"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/session-check-fails.example/init/check-passed": retry_times - + 2, - "scrapy-zyte-api/sessions/pools/session-check-fails.example/use/check-failed": retry_times - + 1, - } - - -@pytest.mark.parametrize( - ("meta", "pool"), - ( - ({}, "example.com"), - ({"zyte_api_session_location": {"postalCode": "10001"}}, "example.com@10001"), - ( - {"zyte_api_session_location": {"postalCode": "10001", "foo": "bar"}}, - "example.com@10001", - ), - ( - { - "zyte_api_session_location": { - "addressCountry": "US", - "addressRegion": "TX", - } - }, - "example.com@US,TX", - ), - ( - { - "zyte_api_session_location": { - "addressCountry": "ES", - "addressRegion": "Pontevedra", - "streetAddress": "Rúa do Príncipe, 123", - "postalCode": "12345", - } - }, - "example.com@ES,Pontevedra,12345,Rúa do Príncipe, 123", - ), - ( - { - "zyte_api_session_params": {"foo": "bar"}, - "zyte_api_session_location": {"postalCode": "10001"}, - }, - "example.com[0]", - ), - ( - { - "zyte_api_session_pool": "foo", - "zyte_api_session_params": {"foo": "bar"}, - "zyte_api_session_location": {"postalCode": "10001"}, - }, - "foo", - ), - ), -) -@deferred_f_from_coro_f -async def test_pool(meta, pool, mockserver): - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - } - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request("https://example.com", meta=meta) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/check-passed": 1, - f"scrapy-zyte-api/sessions/pools/{pool}/use/check-passed": 1, - } - - -@deferred_f_from_coro_f -async def test_pool_params(mockserver, caplog): - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_POOL_SIZE": 1, - } - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request( - "https://example.com/a", - meta={"zyte_api_session_params": {"foo": "bar"}}, - ) - yield Request( - "https://example.com/b", - meta={"zyte_api_session_params": {"foo": "bar"}}, - ) - yield Request( - "https://example.com/c", - meta={"zyte_api_session_params": {"foo": "baz"}}, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - caplog.clear() - caplog.set_level("INFO") - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com[0]/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com[0]/use/check-passed": 2, - "scrapy-zyte-api/sessions/pools/example.com[1]/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com[1]/use/check-passed": 1, - } - expected_logs = { - ( - "INFO", - "Session pool example.com[0] uses these session initialization parameters: {'foo': 'bar'}", - ): 0, - ( - "INFO", - "Session pool example.com[1] uses these session initialization parameters: {'foo': 'baz'}", - ): 0, - } - for record in caplog.records: - entry = (record.levelname, record.msg) - if entry in expected_logs: - expected_logs[entry] += 1 - assert all(v == 1 for v in expected_logs.values()) - - -@pytest.mark.parametrize( - ("setting", "value"), - ( - (1, 1), - (2, 2), - (None, 8), - ), -) -@deferred_f_from_coro_f -async def test_pool_size(setting, value, mockserver): - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - } - if setting is not None: - settings["ZYTE_API_SESSION_POOL_SIZE"] = setting - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] * (value + 1) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": value, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": value + 1, - } - - -@pytest.mark.parametrize( - ("global_setting", "pool_setting", "value"), - ( - (None, 1, 1), - (None, 2, 2), - (3, None, 3), - ), -) -@deferred_f_from_coro_f -async def test_pool_sizes(global_setting, pool_setting, value, mockserver): - settings = { - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - } - if global_setting is not None: - settings["ZYTE_API_SESSION_POOL_SIZE"] = global_setting - if pool_setting is not None: - settings["ZYTE_API_SESSION_POOL_SIZES"] = {"pool.example": pool_setting} - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com", "https://pool.example"] * (value + 1) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": ( - value if pool_setting is None else min(value + 1, 8) - ), - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": value + 1, - "scrapy-zyte-api/sessions/pools/pool.example/init/check-passed": value, - "scrapy-zyte-api/sessions/pools/pool.example/use/check-passed": value + 1, - } - - -def mock_request_error(*, status=200, response_content=None): - kwargs: Dict[str, Any] = {} - if _REQUEST_ERROR_HAS_QUERY: - kwargs["query"] = {} - return RequestError( - history=None, - request_info=None, - response_content=response_content, - status=status, - **kwargs, - ) - - -# Number of times to test request errors that must be retried forever. -FOREVER_TIMES = 100 - - -class fast_forward: - def __init__(self, time): - self.time = time - - -@pytest.mark.parametrize( - ("retrying", "outcomes", "exhausted"), - ( - *( - (retry_policy, outcomes, exhausted) - for retry_policy in ( - SESSION_DEFAULT_RETRY_POLICY, - SESSION_AGGRESSIVE_RETRY_POLICY, - ) - for status in (520, 521) - for outcomes, exhausted in ( - ( - (mock_request_error(status=status),), - True, - ), - ( - (mock_request_error(status=429),), - False, - ), - ( - ( - mock_request_error(status=429), - mock_request_error(status=status), - ), - True, - ), - ) - ), - ), -) -@deferred_f_from_coro_f -@patch("time.monotonic") -async def test_retry_stop(monotonic_mock, retrying, outcomes, exhausted): - monotonic_mock.return_value = 0 - last_outcome = outcomes[-1] - outcomes = deque(outcomes) - - def wait(retry_state): - return 0.0 - - retrying = copy(retrying) - retrying.wait = wait - - async def run(): - while True: - try: - outcome = outcomes.popleft() - except IndexError: - return - else: - if isinstance(outcome, fast_forward): - monotonic_mock.return_value += outcome.time - continue - raise outcome - - run = retrying.wraps(run) - try: - await run() - except Exception as outcome: - assert exhausted - assert outcome is last_outcome - else: - assert not exhausted - - -try: - from scrapy import addons # noqa: F401 -except ImportError: - ADDON_SUPPORT = False -else: - ADDON_SUPPORT = True - - -@pytest.mark.parametrize( - ("manual_settings", "addon_settings"), - ( - ( - {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, - {}, - ), - ( - {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, - {"ZYTE_API_RETRY_POLICY": "zyte_api.zyte_api_retrying"}, - ), - ( - { - "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" - }, - {"ZYTE_API_RETRY_POLICY": "zyte_api.aggressive_retrying"}, - ), - ( - {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, - {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, - ), - ( - { - "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" - }, - { - "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" - }, - ), - ( - {"ZYTE_API_RETRY_POLICY": "tests.test_sessions.UNSET"}, - {"ZYTE_API_RETRY_POLICY": "tests.test_sessions.UNSET"}, - ), - ), -) -@deferred_f_from_coro_f -@pytest.mark.skipif( - not ADDON_SUPPORT, reason="No add-on support in this version of Scrapy" -) -async def test_addon(manual_settings, addon_settings): - crawler = await get_crawler( - { - "ZYTE_API_TRANSPARENT_MODE": True, - "ZYTE_API_SESSION_ENABLED": True, - **manual_settings, - }, - poet=False, - ) - addon_crawler = await get_crawler( - {"ZYTE_API_SESSION_ENABLED": True, **addon_settings}, use_addon=True, poet=False - ) - assert serialize_settings(crawler.settings) == serialize_settings( - addon_crawler.settings - ) - - -@deferred_f_from_coro_f -async def test_session_config(mockserver): - pytest.importorskip("web_poet") - - @session_config( - [ - "postal-code-10001-a.example", - "postal-code-10001-a-fail.example", - "postal-code-10001-a-alternative.example", - ] - ) - class CustomSessionConfig(SessionConfig): - def params(self, request: Request): - return { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - } - - def check(self, response: Response, request: Request) -> bool: - domain = urlparse_cached(request).netloc - return "fail" not in domain - - def pool(self, request: Request) -> str: - domain = urlparse_cached(request).netloc - if domain == "postal-code-10001-a-alternative.example": - return "postal-code-10001-a.example" - return domain - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = [ - "https://postal-code-10001-a.example", - "https://postal-code-10001-a-alternative.example", - "https://postal-code-10001-a-fail.example", - "https://postal-code-10001-b.example", - ] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001-a.example/init/check-passed": 2, - "scrapy-zyte-api/sessions/pools/postal-code-10001-a.example/use/check-passed": 2, - "scrapy-zyte-api/sessions/pools/postal-code-10001-a-fail.example/init/check-failed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-b.example/init/failed": 1, - } - - # Clean up the session config registry, and check it, otherwise we could - # affect other tests. - - session_config_registry.__init__() # type: ignore[misc] - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001-a.example/init/failed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-a-alternative.example/init/failed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-a-fail.example/init/failed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-b.example/init/failed": 1, - } - - -@deferred_f_from_coro_f -async def test_session_config_check_meta(mockserver): - """When initializing a session, known zyte_api_session-prefixed params - should be included in the session initialization request, so that they can - be used from check methods validating those requests. - - For example, when validating a location, access to - zyte_api_session_location may be necessary. - """ - pytest.importorskip("web_poet") - - params = { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - } - - @session_config(["example.com"]) - class CustomSessionConfig(SessionConfig): - def check(self, response, request): - return ( - bool(self.location(request)) - and response.meta["zyte_api_session_params"] == params - and ( - ( - response.meta.get("_is_session_init_request", False) - and "zyte_api_session_foo" not in response.meta - ) - or response.meta["zyte_api_session_foo"] == "bar" - ) - ) - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": params, - "zyte_api_session_params": params, - "zyte_api_session_location": {"postalCode": "10001"}, - "zyte_api_session_foo": "bar", - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com[0]/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com[0]/use/check-passed": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_enabled(mockserver): - pytest.importorskip("web_poet") - - @session_config(["enabled.example", "disabled.example"]) - class CustomSessionConfig(SessionConfig): - def enabled(self, request: Request): - return "enabled" in urlparse_cached(request).netloc - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://enabled.example", "https://disabled.example"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/use/disabled": 1, - "scrapy-zyte-api/sessions/pools/enabled.example/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/enabled.example/use/check-passed": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@pytest.mark.parametrize( - ("settings", "meta", "used"), - ( - ({}, {}, True), - ( - { - "ZYTE_API_SESSION_PARAMS": { - "actions": [ - {"action": "setLocation", "address": {"postalCode": "10002"}} - ] - } - }, - {}, - False, - ), - ({"ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}}, {}, False), - ( - {}, - { - "zyte_api_session_params": { - "actions": [ - {"action": "setLocation", "address": {"postalCode": "10002"}} - ] - } - }, - False, - ), - ({}, {"zyte_api_session_location": {"postalCode": "10002"}}, False), - ), -) -@deferred_f_from_coro_f -async def test_session_config_location(settings, meta, used, mockserver): - """Overriding location in SessionConfig, if done according to the docs, - only has an effect when neither spider-level nor request-level variables - are used to modify params.""" - pytest.importorskip("web_poet") - - @session_config(["postal-code-10001.example"]) - class CustomSessionConfig(SessionConfig): - def location(self, request: Request): - return super().location(request) or {"postalCode": "10001"} - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - **settings, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://postal-code-10001.example"] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - **meta, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - if used: - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 1, - } - else: - pool = ( - "postal-code-10001.example[0]" - if "zyte_api_session_params" in meta - else ( - "postal-code-10001.example@10002" - if "zyte_api_session_location" in meta - else "postal-code-10001.example" - ) - ) - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/failed": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@pytest.mark.parametrize( - ("settings", "meta", "used"), - ( - ({}, {}, True), - ( - { - "ZYTE_API_SESSION_PARAMS": { - "actions": [ - {"action": "setLocation", "address": {"postalCode": "10002"}} - ] - } - }, - {}, - False, - ), - ({"ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}}, {}, True), - ( - {}, - { - "zyte_api_session_params": { - "actions": [ - {"action": "setLocation", "address": {"postalCode": "10002"}} - ] - } - }, - False, - ), - ({}, {"zyte_api_session_location": {"postalCode": "10002"}}, True), - ), -) -@deferred_f_from_coro_f -async def test_session_config_location_bad(settings, meta, used, mockserver): - """Overriding location in SessionConfig, if it does not return - super().location() when truthy, breaks params precedence for location meta - key and setting, but does not break raw params meta key and setting.""" - pytest.importorskip("web_poet") - - @session_config(["postal-code-10001.example"]) - class CustomSessionConfig(SessionConfig): - def location(self, request: Request): - return {"postalCode": "10001"} - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - **settings, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://postal-code-10001.example"] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - **meta, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - pool = ( - "postal-code-10001.example[0]" - if "zyte_api_session_params" in meta - else ( - "postal-code-10001.example@10002" - if "zyte_api_session_location" in meta - else "postal-code-10001.example" - ) - ) - if used: - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/check-passed": 1, - f"scrapy-zyte-api/sessions/pools/{pool}/use/check-passed": 1, - } - else: - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/failed": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_params_location(mockserver): - """A custom session config can be used to customize the params for - location, e.g. to include extra actions, while still relying on the default - check to determine whether or not the session remains valid based on the - outcome of the ``setLocation`` action.""" - pytest.importorskip("web_poet") - - @session_config(["postal-code-10001.example"]) - class CustomSessionConfig(SessionConfig): - def params(self, request: Request): - return { - "actions": [ - { - "action": "waitForNavigation", - }, - { - "action": "setLocation", - "address": self.location(request), - }, - ] - } - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://postal-code-10001.example"] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_params_location_no_set_location(mockserver): - """A custom session config can be used to customize the params for - location to the point where they do not use a ``setLocation`` action. In - that case, the default session check will return ``True`` by default, i.e. - it will not fail due to not finding ``setLocation`` in response actions - data.""" - pytest.importorskip("web_poet") - - @session_config(["example.com"]) - class CustomSessionConfig(SessionConfig): - def params(self, request: Request): - postal_code = self.location(request)["postalCode"] - return { - "actions": [ - { - "action": "click", - "selector": {"type": "css", "value": f"#zip{postal_code}"}, - }, - ] - } - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@pytest.mark.parametrize( - ("meta", "settings", "pool", "outcome"), - ( - ({}, {}, "postal-code-10001.example", False), - ( - { - "zyte_api_session_params": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - }, - ] - } - }, - {}, - "postal-code-10001.example[0]", - True, - ), - ( - {"zyte_api_session_location": {"postalCode": "10001"}}, - {}, - "postal-code-10001.example@10001", - False, - ), - ( - {}, - { - "ZYTE_API_SESSION_PARAMS": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - }, - ] - } - }, - "postal-code-10001.example", - True, - ), - ( - {}, - {"ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}}, - "postal-code-10001.example", - False, - ), - ), -) -@deferred_f_from_coro_f -async def test_session_config_params_precedence( - meta, settings, pool, outcome, mockserver -): - """A params override should have no impact on the use of the - zyte_api_session_params request metadata key or the use of the - ZYTE_API_SESSION_PARAMS setting. However, it can nullify locations if not - implemented with support for them as the default implementation has.""" - pytest.importorskip("web_poet") - - @session_config(["postal-code-10001.example"]) - class CustomSessionConfig(SessionConfig): - def params(self, request: Request): - return { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10002"}, - }, - ] - } - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - **settings, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://postal-code-10001.example"] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - }, - ], - }, - **meta, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - if outcome: - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/check-passed": 1, - f"scrapy-zyte-api/sessions/pools/{pool}/use/check-passed": 1, - } - else: - assert session_stats == { - f"scrapy-zyte-api/sessions/pools/{pool}/init/failed": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_params_error(mockserver): - pytest.importorskip("web_poet") - - @session_config(["example.com"]) - class CustomSessionConfig(SessionConfig): - def params(self, request: Request): - raise Exception - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/param-error": 1, - } - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_pool_caching(mockserver): - pytest.importorskip("web_poet") - - @session_config(["example.com"]) - class CustomSessionConfig(SessionConfig): - def __init__(self, crawler): - super().__init__(crawler) - self.pools = deque(("example.com",)) - - def pool(self, request: Request): - # The following code would fail on the second call, which never - # happens due to pool caching. - return self.pools.popleft() - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - pass - - def closed(self, reason): - self.close_reason = reason - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - assert crawler.spider.close_reason == "finished" - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_pool_error(mockserver): - # NOTE: This error should only happen during the initial process_request - # call. By the time the code reaches process_response, the cached pool - # value for that request is reused, so there is no new call to - # SessionConfig.pool that could fail during process_response only. - - pytest.importorskip("web_poet") - - @session_config(["example.com"]) - class CustomSessionConfig(SessionConfig): - def pool(self, request: Request): - raise Exception - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - pass - - def closed(self, reason): - self.close_reason = reason - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == {} - assert crawler.spider.close_reason == "pool_error" - - # Clean up the session config registry. - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_no_web_poet(mockserver): - """If web-poet is not installed, @session_config raises a RuntimeError.""" - try: - import web_poet # noqa: F401 - except ImportError: - pass - else: - pytest.skip("Test only relevant when web-poet is not installed.") - - with pytest.raises(RuntimeError): - - @session_config(["example.com"]) - class CustomSessionConfig(SessionConfig): - pass - - -@deferred_f_from_coro_f -async def test_session_config_process_request_change_request(mockserver): - pytest.importorskip("web_poet") - - @session_config("example.com") - class CustomSessionConfig(SessionConfig): - def __init__(self, crawler): - super().__init__(crawler) - self.session_data = {} - - def check(self, response: Response, request: Request) -> bool: - if is_session_init_request(request): - session_id = get_request_session_id(request) - self.session_data[session_id] = {"foo": "bar"} - return super().check(response, request) - - def process_request(self, request: Request) -> Optional[Request]: - session_id = get_request_session_id(request) - foo = self.session_data[session_id]["foo"] - request.headers["foo"] = foo - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - request_headers = [] - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - request_headers.append(response.request.headers["foo"]) - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - assert request_headers == [b"bar"] - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - - # Clean up the session config registry, and check it, otherwise we could - # affect other tests. - - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_config_process_request_new_request(mockserver): - pytest.importorskip("web_poet") - - @session_config("example.com") - class CustomSessionConfig(SessionConfig): - def __init__(self, crawler): - super().__init__(crawler) - self.session_data = {} - - def check(self, response: Response, request: Request) -> bool: - if is_session_init_request(request): - session_id = get_request_session_id(request) - self.session_data[session_id] = {"foo": "bar"} - return super().check(response, request) - - def process_request(self, request: Request) -> Optional[Request]: - session_id = get_request_session_id(request) - foo = self.session_data[session_id]["foo"] - new_url = request.url.rstrip("/") + f"/{foo}" - return request.replace(url=new_url) - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - output_urls = [] - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def parse(self, response): - output_urls.append(response.url) - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - assert output_urls == ["https://example.com/bar"] - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - - # Clean up the session config registry, and check it, otherwise we could - # affect other tests. - - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_location_session_config(mockserver): - pytest.importorskip("web_poet") - - @session_config( - [ - "postal-code-10001.example", - "postal-code-10001-fail.example", - "postal-code-10001-alternative.example", - ] - ) - class CustomSessionConfig(LocationSessionConfig): - def location_params( - self, request: Request, location: Dict[str, Any] - ) -> Dict[str, Any]: - assert location == {"postalCode": "10002"} - return { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - } - - def location_check( - self, response: Response, request: Request, location: Dict[str, Any] - ) -> bool: - assert location == {"postalCode": "10002"} - domain = urlparse_cached(request).netloc - return "fail" not in domain - - def pool(self, request: Request) -> str: - domain = urlparse_cached(request).netloc - if domain == "postal-code-10001-alternative.example": - return "postal-code-10001.example" - return domain - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - # We set a location to force the location-specific methods of the - # session config class to be called, but we set the wrong location so - # that the test would not pass were it not for our custom - # implementation which ignores the input location and instead sets the - # right one. - "ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = [ - "https://postal-code-10001.example", - "https://postal-code-10001-alternative.example", - "https://postal-code-10001-fail.example", - ] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, - "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/check-failed": 1, - } - - # Clean up the session config registry, and check it, otherwise we could - # affect other tests. - - session_config_registry.__init__() # type: ignore[misc] - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-alternative.example/init/failed": 1, - "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/failed": 1, - } - - -@deferred_f_from_coro_f -async def test_location_session_config_no_methods(mockserver): - """If no location_* methods are defined, LocationSessionConfig works the - same as SessionConfig.""" - pytest.importorskip("web_poet") - - @session_config( - [ - "postal-code-10001.example", - "postal-code-10001-alternative.example", - ] - ) - class CustomSessionConfig(LocationSessionConfig): - def pool(self, request: Request) -> str: - domain = urlparse_cached(request).netloc - if domain == "postal-code-10001-alternative.example": - return "postal-code-10001.example" - return domain - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = [ - "https://postal-code-10001.example", - "https://postal-code-10001-alternative.example", - ] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, - } - - # Clean up the session config registry, and check it, otherwise we could - # affect other tests. - - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_location_session_config_no_location(mockserver): - """If no location is configured, the methods are never called.""" - pytest.importorskip("web_poet") - - @session_config(["postal-code-10001.example", "a.example"]) - class CustomSessionConfig(LocationSessionConfig): - def location_params( - self, request: Request, location: Dict[str, Any] - ) -> Dict[str, Any]: - assert False - - def location_check( - self, response: Response, request: Request, location: Dict[str, Any] - ) -> bool: - assert False - - settings = { - "RETRY_TIMES": 0, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://postal-code-10001.example", "https://a.example"] - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - for url in self.start_urls: - yield Request( - url, - meta={ - "zyte_api_automap": { - "actions": [ - { - "action": "setLocation", - "address": {"postalCode": "10001"}, - } - ] - }, - }, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, - "scrapy-zyte-api/sessions/pools/a.example/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/a.example/use/check-passed": 1, - } - - # Clean up the session config registry, and check it, otherwise we could - # affect other tests. - - session_config_registry.__init__() # type: ignore[misc] - - -@deferred_f_from_coro_f -async def test_session_refresh(mockserver): - """If a response does not pass a session validity check, the session is - discarded, and the request is retried with a different session.""" - - class Tracker: - def __init__(self): - self.sessions = [] - - def track_session(self, request: Request, spider: Spider): - self.sessions.append(request.meta["zyte_api"]["session"]["id"]) - - tracker = Tracker() - - settings = { - "RETRY_TIMES": 1, - "ZYTE_API_URL": mockserver.urljoin("/"), - "ZYTE_API_SESSION_CHECKER": "tests.test_sessions.DomainChecker", - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, - "ZYTE_API_SESSION_POOL_SIZE": 1, - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://session-check-fails.example"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - crawler.signals.connect( - tracker.track_session, signal=signals.request_reached_downloader - ) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/session-check-fails.example/init/check-passed": 3, - "scrapy-zyte-api/sessions/pools/session-check-fails.example/use/check-failed": 2, - } - assert len(tracker.sessions) == 5 - assert tracker.sessions[0] == tracker.sessions[1] - assert tracker.sessions[0] != tracker.sessions[2] - assert tracker.sessions[2] == tracker.sessions[3] - assert tracker.sessions[0] != tracker.sessions[4] - assert tracker.sessions[2] != tracker.sessions[4] - - -@deferred_f_from_coro_f -async def test_session_refresh_concurrent(mockserver): - """When more than 1 request is using the same session concurrently, it can - happen that more than 1 response triggers a session refresh. In those - cases, the same session should be refreshed only once, not once per - response triggering a refresh.""" - settings = { - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_MAX_BAD_INITS": 1, - "ZYTE_API_SESSION_MAX_ERRORS": 1, - "ZYTE_API_SESSION_POOL_SIZE": 1, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com/"] - - def parse(self, response): - for n in range(2): - yield Request(f"https://example.com/{n}?temporary-download-error") - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/init/failed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/failed": 2, - } - - -@deferred_f_from_coro_f -async def test_cookies(mockserver): - class Tracker: - def __init__(self): - self.cookies = [] - - def track(self, request: Request, spider: Spider): - cookie = request.headers.get(b"Cookie", None) - self.cookies.append(cookie) - - tracker = Tracker() - - settings = { - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_TRANSPARENT_MODE": True, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request( - "https://example.com", - cookies={"a": "b"}, - meta={"zyte_api_session_enabled": False}, - ) - - def parse(self, response): - yield Request( - "https://example.com/2", - meta={"zyte_api_session_enabled": False}, - callback=self.parse2, - ) - - def parse2(self, response): - yield Request( - "https://example.com/3", - callback=self.parse3, - ) - - def parse3(self, response): - yield Request( - "https://example.com/4", - meta={"dont_merge_cookies": False}, - callback=self.parse4, - ) - - def parse4(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 2, - "scrapy-zyte-api/sessions/use/disabled": 2, - } - - assert tracker.cookies == [ - # The 1st request sets cookies and disables session management, so - # cookies are set. - b"a=b", - # The 2nd request disables session management, and gets the cookies set - # by the previous request in the global cookiejar. - b"a=b", - # The 3rd request uses session management, and neither the session init - # request nor the actual request using the session get cookies. - None, - None, - # The 4th request uses session management but sets dont_merge_cookies - # to ``False``, so while session init does not use cookies, the actual - # request using the session gets the cookies. - None, - b"a=b", - ] - - -@deferred_f_from_coro_f -async def test_empty_queue(mockserver): - """After a pool is full, there might be a situation when the middleware - tries to assign a session to a request but all sessions of the pool are - pending creation or a refresh. In those cases, the assign process should - wait until a session becomes available in the queue.""" - settings = { - "ZYTE_API_SESSION_POOL_SIZE": 1, - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - # We send 2 requests in parallel, so only the first one gets a session - # created on demand, and the other one is forced to wait until that - # session is initialized. - start_urls = ["https://example.com/1", "https://example.com/2"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 2, - } - - -@deferred_f_from_coro_f -async def test_empty_queue_limit(mockserver): - settings = { - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS": 1, - "ZYTE_API_SESSION_QUEUE_WAIT_TIME": 0, - "ZYTE_API_SESSION_POOL_SIZE": 1, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com/1", "https://example.com/2"] - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - - -class SessionIDRemovingDownloaderMiddleware: - def process_exception( - self, request: Request, exception: Exception, spider: Spider | None = None - ) -> Union[Request, None]: - if not isinstance(exception, RequestError) or request.meta.get( - "_is_session_init_request", False - ): - return None - - del request.meta["zyte_api_automap"]["session"] - del request.meta["zyte_api_provider"]["session"] - return None - - -@deferred_f_from_coro_f -async def test_missing_session_id(mockserver, caplog): - """If a session ID is missing from a request that should have had it - assigned, a warning is logged about it.""" - - settings = { - "DOWNLOADER_MIDDLEWARES": { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, - "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, - "tests.test_sessions.SessionIDRemovingDownloaderMiddleware": 675, - }, - "RETRY_TIMES": 0, - "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, - "ZYTE_API_SESSION_POOL_SIZE": 1, - "ZYTE_API_TRANSPARENT_MODE": True, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://temporary-download-error.example"] - - def parse(self, response): - pass - - caplog.clear() - caplog.set_level("WARNING") - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/temporary-download-error.example/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/temporary-download-error.example/use/failed": 1, - } - assert "had no session ID assigned, unexpectedly" in caplog.text - - -@pytest.mark.parametrize( - ("settings", "meta", "meta_key"), - ( - ( - {}, - {}, - "zyte_api", - ), - ( - {}, - {"zyte_api": {}}, - "zyte_api", - ), - ( - {}, - {"zyte_api": {"httpResponseBody": True}}, - "zyte_api", - ), - ( - {}, - {"zyte_api_automap": True}, - "zyte_api_automap", - ), - ( - {"ZYTE_API_TRANSPARENT_MODE": True}, - {}, - "zyte_api_automap", - ), - ( - {"ZYTE_API_TRANSPARENT_MODE": True}, - {"zyte_api_automap": False}, - "zyte_api", - ), - ( - {"ZYTE_API_TRANSPARENT_MODE": True}, - {"zyte_api_automap": {}}, - "zyte_api_automap", - ), - ( - {"ZYTE_API_TRANSPARENT_MODE": True}, - {"zyte_api_automap": True}, - "zyte_api_automap", - ), - ), -) -@deferred_f_from_coro_f -async def test_assign_meta_key(settings, meta, meta_key, mockserver): - """Session ID is set in the zyte_api_provider meta key always, and in - either zyte_api or zyte_api_automap depending on some settings and meta - keys.""" - - class Tracker: - def __init__(self): - self.meta: Dict[str, Any] = {} - - def track(self, request: Request, spider: Spider): - self.meta = deepcopy(request.meta) - - tracker = Tracker() - - settings = { - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_URL": mockserver.urljoin("/"), - **settings, - } - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request( - "https://example.com", - meta=meta, - ) - - def parse(self, response): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - - assert ( - tracker.meta["zyte_api_provider"]["session"] - == tracker.meta[meta_key]["session"] - ) - other_meta_key = "zyte_api" if meta_key != "zyte_api" else "zyte_api_automap" - assert tracker.meta.get(other_meta_key, False) is False - - -@deferred_f_from_coro_f -async def test_provider(mockserver): - pytest.importorskip("scrapy_poet") - - from scrapy_poet import DummyResponse - from zyte_common_items import Product - - class Tracker: - def __init__(self): - self.query: Dict[str, Any] = {} - - def track(self, request: Request, spider: Spider): - self.query = request.meta["zyte_api"] - - tracker = Tracker() - - settings = { - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - - async def start(self): - for request in self.start_requests(): - yield request - - def start_requests(self): - yield Request("https://example.com", callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, - } - assert "product" in tracker.query - - -class ExceptionRaisingDownloaderMiddleware: - @classmethod - def from_crawler(cls, crawler): - return cls(crawler) - - def __init__(self, crawler): - self.crawler = crawler - - async def process_request( - self, request: Request, spider: Spider | None = None - ) -> None: - if request.meta.get("_is_session_init_request", False): - return - raise self.crawler.exception - - -@pytest.mark.parametrize( - ("exception", "stat", "reason"), - ( - ( - mock_request_error( - status=422, response_content=b'{"type": "/problem/session-expired"}' - ), - "expired", - "session_expired", - ), - ( - mock_request_error(status=520), - "failed", - "download_error", - ), - ( - mock_request_error(status=521), - "failed", - "download_error", - ), - ( - mock_request_error(status=500), - None, - None, - ), - ( - ServerConnectionError(), - None, - None, - ), - ( - RuntimeError(), - None, - None, - ), - ), -) -@deferred_f_from_coro_f -async def test_exceptions(exception, stat, reason, mockserver, caplog): - settings = { - "DOWNLOADER_MIDDLEWARES": { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, - "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, - "tests.test_sessions.ExceptionRaisingDownloaderMiddleware": 675, - }, - "RETRY_TIMES": 0, - "ZYTE_API_SESSION_ENABLED": True, - "ZYTE_API_TRANSPARENT_MODE": True, - "ZYTE_API_URL": mockserver.urljoin("/"), - } - - class TestSpider(Spider): - name = "test" - start_urls = ["https://example.com"] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def parse(self, response): - pass - - caplog.clear() - caplog.set_level("ERROR") - crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) - crawler.exception = exception - await maybe_deferred_to_future(crawler.crawl()) - - session_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/sessions") - } - if stat is not None: - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, - f"scrapy-zyte-api/sessions/pools/example.com/use/{stat}": 1, - } - else: - assert session_stats == { - "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, - } - if reason is not None: - assert reason in caplog.text - - -@pytest.mark.parametrize( - ("meta", "expected"), - ( - ({}, False), - ({SESSION_INIT_META_KEY: False}, False), - ({SESSION_INIT_META_KEY: True}, True), - ), -) -def test_is_session_init_request(meta, expected): - actual = is_session_init_request(Request("https://example.com", meta=meta)) - assert expected == actual diff --git a/tests/test_sessions_assign.py b/tests/test_sessions_assign.py new file mode 100644 index 00000000..43324ba6 --- /dev/null +++ b/tests/test_sessions_assign.py @@ -0,0 +1,107 @@ +from copy import deepcopy +from typing import Any, Dict + +import pytest +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider, signals + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("settings", "meta", "meta_key"), + ( + ( + {}, + {}, + "zyte_api", + ), + ( + {}, + {"zyte_api": {}}, + "zyte_api", + ), + ( + {}, + {"zyte_api": {"httpResponseBody": True}}, + "zyte_api", + ), + ( + {}, + {"zyte_api_automap": True}, + "zyte_api_automap", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {}, + "zyte_api_automap", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {"zyte_api_automap": False}, + "zyte_api", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {"zyte_api_automap": {}}, + "zyte_api_automap", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {"zyte_api_automap": True}, + "zyte_api_automap", + ), + ), +) +@deferred_f_from_coro_f +async def test_assign_meta_key(settings, meta, meta_key, mockserver): + """Session ID is set in the zyte_api_provider meta key always, and in + either zyte_api or zyte_api_automap depending on some settings and meta + keys.""" + + class Tracker: + def __init__(self): + self.meta: Dict[str, Any] = {} + + def track(self, request: Request, spider: Spider): + self.meta = deepcopy(request.meta) + + tracker = Tracker() + + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + **settings, + } + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request( + "https://example.com", + meta=meta, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, {"example.com": (1, 1)}) + + assert ( + tracker.meta["zyte_api_provider"]["session"] + == tracker.meta[meta_key]["session"] + ) + other_meta_key = "zyte_api" if meta_key != "zyte_api" else "zyte_api_automap" + assert tracker.meta.get(other_meta_key, False) is False diff --git a/tests/test_sessions_check_custom.py b/tests/test_sessions_check_custom.py new file mode 100644 index 00000000..02c14b5b --- /dev/null +++ b/tests/test_sessions_check_custom.py @@ -0,0 +1,319 @@ +from typing import Dict, Tuple, Union + +import pytest +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider +from scrapy.exceptions import CloseSpider +from scrapy.http import Response +from scrapy.utils.misc import load_object + +from scrapy_zyte_api import SessionConfig, session_config +from scrapy_zyte_api._session import SESSION_INIT_META_KEY, session_config_registry +from scrapy_zyte_api.utils import ( + _RAW_CLASS_SETTING_SUPPORT, + maybe_deferred_to_future, +) + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + +mod = "tests.test_sessions_check_custom." + + +class ConstantChecker: + def __init__(self, result): + self._result = result + + def check(self, response: Response, request: Request) -> bool: + if self._result in (True, False): + return self._result + raise self._result + + +class TrueChecker(ConstantChecker): + def __init__(self): + super().__init__(True) + + +class FalseChecker(ConstantChecker): + def __init__(self): + super().__init__(False) + + +class CloseSpiderChecker(ConstantChecker): + def __init__(self): + super().__init__(CloseSpider("closed_by_checker")) + + +class UnexpectedExceptionChecker(ConstantChecker): + def __init__(self): + super().__init__(Exception) + + +class TrueCrawlerChecker(ConstantChecker): + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + super().__init__(crawler.settings["ZYTE_API_SESSION_ENABLED"]) + + +class FalseCrawlerChecker(ConstantChecker): + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + super().__init__(not crawler.settings["ZYTE_API_SESSION_ENABLED"]) + + +class UseChecker(ConstantChecker): + """Always pass for session initialization requests, apply the check logic + only on session use requests.""" + + def check(self, response: Response, request: Request) -> bool: + if response.meta.get(SESSION_INIT_META_KEY, False) is True: + return True + return super().check(response, request) + + +class FalseUseChecker(FalseChecker, UseChecker): + pass + + +class CloseSpiderUseChecker(CloseSpiderChecker, UseChecker): + pass + + +class UnexpectedExceptionUseChecker(UnexpectedExceptionChecker, UseChecker): + pass + + +class OnlyPassFirstInitChecker: + def __init__(self): + self.on_first_init = True + + def check(self, response: Response, request: Request) -> bool: + if self.on_first_init: + self.on_first_init = False + return True + return False + + +# NOTE: There is no use checker subclass for TrueChecker because the outcome +# would be the same (always return True), and there are no use checker +# subclasses for the crawler classes because the init use is enough to verify +# that using the crawler works. + +CHECKER_TESTS: Tuple[ + Tuple[ + str, + str, + Dict[str, Union[Tuple[int, int], Dict[str, int]]], + ], + ..., +] = ( + (f"{mod}TrueChecker", "finished", {"example.com": (1, 1)}), + ( + f"{mod}FalseChecker", + "bad_session_inits", + {"example.com": {"init/check-failed": 1}}, + ), + ( + f"{mod}FalseUseChecker", + "finished", + {"example.com": {"init/check-passed": 2, "use/check-failed": 1}}, + ), + (f"{mod}CloseSpiderChecker", "closed_by_checker", {}), + ( + f"{mod}CloseSpiderUseChecker", + "closed_by_checker", + {"example.com": {"init/check-passed": 1}}, + ), + ( + f"{mod}UnexpectedExceptionChecker", + "bad_session_inits", + {"example.com": {"init/check-error": 1}}, + ), + ( + f"{mod}UnexpectedExceptionUseChecker", + "finished", + {"example.com": {"init/check-passed": 2, "use/check-error": 1}}, + ), + (f"{mod}TrueCrawlerChecker", "finished", {"example.com": (1, 1)}), + ( + f"{mod}FalseCrawlerChecker", + "bad_session_inits", + {"example.com": {"init/check-failed": 1}}, + ), + ( + f"{mod}OnlyPassFirstInitChecker", + "bad_session_inits", + { + "example.com": { + "init/check-passed": 1, + "init/check-failed": 1, + "use/check-failed": 1, + } + }, + ), +) + + +@pytest.mark.parametrize( + ("checker", "close_reason", "stats"), + ( + *CHECKER_TESTS, + *( + pytest.param( + load_object(checker), + close_reason, + stats, + marks=pytest.mark.skipif( + not _RAW_CLASS_SETTING_SUPPORT, + reason=( + "Configuring component classes instead of their " + "import paths requires Scrapy 2.4+." + ), + ), + ) + for checker, close_reason, stats in CHECKER_TESTS + ), + ), +) +@deferred_f_from_coro_f +async def test_checker(checker, close_reason, stats, mockserver): + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_CHECKER": checker, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert crawler.spider.close_reason == close_reason + assert_session_stats(crawler, stats) + + +class CloseSpiderURLChecker: + def check(self, response: Response, request: Request) -> bool: + if "fail" in request.url: + raise CloseSpider("closed_by_checker") + return True + + +@deferred_f_from_coro_f +async def test_checker_close_spider_use(mockserver): + """A checker can raise CloseSpider not only during session initialization, + but also during session use.""" + settings = { + **SESSION_SETTINGS, + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions_check_custom.CloseSpiderURLChecker", + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com/fail"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert crawler.spider.close_reason == "closed_by_checker" + assert_session_stats(crawler, {"example.com": {"init/check-passed": 1}}) + + +@deferred_f_from_coro_f +async def test_session_config_check_meta(mockserver): + """When initializing a session, known zyte_api_session-prefixed params + should be included in the session initialization request, so that they can + be used from check methods validating those requests. + + For example, when validating a location, access to + zyte_api_session_location may be necessary. + """ + pytest.importorskip("web_poet") + + params = { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + } + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + def check(self, response, request): + return ( + bool(self.location(request)) + and response.meta["zyte_api_session_params"] == params + and ( + ( + response.meta.get("_is_session_init_request", False) + and "zyte_api_session_foo" not in response.meta + ) + or response.meta["zyte_api_session_foo"] == "bar" + ) + ) + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": params, + "zyte_api_session_params": params, + "zyte_api_session_location": {"postalCode": "10001"}, + "zyte_api_session_foo": "bar", + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, {"example.com[0]": (1, 1)}) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] diff --git a/tests/test_sessions_check_default.py b/tests/test_sessions_check_default.py new file mode 100644 index 00000000..2ea4790e --- /dev/null +++ b/tests/test_sessions_check_default.py @@ -0,0 +1,86 @@ +import pytest +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider + +from scrapy_zyte_api.utils import ( + maybe_deferred_to_future, +) + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("postal_code", "url", "close_reason", "stats"), + ( + ( + None, + "https://postal-code-10001-soft.example", + "finished", + {"postal-code-10001-soft.example": (1, 1)}, + ), + ( + "10001", + "https://postal-code-10001-soft.example", + "finished", + {"postal-code-10001-soft.example": (1, 1)}, + ), + ( + "10002", + "https://postal-code-10001-soft.example", + "bad_session_inits", + {"postal-code-10001-soft.example": {"init/check-failed": 1}}, + ), + ( + "10001", + "https://no-location-support.example", + "unsupported_set_location", + {}, + ), + ), +) +@deferred_f_from_coro_f +async def test_checker_location(postal_code, url, close_reason, stats, mockserver): + """The default checker looks into the outcome of the ``setLocation`` action + if a location meta/setting was used.""" + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + if postal_code is not None: + settings["ZYTE_API_SESSION_LOCATION"] = {"postalCode": postal_code} + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_code}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert crawler.spider.close_reason == close_reason + assert_session_stats(crawler, stats) diff --git a/tests/test_sessions_check_errors.py b/tests/test_sessions_check_errors.py new file mode 100644 index 00000000..ce851b4a --- /dev/null +++ b/tests/test_sessions_check_errors.py @@ -0,0 +1,52 @@ +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider +from scrapy.http import Response +from scrapy.utils.httpobj import urlparse_cached + +from scrapy_zyte_api.utils import ( + maybe_deferred_to_future, +) + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +class DomainChecker: + def check(self, response: Response, request: Request) -> bool: + domain = urlparse_cached(request).netloc + return "fail" not in domain + + +@deferred_f_from_coro_f +async def test_check_overrides_error(mockserver): + """Max errors are ignored if a session does not pass its session check.""" + retry_times = 2 + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": retry_times, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions_check_errors.DomainChecker", + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_MAX_ERRORS": 2, + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://session-check-fails.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "session-check-fails.example": { + "init/check-passed": retry_times + 2, + "use/check-failed": retry_times + 1, + } + }, + ) diff --git a/tests/test_sessions_config.py b/tests/test_sessions_config.py new file mode 100644 index 00000000..79dd9b76 --- /dev/null +++ b/tests/test_sessions_config.py @@ -0,0 +1,273 @@ +from typing import Optional + +import pytest +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider +from scrapy.http import Response +from scrapy.utils.httpobj import urlparse_cached + +from scrapy_zyte_api import ( + SessionConfig, + get_request_session_id, + is_session_init_request, + session_config, +) +from scrapy_zyte_api._session import session_config_registry +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@deferred_f_from_coro_f +async def test_session_config(mockserver): + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001-a.example", + "postal-code-10001-a-fail.example", + "postal-code-10001-a-alternative.example", + ] + ) + class CustomSessionConfig(SessionConfig): + def params(self, request: Request): + return { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + } + + def check(self, response: Response, request: Request) -> bool: + domain = urlparse_cached(request).netloc + return "fail" not in domain + + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-a-alternative.example": + return "postal-code-10001-a.example" + return domain + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001-a.example", + "https://postal-code-10001-a-alternative.example", + "https://postal-code-10001-a-fail.example", + "https://postal-code-10001-b.example", + ] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "postal-code-10001-a.example": { + "init/check-passed": 2, + "use/check-passed": 2, + }, + "postal-code-10001-a-fail.example": {"init/check-failed": 1}, + "postal-code-10001-b.example": {"init/failed": 1}, + }, + ) + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "postal-code-10001-a.example": {"init/failed": 1}, + "postal-code-10001-a-alternative.example": {"init/failed": 1}, + "postal-code-10001-a-fail.example": {"init/failed": 1}, + "postal-code-10001-b.example": {"init/failed": 1}, + }, + ) + + +@deferred_f_from_coro_f +async def test_session_config_no_web_poet(mockserver): + """If web-poet is not installed, @session_config raises a RuntimeError.""" + try: + import web_poet # noqa: F401 + except ImportError: + pass + else: + pytest.skip("Test only relevant when web-poet is not installed.") + + with pytest.raises(RuntimeError): + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + pass + + +@deferred_f_from_coro_f +async def test_session_config_process_request_change_request(mockserver): + pytest.importorskip("web_poet") + + @session_config("example.com") + class CustomSessionConfig(SessionConfig): + def __init__(self, crawler): + super().__init__(crawler) + self.session_data = {} + + def check(self, response: Response, request: Request) -> bool: + if is_session_init_request(request): + session_id = get_request_session_id(request) + self.session_data[session_id] = {"foo": "bar"} + return super().check(response, request) + + def process_request(self, request: Request) -> Optional[Request]: + session_id = get_request_session_id(request) + foo = self.session_data[session_id]["foo"] + request.headers["foo"] = foo + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + request_headers = [] + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + request_headers.append(response.request.headers["foo"]) + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert request_headers == [b"bar"] + + assert_session_stats(crawler, {"example.com": (1, 1)}) + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + +@deferred_f_from_coro_f +async def test_session_config_process_request_new_request(mockserver): + pytest.importorskip("web_poet") + + @session_config("example.com") + class CustomSessionConfig(SessionConfig): + def __init__(self, crawler): + super().__init__(crawler) + self.session_data = {} + + def check(self, response: Response, request: Request) -> bool: + if is_session_init_request(request): + session_id = get_request_session_id(request) + self.session_data[session_id] = {"foo": "bar"} + return super().check(response, request) + + def process_request(self, request: Request) -> Optional[Request]: + session_id = get_request_session_id(request) + foo = self.session_data[session_id]["foo"] + new_url = request.url.rstrip("/") + f"/{foo}" + return request.replace(url=new_url) + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + output_urls = [] + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + output_urls.append(response.url) + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert output_urls == ["https://example.com/bar"] + + assert_session_stats(crawler, {"example.com": (1, 1)}) + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + +@deferred_f_from_coro_f +async def test_session_config_params_error(mockserver): + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + def params(self, request: Request): + raise Exception + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, {"example.com": {"init/param-error": 1}}) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] diff --git a/tests/test_sessions_cookies.py b/tests/test_sessions_cookies.py new file mode 100644 index 00000000..22f0303a --- /dev/null +++ b/tests/test_sessions_cookies.py @@ -0,0 +1,94 @@ +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider, signals + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@deferred_f_from_coro_f +async def test_cookies(mockserver): + class Tracker: + def __init__(self): + self.cookies = [] + + def track(self, request: Request, spider: Spider): + cookie = request.headers.get(b"Cookie", None) + self.cookies.append(cookie) + + tracker = Tracker() + + settings = { + **SESSION_SETTINGS, + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request( + "https://example.com", + cookies={"a": "b"}, + meta={"zyte_api_session_enabled": False}, + ) + + def parse(self, response): + yield Request( + "https://example.com/2", + meta={"zyte_api_session_enabled": False}, + callback=self.parse2, + ) + + def parse2(self, response): + yield Request( + "https://example.com/3", + callback=self.parse3, + ) + + def parse3(self, response): + yield Request( + "https://example.com/4", + meta={"dont_merge_cookies": False}, + callback=self.parse4, + ) + + def parse4(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "/pools/example.com/init/check-passed": 2, + "/pools/example.com/use/check-passed": 2, + "/use/disabled": 2, + }, + ) + + assert tracker.cookies == [ + # The 1st request sets cookies and disables session management, so + # cookies are set. + b"a=b", + # The 2nd request disables session management, and gets the cookies set + # by the previous request in the global cookiejar. + b"a=b", + # The 3rd request uses session management, and neither the session init + # request nor the actual request using the session get cookies. + None, + None, + # The 4th request uses session management but sets dont_merge_cookies + # to ``False``, so while session init does not use cookies, the actual + # request using the session gets the cookies. + None, + b"a=b", + ] diff --git a/tests/test_sessions_enabled.py b/tests/test_sessions_enabled.py new file mode 100644 index 00000000..ce01d231 --- /dev/null +++ b/tests/test_sessions_enabled.py @@ -0,0 +1,97 @@ +import pytest +from scrapy import Request, Spider +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy.utils.httpobj import urlparse_cached + +from scrapy_zyte_api import ( + SessionConfig, + session_config, +) +from scrapy_zyte_api._session import session_config_registry +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import get_crawler, UNSET +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("setting", "meta", "outcome"), + ( + (UNSET, UNSET, False), + (UNSET, True, True), + (UNSET, False, False), + (True, UNSET, True), + (True, True, True), + (True, False, False), + (False, UNSET, False), + (False, True, True), + (False, False, False), + ), +) +@deferred_f_from_coro_f +async def test_enabled(setting, meta, outcome, mockserver): + settings = {"ZYTE_API_URL": mockserver.urljoin("/")} + if setting is not UNSET: + settings["ZYTE_API_SESSION_ENABLED"] = setting + meta_dict = {} + if meta is not UNSET: + meta_dict = {"zyte_api_session_enabled": meta} + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request("https://example.com", meta=meta_dict) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + if outcome: + assert_session_stats(crawler, {"example.com": (1, 1)}) + else: + assert_session_stats(crawler, {"/use/disabled": 1}) + + +@deferred_f_from_coro_f +async def test_session_config_enabled(mockserver): + pytest.importorskip("web_poet") + + @session_config(["enabled.example", "disabled.example"]) + class CustomSessionConfig(SessionConfig): + def enabled(self, request: Request): + return "enabled" in urlparse_cached(request).netloc + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://enabled.example", "https://disabled.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "/use/disabled": 1, + "/pools/enabled.example/init/check-passed": 1, + "/pools/enabled.example/use/check-passed": 1, + }, + ) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] diff --git a/tests/test_sessions_errors.py b/tests/test_sessions_errors.py new file mode 100644 index 00000000..3b5656d2 --- /dev/null +++ b/tests/test_sessions_errors.py @@ -0,0 +1,300 @@ +from collections import deque +from copy import copy +from typing import Any, Dict, Union +from unittest.mock import patch + +import pytest +from aiohttp.client_exceptions import ServerConnectionError +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider +from scrapy.http import Response +from zyte_api import RequestError + +from scrapy_zyte_api import ( + SESSION_AGGRESSIVE_RETRY_POLICY, + SESSION_DEFAULT_RETRY_POLICY, +) +from scrapy_zyte_api.utils import ( + _REQUEST_ERROR_HAS_QUERY, + maybe_deferred_to_future, +) + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +def mock_request_error(*, status=200, response_content=None): + kwargs: Dict[str, Any] = {} + if _REQUEST_ERROR_HAS_QUERY: + kwargs["query"] = {} + return RequestError( + history=None, + request_info=None, + response_content=response_content, + status=status, + **kwargs, + ) + + +# Number of times to test request errors that must be retried forever. +FOREVER_TIMES = 100 + + +class fast_forward: + def __init__(self, time): + self.time = time + + +@pytest.mark.parametrize( + ("retrying", "outcomes", "exhausted"), + ( + *( + (retry_policy, outcomes, exhausted) + for retry_policy in ( + SESSION_DEFAULT_RETRY_POLICY, + SESSION_AGGRESSIVE_RETRY_POLICY, + ) + for status in (520, 521) + for outcomes, exhausted in ( + ( + (mock_request_error(status=status),), + True, + ), + ( + (mock_request_error(status=429),), + False, + ), + ( + ( + mock_request_error(status=429), + mock_request_error(status=status), + ), + True, + ), + ) + ), + ), +) +@deferred_f_from_coro_f +@patch("time.monotonic") +async def test_retry_stop(monotonic_mock, retrying, outcomes, exhausted): + monotonic_mock.return_value = 0 + last_outcome = outcomes[-1] + outcomes = deque(outcomes) + + def wait(retry_state): + return 0.0 + + retrying = copy(retrying) + retrying.wait = wait + + async def run(): + while True: + try: + outcome = outcomes.popleft() + except IndexError: + return + else: + if isinstance(outcome, fast_forward): + monotonic_mock.return_value += outcome.time + continue + raise outcome + + run = retrying.wraps(run) + try: + await run() + except Exception as outcome: + assert exhausted + assert outcome is last_outcome + else: + assert not exhausted + + +class SessionIDRemovingDownloaderMiddleware: + def process_exception( + self, request: Request, exception: Exception, spider: Spider | None = None + ) -> Union[Request, None]: + if not isinstance(exception, RequestError) or request.meta.get( + "_is_session_init_request", False + ): + return None + + del request.meta["zyte_api_automap"]["session"] + del request.meta["zyte_api_provider"]["session"] + return None + + +class SessionIDRemovingResponseMiddleware: + def process_response( + self, request: Request, response: Response, spider: Spider | None = None + ) -> Response: + if request.meta.get("_is_session_init_request", False): + return response + for meta_key in ("zyte_api", "zyte_api_automap", "zyte_api_provider"): + if meta_key in request.meta and isinstance(request.meta[meta_key], dict): + request.meta[meta_key].pop("session", None) + return response + + +@deferred_f_from_coro_f +async def test_missing_session_id(mockserver, caplog): + """If a session ID is missing from a request that should have had it + assigned, a warning is logged about it.""" + + settings = { + **SESSION_SETTINGS, + "DOWNLOADER_MIDDLEWARES": { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, + "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, + "tests.test_sessions_errors.SessionIDRemovingDownloaderMiddleware": 675, + }, + "RETRY_TIMES": 0, + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://temporary-download-error.example"] + + def parse(self, response): + pass + + caplog.clear() + caplog.set_level("WARNING") + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + {"temporary-download-error.example": {"init/check-passed": 1, "use/failed": 1}}, + ) + assert "had no session ID assigned, unexpectedly" in caplog.text + + +class ExceptionRaisingDownloaderMiddleware: + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + self.crawler = crawler + + async def process_request( + self, request: Request, spider: Spider | None = None + ) -> None: + if request.meta.get("_is_session_init_request", False): + return + raise self.crawler.exception + + +@pytest.mark.parametrize( + ("exception", "stat", "reason"), + ( + ( + mock_request_error( + status=422, response_content=b'{"type": "/problem/session-expired"}' + ), + "expired", + "session_expired", + ), + ( + mock_request_error(status=520), + "failed", + "download_error", + ), + ( + mock_request_error(status=521), + "failed", + "download_error", + ), + ( + mock_request_error(status=500), + None, + None, + ), + ( + ServerConnectionError(), + None, + None, + ), + ( + RuntimeError(), + None, + None, + ), + ), +) +@deferred_f_from_coro_f +async def test_exceptions(exception, stat, reason, mockserver, caplog): + settings = { + **SESSION_SETTINGS, + "DOWNLOADER_MIDDLEWARES": { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, + "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, + "tests.test_sessions_errors.ExceptionRaisingDownloaderMiddleware": 675, + }, + "RETRY_TIMES": 0, + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def parse(self, response): + pass + + caplog.clear() + caplog.set_level("ERROR") + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.exception = exception + await maybe_deferred_to_future(crawler.crawl()) + + if stat is not None: + assert_session_stats( + crawler, + {"example.com": {"init/check-passed": 2, f"use/{stat}": 1}}, + ) + else: + assert_session_stats(crawler, {"example.com": {"init/check-passed": 1}}) + if reason is not None: + assert reason in caplog.text + + +@deferred_f_from_coro_f +async def test_missing_session_id_on_response(mockserver, caplog): + settings = { + **SESSION_SETTINGS, + "DOWNLOADER_MIDDLEWARES": { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, + "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, + "tests.test_sessions_errors.SessionIDRemovingResponseMiddleware": 675, + }, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions_check_errors.DomainChecker", + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://session-check-fails.example"] + + def parse(self, response): + pass + + caplog.clear() + caplog.set_level("WARNING") + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert "had no session ID assigned, unexpectedly" in caplog.text diff --git a/tests/test_sessions_init_bad.py b/tests/test_sessions_init_bad.py new file mode 100644 index 00000000..f9cfc21a --- /dev/null +++ b/tests/test_sessions_init_bad.py @@ -0,0 +1,52 @@ +import pytest +from scrapy import Spider +from scrapy.utils.defer import deferred_f_from_coro_f + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("global_setting", "pool_setting", "value"), + ( + (None, 0, 1), + (None, 1, 1), + (None, 2, 2), + (3, None, 3), + ), +) +@deferred_f_from_coro_f +async def test_max_bad_inits_per_pool(global_setting, pool_setting, value, mockserver): + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_PARAMS": {"browserHtml": True, "httpResponseBody": True}, + } + if global_setting is not None: + settings["ZYTE_API_SESSION_MAX_BAD_INITS"] = global_setting + if pool_setting is not None: + settings["ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL"] = { + "pool.example": pool_setting + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com", "https://pool.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "example.com": { + "init/failed": 8 if global_setting is None else global_setting + }, + "pool.example": {"init/failed": value}, + }, + ) diff --git a/tests/test_sessions_init_location.py b/tests/test_sessions_init_location.py new file mode 100644 index 00000000..ed79ee87 --- /dev/null +++ b/tests/test_sessions_init_location.py @@ -0,0 +1,343 @@ +import pytest +from scrapy import Request, Spider +from scrapy.utils.defer import deferred_f_from_coro_f + +from scrapy_zyte_api import SessionConfig, session_config +from scrapy_zyte_api._session import session_config_registry +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("settings", "meta", "used"), + ( + ({}, {}, True), + ( + { + "ZYTE_API_SESSION_PARAMS": { + "actions": [ + {"action": "setLocation", "address": {"postalCode": "10002"}} + ] + } + }, + {}, + False, + ), + ({"ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}}, {}, False), + ( + {}, + { + "zyte_api_session_params": { + "actions": [ + {"action": "setLocation", "address": {"postalCode": "10002"}} + ] + } + }, + False, + ), + ({}, {"zyte_api_session_location": {"postalCode": "10002"}}, False), + ), +) +@deferred_f_from_coro_f +async def test_session_config_location(settings, meta, used, mockserver): + """Overriding location in SessionConfig, if done according to the docs, + only has an effect when neither spider-level nor request-level variables + are used to modify params.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example"]) + class CustomSessionConfig(SessionConfig): + def location(self, request: Request): + return super().location(request) or {"postalCode": "10001"} + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + **settings, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + **meta, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + if used: + assert_session_stats(crawler, {"postal-code-10001.example": (1, 1)}) + else: + pool = ( + "postal-code-10001.example[0]" + if "zyte_api_session_params" in meta + else ( + "postal-code-10001.example@10002" + if "zyte_api_session_location" in meta + else "postal-code-10001.example" + ) + ) + assert_session_stats(crawler, {pool: {"init/failed": 1}}) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@pytest.mark.parametrize( + ("settings", "meta", "used"), + ( + ({}, {}, True), + ( + { + "ZYTE_API_SESSION_PARAMS": { + "actions": [ + {"action": "setLocation", "address": {"postalCode": "10002"}} + ] + } + }, + {}, + False, + ), + ({"ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}}, {}, True), + ( + {}, + { + "zyte_api_session_params": { + "actions": [ + {"action": "setLocation", "address": {"postalCode": "10002"}} + ] + } + }, + False, + ), + ({}, {"zyte_api_session_location": {"postalCode": "10002"}}, True), + ), +) +@deferred_f_from_coro_f +async def test_session_config_location_bad(settings, meta, used, mockserver): + """Overriding location in SessionConfig, if it does not return + super().location() when truthy, breaks params precedence for location meta + key and setting, but does not break raw params meta key and setting.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example"]) + class CustomSessionConfig(SessionConfig): + def location(self, request: Request): + return {"postalCode": "10001"} + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + **settings, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + **meta, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + pool = ( + "postal-code-10001.example[0]" + if "zyte_api_session_params" in meta + else ( + "postal-code-10001.example@10002" + if "zyte_api_session_location" in meta + else "postal-code-10001.example" + ) + ) + if used: + assert_session_stats( + crawler, {pool: {"init/check-passed": 1, "use/check-passed": 1}} + ) + else: + assert_session_stats(crawler, {pool: {"init/failed": 1}}) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@deferred_f_from_coro_f +async def test_session_config_params_location(mockserver): + """A custom session config can be used to customize the params for + location, e.g. to include extra actions, while still relying on the default + check to determine whether or not the session remains valid based on the + outcome of the ``setLocation`` action.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example"]) + class CustomSessionConfig(SessionConfig): + def params(self, request: Request): + return { + "actions": [ + { + "action": "waitForNavigation", + }, + { + "action": "setLocation", + "address": self.location(request), + }, + ] + } + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, {"postal-code-10001.example": (1, 1)}) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@deferred_f_from_coro_f +async def test_session_config_params_location_no_set_location(mockserver): + """A custom session config can be used to customize the params for + location to the point where they do not use a ``setLocation`` action. In + that case, the default session check will return ``True`` by default, i.e. + it will not fail due to not finding ``setLocation`` in response actions + data.""" + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + def params(self, request: Request): + postal_code = self.location(request)["postalCode"] + return { + "actions": [ + { + "action": "click", + "selector": {"type": "css", "value": f"#zip{postal_code}"}, + }, + ] + } + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, {"example.com": (1, 1)}) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] diff --git a/tests/test_sessions_init_location_config.py b/tests/test_sessions_init_location_config.py new file mode 100644 index 00000000..9264a1ee --- /dev/null +++ b/tests/test_sessions_init_location_config.py @@ -0,0 +1,267 @@ +from typing import Any, Dict + +import pytest +from scrapy import Request, Spider +from scrapy.http import Response +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy.utils.httpobj import urlparse_cached + +from scrapy_zyte_api import LocationSessionConfig, session_config +from scrapy_zyte_api._session import session_config_registry +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@deferred_f_from_coro_f +async def test_location_session_config(mockserver): + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001.example", + "postal-code-10001-fail.example", + "postal-code-10001-alternative.example", + ] + ) + class CustomSessionConfig(LocationSessionConfig): + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: + assert location == {"postalCode": "10002"} + return { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + } + + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: + assert location == {"postalCode": "10002"} + domain = urlparse_cached(request).netloc + return "fail" not in domain + + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-alternative.example": + return "postal-code-10001.example" + return domain + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + # We set a location to force the location-specific methods of the + # session config class to be called, but we set the wrong location so + # that the test would not pass were it not for our custom + # implementation which ignores the input location and instead sets the + # right one. + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001.example", + "https://postal-code-10001-alternative.example", + "https://postal-code-10001-fail.example", + ] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "postal-code-10001.example": { + "init/check-passed": 2, + "use/check-passed": 2, + }, + "postal-code-10001-fail.example": {"init/check-failed": 1}, + }, + ) + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "postal-code-10001.example": {"init/failed": 1}, + "postal-code-10001-alternative.example": {"init/failed": 1}, + "postal-code-10001-fail.example": {"init/failed": 1}, + }, + ) + + +@deferred_f_from_coro_f +async def test_location_session_config_no_methods(mockserver): + """If no location_* methods are defined, LocationSessionConfig works the + same as SessionConfig.""" + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001.example", + "postal-code-10001-alternative.example", + ] + ) + class CustomSessionConfig(LocationSessionConfig): + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-alternative.example": + return "postal-code-10001.example" + return domain + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001.example", + "https://postal-code-10001-alternative.example", + ] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + {"postal-code-10001.example": {"init/check-passed": 2, "use/check-passed": 2}}, + ) + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + +@deferred_f_from_coro_f +async def test_location_session_config_no_location(mockserver): + """If no location is configured, the methods are never called.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example", "a.example"]) + class CustomSessionConfig(LocationSessionConfig): + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: + assert False + + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: + assert False + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example", "https://a.example"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "postal-code-10001.example": {"init/failed": 1}, + "a.example": {"init/check-passed": 1, "use/check-passed": 1}, + }, + ) + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] diff --git a/tests/test_sessions_init_precedence.py b/tests/test_sessions_init_precedence.py new file mode 100644 index 00000000..8630d1cf --- /dev/null +++ b/tests/test_sessions_init_precedence.py @@ -0,0 +1,330 @@ +from typing import Any, Dict + +import pytest +from scrapy import Request, Spider +from scrapy.utils.defer import deferred_f_from_coro_f + +from scrapy_zyte_api import SessionConfig, session_config +from scrapy_zyte_api._session import session_config_registry +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler, UNSET +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("params_setting", "params_meta", "location_setting", "location_meta", "outcome"), + ( + (UNSET, UNSET, UNSET, UNSET, False), + (UNSET, UNSET, UNSET, None, False), + (UNSET, UNSET, UNSET, False, False), + (UNSET, UNSET, UNSET, True, True), + (UNSET, UNSET, False, UNSET, False), + (UNSET, UNSET, False, None, False), + (UNSET, UNSET, False, False, False), + (UNSET, UNSET, False, True, True), + (UNSET, UNSET, True, UNSET, True), + (UNSET, UNSET, True, None, False), + (UNSET, UNSET, True, False, False), + (UNSET, UNSET, True, True, True), + (UNSET, False, UNSET, UNSET, False), + (UNSET, False, UNSET, None, False), + (UNSET, False, UNSET, False, False), + (UNSET, False, UNSET, True, False), + (UNSET, False, False, UNSET, False), + (UNSET, False, False, None, False), + (UNSET, False, False, False, False), + (UNSET, False, False, True, False), + (UNSET, False, True, UNSET, False), + (UNSET, False, True, None, False), + (UNSET, False, True, False, False), + (UNSET, False, True, True, False), + (UNSET, True, UNSET, UNSET, True), + (UNSET, True, UNSET, None, True), + (UNSET, True, UNSET, False, True), + (UNSET, True, UNSET, True, True), + (UNSET, True, False, UNSET, True), + (UNSET, True, False, None, True), + (UNSET, True, False, False, True), + (UNSET, True, False, True, True), + (UNSET, True, True, UNSET, True), + (UNSET, True, True, None, True), + (UNSET, True, True, False, True), + (UNSET, True, True, True, True), + (False, UNSET, UNSET, UNSET, False), + (False, UNSET, UNSET, None, False), + (False, UNSET, UNSET, False, False), + (False, UNSET, UNSET, True, True), + (False, UNSET, False, UNSET, False), + (False, UNSET, False, None, False), + (False, UNSET, False, False, False), + (False, UNSET, False, True, True), + (False, UNSET, True, UNSET, False), + (False, UNSET, True, None, False), + (False, UNSET, True, False, False), + (False, UNSET, True, True, True), + (False, False, UNSET, UNSET, False), + (False, False, UNSET, None, False), + (False, False, UNSET, False, False), + (False, False, UNSET, True, False), + (False, False, False, UNSET, False), + (False, False, False, None, False), + (False, False, False, False, False), + (False, False, False, True, False), + (False, False, True, UNSET, False), + (False, False, True, None, False), + (False, False, True, False, False), + (False, False, True, True, False), + (False, True, UNSET, UNSET, True), + (False, True, UNSET, None, True), + (False, True, UNSET, False, True), + (False, True, UNSET, True, True), + (False, True, False, UNSET, True), + (False, True, False, None, True), + (False, True, False, False, True), + (False, True, False, True, True), + (False, True, True, UNSET, True), + (False, True, True, None, True), + (False, True, True, False, True), + (False, True, True, True, True), + (True, UNSET, UNSET, UNSET, True), + (True, UNSET, UNSET, None, True), + (True, UNSET, UNSET, False, False), + (True, UNSET, UNSET, True, True), + (True, UNSET, False, UNSET, True), + (True, UNSET, False, None, True), + (True, UNSET, False, False, False), + (True, UNSET, False, True, True), + (True, UNSET, True, UNSET, True), + (True, UNSET, True, None, True), + (True, UNSET, True, False, False), + (True, UNSET, True, True, True), + (True, False, UNSET, UNSET, False), + (True, False, UNSET, None, False), + (True, False, UNSET, False, False), + (True, False, UNSET, True, False), + (True, False, False, UNSET, False), + (True, False, False, None, False), + (True, False, False, False, False), + (True, False, False, True, False), + (True, False, True, UNSET, False), + (True, False, True, None, False), + (True, False, True, False, False), + (True, False, True, True, False), + (True, True, UNSET, UNSET, True), + (True, True, UNSET, None, True), + (True, True, UNSET, False, True), + (True, True, UNSET, True, True), + (True, True, False, UNSET, True), + (True, True, False, None, True), + (True, True, False, False, True), + (True, True, False, True, True), + (True, True, True, UNSET, True), + (True, True, True, None, True), + (True, True, True, False, True), + (True, True, True, True, True), + ), +) +@deferred_f_from_coro_f +async def test_params_precedence( + params_setting, params_meta, location_setting, location_meta, outcome, mockserver +): + postal_codes = {True: "10001", False: "10002"} + pool = ( + "postal-code-10001.example[0]" + if params_meta in postal_codes + else ( + f"postal-code-10001.example@{postal_codes[location_meta]}" + if location_meta in postal_codes + else "postal-code-10001.example" + ) + ) + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + meta: Dict[str, Any] = {} + + if params_setting is not UNSET: + settings["ZYTE_API_SESSION_PARAMS"] = { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_codes[params_setting]}, + } + ] + } + if params_meta is not UNSET: + meta["zyte_api_session_params"] = { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_codes[params_meta]}, + } + ] + } + if location_setting is not UNSET: + settings["ZYTE_API_SESSION_LOCATION"] = { + "postalCode": postal_codes[location_setting] + } + if location_meta is None: + meta["zyte_api_session_location"] = {} + elif location_meta is not UNSET: + meta["zyte_api_session_location"] = {"postalCode": postal_codes[location_meta]} + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request( + "https://postal-code-10001.example", + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_codes[True]}, + } + ] + }, + **meta, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + if outcome: + assert_session_stats( + crawler, {pool: {"init/check-passed": 1, "use/check-passed": 1}} + ) + else: + assert_session_stats(crawler, {pool: {"init/failed": 1}}) + + +@pytest.mark.parametrize( + ("meta", "settings", "pool", "outcome"), + ( + ({}, {}, "postal-code-10001.example", False), + ( + { + "zyte_api_session_params": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + }, + ] + } + }, + {}, + "postal-code-10001.example[0]", + True, + ), + ( + {"zyte_api_session_location": {"postalCode": "10001"}}, + {}, + "postal-code-10001.example@10001", + False, + ), + ( + {}, + { + "ZYTE_API_SESSION_PARAMS": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + }, + ] + } + }, + "postal-code-10001.example", + True, + ), + ( + {}, + {"ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}}, + "postal-code-10001.example", + False, + ), + ), +) +@deferred_f_from_coro_f +async def test_session_config_params_precedence( + meta, settings, pool, outcome, mockserver +): + """A params override should have no impact on the use of the + zyte_api_session_params request metadata key or the use of the + ZYTE_API_SESSION_PARAMS setting. However, it can nullify locations if not + implemented with support for them as the default implementation has.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example"]) + class CustomSessionConfig(SessionConfig): + def params(self, request: Request): + return { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10002"}, + }, + ] + } + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + **settings, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + }, + ], + }, + **meta, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + if outcome: + assert_session_stats( + crawler, {pool: {"init/check-passed": 1, "use/check-passed": 1}} + ) + else: + assert_session_stats(crawler, {pool: {"init/failed": 1}}) + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] diff --git a/tests/test_sessions_init_url.py b/tests/test_sessions_init_url.py new file mode 100644 index 00000000..257bd385 --- /dev/null +++ b/tests/test_sessions_init_url.py @@ -0,0 +1,53 @@ +import pytest +from scrapy import Spider +from scrapy.utils.defer import deferred_f_from_coro_f + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("params", "close_reason", "stats"), + ( + ( + {"browserHtml": True}, + "bad_session_inits", + {"forbidden.example": {"init/failed": 1}}, + ), + ( + {"browserHtml": True, "url": "https://example.com"}, + "failed_forbidden_domain", + {"forbidden.example": {"init/check-passed": 1}}, + ), + ), +) +@deferred_f_from_coro_f +async def test_url_override(params, close_reason, stats, mockserver): + """If session params define a URL, that URL is used for session + initialization. Otherwise, the URL from the request getting the session + assigned first is used for session initialization.""" + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_PARAMS": params, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://forbidden.example"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert crawler.spider.close_reason == close_reason + assert_session_stats(crawler, stats) diff --git a/tests/test_sessions_max.py b/tests/test_sessions_max.py new file mode 100644 index 00000000..2efa0c70 --- /dev/null +++ b/tests/test_sessions_max.py @@ -0,0 +1,109 @@ +from math import floor +from urllib.parse import urlparse + +import pytest +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Spider + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + +RETRY_TIMES = 2 +TEST_CASES = [ + *( + ( + "https://example.com", + ( + { + **( + {} + if setting is None + else {"ZYTE_API_SESSION_MAX_BAD_INITS": setting} + ), + "ZYTE_API_SESSION_PARAMS": { + "browserHtml": True, + "httpResponseBody": True, + }, + } + ), + {"init/failed": value}, + ) + for setting, value in ((0, 1), (1, 1), (2, 2), (None, 8)) + ), + *( + ( + "https://example.com", + ( + { + **( + {} + if setting is None + else {"ZYTE_API_SESSION_MAX_CHECK_FAILURES": setting} + ), + "RETRY_TIMES": RETRY_TIMES, + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions_check_custom.FalseUseChecker", + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + ), + { + "init/check-passed": floor((RETRY_TIMES + 1) / value) + 1, + "use/check-failed": RETRY_TIMES + 1, + }, + ) + for setting, value in ((None, 1), (0, 1), (1, 1), (2, 2)) + ), + *( + ( + "https://temporary-download-error.example", + ( + { + **( + {} + if setting is None + else {"ZYTE_API_SESSION_MAX_ERRORS": setting} + ), + "RETRY_TIMES": RETRY_TIMES, + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + ), + { + "init/check-passed": floor((RETRY_TIMES + 1) / value) + 1, + "use/failed": RETRY_TIMES + 1, + }, + ) + for setting, value in ((None, 1), (0, 1), (1, 1), (2, 2)) + ), +] + + +@pytest.mark.parametrize( + ("start_url", "settings", "expected_stats"), + TEST_CASES, +) +@deferred_f_from_coro_f +async def test_max(start_url, settings, expected_stats, mockserver): + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_QUEUE_WAIT_TIME": 0.001, + **settings, + } + + class TestSpider(Spider): + name = "test" + start_urls = [start_url] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + pool = urlparse(start_url).netloc + assert_session_stats(crawler, {pool: expected_stats}) diff --git a/tests/test_sessions_poet.py b/tests/test_sessions_poet.py new file mode 100644 index 00000000..07ab4056 --- /dev/null +++ b/tests/test_sessions_poet.py @@ -0,0 +1,53 @@ +import pytest + +pytest.importorskip("scrapy_poet") + +from typing import Any, Dict + +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider, signals + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + +from scrapy_poet import DummyResponse +from zyte_common_items import Product + + +@deferred_f_from_coro_f +async def test_provider(mockserver): + class Tracker: + def __init__(self): + self.query: Dict[str, Any] = {} + + def track(self, request: Request, spider: Spider): + self.query = request.meta["zyte_api"] + + tracker = Tracker() + + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request("https://example.com", callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, {"example.com": (1, 1)}) + assert "product" in tracker.query diff --git a/tests/test_sessions_pool.py b/tests/test_sessions_pool.py new file mode 100644 index 00000000..a5a791e7 --- /dev/null +++ b/tests/test_sessions_pool.py @@ -0,0 +1,524 @@ +from asyncio import sleep +from collections import deque + +import pytest +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider + +from scrapy_zyte_api import SessionConfig, session_config +from scrapy_zyte_api._session import ( + ScrapyZyteAPISessionDownloaderMiddleware, + session_config_registry, +) +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler, get_downloader_middleware +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("meta", "pool"), + ( + ({}, "example.com"), + ({"zyte_api_session_location": {"postalCode": "10001"}}, "example.com@10001"), + ( + {"zyte_api_session_location": {"postalCode": "10001", "foo": "bar"}}, + "example.com@10001", + ), + ( + { + "zyte_api_session_location": { + "addressCountry": "US", + "addressRegion": "TX", + } + }, + "example.com@US,TX", + ), + ( + { + "zyte_api_session_location": { + "addressCountry": "ES", + "addressRegion": "Pontevedra", + "streetAddress": "Rúa do Príncipe, 123", + "postalCode": "12345", + } + }, + "example.com@ES,Pontevedra,12345,Rúa do Príncipe, 123", + ), + ( + { + "zyte_api_session_params": {"foo": "bar"}, + "zyte_api_session_location": {"postalCode": "10001"}, + }, + "example.com[0]", + ), + ( + { + "zyte_api_session_pool": "foo", + "zyte_api_session_params": {"foo": "bar"}, + "zyte_api_session_location": {"postalCode": "10001"}, + }, + "foo", + ), + ), +) +@deferred_f_from_coro_f +async def test_pool(meta, pool, mockserver): + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request("https://example.com", meta=meta) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, {pool: {"init/check-passed": 1, "use/check-passed": 1}} + ) + + +@deferred_f_from_coro_f +async def test_pool_params(mockserver, caplog): + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + yield Request( + "https://example.com/a", + meta={"zyte_api_session_params": {"foo": "bar"}}, + ) + yield Request( + "https://example.com/b", + meta={"zyte_api_session_params": {"foo": "bar"}}, + ) + yield Request( + "https://example.com/c", + meta={"zyte_api_session_params": {"foo": "baz"}}, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + caplog.clear() + caplog.set_level("INFO") + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "example.com[0]": {"init/check-passed": 1, "use/check-passed": 2}, + "example.com[1]": {"init/check-passed": 1, "use/check-passed": 1}, + }, + ) + expected_logs = { + ( + "INFO", + "Session pool example.com[0] uses these session initialization parameters: {'foo': 'bar'}", + ): 0, + ( + "INFO", + "Session pool example.com[1] uses these session initialization parameters: {'foo': 'baz'}", + ): 0, + } + for record in caplog.records: + entry = (record.levelname, record.msg) + if entry in expected_logs: + expected_logs[entry] += 1 + assert all(v == 1 for v in expected_logs.values()) + + +@deferred_f_from_coro_f +async def test_session_config_pool_caching(mockserver): + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + def __init__(self, crawler): + super().__init__(crawler) + self.pools = deque(("example.com",)) + + def pool(self, request: Request): + # The following code would fail on the second call, which never + # happens due to pool caching. + return self.pools.popleft() + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, {"example.com": {"init/check-passed": 1, "use/check-passed": 1}} + ) + assert crawler.spider.close_reason == "finished" + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@pytest.mark.parametrize("outcome", [Exception, 123, {}]) +@deferred_f_from_coro_f +async def test_pool_error(mockserver, outcome): + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + def pool(self, request: Request): + if isinstance(outcome, type) and issubclass(outcome, Exception): + raise outcome + return outcome + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, {}) + assert crawler.spider.close_reason == "pool_error" + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@deferred_f_from_coro_f +async def test_mw_get_pool(mockserver): + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + crawler = await get_crawler(settings) + mw = get_downloader_middleware(crawler, ScrapyZyteAPISessionDownloaderMiddleware) + request = Request("https://example.com", meta={"zyte_api_session_pool": "foo"}) + assert mw.get_pool(request) == "foo" + + # get_pool() is None is plugin-managed sessions are disabled. + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + } + crawler = await get_crawler(settings) + mw = get_downloader_middleware(crawler, ScrapyZyteAPISessionDownloaderMiddleware) + assert mw.get_pool(request) is None + + +@pytest.mark.parametrize( + ("settings", "meta", "expected"), + ( + ({}, None, 0.0), + ({"DOWNLOAD_DELAY": 1.0}, None, 1.0), + ({"ZYTE_API_SESSION_DELAY": 1.5}, None, 1.5), + ({}, "example.com", 0.0), + ({}, {"id": "example.com", "delay": 1.5}, 1.5), + ( + {"ZYTE_API_SESSION_POOLS": {"example.com": {"delay": 0.5}}}, + {"id": "example.com", "delay": 1.5}, + 0.5, + ), + ), +) +@deferred_f_from_coro_f +async def test_delay(settings, meta, expected, mockserver, monkeypatch): + queue_wait_time = expected + 0.1 + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_SESSION_QUEUE_WAIT_TIME": queue_wait_time, + "ZYTE_API_SESSION_RANDOMIZE_DELAY": False, + **settings, + } + + sleep_calls = [] + + async def fake_sleep(delay): + if delay != pytest.approx(queue_wait_time): + sleep_calls.append(delay) + await sleep(0) + + monkeypatch.setattr("scrapy_zyte_api._session.sleep", fake_sleep) + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for url in self.start_urls: + if meta is None: + yield Request(url) + else: + yield Request(url, meta={"zyte_api_session_pool": meta}) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert len(sleep_calls) == 1 + assert sleep_calls[0] == pytest.approx(expected) + + +@deferred_f_from_coro_f +async def test_delay_reuse(mockserver, monkeypatch): + """Ensure that non-random delays during session reuse (as opposed to + creation) work as expected.""" + expected = 0.0 # No delay by default + queue_wait_time = expected + 0.1 + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_SESSION_QUEUE_WAIT_TIME": queue_wait_time, + "ZYTE_API_SESSION_RANDOMIZE_DELAY": False, + } + + sleep_calls = [] + + async def fake_sleep(delay): + if delay != pytest.approx(queue_wait_time): + sleep_calls.append(delay) + await sleep(0) + + monkeypatch.setattr("scrapy_zyte_api._session.sleep", fake_sleep) + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] * 2 + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert len(sleep_calls) == 1 + assert sleep_calls[0] == pytest.approx(expected) + + +@pytest.mark.parametrize( + ("settings", "start_requests"), + ( + ({"ZYTE_API_SESSION_RANDOMIZE_DELAY": True}, ["https://example.com"] * 2), + ( + {"ZYTE_API_SESSION_POOLS": {"example.com": {"randomize_delay": True}}}, + ["https://example.com"] * 2, + ), + ( + {}, + [ + Request( + "https://example.com", + meta={ + "zyte_api_session_pool": { + "id": "example.com", + "randomize_delay": True, + } + }, + ) + for _ in range(2) + ], + ), + ), +) +@deferred_f_from_coro_f +async def test_delay_random(settings, start_requests, mockserver, monkeypatch): + base_delay = 1.0 + queue_wait_time = base_delay * 2 + settings = { + "RANDOMIZE_DOWNLOAD_DELAY": False, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_SESSION_DELAY": base_delay, + "ZYTE_API_SESSION_QUEUE_WAIT_TIME": queue_wait_time, + **settings, + } + + sleep_calls = [] + + async def fake_sleep(delay): + if delay != pytest.approx(queue_wait_time): + sleep_calls.append(delay) + await sleep(0) + + monkeypatch.setattr("scrapy_zyte_api._session.sleep", fake_sleep) + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for item in start_requests: + if isinstance(item, str): + yield Request(item, dont_filter=True) + else: + yield item.replace(dont_filter=True) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert len(sleep_calls) == 2 + assert any(call != pytest.approx(base_delay) for call in sleep_calls) + + +@pytest.mark.parametrize( + ("settings", "start_requests", "expected_stats"), + ( + ( + {"ZYTE_API_SESSION_POOL_SIZE": 1}, + ["https://example.com"] * (1 + 1), + {"example.com": (1, 1 + 1)}, + ), + ( + {}, + ["https://example.com"] * (8 + 1), + {"example.com": (8, 8 + 1)}, + ), + ( + {"ZYTE_API_SESSION_POOL_SIZES": {"pool.example": 1}}, + (["https://example.com", "https://pool.example"] * (1 + 1)), + {"example.com": (1 + 1, 1 + 1), "pool.example": (1, 1 + 1)}, + ), + ( + { + "ZYTE_API_SESSION_POOL_SIZES": {"example.com": 2}, + "ZYTE_API_SESSION_POOLS": {"example.com": {}}, + }, + ["https://example.com"] * (2 + 1), + {"example.com": (2, 2 + 1)}, + ), + ( + { + "ZYTE_API_SESSION_POOL_SIZES": {"example.com": 2}, + "ZYTE_API_SESSION_POOLS": {"example.com": {"size": 1}}, + }, + ["https://example.com"] * (1 + 1), + {"example.com": (1, 1 + 1)}, + ), + ( + {"ZYTE_API_SESSION_POOL_SIZE": 1}, + [ + Request( + "https://example.com", + meta={"zyte_api_session_pool": {"id": "example.com", "size": 2}}, + ) + for _ in range(2 + 1) + ], + {"example.com": (2, 2 + 1)}, + ), + ( + {"ZYTE_API_SESSION_POOLS": {"example.com": {"size": 1}}}, + [ + Request( + "https://example.com", + meta={"zyte_api_session_pool": {"id": "example.com", "size": 2}}, + ) + for _ in range(2 + 1) + ], + {"example.com": (1, 2 + 1)}, + ), + ), +) +@deferred_f_from_coro_f +async def test_size(settings, start_requests, expected_stats, mockserver, caplog): + settings = { + **SESSION_SETTINGS, + "ZYTE_API_URL": mockserver.urljoin("/"), + **settings, + } + + caplog.clear() + caplog.set_level("WARNING") + + class TestSpider(Spider): + name = "test" + + async def start(self): + for request in self.start_requests(): + yield request + + def start_requests(self): + for item in start_requests: + if isinstance(item, str): + yield Request(item, dont_filter=True) + else: + yield item.replace(dont_filter=True) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, expected_stats) + + if "ZYTE_API_SESSION_POOL_SIZES" in settings: + assert any( + "ZYTE_API_SESSION_POOL_SIZES is deprecated" in rec.getMessage() + for rec in caplog.records + ) diff --git a/tests/test_sessions_queue.py b/tests/test_sessions_queue.py new file mode 100644 index 00000000..a1d120c6 --- /dev/null +++ b/tests/test_sessions_queue.py @@ -0,0 +1,39 @@ +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Spider +import pytest + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@pytest.mark.parametrize( + ("attempts", "expected_stats"), + ( + (None, {"example.com": (1, 2)}), + (1, {"example.com": (1, 1)}), + ), +) +@deferred_f_from_coro_f +async def test_empty_queue(attempts, expected_stats, mockserver): + settings = { + **SESSION_SETTINGS, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_SESSION_QUEUE_WAIT_TIME": 0.001, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + if attempts is not None: + settings["ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS"] = attempts + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] * 2 + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats(crawler, expected_stats) diff --git a/tests/test_sessions_refresh.py b/tests/test_sessions_refresh.py new file mode 100644 index 00000000..fb5966c0 --- /dev/null +++ b/tests/test_sessions_refresh.py @@ -0,0 +1,99 @@ +from scrapy.utils.defer import deferred_f_from_coro_f +from scrapy import Request, Spider, signals + +from scrapy_zyte_api.utils import maybe_deferred_to_future + +from . import SESSION_SETTINGS, get_crawler +from .helpers import assert_session_stats + + +@deferred_f_from_coro_f +async def test_session_refresh(mockserver): + """If a response does not pass a session validity check, the session is + discarded, and the request is retried with a different session.""" + + class Tracker: + def __init__(self): + self.sessions = [] + + def track_session(self, request: Request, spider: Spider): + self.sessions.append(request.meta["zyte_api"]["session"]["id"]) + + tracker = Tracker() + + settings = { + **SESSION_SETTINGS, + "RETRY_TIMES": 1, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions_check_errors.DomainChecker", + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://session-check-fails.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect( + tracker.track_session, signal=signals.request_reached_downloader + ) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "session-check-fails.example": { + "init/check-passed": 3, + "use/check-failed": 2, + } + }, + ) + assert len(tracker.sessions) == 5 + assert tracker.sessions[0] == tracker.sessions[1] + assert tracker.sessions[0] != tracker.sessions[2] + assert tracker.sessions[2] == tracker.sessions[3] + assert tracker.sessions[0] != tracker.sessions[4] + assert tracker.sessions[2] != tracker.sessions[4] + + +@deferred_f_from_coro_f +async def test_session_refresh_concurrent(mockserver): + """When more than 1 request is using the same session concurrently, it can + happen that more than 1 response triggers a session refresh. In those + cases, the same session should be refreshed only once, not once per + response triggering a refresh.""" + settings = { + **SESSION_SETTINGS, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + "ZYTE_API_SESSION_MAX_ERRORS": 1, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com/"] + + def parse(self, response): + for n in range(2): + yield Request(f"https://example.com/{n}?temporary-download-error") + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await maybe_deferred_to_future(crawler.crawl()) + + assert_session_stats( + crawler, + { + "example.com": { + "init/check-passed": 1, + "init/failed": 1, + "use/check-passed": 1, + "use/failed": 2, + } + }, + ) diff --git a/tests/test_sessions_utils.py b/tests/test_sessions_utils.py new file mode 100644 index 00000000..d620b311 --- /dev/null +++ b/tests/test_sessions_utils.py @@ -0,0 +1,18 @@ +import pytest +from scrapy import Request + +from scrapy_zyte_api import is_session_init_request +from scrapy_zyte_api._session import SESSION_INIT_META_KEY + + +@pytest.mark.parametrize( + ("meta", "expected"), + ( + ({}, False), + ({SESSION_INIT_META_KEY: False}, False), + ({SESSION_INIT_META_KEY: True}, True), + ), +) +def test_is_session_init_request(meta, expected): + actual = is_session_init_request(Request("https://example.com", meta=meta)) + assert expected == actual diff --git a/tox.ini b/tox.ini index aaeedf25..c0f03129 100644 --- a/tox.ini +++ b/tox.ini @@ -68,6 +68,7 @@ deps = {[min]deps} zyte-api==0.6.0 parsel==1.7.0 + typing_extensions==4.1.0 # Earliest supported Scrapy version. [testenv:min-scrapy-2x0] @@ -104,6 +105,7 @@ deps = {[min]deps} scrapy==2.5.0 zyte-api==0.6.0 + typing_extensions==4.1.0 # First Scrapy version since 2.4.0 where installing the reactor earlier is not # necessary. @@ -113,6 +115,7 @@ deps = {[min]deps} scrapy==2.6.0 zyte-api==0.6.0 + typing_extensions==4.1.0 # First Scrapy version with centralizing request fingerprinting support. [testenv:min-scrapy-2x7] @@ -121,6 +124,7 @@ deps = {[min]deps} scrapy==2.7.0 zyte-api==0.6.0 + typing_extensions==4.1.0 [testenv:min-extra] basepython=python3.10 @@ -147,7 +151,10 @@ deps = {[min]deps} parsel==1.7.0 scrapy==2.0.1 + # https://github.com/zytedata/python-zyte-api/blob/0.8.0/setup.py#L29-L30 zyte-api[x402]==0.8.0 + eth-account==0.13.7 + x402==0.1.1 [testenv:extra] basepython=python3.13