diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 398daf4e..049c96fe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,8 +1,8 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.0 + rev: v0.15.1 hooks: - - id: ruff + - id: ruff-check args: [ --fix ] - id: ruff-format - repo: https://github.com/adamchainz/blacken-docs diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py index 41814277..1c383efb 100644 --- a/docs/_ext/__init__.py +++ b/docs/_ext/__init__.py @@ -5,8 +5,12 @@ def http_api_reference_role( - name, rawtext, text, lineno, inliner, options={}, content=[] + name, rawtext, text, lineno, inliner, options=None, content=None ): + if options is None: + options = {} + if content is None: + content = [] match = re.search( r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text ) diff --git a/docs/conf.py b/docs/conf.py index 311edaa3..a947f3ad 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -2,7 +2,7 @@ from pathlib import Path project = "scrapy-zyte-api" -copyright = "2023, Zyte Group Ltd" +project_copyright = "2023, Zyte Group Ltd" author = "Zyte Group Ltd" release = "0.32.0" diff --git a/pyproject.toml b/pyproject.toml index 3c8ab1ff..7c043746 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -129,3 +129,141 @@ filterwarnings = [ "ignore:RetryMiddleware\\.process_spider_exception\\(\\):scrapy.exceptions.ScrapyDeprecationWarning", "ignore::scrapy.exceptions.ScrapyDeprecationWarning:scrapy_poet", ] + +[tool.ruff.lint] +extend-select = [ + # flake8-builtins + "A", + # flake8-async + "ASYNC", + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # flake8-commas + "COM", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-pytest-style + "PT", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Trailing comma missing + "COM812", + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public method + "D102", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in public nested class + "D106", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # First word of the first line should be properly capitalized + "D403", + # No blank lines allowed between a section header and its content + "D412", + # `try`-`except` within a loop incurs performance overhead + "PERF203", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # String contains ambiguous {}. + "RUF001", + # Docstring contains ambiguous {}. + "RUF002", + # Comment contains ambiguous {}. + "RUF003", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", +] + +[tool.ruff.lint.isort] +split-on-trailing-comma = false + +[tool.ruff.lint.per-file-ignores] +# we need to use typing.Set[] over modern alternatives with web-poet<0.19.0 && Python<3.11 +# see https://github.com/scrapinghub/web-poet/pull/219 +"scrapy_zyte_api/providers.py" = ["UP006", "UP035"] +"tests/**" = ["S"] + +[tool.ruff.lint.pydocstyle] +convention = "pep257" diff --git a/scrapy_zyte_api/__init__.py b/scrapy_zyte_api/__init__.py index a11ad09d..2ba0b71c 100644 --- a/scrapy_zyte_api/__init__.py +++ b/scrapy_zyte_api/__init__.py @@ -7,7 +7,6 @@ # Register web-poet serializers from . import _serialization # noqa: F401 - from ._annotations import ExtractFrom, actions, custom_attrs from ._middlewares import ( ScrapyZyteAPIDownloaderMiddleware, @@ -51,25 +50,25 @@ session_config_registry = _session_config_registry __all__ = [ - "ExtractFrom", + "SESSION_AGGRESSIVE_RETRY_POLICY", + "SESSION_DEFAULT_RETRY_POLICY", "Actions", + "Addon", + "ExtractFrom", "Geolocation", - "Screenshot", + "LocationSessionConfig", "ScrapyZyteAPIDownloadHandler", - "ScrapyZyteAPIRequestFingerprinter", "ScrapyZyteAPIDownloaderMiddleware", "ScrapyZyteAPIRefererSpiderMiddleware", - "ScrapyZyteAPISpiderMiddleware", + "ScrapyZyteAPIRequestFingerprinter", "ScrapyZyteAPISessionDownloaderMiddleware", - "Addon", + "ScrapyZyteAPISpiderMiddleware", + "Screenshot", + "SessionConfig", "actions", "custom_attrs", "get_request_session_id", "is_session_init_request", "session_config", "session_config_registry", - "LocationSessionConfig", - "SessionConfig", - "SESSION_DEFAULT_RETRY_POLICY", - "SESSION_AGGRESSIVE_RETRY_POLICY", ] diff --git a/scrapy_zyte_api/_annotations.py b/scrapy_zyte_api/_annotations.py index b5a08d70..bd7bfede 100644 --- a/scrapy_zyte_api/_annotations.py +++ b/scrapy_zyte_api/_annotations.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from enum import Enum -from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Tuple, TypedDict +from typing import Any, TypedDict class ExtractFrom(str, Enum): @@ -18,48 +19,48 @@ class ExtractFrom(str, Enum): class _Selector(TypedDict, total=False): type: str value: str - state: Optional[str] + state: str | None class Action(TypedDict, total=False): action: str - address: Optional[dict] - args: Optional[dict] - button: Optional[str] - delay: Optional[float] - id: Optional[str] - key: Optional[str] - keyword: Optional[str] - left: Optional[int] - maxPageHeight: Optional[int] - maxScrollCount: Optional[int] - maxScrollDelay: Optional[float] - onError: Optional[str] - options: Optional[dict] - selector: Optional[_Selector] - source: Optional[str] - text: Optional[str] - timeout: Optional[float] - top: Optional[int] - url: Optional[str] - urlMatchingOptions: Optional[str] - urlPattern: Optional[str] - values: Optional[List[str]] - waitForNavigationTimeout: Optional[float] - waitUntil: Optional[str] - - -class _ActionResult(TypedDict, total=False): + address: dict | None + args: dict | None + button: str | None + delay: float | None + id: str | None + key: str | None + keyword: str | None + left: int | None + maxPageHeight: int | None + maxScrollCount: int | None + maxScrollDelay: float | None + onError: str | None + options: dict | None + selector: _Selector | None + source: str | None + text: str | None + timeout: float | None + top: int | None + url: str | None + urlMatchingOptions: str | None + urlPattern: str | None + values: list[str] | None + waitForNavigationTimeout: float | None + waitUntil: str | None + + +class _ActionResult(TypedDict, total=False): # noqa: PYI049 action: str elapsedTime: float status: str - error: Optional[str] + error: str | None def make_hashable(obj: Any) -> Any: """Converts input into hashable form, to use in ``Annotated``.""" if isinstance(obj, (tuple, list)): - return tuple((make_hashable(e) for e in obj)) + return tuple(make_hashable(e) for e in obj) if isinstance(obj, dict): return frozenset((make_hashable(k), make_hashable(v)) for k, v in obj.items()) @@ -78,15 +79,16 @@ def _from_hashable(obj: Any) -> Any: return obj -def actions(value: Iterable[Action]) -> Tuple[Any, ...]: +def actions(value: Iterable[Action]) -> tuple[Any, ...]: """Convert an iterable of :class:`~scrapy_zyte_api.Action` dicts into a hashable value.""" # both lists and dicts are not hashable and we need dep types to be hashable return tuple(make_hashable(action) for action in value) def custom_attrs( - input: Dict[str, Any], options: Optional[Dict[str, Any]] = None -) -> Tuple[FrozenSet[Any], Optional[FrozenSet[Any]]]: + input: dict[str, Any], # noqa: A002 + options: dict[str, Any] | None = None, +) -> tuple[frozenset[Any], frozenset[Any] | None]: input_wrapped = make_hashable(input) options_wrapped = make_hashable(options) if options else None return input_wrapped, options_wrapped diff --git a/scrapy_zyte_api/_cookies.py b/scrapy_zyte_api/_cookies.py index 60ae268b..ac41f978 100644 --- a/scrapy_zyte_api/_cookies.py +++ b/scrapy_zyte_api/_cookies.py @@ -1,12 +1,12 @@ from http.cookiejar import Cookie -from typing import Any, Dict, List, Optional +from typing import Any from urllib.parse import urlparse from scrapy.http import Request from scrapy.http.cookies import CookieJar -def _get_cookie_jar(request: Request, cookie_jars: Dict[Any, CookieJar]) -> CookieJar: +def _get_cookie_jar(request: Request, cookie_jars: dict[Any, CookieJar]) -> CookieJar: jar_id = request.meta.get("cookiejar") return cookie_jars[jar_id] @@ -24,9 +24,9 @@ def _get_cookie_domain(cookie, url): def _process_cookies( - api_response: Dict[str, Any], + api_response: dict[str, Any], request: Request, - cookie_jars: Optional[Dict[Any, CookieJar]], + cookie_jars: dict[Any, CookieJar] | None, ): if not cookie_jars: return @@ -64,7 +64,7 @@ def _process_cookies( def _get_all_cookies( - request: Request, cookie_jars: Dict[Any, CookieJar] -) -> List[Cookie]: + request: Request, cookie_jars: dict[Any, CookieJar] +) -> list[Cookie]: cookie_jar = _get_cookie_jar(request, cookie_jars) return list(cookie_jar.jar) diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index ca3f38f8..23f14944 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -92,14 +92,14 @@ def _get_spm_mw(self): spm_mw_classes = [] try: - from scrapy_crawlera import CrawleraMiddleware + from scrapy_crawlera import CrawleraMiddleware # noqa: PLC0415 except ImportError: pass else: spm_mw_classes.append(CrawleraMiddleware) try: - from scrapy_zyte_smartproxy import ZyteSmartProxyMiddleware + from scrapy_zyte_smartproxy import ZyteSmartProxyMiddleware # noqa: PLC0415 except ImportError: pass else: diff --git a/scrapy_zyte_api/_page_inputs.py b/scrapy_zyte_api/_page_inputs.py index d1ec0585..64e7efce 100644 --- a/scrapy_zyte_api/_page_inputs.py +++ b/scrapy_zyte_api/_page_inputs.py @@ -1,5 +1,4 @@ from base64 import b64decode -from typing import List, Optional import attrs @@ -15,7 +14,7 @@ class Actions: """ #: Results of actions. - results: Optional[List[_ActionResult]] + results: list[_ActionResult] | None @attrs.define @@ -26,8 +25,6 @@ class Geolocation: `. """ - pass - @attrs.define class Screenshot: diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 3196e2ca..32a18d01 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -1,8 +1,9 @@ from base64 import b64decode, b64encode +from collections.abc import Iterable, Mapping from copy import copy from logging import getLogger from os import environ -from typing import Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union +from typing import Any from warnings import warn from scrapy import Request @@ -71,7 +72,7 @@ # purposes, i.e. 2 requests with a different value for that field but otherwise # identical should be treated as different requests, not as duplicate requests. # -_REQUEST_PARAMS: Dict[str, Dict[str, Any]] = { +_REQUEST_PARAMS: dict[str, dict[str, Any]] = { "url": { "default": _NoDefault, }, @@ -271,10 +272,10 @@ ANY_VALUE = object() ANY_VALUE_T = Any -SKIP_HEADER_T = Dict[bytes, Union[ANY_VALUE_T, str]] +SKIP_HEADER_T = dict[bytes, ANY_VALUE_T | str] -def _may_use_browser(api_params: Dict[str, Any]) -> bool: +def _may_use_browser(api_params: dict[str, Any]) -> bool: """Return ``False`` if *api_params* indicate with certainty that browser rendering will not be used, or ``True`` otherwise.""" for key in _BROWSER_KEYS: @@ -285,9 +286,9 @@ def _may_use_browser(api_params: Dict[str, Any]) -> bool: return True if "httpResponseBody" in extract_froms: return False - if api_params.get("httpResponseBody", _DEFAULT_API_PARAMS["httpResponseBody"]): - return False - return True + return not api_params.get( + "httpResponseBody", _DEFAULT_API_PARAMS["httpResponseBody"] + ) def session_id_to_session(session_id): @@ -301,7 +302,7 @@ def str_to_bool(value): def _is_safe_header(k, v, /, *, api_params, request): k = k.strip() lowercase_k = to_bytes(k.lower()) - if not (lowercase_k.startswith(b"zyte-") or lowercase_k.startswith(b"x-crawlera-")): + if not (lowercase_k.startswith((b"zyte-", b"x-crawlera-"))): return True decoded_k = to_unicode(k) @@ -522,28 +523,29 @@ def _is_safe_header(k, v, /, *, api_params, request): def _process_manual_custom_http_request_headers( - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, ) -> None: - headers = [] - for header_dict in api_params.pop("customHttpRequestHeaders"): + headers = [ + header_dict + for header_dict in api_params.pop("customHttpRequestHeaders") if _is_safe_header( header_dict["name"], header_dict["value"], api_params=api_params, request=request, - ): - headers.append(header_dict) + ) + ] if headers: api_params["customHttpRequestHeaders"] = headers def _iter_headers( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, header_parameter: str, -) -> Iterable[Tuple[bytes, bytes, bytes]]: +) -> Iterable[tuple[bytes, bytes, bytes]]: headers = api_params.get(header_parameter) if headers not in (None, True): logger.warning( @@ -566,7 +568,7 @@ def _iter_headers( def _map_custom_http_request_headers( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, skip_headers: SKIP_HEADER_T, ): @@ -585,9 +587,9 @@ def _map_custom_http_request_headers( def _map_request_headers( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, - browser_headers: Dict[bytes, str], + browser_headers: dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, ): request_headers = {} @@ -613,7 +615,7 @@ def _map_request_headers( def _warn_about_request_headers( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, skip_headers: SKIP_HEADER_T, ): @@ -633,7 +635,7 @@ def _warn_about_request_headers( ) -def _get_extract_from(api_params: Dict[str, Any], extract_type: str) -> Union[str, Any]: +def _get_extract_from(api_params: dict[str, Any], extract_type: str) -> str | Any: options = api_params.get(f"{extract_type}Options", {}) default_extract_from = _REQUEST_PARAMS[extract_type].get( "default_extract_from", _NoDefault @@ -641,7 +643,7 @@ def _get_extract_from(api_params: Dict[str, Any], extract_type: str) -> Union[st return options.get("extractFrom", default_extract_from) -def _get_extract_froms(api_params: Dict[str, Any]) -> Set[str]: +def _get_extract_froms(api_params: dict[str, Any]) -> set[str]: result = set() for key in _EXTRACT_KEYS: if not api_params.get(key, False): @@ -652,10 +654,10 @@ def _get_extract_froms(api_params: Dict[str, Any]) -> Set[str]: def _set_request_headers_from_request( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, skip_headers: SKIP_HEADER_T, - browser_headers: Dict[bytes, str], + browser_headers: dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, ): """Updates *api_params*, in place, based on *request*.""" @@ -717,7 +719,7 @@ def proxy_mode_browser_html_enabled(request: Request) -> bool: def _set_http_response_body_from_request( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, ): if not any( @@ -736,9 +738,9 @@ def _set_http_response_body_from_request( def _set_http_response_headers_from_request( *, - api_params: Dict[str, Any], - default_params: Dict[str, Any], - meta_params: Dict[str, Any], + api_params: dict[str, Any], + default_params: dict[str, Any], + meta_params: dict[str, Any], ): if api_params.get("httpResponseBody"): api_params.setdefault("httpResponseHeaders", True) @@ -758,7 +760,7 @@ def _set_http_response_headers_from_request( def _set_http_response_cookies_from_request( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], ): api_params.setdefault("experimental", {}) api_params["experimental"].setdefault("responseCookies", True) @@ -768,9 +770,9 @@ def _set_http_response_cookies_from_request( def _set_http_request_cookies_from_request( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, - cookie_jars: Dict[Any, CookieJar], + cookie_jars: dict[Any, CookieJar], max_cookies: int, ): api_params.setdefault("experimental", {}) @@ -831,7 +833,7 @@ def _set_http_request_cookies_from_request( def _set_http_request_method_from_request( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, ): method = api_params.get("httpRequestMethod") @@ -853,7 +855,7 @@ def _set_http_request_method_from_request( def _set_http_request_body_from_request( *, - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, ): body = api_params.get("httpRequestBody") @@ -879,8 +881,8 @@ def _set_http_request_body_from_request( def _unset_unneeded_api_params( *, - api_params: Dict[str, Any], - default_params: Dict[str, Any], + api_params: dict[str, Any], + default_params: dict[str, Any], request: Request, ): for param, default_value in _DEFAULT_API_PARAMS.items(): @@ -899,16 +901,16 @@ def _unset_unneeded_api_params( def _update_api_params_from_request( - api_params: Dict[str, Any], + api_params: dict[str, Any], request: Request, *, - default_params: Dict[str, Any], - meta_params: Dict[str, Any], + default_params: dict[str, Any], + meta_params: dict[str, Any], skip_headers: SKIP_HEADER_T, - browser_headers: Dict[bytes, str], + browser_headers: dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, cookies_enabled: bool, - cookie_jars: Optional[Dict[Any, CookieJar]], + cookie_jars: dict[Any, CookieJar] | None, max_cookies: int, ): _set_http_response_body_from_request(api_params=api_params, request=request) @@ -944,30 +946,29 @@ def _update_api_params_from_request( def _copy_meta_params_as_dict( - meta_params: Dict[str, Any], + meta_params: dict[str, Any], *, param: str, request: Request, ): if meta_params is True: return {} - elif not isinstance(meta_params, Mapping): + if not isinstance(meta_params, Mapping): raise ValueError( f"'{param}' parameters in the request meta should be provided as " f"a dictionary, got {type(meta_params)} instead in {request}." ) - else: - return copy(meta_params) + return copy(meta_params) def _merge_params( *, - default_params: Dict[str, Any], - meta_params: Dict[str, Any], + default_params: dict[str, Any], + meta_params: dict[str, Any], param: str, setting: str, request: Request, - context: Optional[List[str]] = None, + context: list[str] | None = None, ): params = copy(default_params) meta_params = copy(meta_params) @@ -980,7 +981,7 @@ def _merge_params( param=param, setting=setting, request=request, - context=context + [k], + context=[*context, k], ) if meta_params[k] not in (None, {}): continue @@ -988,7 +989,7 @@ def _merge_params( if k in params: params.pop(k) else: - qual_param = ".".join(context + [k]) + qual_param = ".".join([*context, k]) logger.warning( f"In request {request} {param!r} parameter {qual_param} is " f"None, which is a value reserved to unset parameters defined " @@ -1002,7 +1003,7 @@ def _merge_params( def _get_raw_params( request: Request, *, - default_params: Dict[str, Any], + default_params: dict[str, Any], ): meta_params = request.meta.get("zyte_api", False) if meta_params is False: @@ -1013,6 +1014,7 @@ def _get_raw_params( f"Setting the zyte_api request metadata key to " f"{meta_params!r} is deprecated. Use False instead.", DeprecationWarning, + stacklevel=1, ) return None @@ -1035,12 +1037,12 @@ def _get_automap_params( request: Request, *, default_enabled: bool, - default_params: Dict[str, Any], + default_params: dict[str, Any], skip_headers: SKIP_HEADER_T, - browser_headers: Dict[bytes, str], + browser_headers: dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, cookies_enabled: bool, - cookie_jars: Optional[Dict[Any, CookieJar]], + cookie_jars: dict[Any, CookieJar] | None, max_cookies: int, ): meta_params = request.meta.get("zyte_api_automap", default_enabled) @@ -1080,17 +1082,17 @@ def _get_automap_params( def _get_api_params( request: Request, *, - default_params: Dict[str, Any], + default_params: dict[str, Any], transparent_mode: bool, - automap_params: Dict[str, Any], + automap_params: dict[str, Any], skip_headers: SKIP_HEADER_T, - browser_headers: Dict[bytes, str], + browser_headers: dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, - job_id: Optional[str], + job_id: str | None, cookies_enabled: bool, - cookie_jars: Optional[Dict[Any, CookieJar]], + cookie_jars: dict[Any, CookieJar] | None, max_cookies: int, -) -> Optional[dict]: +) -> dict | None: """Returns a dictionary of API parameters that must be sent to Zyte API for the specified request, or None if the request should not be sent through Zyte API.""" @@ -1182,7 +1184,7 @@ def _load_mw_skip_headers(crawler): return mw_skip_headers -def _load_browser_headers(settings) -> Dict[bytes, str]: +def _load_browser_headers(settings) -> dict[bytes, str]: browser_headers = settings.getdict( "ZYTE_API_BROWSER_HEADERS", {"Referer": "referer"}, @@ -1220,8 +1222,7 @@ def __init__(self, crawler, cookies_enabled=None): def _request_skip_headers(self, request): result = dict(self._mw_skip_headers) for name in request.meta.get("_pre_mw_headers", set()): - if name in result: - del result[name] + result.pop(name, None) return result def parse(self, request): diff --git a/scrapy_zyte_api/_request_fingerprinter.py b/scrapy_zyte_api/_request_fingerprinter.py index e9b548bc..110d1f05 100644 --- a/scrapy_zyte_api/_request_fingerprinter.py +++ b/scrapy_zyte_api/_request_fingerprinter.py @@ -6,7 +6,7 @@ logger = getLogger(__name__) -try: # noqa: C901 +try: from scrapy.utils.request import RequestFingerprinter as _ # noqa: F401 except ImportError: if not TYPE_CHECKING: @@ -35,7 +35,7 @@ def from_crawler(cls, crawler): @staticmethod def _poet_is_configured(settings): try: - from scrapy_poet import InjectionMiddleware + from scrapy_poet import InjectionMiddleware # noqa: PLC0415 except ImportError: return False for k, v in settings.get("DOWNLOADER_MIDDLEWARES", {}).items(): @@ -49,7 +49,7 @@ def __init__(self, crawler): self._poet_is_configured(settings) ) if poet_is_configured: - from scrapy_poet import ( + from scrapy_poet import ( # noqa: PLC0415 ScrapyPoetRequestFingerprinter as DefaultFallbackRequestFingerprinter, ) else: @@ -65,7 +65,7 @@ def __init__(self, crawler): ) if poet_is_configured and not isinstance( self._fallback_request_fingerprinter, - cast(type, DefaultFallbackRequestFingerprinter), + cast("type", DefaultFallbackRequestFingerprinter), ): logger.warning( f"scrapy-poet is enabled, but your custom value for the " @@ -82,7 +82,7 @@ def __init__(self, crawler): f"setting instead." ) self._fallback_fingerprinter_is_poets = False - self._cache: "WeakKeyDictionary[Request, bytes]" = WeakKeyDictionary() + self._cache: WeakKeyDictionary[Request, bytes] = WeakKeyDictionary() self._param_parser = _ParamParser(crawler, cookies_enabled=False) self._crawler = crawler @@ -149,6 +149,6 @@ def fingerprint(self, request): fingerprint += deps_key if serialized_page_params is not None: fingerprint += serialized_page_params - self._cache[request] = hashlib.sha1(fingerprint).digest() + self._cache[request] = hashlib.sha1(fingerprint).digest() # noqa: S324 return self._cache[request] return self._fallback_request_fingerprinter.fingerprint(request) diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index cbf8ca98..01579b6c 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -1,10 +1,11 @@ +import contextlib import json from asyncio import Task, create_task, sleep from collections import defaultdict, deque from copy import deepcopy from functools import partial from logging import getLogger -from typing import Any, DefaultDict, Deque, Dict, List, Optional, Set, Type, Union +from typing import Any from uuid import uuid4 from weakref import WeakKeyDictionary @@ -21,8 +22,8 @@ from .utils import ( # type: ignore[attr-defined] _DOWNLOAD_NEEDS_SPIDER, _build_from_crawler, - deferred_to_future, _close_spider, + deferred_to_future, ) logger = getLogger(__name__) @@ -30,7 +31,7 @@ ZYTE_API_META_KEYS = ("zyte_api", "zyte_api_automap", "zyte_api_provider") -def get_request_session_id(request: Request) -> Optional[str]: +def get_request_session_id(request: Request) -> str | None: """Return the session ID of *request*, or ``None`` if it does not have a session ID assigned.""" for meta_key in ZYTE_API_META_KEYS: @@ -157,7 +158,7 @@ class SessionConfig: #: :ref:`creating a pool ID for a request ` based on the #: content of the :reqmeta:`zyte_api_session_location` metadata key. See #: :meth:`pool`. - ADDRESS_FIELDS: List[str] = [ + ADDRESS_FIELDS: list[str] = [ "addressCountry", "addressRegion", "postalCode", @@ -182,7 +183,7 @@ def __init__(self, crawler): self._checker = None self._enabled = crawler.settings.getbool("ZYTE_API_SESSION_ENABLED", False) self._pool_counters = defaultdict(int) - self._param_pools: DefaultDict[str, Dict[str, int]] = defaultdict(dict) + self._param_pools: defaultdict[str, dict[str, int]] = defaultdict(dict) def enabled(self, request: Request) -> bool: """Return ``True`` if the request should use sessions from @@ -193,7 +194,7 @@ def enabled(self, request: Request) -> bool: """ return request.meta.get("zyte_api_session_enabled", self._enabled) - def process_request(self, request: Request) -> Optional[Request]: + def process_request(self, request: Request) -> Request | None: """Process *request* after it has been assigned a session. Return ``None`` to send the request as is, or return a new request @@ -290,7 +291,7 @@ def pool(self, request: Request) -> str: return f"{netloc}@{location_id}" return netloc - def location(self, request: Request) -> Dict[str, str]: + def location(self, request: Request) -> dict[str, str]: """Return the address :class:`dict` to use for location-based session initialization for *request*. @@ -335,7 +336,7 @@ def location(self, request: Request) -> Dict[str, str]: """ return request.meta.get("zyte_api_session_location", self._setting_location) - def params(self, request: Request) -> Dict[str, Any]: + def params(self, request: Request) -> dict[str, Any]: """Return the Zyte API request parameters to use to initialize a session for *request*. @@ -417,14 +418,14 @@ def check(self, response: Response, request: Request) -> bool: except ImportError: class SessionConfigRulesRegistry: - def session_config_cls(self, request: Request) -> Type[SessionConfig]: + def session_config_cls(self, request: Request) -> type[SessionConfig]: return SessionConfig def session_config( self, include, *, - instead_of: Optional[Type] = SessionConfig, + instead_of: type | None = SessionConfig, exclude=None, priority: int = 500, **kwargs, @@ -498,9 +499,9 @@ def __init__(self): rules = [ApplyRule(for_patterns=Patterns(include=[""]), use=SessionConfig)] # type: ignore[arg-type] super().__init__(rules=rules) - def session_config_cls(self, request: Request) -> Type[SessionConfig]: + def session_config_cls(self, request: Request) -> type[SessionConfig]: cls = SessionConfig - overrides: Dict[Type[SessionConfig], Type[SessionConfig]] = ( + overrides: dict[type[SessionConfig], type[SessionConfig]] = ( self.overrides_for(request.url) # type: ignore[assignment] ) while cls in overrides: @@ -511,8 +512,8 @@ def session_config( self, include: Strings, *, - instead_of: Optional[Type[SessionConfig]] = SessionConfig, - exclude: Optional[Strings] = None, + instead_of: type[SessionConfig] | None = SessionConfig, + exclude: Strings | None = None, priority: int = 500, **kwargs, ): @@ -558,7 +559,7 @@ def __init__(self, crawler: Crawler): settings = crawler.settings pool_size = settings.getint("ZYTE_API_SESSION_POOL_SIZE", 8) - self._pending_initial_sessions: Dict[str, int] = defaultdict(lambda: pool_size) + self._pending_initial_sessions: dict[str, int] = defaultdict(lambda: pool_size) pool_sizes = settings.getdict("ZYTE_API_SESSION_POOL_SIZES", {}) for pool, size in pool_sizes.items(): self._pending_initial_sessions[pool] = size @@ -566,19 +567,19 @@ def __init__(self, crawler: Crawler): self._max_check_failures = settings.getint( "ZYTE_API_SESSION_MAX_CHECK_FAILURES", 1 ) - self._check_failures: Dict[str, int] = defaultdict(int) + self._check_failures: dict[str, int] = defaultdict(int) self._max_errors = settings.getint("ZYTE_API_SESSION_MAX_ERRORS", 1) - self._errors: Dict[str, int] = defaultdict(int) + self._errors: dict[str, int] = defaultdict(int) max_bad_inits = settings.getint("ZYTE_API_SESSION_MAX_BAD_INITS", 8) - self._max_bad_inits: Dict[str, int] = defaultdict(lambda: max_bad_inits) + self._max_bad_inits: dict[str, int] = defaultdict(lambda: max_bad_inits) max_bad_inits_per_pool = settings.getdict( "ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL", {} ) for pool, pool_max_bad_inits in max_bad_inits_per_pool.items(): self._max_bad_inits[pool] = pool_max_bad_inits - self._bad_inits: Dict[str, int] = defaultdict(int) + self._bad_inits: dict[str, int] = defaultdict(int) # Transparent mode, needed to determine whether to set the session # using ``zyte_api`` or ``zyte_api_automap``. @@ -597,7 +598,7 @@ def __init__(self, crawler: Crawler): # # As soon as a session expires, it is removed from its pool, and a task # to initialize that new session is started. - self._pools: Dict[str, Set[str]] = defaultdict(set) + self._pools: dict[str, set[str]] = defaultdict(set) self._pool_cache: WeakKeyDictionary[Request, str] = WeakKeyDictionary() # The queue is a rotating list of session IDs to use. @@ -616,7 +617,7 @@ def __init__(self, crawler: Crawler): # If the queue is empty, sleep and try again. Sessions from the pool # will be appended to the queue as they are initialized and ready to # use. - self._queues: Dict[str, Deque[str]] = defaultdict(deque) + self._queues: dict[str, deque[str]] = defaultdict(deque) self._queue_max_attempts = settings.getint( "ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS", 60 ) @@ -628,12 +629,12 @@ def __init__(self, crawler: Crawler): # # Keeping a reference to those tasks until they are done is necessary # to prevent garbage collection to remove the tasks. - self._init_tasks: Set[Task] = set() + self._init_tasks: set[Task] = set() self._session_config_cache: WeakKeyDictionary[Request, SessionConfig] = ( WeakKeyDictionary() ) - self._session_config_map: Dict[Type[SessionConfig], SessionConfig] = {} + self._session_config_map: dict[type[SessionConfig], SessionConfig] = {} self._setting_params = settings.getdict("ZYTE_API_SESSION_PARAMS") @@ -661,8 +662,8 @@ def get_pool(self, request): session_config = self._get_session_config(request) try: pool = session_config.pool(request) - except Exception: - raise PoolError + except Exception as ex: + raise PoolError from ex self._pool_cache[request] = pool return pool @@ -768,7 +769,7 @@ async def _next_from_queue(self, request: Request, pool: str) -> str: while session_id not in self._pools[pool]: # After 1st loop: invalid session. try: session_id = self._queues[pool].popleft() - except IndexError: # No ready-to-use session available. + except IndexError as ex: # No ready-to-use session available. attempts += 1 if attempts >= self._queue_max_attempts: raise RuntimeError( @@ -785,7 +786,7 @@ async def _next_from_queue(self, request: Request, pool: str) -> str: f"https://github.com/scrapy-plugins/scrapy-zyte-api/issues/new " f"providing a minimal reproducible example if " f"possible, or debug logs and stats otherwise." - ) + ) from ex await sleep(self._queue_wait_time) assert session_id is not None self._queues[pool].append(session_id) @@ -826,10 +827,8 @@ def _start_session_refresh(self, session_id: str, request: Request, pool: str): task = create_task(self._create_session(request, pool)) self._init_tasks.add(task) task.add_done_callback(self._init_tasks.discard) - try: + with contextlib.suppress(KeyError): del self._errors[session_id] - except KeyError: - pass def _start_request_session_refresh(self, request: Request, pool: str): session_id = get_request_session_id(request) @@ -885,7 +884,7 @@ async def check(self, response: Response, request: Request) -> bool: self._start_request_session_refresh(request, pool) return False - async def assign(self, request: Request) -> Optional[Request]: + async def assign(self, request: Request) -> Request | None: """Assign a working session to *request*. If the session config creates a new request instead of modifying the @@ -971,12 +970,12 @@ def __init__(self, crawler: Crawler): async def process_request( self, request: Request, spider: Spider | None = None - ) -> Optional[Request]: + ) -> Request | None: return await self._sessions.assign(request) async def process_response( self, request: Request, response: Response, spider: Spider | None = None - ) -> Union[Request, Response, None]: + ) -> Request | Response | None: if isinstance(response, DummyResponse): return response @@ -997,7 +996,7 @@ async def process_response( async def process_exception( self, request: Request, exception: Exception, spider: Spider | None = None - ) -> Union[Request, None]: + ) -> Request | None: if ( not isinstance(exception, RequestError) or self._sessions.is_init_request(request) @@ -1041,7 +1040,7 @@ class LocationSessionConfig(SessionConfig): as a parameter. """ - def params(self, request: Request) -> Dict[str, Any]: + def params(self, request: Request) -> dict[str, Any]: if not (location := self.location(request)): return super().params(request) return self.location_params(request, location) @@ -1052,15 +1051,15 @@ def check(self, response: Response, request: Request) -> bool: return self.location_check(response, request, location) def location_params( - self, request: Request, location: Dict[str, Any] - ) -> Dict[str, Any]: + self, request: Request, location: dict[str, Any] + ) -> dict[str, Any]: """Like :class:`SessionConfig.params `, but it is only called when a location is set, and gets that *location* as a parameter.""" return super().params(request) def location_check( - self, response: Response, request: Request, location: Dict[str, Any] + self, response: Response, request: Request, location: dict[str, Any] ) -> bool: """Like :class:`SessionConfig.check `, but it is only called when a diff --git a/scrapy_zyte_api/addon.py b/scrapy_zyte_api/addon.py index b428357a..ee4f3b93 100644 --- a/scrapy_zyte_api/addon.py +++ b/scrapy_zyte_api/addon.py @@ -49,7 +49,7 @@ def _setdefault(settings, setting, cls, pos): class Addon: def update_settings(self, settings: BaseSettings) -> None: - from scrapy.settings.default_settings import ( + from scrapy.settings.default_settings import ( # noqa: PLC0415 REQUEST_FINGERPRINTER_CLASS as _SCRAPY_DEFAULT_REQUEST_FINGEPRINTER_CLASS, ) @@ -79,7 +79,7 @@ def update_settings(self, settings: BaseSettings) -> None: settings.set( "REQUEST_FINGERPRINTER_CLASS", "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter", - cast(int, settings.getpriority("REQUEST_FINGERPRINTER_CLASS")), + cast("int", settings.getpriority("REQUEST_FINGERPRINTER_CLASS")), ) else: settings.set( @@ -115,11 +115,11 @@ def update_settings(self, settings: BaseSettings) -> None: settings.set("ZYTE_API_TRANSPARENT_MODE", True, "addon") try: - from scrapy_poet import InjectionMiddleware + from scrapy_poet import InjectionMiddleware # noqa: PLC0415 except ImportError: pass else: - from scrapy_zyte_api.providers import ZyteApiProvider + from scrapy_zyte_api.providers import ZyteApiProvider # noqa: PLC0415 if not _POET_ADDON_SUPPORT: _setdefault( @@ -136,5 +136,5 @@ def update_settings(self, settings: BaseSettings) -> None: settings.set( "ZYTE_API_RETRY_POLICY", _SESSION_RETRY_POLICIES.get(loaded_retry_policy, retry_policy), - cast(int, settings.getpriority("ZYTE_API_RETRY_POLICY")), + cast("int", settings.getpriority("ZYTE_API_RETRY_POLICY")), ) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 176705ca..0573942f 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -2,8 +2,7 @@ import logging import time from copy import deepcopy -from typing import Any, Optional, Union - +from typing import Any from scrapy import Spider, signals from scrapy.crawler import Crawler @@ -13,6 +12,7 @@ from scrapy.settings import Settings from scrapy.utils.misc import load_object from scrapy.utils.reactor import verify_installed_reactor +from twisted.internet.defer import ensureDeferred from zyte_api import AsyncZyteAPI, RequestError from zyte_api.apikey import NoApiKey @@ -36,8 +36,8 @@ def _body_max_size_exceeded( body_size: int, - warnsize: Optional[int], - maxsize: Optional[int], + warnsize: int | None, + maxsize: int | None, request_url: str, ) -> bool: if warnsize and body_size > warnsize: @@ -90,7 +90,7 @@ def __init__( self, settings: Settings, crawler: Crawler, - client: Optional[AsyncZyteAPI] = None, + client: AsyncZyteAPI | None = None, ): if not settings.getbool("ZYTE_API_ENABLED", True): raise NotConfigured( @@ -190,13 +190,13 @@ def _build_client(settings): user_agent=settings.get("_ZYTE_API_USER_AGENT", USER_AGENT), **kwargs, ) - except NoApiKey: + except NoApiKey as ex: message = ( "No authentication data provided. See " "https://scrapy-zyte-api.readthedocs.io/en/latest/setup.html#auth" ) logger.warning(message) - raise NotConfigured(message) + raise NotConfigured(message) from ex def _create_handler(self, path: Any) -> Any: dhcls = load_object(path) @@ -262,9 +262,9 @@ def _update_stats(self, api_params): ) for error_type, count in self._client.agg_stats.api_error_types.items(): - error_type = error_type or "/" + error_type = error_type or "/" # noqa: PLW2901 if not error_type.startswith("/"): - error_type = f"/{error_type}" + error_type = f"/{error_type}" # noqa: PLW2901 self._stats.set_value(f"{prefix}/error_types{error_type}", count) for counter in ( @@ -276,7 +276,7 @@ def _update_stats(self, api_params): async def _download_request( self, api_params: dict, request: Request - ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: + ) -> ZyteAPITextResponse | ZyteAPIResponse | None: # Define url by default retrying = request.meta.get("zyte_api_retry_policy") if retrying: @@ -304,9 +304,8 @@ async def _download_request( # would cause AutoThrottle to adjust the download delay of the # request slot, and we do not want AutoThrottle to do that for Zyte # API slots since Zyte API already handles throtling. - if ( - not self._autothrottle_is_enabled - or _AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT + if not self._autothrottle_is_enabled or ( + _AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT and request.meta.get("autothrottle_dont_adjust_delay", False) ): request.meta["download_latency"] = time.time() - start_time @@ -359,8 +358,6 @@ def _truncate_params(self, params): if _DOWNLOAD_REQUEST_RETURNS_DEFERRED: def close(self) -> Deferred: - from twisted.internet.defer import ensureDeferred - async def _close(): if self._fallback_handler and hasattr(self._fallback_handler, "close"): await self._fallback_handler.close() @@ -375,7 +372,7 @@ async def close(self) -> None: # type: ignore[misc] await self._fallback_handler.close() await self._close() - async def _close(self) -> None: # NOQA + async def _close(self) -> None: await self._session.close() @@ -384,7 +381,7 @@ def __init__( self, settings: Settings, crawler: Crawler, - client: Optional[AsyncZyteAPI] = None, + client: AsyncZyteAPI | None = None, ): super().__init__(settings, crawler, client) self._fallback_handler = self._create_handler( @@ -397,7 +394,7 @@ def __init__( self, settings: Settings, crawler: Crawler, - client: Optional[AsyncZyteAPI] = None, + client: AsyncZyteAPI | None = None, ): super().__init__(settings, crawler, client) self._fallback_handler = self._create_handler( @@ -413,7 +410,7 @@ def __init__( self, settings: Settings, crawler: Crawler, - client: Optional[AsyncZyteAPI] = None, + client: AsyncZyteAPI | None = None, ): super().__init__(settings, crawler, client) self._fallback_handler = self._create_handler( diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 1ad156fe..b37ea4c3 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,11 +1,10 @@ -from collections.abc import Coroutine -from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast +from collections.abc import Callable, Coroutine, Sequence +from typing import TYPE_CHECKING, Any, Set, cast from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request from scrapy.crawler import Crawler from scrapy_poet import PageObjectInputProvider -from twisted.internet.defer import Deferred from web_poet import ( AnyResponse, BrowserHtml, @@ -46,9 +45,13 @@ from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot from scrapy_zyte_api._annotations import _ActionResult, _from_hashable -from scrapy_zyte_api.responses import ZyteAPITextResponse from scrapy_zyte_api.utils import _ENGINE_HAS_DOWNLOAD_ASYNC, maybe_deferred_to_future +if TYPE_CHECKING: + from twisted.internet.defer import Deferred + + from scrapy_zyte_api.responses import ZyteAPITextResponse + try: # requires Scrapy >= 2.8 from scrapy.http.request import NO_CALLBACK @@ -56,7 +59,7 @@ NO_CALLBACK = None # type: ignore[assignment] -_ITEM_KEYWORDS: Dict[type, str] = { +_ITEM_KEYWORDS: dict[type, str] = { Product: "product", ProductList: "productList", ProductNavigation: "productNavigation", @@ -68,7 +71,7 @@ JobPostingNavigation: "jobPostingNavigation", Serp: "serp", } -_AUTO_PAGES: Set[type] = { +_AUTO_PAGES: set[type] = { AutoArticlePage, AutoArticleListPage, AutoArticleNavigationPage, @@ -113,7 +116,7 @@ def __init__(self, *args, **kwargs): def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) - def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): + def _track_auto_fields(self, crawler: Crawler, request: Request, cls: type): assert crawler.stats if cls not in _ITEM_KEYWORDS: return @@ -138,16 +141,16 @@ def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): cls_fqn = get_fq_class_name(cls) crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) - async def __call__( # noqa: C901 + async def __call__( self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: """Makes a Zyte API request to provide BrowserResponse and/or item dependencies.""" - results: List[Any] = [] + results: list[Any] = [] http_response = None screenshot_requested = Screenshot in to_provide for cls in list(to_provide): - self._track_auto_fields(crawler, request, cast(type, cls)) + self._track_auto_fields(crawler, request, cast("type", cls)) item = self.injector.weak_cache.get(request, {}).get(cls) if item: results.append(item) @@ -177,8 +180,8 @@ async def __call__( # noqa: C901 **request.meta.get("zyte_api_provider", {}), } - to_provide_stripped: Set[type] = set() - extract_from_seen: Dict[str, str] = {} + to_provide_stripped: set[type] = set() + extract_from_seen: dict[str, str] = {} item_requested: bool = False for cls in to_provide: @@ -344,7 +347,7 @@ async def __call__( # noqa: C901 results.append(result) continue if cls_stripped is Actions and is_typing_annotated(cls): - actions_result: Optional[List[_ActionResult]] + actions_result: list[_ActionResult] | None if "actions" in api_response.raw_api_response: actions_result = [ _ActionResult(**action_result) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 70ec0d25..877e98de 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,7 +1,7 @@ import datetime as dt from base64 import b64decode from copy import copy -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, TypeAlias, cast from scrapy import Request from scrapy.http import Headers, HtmlResponse, Response, TextResponse @@ -26,11 +26,11 @@ class ZyteAPIMixin: "content-encoding", } - def __init__(self, *args, raw_api_response: Optional[Dict] = None, **kwargs): + def __init__(self, *args, raw_api_response: dict | None = None, **kwargs): super().__init__(*args, **kwargs) self._raw_api_response = raw_api_response if not _RESPONSE_HAS_ATTRIBUTES: - self.attributes: Tuple[str, ...] = ( + self.attributes: tuple[str, ...] = ( "url", "status", "headers", @@ -54,7 +54,7 @@ def replace(self, *args, **kwargs): return cls(*args, **kwargs) @property - def raw_api_response(self) -> Optional[Dict]: + def raw_api_response(self) -> dict | None: """Contains the raw API response from Zyte API. For the full list of parameters, see :ref:`zapi-reference`. @@ -85,12 +85,12 @@ def _response_cookie_to_header_value(cookie): return result @classmethod - def _prepare_headers(cls, api_response: Dict[str, Any]): - result: Dict[str, List[str]] = {} - input_headers: Optional[List[Dict[str, str]]] = api_response.get( + def _prepare_headers(cls, api_response: dict[str, Any]): + result: dict[str, list[str]] = {} + input_headers: list[dict[str, str]] | None = api_response.get( "httpResponseHeaders" ) - response_cookies: Optional[List[Dict[str, str]]] = api_response.get( + response_cookies: list[dict[str, str]] | None = api_response.get( "experimental", {} ).get("responseCookies") if input_headers: @@ -113,9 +113,7 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): class ZyteAPITextResponse(ZyteAPIMixin, HtmlResponse): @classmethod - def from_api_response( - cls, api_response: Dict, *, request: Optional[Request] = None - ): + def from_api_response(cls, api_response: dict, *, request: Request | None = None): """Alternative constructor to instantiate the response from the raw Zyte API response. """ @@ -146,9 +144,7 @@ def replace(self, *args, **kwargs): class ZyteAPIResponse(ZyteAPIMixin, Response): @classmethod - def from_api_response( - cls, api_response: Dict, *, request: Optional[Request] = None - ): + def from_api_response(cls, api_response: dict, *, request: Request | None = None): """Alternative constructor to instantiate the response from the raw Zyte API response. """ @@ -163,18 +159,18 @@ def from_api_response( ) -_IMMUTABLE_JSON = Union[None, str, int, float, bool] -_JSON = Union[ - None, str, int, float, bool, List["_JSON"], Dict[_IMMUTABLE_JSON, "_JSON"] -] -_API_RESPONSE = Dict[str, _JSON] +_IMMUTABLE_JSON: TypeAlias = None | str | int | float | bool +_JSON: TypeAlias = ( + None | str | int | float | bool | list["_JSON"] | dict[_IMMUTABLE_JSON, "_JSON"] +) +_API_RESPONSE: TypeAlias = dict[str, _JSON] def _process_response( api_response: _API_RESPONSE, request: Request, - cookie_jars: Optional[Dict[Any, CookieJar]], -) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: + cookie_jars: dict[Any, CookieJar] | None, +) -> ZyteAPITextResponse | ZyteAPIResponse | None: """Given a Zyte API Response and the ``scrapy.Request`` that asked for it, this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending on which if it can properly decode the HTTP Body or have access to browserHtml. @@ -196,13 +192,13 @@ def _process_response( if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): # a plain dict here doesn't work correctly on Scrapy < 2.1 scrapy_headers = Headers() - for header in cast(List[Dict[str, str]], api_response["httpResponseHeaders"]): + for header in cast("list[dict[str, str]]", api_response["httpResponseHeaders"]): scrapy_headers[header["name"].encode()] = header["value"].encode() response_cls = responsetypes.from_args( headers=scrapy_headers, - url=cast(str, api_response["url"]), + url=cast("str", api_response["url"]), # FIXME: update this when python-zyte-api supports base64 decoding - body=b64decode(api_response["httpResponseBody"]), # type: ignore + body=b64decode(api_response["httpResponseBody"]), # type: ignore[arg-type] ) if issubclass(response_cls, TextResponse): return ZyteAPITextResponse.from_api_response(api_response, request=request) diff --git a/scrapy_zyte_api/utils.py b/scrapy_zyte_api/utils.py index 8d7d6085..4c1c5837 100644 --- a/scrapy_zyte_api/utils.py +++ b/scrapy_zyte_api/utils.py @@ -1,12 +1,15 @@ import asyncio +import inspect import sys +from collections.abc import Coroutine from importlib.metadata import version -from typing import Any, Coroutine +from typing import Any, TypeVar, Union from warnings import catch_warnings, filterwarnings import scrapy from packaging.version import Version from scrapy.utils.reactor import is_asyncio_reactor_installed +from twisted.internet.defer import Deferred from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT from .__version__ import __version__ @@ -88,12 +91,6 @@ def _build_from_crawler( try: from scrapy.utils.defer import deferred_to_future, maybe_deferred_to_future except ImportError: # Scrapy < 2.7.0 - import asyncio - from typing import TYPE_CHECKING, TypeVar, Union - from warnings import catch_warnings, filterwarnings - - if TYPE_CHECKING: - from twisted.internet.defer import Deferred def set_asyncio_event_loop(): try: @@ -149,8 +146,6 @@ def _is_asyncio_available() -> bool: # https://github.com/scrapy/scrapy/blob/0b9d8da09dd2cb1b74ddf025107e6f584839fbff/scrapy/utils/defer.py#L525 def _schedule_coro(coro: Coroutine[Any, Any, Any]) -> None: if not _is_asyncio_available(): - from twisted.internet.defer import Deferred - Deferred.fromCoroutine(coro) return loop = asyncio.get_event_loop() @@ -169,10 +164,6 @@ def _close_spider(crawler, reason): except ImportError: # pragma: no cover # Scrapy < 2.14 - import inspect - - from twisted.internet.defer import Deferred - def _ensure_awaitable(o): # type: ignore[no-redef] if isinstance(o, Deferred): return maybe_deferred_to_future(o) diff --git a/tests/__init__.py b/tests/__init__.py index 94b8c764..54d61739 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,7 @@ from contextlib import asynccontextmanager, contextmanager from copy import deepcopy from os import environ -from typing import Any, Dict, Optional +from typing import Any from urllib.request import Request from packaging.version import Version @@ -16,16 +16,16 @@ from scrapy_zyte_api.addon import Addon from scrapy_zyte_api.handler import _ScrapyZyteAPIBaseDownloadHandler from scrapy_zyte_api.utils import ( # type: ignore[attr-defined] + _DOWNLOAD_REQUEST_RETURNS_DEFERRED, _POET_ADDON_SUPPORT, _ensure_awaitable, maybe_deferred_to_future, - _DOWNLOAD_REQUEST_RETURNS_DEFERRED, ) _API_KEY = "a" DEFAULT_CLIENT_CONCURRENCY = AsyncZyteAPI(api_key=_API_KEY).n_conn -SETTINGS_T = Dict[str, Any] +SETTINGS_T = dict[str, Any] SETTINGS: SETTINGS_T = { "DOWNLOAD_HANDLERS": { "http": "scrapy_zyte_api.handler.ScrapyZyteAPIDownloadHandler", @@ -105,7 +105,7 @@ def get_download_handler(crawler, schema): @asynccontextmanager async def make_handler( - settings: SETTINGS_T, api_url: Optional[str] = None, *, use_addon: bool = False + settings: SETTINGS_T, api_url: str | None = None, *, use_addon: bool = False ): if api_url is not None: settings["ZYTE_API_URL"] = api_url @@ -118,7 +118,7 @@ async def make_handler( yield handler finally: if handler is not None: - await handler._close() # NOQA + await handler._close() def serialize_settings(settings): diff --git a/tests/conftest.py b/tests/conftest.py index 6acd1c42..631a3ae6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,17 +1,15 @@ import pytest +from .mockserver import MockServer + @pytest.fixture(scope="session") def mockserver(): - from .mockserver import MockServer - with MockServer() as server: yield server -@pytest.fixture(scope="function") +@pytest.fixture def fresh_mockserver(): - from .mockserver import MockServer - with MockServer() as server: yield server diff --git a/tests/mockserver.py b/tests/mockserver.py index 2c8f55a9..42cd142b 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -1,4 +1,5 @@ from __future__ import annotations + import argparse import json import socket @@ -8,28 +9,30 @@ from contextlib import asynccontextmanager from importlib import import_module from subprocess import PIPE, Popen -from typing import Dict, List, Optional +from typing import TYPE_CHECKING from urllib.parse import urlparse from scrapy import Request -from twisted.internet import reactor -from twisted.internet.defer import Deferred from twisted.internet.task import deferLater from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET, Site -from scrapy_zyte_api._annotations import _ActionResult, ExtractFrom -from scrapy_zyte_api.responses import _API_RESPONSE +from scrapy_zyte_api._annotations import ExtractFrom, _ActionResult + +from . import SETTINGS, download_request, make_handler + +if TYPE_CHECKING: + from twisted.internet.defer import Deferred -from . import SETTINGS, make_handler, download_request + from scrapy_zyte_api.responses import _API_RESPONSE # https://github.com/scrapy/scrapy/blob/02b97f98e74a994ad3e4d74e7ed55207e508a576/tests/mockserver.py#L27C1-L33C19 -def getarg(request, name, default=None, type=None): +def getarg(request, name, default=None, type_=None): if name in request.args: value = request.args[name][0] - if type is not None: - value = type(value) + if type_ is not None: + value = type_(value) return value return default @@ -52,6 +55,8 @@ class LeafResource(Resource): isLeaf = True def deferRequest(self, request, delay, f, *a, **kw): + from twisted.internet import reactor # noqa: PLC0415 + def _cancelrequest(_): # silence CancelledError d.addErrback(lambda _: None) @@ -196,10 +201,7 @@ def render_POST(self, request): break else: headers = request_data.get("requestHeaders", {}) - if "referer" in headers: - referer = headers["referer"] - else: - referer = None + referer = headers.get("referer") if referer is not None: assert isinstance(response_data["httpResponseHeaders"], list) response_data["httpResponseHeaders"].append( @@ -208,7 +210,7 @@ def render_POST(self, request): actions = request_data.get("actions") if actions: - results: List[_ActionResult] = [] + results: list[_ActionResult] = [] for action in actions: result: _ActionResult = { "action": action["action"], @@ -240,9 +242,8 @@ def render_POST(self, request): assert isinstance(response_data["product"], dict) assert isinstance(response_data["product"]["name"], str) extract_from = request_data.get("productOptions", {}).get("extractFrom") - if extract_from: - if extract_from == ExtractFrom.httpResponseBody: - response_data["product"]["name"] += " (from httpResponseBody)" + if extract_from == ExtractFrom.httpResponseBody: + response_data["product"]["name"] += " (from httpResponseBody)" if "geolocation" in request_data: response_data["product"]["name"] += ( @@ -300,11 +301,11 @@ def _delayedRender(self, request, seconds): class MockServer: def __init__(self, resource=None, port=None): resource = resource or DefaultResource - self.resource = "{}.{}".format(resource.__module__, resource.__name__) + self.resource = f"{resource.__module__}.{resource.__name__}" self.proc = None self.host = socket.gethostbyname(socket.gethostname()) self.port = port or get_ephemeral_port() - self.root_url = "http://%s:%d" % (self.host, self.port) + self.root_url = f"http://{self.host}:{self.port}" def __enter__(self): self.proc = Popen( @@ -333,13 +334,15 @@ def urljoin(self, path): return self.root_url + path @asynccontextmanager - async def make_handler(self, settings: Optional[Dict] = None): + async def make_handler(self, settings: dict | None = None): settings = settings or {} async with make_handler(settings, self.urljoin("/")) as handler: yield handler def main(): + from twisted.internet import reactor # noqa: PLC0415 + parser = argparse.ArgumentParser() parser.add_argument("resource") parser.add_argument("--port", type=int) @@ -352,11 +355,7 @@ def main(): def print_listening(): host = http_port.getHost() - print( - "Mock server {} running at http://{}:{}".format( - resource, host.host, host.port - ) - ) + print(f"Mock server {resource} running at http://{host.host}:{host.port}") # Typing issue: https://github.com/twisted/twisted/issues/9909 reactor.callWhenRunning(print_listening) # type: ignore[attr-defined] diff --git a/tests/test_addon.py b/tests/test_addon.py index dfe07ba9..16922b08 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -1,12 +1,9 @@ -from typing import Optional, Type - import pytest - from scrapy import Request, Spider -from scrapy.utils.defer import deferred_f_from_coro_f from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler from scrapy.http.response import Response from scrapy.settings.default_settings import TWISTED_REACTOR +from scrapy.utils.defer import deferred_f_from_coro_f from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred, succeed @@ -22,8 +19,8 @@ _POET_ADDON_SUPPORT, ) +from . import download_request, get_download_handler, make_handler, serialize_settings from . import get_crawler as get_crawler_zyte_api -from . import get_download_handler, make_handler, serialize_settings, download_request pytest.importorskip("scrapy.addons") @@ -32,7 +29,7 @@ except ImportError: POET = False InjectionMiddleware = None - ZyteApiProvider: Optional[Type] = None + ZyteApiProvider: type | None = None else: POET = True from scrapy_zyte_api.providers import ZyteApiProvider @@ -200,7 +197,7 @@ def _test_setting_changes(initial_settings, expected_settings): ) @pytest.mark.parametrize( ("initial_settings", "expected_settings"), - ( + [ ( {}, BASE_EXPECTED, @@ -248,7 +245,7 @@ def _test_setting_changes(initial_settings, expected_settings): }, }, ), - ), + ], ) def test_no_poet_setting_changes(initial_settings, expected_settings): _test_setting_changes(initial_settings, expected_settings) @@ -267,7 +264,7 @@ def test_no_poet_setting_changes(initial_settings, expected_settings): ) @pytest.mark.parametrize( ("initial_settings", "expected_settings"), - ( + [ ( {}, { @@ -278,7 +275,7 @@ def test_no_poet_setting_changes(initial_settings, expected_settings): }, }, ), - ), + ], ) def test_poet_setting_changes(initial_settings, expected_settings): _test_setting_changes(initial_settings, expected_settings) diff --git a/tests/test_annotations.py b/tests/test_annotations.py index a6537d00..7d1fac70 100644 --- a/tests/test_annotations.py +++ b/tests/test_annotations.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize( - "input,expected", + ("input_", "expected"), [ ([], ()), ({}, frozenset()), @@ -38,12 +38,12 @@ ), ], ) -def test_make_hashable(input, expected): - assert make_hashable(input) == expected +def test_make_hashable(input_, expected): + assert make_hashable(input_) == expected @pytest.mark.parametrize( - "input,expected", + ("input_", "expected"), [ ((), []), (frozenset(), {}), @@ -72,12 +72,12 @@ def test_make_hashable(input, expected): ), ], ) -def test_from_hashable(input, expected): - assert _from_hashable(input) == expected +def test_from_hashable(input_, expected): + assert _from_hashable(input_) == expected @pytest.mark.parametrize( - "input,expected", + ("input_", "expected"), [ ([], ()), ([{}], (frozenset(),)), @@ -90,12 +90,12 @@ def test_from_hashable(input, expected): ), ], ) -def test_actions(input, expected): - assert actions(input) == expected +def test_actions(input_, expected): + assert actions(input_) == expected @pytest.mark.parametrize( - "input,options,expected", + ("input_", "options", "expected"), [ ({}, None, (frozenset(), None)), ({"foo": "bar"}, None, (frozenset({("foo", "bar")}), None)), @@ -106,5 +106,5 @@ def test_actions(input, expected): ), ], ) -def test_custom_attrs(input, options, expected): - assert custom_attrs(input, options) == expected +def test_custom_attrs(input_, options, expected): + assert custom_attrs(input_, options) == expected diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 9c3dd15d..24f2afce 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -4,11 +4,10 @@ from functools import partial from http.cookiejar import Cookie from inspect import isclass -from typing import Any, Dict, List, Optional, Type, cast +from typing import Any, cast from unittest import mock import pytest -from _pytest.logging import LogCaptureFixture # NOQA from scrapy import Request, Spider from scrapy.crawler import Crawler from scrapy.downloadermiddlewares.cookies import CookiesMiddleware @@ -19,7 +18,7 @@ from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT from scrapy.utils.defer import deferred_f_from_coro_f -from twisted.internet.defer import Deferred +from twisted.internet.defer import Deferred, succeed from zyte_api import RequestError from scrapy_zyte_api._cookies import _get_cookie_jar @@ -36,13 +35,13 @@ DEFAULT_CLIENT_CONCURRENCY, SETTINGS, SETTINGS_T, + download_request, get_crawler, get_download_handler, get_downloader_middleware, - set_env, - download_request, process_request, process_response, + set_env, ) from .mockserver import DelayedResource, MockServer, produce_request_response @@ -68,7 +67,6 @@ class ParamsDownloadHandler(_ScrapyZyteAPIBaseDownloadHandler): def download_request(self, request: Request, spider: Spider) -> Deferred: params = self._param_parser.parse(request) self._crawler.signals.send_catch_log(params_signal, params=params) - from twisted.internet.defer import succeed return succeed(Response(request.url)) @@ -81,7 +79,7 @@ async def download_request(self, request: Request) -> Response: # type: ignore[ def inject_cookies( - cookies: Optional[List[Dict[str, Any]]], request: Request, crawler: Crawler + cookies: list[dict[str, Any]] | None, request: Request, crawler: Crawler ) -> None: if cookies is None: return @@ -115,10 +113,10 @@ def inject_cookies( async def request_to_params( request: Request, - settings: Optional[SETTINGS_T] = None, + settings: SETTINGS_T | None = None, is_start_request: bool = False, - cookies: Optional[List[Dict[str, Any]]] = None, -) -> Dict[str, Any]: + cookies: list[dict[str, Any]] | None = None, +) -> dict[str, Any]: """Convert a Scrapy request to a Zyte API parameters dictionary.""" start_request = request if is_start_request else Request(url="data:,") @@ -141,7 +139,7 @@ async def parse(self, response): if not is_start_request: yield request - param_sets: List[Dict[str, Any]] = [] + param_sets: list[dict[str, Any]] = [] def track_params(params): param_sets.append(params) @@ -180,7 +178,7 @@ def track_params(params): ], ) @deferred_f_from_coro_f -async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): +async def test_response_binary(meta: dict[str, dict[str, Any]], mockserver): """Test that binary (i.e. non-text) responses from Zyte API are successfully mapped to a subclass of Response that is not also a subclass of TextResponse. @@ -214,7 +212,7 @@ async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): ), ], ) -async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): +async def test_response_html(meta: dict[str, dict[str, Any]], mockserver): """Test that HTML responses from Zyte API are successfully mapped to a subclass of TextResponse. @@ -242,7 +240,7 @@ async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): @deferred_f_from_coro_f @pytest.mark.parametrize( - "setting,enabled", + ("setting", "enabled"), [ (UNSET, True), (True, True), @@ -289,7 +287,7 @@ async def test_coro_handling(zyte_api: bool, mockserver): @deferred_f_from_coro_f @pytest.mark.parametrize( - "meta, exception_type, exception_text", + ("meta", "exception_type", "exception_text"), [ ( {"zyte_api": {"echoData": Request("http://test.com")}}, @@ -313,9 +311,9 @@ async def test_coro_handling(zyte_api: bool, mockserver): ], ) async def test_exceptions( - caplog: LogCaptureFixture, - meta: Dict[str, Dict[str, Any]], - exception_type: Type[Exception], + caplog: pytest.LogCaptureFixture, + meta: dict[str, dict[str, Any]], + exception_type: type[Exception], exception_text: str, mockserver, ): @@ -387,9 +385,9 @@ async def parse(self, response): assert response_indexes[0] == expected_first_index -AUTOMAP_PARAMS: Dict[str, Any] = {} +AUTOMAP_PARAMS: dict[str, Any] = {} BROWSER_HEADERS = {b"referer": "referer"} -DEFAULT_PARAMS: Dict[str, Any] = {} +DEFAULT_PARAMS: dict[str, Any] = {} TRANSPARENT_MODE = False SKIP_HEADERS = { b"cookie": ANY_VALUE, @@ -412,9 +410,8 @@ async def parse(self, response): @deferred_f_from_coro_f async def test_params_parser_input_default(mockserver): async with mockserver.make_handler() as handler: - for key in GET_API_PARAMS_KWARGS: + for key, expected in GET_API_PARAMS_KWARGS.items(): actual = getattr(handler._param_parser, f"_{key}") - expected = GET_API_PARAMS_KWARGS[key] assert actual == expected, key @@ -444,7 +441,7 @@ async def test_param_parser_input_custom(mockserver): @deferred_f_from_coro_f @pytest.mark.parametrize( - "output,uses_zyte_api", + ("output", "uses_zyte_api"), [ (None, False), ({}, True), @@ -471,14 +468,14 @@ async def test_param_parser_output_side_effects(output, uses_zyte_api, mockserve handler._fallback_handler.download_request.assert_called() -DEFAULT_AUTOMAP_PARAMS: Dict[str, Any] = { +DEFAULT_AUTOMAP_PARAMS: dict[str, Any] = { "httpResponseBody": True, "httpResponseHeaders": True, } @pytest.mark.parametrize( - "setting,meta,expected", + ("setting", "meta", "expected"), [ (False, None, None), (False, {}, None), @@ -597,7 +594,10 @@ async def test_bad_meta_type(key, value): request = Request(url="https://example.com", meta={key: value}) crawler = await get_crawler() param_parser = _ParamParser(crawler) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="parameters in the request meta should be provided as a dictionary", + ): param_parser.parse(request) @@ -648,7 +648,7 @@ async def test_default_params_none(mockserver, caplog): @pytest.mark.parametrize( - "setting,meta,expected,warnings", + ("setting", "meta", "expected", "warnings"), [ ({}, {}, {}, []), ({}, {"b": 2}, {"b": 2}, []), @@ -672,7 +672,7 @@ async def test_default_params_none(mockserver, caplog): ], ) @pytest.mark.parametrize( - "setting_key,meta_key,ignore_keys", + ("setting_key", "meta_key", "ignore_keys"), [ ("ZYTE_API_DEFAULT_PARAMS", "zyte_api", set()), ( @@ -718,7 +718,7 @@ async def test_default_params_merging( @pytest.mark.parametrize( - "setting,meta", + ("setting", "meta"), [ # append ( @@ -738,7 +738,7 @@ async def test_default_params_merging( ], ) @pytest.mark.parametrize( - "setting_key,meta_key", + ("setting_key", "meta_key"), [ ("ZYTE_API_DEFAULT_PARAMS", "zyte_api"), ( @@ -790,7 +790,7 @@ async def _test_param_processing( @pytest.mark.parametrize( - "meta,expected,warnings", + ("meta", "expected", "warnings"), [ # If no other known main output is specified in meta, httpResponseBody # is requested. @@ -878,7 +878,7 @@ async def test_automap_main_outputs(meta, expected, warnings, caplog): @pytest.mark.parametrize( - "meta,expected,warnings", + ("meta", "expected", "warnings"), [ # Test cases where httpResponseHeaders is not specifically set to True # or False, where it is automatically set to True if httpResponseBody @@ -1016,7 +1016,7 @@ async def test_automap_header_output(meta, expected, warnings, caplog): @pytest.mark.parametrize( - "method,meta,expected,warnings", + ("method", "meta", "expected", "warnings"), [ # The GET HTTP method is not mapped, since it is the default method. ( @@ -1249,7 +1249,7 @@ async def test_automap_method(method, meta, expected, warnings, caplog): @pytest.mark.parametrize( - "headers,meta,expected,warnings", + ("headers", "meta", "expected", "warnings"), [ # If httpResponseBody is True, implicitly or explicitly, # Request.headers are mapped as customHttpRequestHeaders. @@ -2320,7 +2320,7 @@ async def test_automap_headers(headers, meta, expected, warnings, caplog): @pytest.mark.parametrize( - "settings,headers,meta,expected,warnings", + ("settings", "headers", "meta", "expected", "warnings"), [ # You may update the ZYTE_API_SKIP_HEADERS setting to remove # headers that the customHttpRequestHeaders parameter starts supporting @@ -2372,7 +2372,7 @@ async def test_automap_header_settings( @pytest.mark.parametrize( - "meta,expected,warnings", + ("meta", "expected", "warnings"), [ ( { @@ -2419,7 +2419,7 @@ async def test_manual_custom_http_request_headers_processing( ) -REQUEST_INPUT_COOKIES_EMPTY: Dict[str, str] = {} +REQUEST_INPUT_COOKIES_EMPTY: dict[str, str] = {} REQUEST_INPUT_COOKIES_MINIMAL_DICT = {"a": "b"} REQUEST_INPUT_COOKIES_MINIMAL_LIST = [{"name": "a", "value": "b"}] REQUEST_INPUT_COOKIES_MAXIMAL = [ @@ -2432,7 +2432,7 @@ async def test_manual_custom_http_request_headers_processing( @pytest.mark.parametrize( - "settings,cookies,meta,params,expected,warnings,cookie_jar", + ("settings", "cookies", "meta", "params", "expected", "warnings", "cookie_jar"), [ # Cookies, both for requests and for responses, are enabled based on # both ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED (default: False) and @@ -2450,7 +2450,7 @@ async def test_manual_custom_http_request_headers_processing( setup_warnings or ( run_time_warnings - if cast(Dict, settings).get("COOKIES_ENABLED", True) + if cast("dict", settings).get("COOKIES_ENABLED", True) else [] ), [], @@ -2513,7 +2513,7 @@ async def test_manual_custom_http_request_headers_processing( "httpResponseHeaders": True, "experimental": { "responseCookies": True, - **cast(Dict, output_cookies), + **cast("dict", output_cookies), }, }, [], @@ -2840,7 +2840,7 @@ async def test_manual_custom_http_request_headers_processing( { "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, }, - input, + input_, {}, {}, { @@ -2854,7 +2854,7 @@ async def test_manual_custom_http_request_headers_processing( [], [], ) - for input, output in ( + for input_, output in ( ( REQUEST_INPUT_COOKIES_MINIMAL_DICT, REQUEST_OUTPUT_COOKIES_MINIMAL, @@ -2943,7 +2943,7 @@ async def test_automap_all_cookies(meta): """Because of scenarios like cross-domain redirects and browser rendering, Zyte API requests should include all cookie jar cookies, regardless of the target URL domain.""" - settings: Dict[str, Any] = { + settings: dict[str, Any] = { "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_TRANSPARENT_MODE": True, } @@ -2983,7 +2983,7 @@ async def test_automap_all_cookies(meta): # Have the response set 2 cookies for c.example, with and without a domain, # and a cookie for and d.example. - api_response: Dict[str, Any] = { + api_response: dict[str, Any] = { "url": "https://c.example", "httpResponseBody": "", "statusCode": 200, @@ -3051,7 +3051,7 @@ async def test_automap_cookie_jar(meta): url="https://example.com/3", meta={**meta, "cookiejar": "a"}, cookies={"x": "w"} ) request4 = Request(url="https://example.com/4", meta={**meta, "cookiejar": "a"}) - settings: Dict[str, Any] = { + settings: dict[str, Any] = { "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_TRANSPARENT_MODE": True, } @@ -3103,7 +3103,7 @@ async def test_automap_cookie_jar(meta): ) @deferred_f_from_coro_f async def test_automap_cookie_limit(meta, caplog): - settings: Dict[str, Any] = { + settings: dict[str, Any] = { "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_MAX_COOKIES": 1, "ZYTE_API_TRANSPARENT_MODE": True, @@ -3257,7 +3257,7 @@ async def test_automap_custom_cookie_middleware(): @pytest.mark.parametrize( - "body,meta,expected,warnings", + ("body", "meta", "expected", "warnings"), [ # The body is copied into httpRequestBody, base64-encoded. ( @@ -3333,7 +3333,7 @@ async def test_automap_body(body, meta, expected, warnings, caplog): @pytest.mark.parametrize( - "meta,expected,warnings", + ("meta", "expected", "warnings"), [ # When httpResponseBody, browserHtml, screenshot, automatic extraction # properties, or httpResponseHeaders, are unnecessarily set to False, @@ -3407,7 +3407,7 @@ async def test_automap_default_parameter_cleanup(meta, expected, warnings, caplo @pytest.mark.parametrize( - "default_params,meta,expected,warnings", + ("default_params", "meta", "expected", "warnings"), [ ( {}, @@ -3526,7 +3526,7 @@ async def test_middleware_headers_cb_requests_skip(): request = Request(url="https://example.com") settings = { "ZYTE_API_SKIP_HEADERS": list( - set(header.decode() for header in SKIP_HEADERS) + {header.decode() for header in SKIP_HEADERS} | { "Referer", } @@ -3614,7 +3614,7 @@ async def test_middleware_headers_default_skip(): "User-Agent": DEFAULT_USER_AGENT, }, "ZYTE_API_SKIP_HEADERS": list( - set(header.decode() for header in SKIP_HEADERS) + {header.decode() for header in SKIP_HEADERS} | {*DEFAULT_REQUEST_HEADERS, "Accept-Encoding", "Referer", "User-Agent"} ), "ZYTE_API_TRANSPARENT_MODE": True, @@ -3697,7 +3697,7 @@ async def test_middleware_headers_request_headers_skip(): ) settings = { "ZYTE_API_SKIP_HEADERS": list( - set(header.decode() for header in SKIP_HEADERS) + {header.decode() for header in SKIP_HEADERS} | {*DEFAULT_REQUEST_HEADERS, "Accept-Encoding", "Referer", "User-Agent"} ), "ZYTE_API_TRANSPARENT_MODE": True, @@ -3788,7 +3788,7 @@ async def test_middleware_headers_custom_middleware_before_skip(): request = Request("https://example.com") settings = { "ZYTE_API_SKIP_HEADERS": list( - set(header.decode() for header in SKIP_HEADERS) + {header.decode() for header in SKIP_HEADERS} | {*DEFAULT_REQUEST_HEADERS, "Accept-Encoding", "Referer", "User-Agent"} ), "ZYTE_API_TRANSPARENT_MODE": True, @@ -3817,7 +3817,7 @@ async def test_middleware_headers_request_copy(): @pytest.mark.parametrize( ("extract_from", "headers", "warnings"), - ( + [ *( (extract_from, headers, warnings) for extract_from in (None, "httpResponseBody", "browserHtml") @@ -3859,12 +3859,12 @@ async def test_middleware_headers_request_copy(): ), ) ), - ), + ], ) @deferred_f_from_coro_f async def test_serp_header_mapping(extract_from, headers, warnings, caplog): """serp does not support headers.""" - meta: Dict[str, Any] = {"serp": True} + meta: dict[str, Any] = {"serp": True} if extract_from: meta["serpOptions"] = {"extractFrom": extract_from} request = Request( @@ -3889,7 +3889,7 @@ async def test_serp_header_mapping(extract_from, headers, warnings, caplog): @pytest.mark.parametrize( - "meta,expected,warnings", + ("meta", "expected", "warnings"), [ ( {}, diff --git a/tests/test_handler.py b/tests/test_handler.py index 9c4a8f69..1f0a1312 100644 --- a/tests/test_handler.py +++ b/tests/test_handler.py @@ -9,11 +9,11 @@ from unittest import mock import pytest -from scrapy.utils.defer import deferred_f_from_coro_f from scrapy import Request, Spider from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler from scrapy.exceptions import NotConfigured from scrapy.settings import Settings +from scrapy.utils.defer import deferred_f_from_coro_f from scrapy.utils.test import get_crawler from zyte_api import RetryFactory from zyte_api.constants import API_URL @@ -27,14 +27,22 @@ _AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT, _POET_ADDON_SUPPORT, _X402_SUPPORT, - _build_from_crawler, USER_AGENT, + _build_from_crawler, maybe_deferred_to_future, ) -from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS, SETTINGS_T, UNSET +from . import ( + DEFAULT_CLIENT_CONCURRENCY, + SETTINGS, + SETTINGS_T, + UNSET, + download_request, + get_download_handler, + make_handler, + set_env, +) from . import get_crawler as get_crawler_zyte_api -from . import get_download_handler, make_handler, set_env, download_request from .mockserver import MockServer try: @@ -45,11 +53,11 @@ @pytest.mark.parametrize( "concurrency", - ( + [ 1, DEFAULT_CLIENT_CONCURRENCY, DEFAULT_CLIENT_CONCURRENCY + 1, - ), + ], ) @deferred_f_from_coro_f async def test_concurrency_configuration(concurrency): @@ -71,7 +79,7 @@ async def test_concurrency_configuration(concurrency): @pytest.mark.parametrize( ("scenario", "expected"), - ( + [ ( {}, NotConfigured, @@ -150,7 +158,7 @@ async def test_concurrency_configuration(concurrency): if _X402_SUPPORT else NotConfigured, ), - ), + ], ) def test_auth(scenario: dict[str, Any], expected: type[Exception] | dict[str, str]): env = scenario.get("env", {}) @@ -183,8 +191,8 @@ def build_hander(): @pytest.mark.parametrize( - "setting,expected", - ( + ("setting", "expected"), + [ ( UNSET, API_URL, @@ -205,7 +213,7 @@ def build_hander(): "https://api.example.com", "https://api.example.com", ), - ), + ], ) def test_api_url(setting, expected): settings: SETTINGS_T = {"ZYTE_API_KEY": "a"} @@ -231,7 +239,7 @@ def test_custom_client(): @deferred_f_from_coro_f @pytest.mark.parametrize( - "settings,meta,expected", + ("settings", "meta", "expected"), [ ({}, {}, None), ( @@ -285,7 +293,7 @@ async def test_retry_policy( @pytest.mark.parametrize( ("settings", "meta", "is_set"), - ( + [ ({}, {"zyte_api": {"foo": "bar"}}, True), ( {}, @@ -328,7 +336,7 @@ async def test_retry_policy( {"autothrottle_dont_adjust_delay": False}, True, ), - ), + ], ) @deferred_f_from_coro_f async def test_download_latency(settings, meta, is_set, mockserver): @@ -434,7 +442,7 @@ def test_single_client(): @deferred_f_from_coro_f @pytest.mark.parametrize( - "settings,enabled", + ("settings", "enabled"), [ ({}, False), ({"ZYTE_API_LOG_REQUESTS": False}, False), @@ -459,7 +467,7 @@ async def test_log_request_toggle( @deferred_f_from_coro_f @pytest.mark.parametrize( - "settings,short_str,long_str,truncated_str", + ("settings", "short_str", "long_str", "truncated_str"), [ ({}, "a" * 64, "a" * 65, "a" * 63 + "..."), ({"ZYTE_API_LOG_REQUESTS_TRUNCATE": 0}, "a" * 64, "a" * 65, "a" * 65), @@ -551,7 +559,9 @@ def test_log_request_truncate_negative(enabled): "ZYTE_API_LOG_REQUESTS_TRUNCATE": -1, } crawler = get_crawler(settings_dict=settings) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match=r"ZYTE_API_LOG_REQUESTS_TRUNCATE setting \(-1\) is invalid" + ): _build_from_crawler(ScrapyZyteAPIDownloadHandler, crawler) @@ -571,8 +581,8 @@ async def test_trust_env(enabled): @pytest.mark.parametrize( - "user_agent,expected", - ( + ("user_agent", "expected"), + [ ( None, USER_AGENT, @@ -581,7 +591,7 @@ async def test_trust_env(enabled): "zyte-crawlers/0.0.1", "zyte-crawlers/0.0.1", ), - ), + ], ) def test_user_agent_for_build_client(user_agent, expected): settings: Settings = Settings( @@ -680,7 +690,7 @@ async def test_fallback_setting(): @pytest.mark.parametrize( - "body_size, warnsize, maxsize, expected_result, expected_warnings", + ("body_size", "warnsize", "maxsize", "expected_result", "expected_warnings"), [ # Warning only (exceeds warnsize but not maxsize) ( @@ -722,7 +732,7 @@ def test_body_max_size_exceeded( if expected_warnings: for call, expected_warning in zip( - logger.warning.call_args_list, expected_warnings + logger.warning.call_args_list, expected_warnings, strict=True ): assert call[0][0] == expected_warning else: @@ -731,7 +741,7 @@ def test_body_max_size_exceeded( @deferred_f_from_coro_f @pytest.mark.parametrize( - "body_size, warnsize, maxsize, expect_null", + ("body_size", "warnsize", "maxsize", "expect_null"), [ (500, None, None, False), # No limits, should return response ( @@ -761,17 +771,19 @@ async def test_download_request_limits( mock_api_response = mock.Mock(body=b"x" * body_size) # Patch the `from_api_response` method of ZyteAPITextResponse only for the test - with mock.patch.object( - ZyteAPITextResponse, "from_api_response", return_value=mock_api_response - ): - with mock.patch( + with ( + mock.patch.object( + ZyteAPITextResponse, "from_api_response", return_value=mock_api_response + ), + mock.patch( "scrapy_zyte_api.responses._process_response", return_value=mock_api_response, - ): - request = Request("https://example.com") - result = await handler._download_request({}, request) - - if expect_null: - assert result is None - else: - assert result is not None + ), + ): + request = Request("https://example.com") + result = await handler._download_request({}, request) + + if expect_null: + assert result is None + else: + assert result is not None diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index ee953b20..0fe657ff 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, cast +from typing import Any, cast from unittest import SkipTest import pytest @@ -16,11 +16,11 @@ from scrapy_zyte_api.utils import ( # type: ignore[attr-defined] _GET_SLOT_NEEDS_SPIDER, _PROCESS_SPIDER_OUTPUT_ASYNC_SUPPORT, + _PROCESS_SPIDER_OUTPUT_REQUIRES_SPIDER, + _PROCESS_START_REQUIRES_SPIDER, _START_REQUESTS_CAN_YIELD_ITEMS, _build_from_crawler, maybe_deferred_to_future, - _PROCESS_SPIDER_OUTPUT_REQUIRES_SPIDER, - _PROCESS_START_REQUIRES_SPIDER, ) from . import SETTINGS, process_request @@ -35,7 +35,7 @@ async def request_processor(middleware, request: Request): assert await process_request(middleware, request) is None -async def aiter(list_): +async def aiter_(list_): for item in list_: yield item @@ -45,7 +45,7 @@ async def start_request_processor(middleware, request: Request): args = (None,) if _PROCESS_START_REQUIRES_SPIDER else () result = [ request - async for request in middleware.process_start(aiter([request]), *args) + async for request in middleware.process_start(aiter_([request]), *args) ] else: result = list(middleware.process_start_requests([request], None)) @@ -59,7 +59,7 @@ async def spider_output_processor(middleware, request: Request): result = [ request async for request in middleware.process_spider_output_async( - response, aiter([request]), *args + response, aiter_([request]), *args ) ] else: @@ -68,7 +68,7 @@ async def spider_output_processor(middleware, request: Request): @pytest.mark.parametrize( - ["mw_cls", "processor"], + ("mw_cls", "processor"), [ (ScrapyZyteAPIDownloaderMiddleware, request_processor), (ScrapyZyteAPISpiderMiddleware, start_request_processor), @@ -76,7 +76,7 @@ async def spider_output_processor(middleware, request: Request): ], ) @pytest.mark.parametrize( - ["settings", "preserve"], + ("settings", "preserve"), [ ({}, True), ({"ZYTE_API_PRESERVE_DELAY": False}, False), @@ -233,7 +233,7 @@ async def start(self): yield request def start_requests(self): - for i in range(spider_requests): + for _ in range(spider_requests): meta = {"zyte_api": {"browserHtml": True}} yield Request("https://example.com", meta=meta, dont_filter=True) @@ -403,7 +403,7 @@ def parse(self, response): @pytest.mark.parametrize( - "setting,attribute,conflict", + ("setting", "attribute", "conflict"), [ (None, None, False), (None, False, False), @@ -418,10 +418,7 @@ def parse(self, response): ) @deferred_f_from_coro_f async def test_spm_conflict_smartproxy(setting, attribute, conflict): - try: - import scrapy_zyte_smartproxy # noqa: F401 - except ImportError: - raise SkipTest("scrapy-zyte-smartproxy missing") + pytest.importorskip("scrapy_zyte_smartproxy") class SPMSpider(Spider): name = "spm_spider" @@ -435,7 +432,7 @@ class SPMSpider(Spider): "ZYTE_SMARTPROXY_APIKEY": "foo", **SETTINGS, } - mws = dict(cast(Dict[Any, int], settings["DOWNLOADER_MIDDLEWARES"])) + mws = dict(cast("dict[Any, int]", settings["DOWNLOADER_MIDDLEWARES"])) mws["scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware"] = 610 settings["DOWNLOADER_MIDDLEWARES"] = mws @@ -450,7 +447,7 @@ class SPMSpider(Spider): try: - import scrapy_crawlera # noqa: F401 + import scrapy_crawlera except ImportError: scrapy_crawlera = None SCRAPY_CRAWLERA_VERSION = Version("1.2.3") @@ -459,7 +456,7 @@ class SPMSpider(Spider): @pytest.mark.parametrize( - "setting,attribute,conflict", + ("setting", "attribute", "conflict"), [ (None, None, False), (None, False, False), @@ -469,7 +466,7 @@ class SPMSpider(Spider): (False, True, True), (True, None, True), # https://github.com/scrapy-plugins/scrapy-zyte-smartproxy/commit/49ebedd8b1d48cf2667db73f18da3e2c2c7fbfa7 - (True, False, SCRAPY_CRAWLERA_VERSION < Version("1.7")), + (True, False, SCRAPY_CRAWLERA_VERSION < Version("1.7")), # noqa: SIM300 (True, True, True), ], ) @@ -490,7 +487,7 @@ class CrawleraSpider(Spider): "CRAWLERA_APIKEY": "foo", **SETTINGS, } - mws = dict(cast(Dict[Any, int], settings["DOWNLOADER_MIDDLEWARES"])) + mws = dict(cast("dict[Any, int]", settings["DOWNLOADER_MIDDLEWARES"])) mws["scrapy_crawlera.CrawleraMiddleware"] = 610 settings["DOWNLOADER_MIDDLEWARES"] = mws diff --git a/tests/test_providers.py b/tests/test_providers.py index 472e631c..6dfa9c19 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,14 +7,15 @@ pytest.importorskip("scrapy_poet") import attrs -from scrapy.utils.defer import deferred_f_from_coro_f from scrapy import Request, Spider +from scrapy.statscollectors import MemoryStatsCollector +from scrapy.utils.defer import deferred_f_from_coro_f from scrapy_poet import DummyResponse from scrapy_poet.utils.testing import HtmlResource, crawl_single_item from twisted.internet import reactor -from twisted.web.client import Agent from twisted.internet.defer import Deferred from twisted.internet.protocol import Protocol +from twisted.web.client import Agent from web_poet import ( AnyResponse, BrowserHtml, @@ -147,12 +148,12 @@ async def test_provider(mockserver): assert item["html"] == "Hello

World!

" assert item["response_html"] == "Hello

World!

" assert item["product"] == Product.from_dict( - dict( - url=url, - name="Product name", - price="10", - currency="USD", - ) + { + "url": url, + "name": "Product name", + "price": "10", + "currency": "USD", + } ) @@ -190,7 +191,7 @@ def parse_( # type: ignore[override] settings = deepcopy(SETTINGS) settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100} - item, url, _ = await _crawl_single_item( + item, *_ = await _crawl_single_item( ItemDepSpider, HtmlResource, settings, port=port ) count_resp = await maybe_deferred_to_future( @@ -217,7 +218,7 @@ def parse_(self, response: DummyResponse, product: Product, my_item: MyItem): # settings = deepcopy(SETTINGS) settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100} - item, url, _ = await _crawl_single_item( + item, *_ = await _crawl_single_item( ItemDepSpider, HtmlResource, settings, port=port ) count_resp = await maybe_deferred_to_future( @@ -251,7 +252,7 @@ def parse_( # type: ignore[override] settings = deepcopy(SETTINGS) settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1} - item, url, _ = await _crawl_single_item( + item, *_ = await _crawl_single_item( ItemDepSpider, HtmlResource, settings, port=port ) count_resp = await maybe_deferred_to_future( @@ -327,12 +328,12 @@ def parse_(self, response: DummyResponse, page: AnnotatedProductPage): # type: AnnotatedZyteAPISpider, HtmlResource, settings ) assert item["product"] == Product.from_dict( - dict( - url=url, - name="Product name (from httpResponseBody)", - price="10", - currency="USD", - ) + { + "url": url, + "name": "Product name (from httpResponseBody)", + "price": "10", + "currency": "USD", + } ) @@ -383,12 +384,12 @@ def parse_(self, response: DummyResponse, page: AnnotatedProductPage): # type: AnnotatedZyteAPISpider, HtmlResource, settings ) assert item["product"] == Product.from_dict( - dict( - url=url, - name="Product name", - price="10", - currency="USD", - ) + { + "url": url, + "name": "Product name", + "price": "10", + "currency": "USD", + } ) @@ -409,7 +410,7 @@ def parse_(self, response: DummyResponse, page: GeoProductPage): # type: ignore settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - item, url, _ = await _crawl_single_item(GeoZyteAPISpider, HtmlResource, settings) + item, *_ = await _crawl_single_item(GeoZyteAPISpider, HtmlResource, settings) assert item["product"].name == "Product name (country DE)" @@ -428,7 +429,7 @@ def parse_(self, response: DummyResponse, page: GeoProductPage): # type: ignore settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - item, url, _ = await _crawl_single_item(GeoZyteAPISpider, HtmlResource, settings) + item, *_ = await _crawl_single_item(GeoZyteAPISpider, HtmlResource, settings) assert item is None assert "Geolocation dependencies must be annotated" in caplog.text @@ -470,12 +471,12 @@ def parse_(self, response: DummyResponse, page: CustomAttrsPage): # type: ignor CustomAttrsZyteAPISpider, HtmlResource, settings ) assert item["product"] == Product.from_dict( - dict( - url=url, - name="Product name", - price="10", - currency="USD", - ) + { + "url": url, + "name": "Product name", + "price": "10", + "currency": "USD", + } ) assert item["custom_attrs"] == CustomAttributes.from_dict( { @@ -513,12 +514,12 @@ def parse_(self, response: DummyResponse, page: CustomAttrsPage): # type: ignor CustomAttrsZyteAPISpider, HtmlResource, settings ) assert item["product"] == Product.from_dict( - dict( - url=url, - name="Product name", - price="10", - currency="USD", - ) + { + "url": url, + "name": "Product name", + "price": "10", + "currency": "USD", + } ) assert item["custom_attrs"] == { "attr1": "foo", @@ -1076,7 +1077,7 @@ def parse_(self, response: DummyResponse, page: ActionProductPage): # type: ign settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - item, url, _ = await _crawl_single_item(ActionZyteAPISpider, HtmlResource, settings) + item, *_ = await _crawl_single_item(ActionZyteAPISpider, HtmlResource, settings) assert isinstance(item["product"], Product) assert item["action_results"] == Actions( [ @@ -1126,8 +1127,6 @@ async def test_auto_field_stats_no_override(mockserver): """When requesting an item directly from Zyte API, without an override to change fields, stats reflect the entire list of item fields.""" - from scrapy.statscollectors import MemoryStatsCollector - duplicate_stat_calls: defaultdict[str, int] = defaultdict(int) class OnlyOnceStatsCollector(MemoryStatsCollector): @@ -1659,17 +1658,17 @@ async def test_multiple_types(mockserver): assert item["html"] == "Hello

World!

" assert item["response_html"] == "Hello

World!

" assert item["product"] == Product.from_dict( - dict( - url=url, - name="Product name", - price="10", - currency="USD", - ) + { + "url": url, + "name": "Product name", + "price": "10", + "currency": "USD", + } ) assert item["productNavigation"] == ProductNavigation.from_dict( - dict( - url=url, - name="Product navigation", - pageNumber=0, - ) + { + "url": url, + "name": "Product navigation", + "pageNumber": 0, + } ) diff --git a/tests/test_referer.py b/tests/test_referer.py index fd823a23..fa91942a 100644 --- a/tests/test_referer.py +++ b/tests/test_referer.py @@ -1,8 +1,8 @@ from copy import deepcopy import pytest -from scrapy.utils.defer import deferred_f_from_coro_f from scrapy import Spider, signals +from scrapy.utils.defer import deferred_f_from_coro_f from scrapy.utils.test import get_crawler from scrapy_zyte_api.utils import _POET_ADDON_SUPPORT, maybe_deferred_to_future @@ -19,13 +19,13 @@ @pytest.mark.parametrize( ("settings", "meta", "headers", "expected"), - ( + [ # Default behavior of non-Zyte-API, transparent/automap, and manual # Zyte API requests. ({}, {}, {}, True), (SETTINGS, {"zyte_api_automap": False}, {}, True), (SETTINGS, {"zyte_api_automap": True}, {}, False), - (SETTINGS, {}, {}, False if ADDON_SUPPORT else True), + (SETTINGS, {}, {}, not ADDON_SUPPORT), ( SETTINGS, {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": True}}, @@ -261,7 +261,7 @@ {}, "https://example.com", ), - ), + ], ) @deferred_f_from_coro_f async def test_main(settings, meta, headers, expected, mockserver): diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 722d1740..686a1039 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -2,8 +2,8 @@ import pytest from packaging.version import Version -from scrapy.utils.defer import deferred_f_from_coro_f from scrapy import __version__ as SCRAPY_VERSION +from scrapy.utils.defer import deferred_f_from_coro_f if Version(SCRAPY_VERSION) < Version("2.7"): pytest.skip("Skipping tests for Scrapy ≥ 2.7", allow_module_level=True) @@ -65,7 +65,7 @@ async def test_poet_installed_but_disabled(caplog): """If the scrapy-poet package is installed but its main middleware, InjectionMiddleware, is not set in DOWNLOADER_MIDDLEWARES, do not try to use its API for request fingerprinting.""" - from web_poet import WebPage + from web_poet import WebPage # noqa: PLC0415 no_deps_request = Request("https://example.com") @@ -127,8 +127,8 @@ async def test_headers(): @pytest.mark.parametrize( - "url,params,fingerprint", - ( + ("url", "params", "fingerprint"), + [ ( "https://example.com", {}, @@ -209,7 +209,7 @@ async def test_headers(): {"actions": [{"action": "click", "selector": ".button"}]}, b"\x83\xfa\x04\xfal\xc6d(\xe1\x06\xf1>b\xed\xbe\xb1\xf2\xac5E", ), - ), + ], ) @deferred_f_from_coro_f async def test_known_fingerprints(url, params, fingerprint): @@ -300,8 +300,8 @@ async def test_only_end_parameters_matter(): @pytest.mark.parametrize( - "url1,url2,match", - ( + ("url1", "url2", "match"), + [ ( "https://example.com", "https://example.com", @@ -342,7 +342,7 @@ async def test_only_end_parameters_matter(): "https://example.com#2", True, ), - ), + ], ) @deferred_f_from_coro_f async def test_url(url1, url2, match): @@ -363,8 +363,8 @@ def merge_dicts(*dicts): @pytest.mark.parametrize( - "params,match", - ( + ("params", "match"), + [ # As long as browserHtml or screenshot are True, different fragments # make for different fingerprints, regardless of other parameters. Same # for extraction types if browserHtml is set in *Options.extractFrom. @@ -476,7 +476,7 @@ def merge_dicts(*dicts): {"browserHtml": False, "screenshot": False}, ) ), - ), + ], ) @deferred_f_from_coro_f async def test_url_fragments(params, match): @@ -525,7 +525,7 @@ async def test_request_body(): async def test_deps(): """Test that some injected dependencies do not affect fingerprinting at all (e.g. HttpClient) while others do (e.g. WebPage).""" - from web_poet import HttpClient, WebPage + from web_poet import HttpClient, WebPage # noqa: PLC0415 request = Request("https://example.com") raw_request = Request( @@ -730,7 +730,7 @@ async def test_page_params(): @pytest.mark.parametrize( ("settings", "meta1", "meta2", "fingerprint_matches"), - ( + [ # Session pool IDs affect fingerprinting, but session initialization # parameters do not. # @@ -855,7 +855,7 @@ async def test_page_params(): }, True, ), - ), + ], ) @deferred_f_from_coro_f async def test_session_pool_ids(settings, meta1, meta2, fingerprint_matches): diff --git a/tests/test_responses.py b/tests/test_responses.py index 241672b6..f046cc69 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -1,7 +1,7 @@ from base64 import b64encode from collections import defaultdict from functools import partial -from typing import Any, Dict, cast +from typing import Any, cast import pytest from scrapy import Request @@ -110,7 +110,7 @@ def raw_api_response_mixed(): @pytest.mark.parametrize( - "api_response,cls", + ("api_response", "cls"), [ (raw_api_response_browser, ZyteAPITextResponse), (raw_api_response_body, ZyteAPIResponse), @@ -134,7 +134,7 @@ def test_init(api_response, cls): @pytest.mark.parametrize( - "api_response,cls,content_length", + ("api_response", "cls", "content_length"), [ (raw_api_response_browser, ZyteAPITextResponse, 44), (raw_api_response_body, ZyteAPIResponse, 44), @@ -164,7 +164,7 @@ def test_text_from_api_response(api_response, cls, content_length): @pytest.mark.parametrize( - "api_response,cls", + ("api_response", "cls"), [ (raw_api_response_browser, ZyteAPITextResponse), (raw_api_response_body, ZyteAPIResponse), @@ -189,7 +189,9 @@ def test_response_replace(api_response, cls): } # Attempting to replace the raw_api_response value would raise an error - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match="Replacing the value of 'raw_api_response' isn't allowed" + ): orig_response.replace(raw_api_response=new_raw_api_response) @@ -222,7 +224,7 @@ def format_to_httpResponseBody(body, encoding="utf-8"): @pytest.mark.parametrize( - "api_response,cls", + ("api_response", "cls"), [ (raw_api_response_browser, ZyteAPITextResponse), (raw_api_response_body, ZyteAPIResponse), @@ -259,14 +261,14 @@ def test_response_headers_removal(api_response, cls): @pytest.mark.parametrize( - "fields,cls,keep", + ("fields", "cls", "keep"), [ # Only keep the Set-Cookie header if experimental.responseCookies is # not received. *( ( { - **cast(Dict[Any, Any], output_fields), + **cast("dict[Any, Any]", output_fields), "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html"}, {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, @@ -343,7 +345,7 @@ def test__process_response_no_body(): "product": {"name": "shoes"}, } - resp = _process_response(api_response, Request(cast(str, api_response["url"]))) + resp = _process_response(api_response, Request(cast("str", api_response["url"]))) assert isinstance(resp, Response) assert resp.body == b"" @@ -400,7 +402,7 @@ def test__process_response_body_only_infer_encoding(): @pytest.mark.parametrize( - "encoding,content_type", + ("encoding", "content_type"), [ ("utf-8", "text/html; charset=UTF-8"), ("gb18030", "text/html; charset=gb2312"), @@ -425,7 +427,7 @@ def test__process_response_body_and_headers(encoding, content_type): @pytest.mark.parametrize( - "body,expected,actual_encoding,inferred_encoding", + ("body", "expected", "actual_encoding", "inferred_encoding"), [ ("plain", "plain", "cp1252", "cp1252"), ( @@ -512,7 +514,7 @@ def test__process_response_non_text(): } ], } - resp = _process_response(api_response, Request(cast(str, api_response["url"]))) + resp = _process_response(api_response, Request(cast("str", api_response["url"]))) assert isinstance(resp, Response) with pytest.raises(NotSupported): @@ -554,7 +556,7 @@ def test__process_response_browserhtml(api_response): ], ) @pytest.mark.parametrize( - "kwargs,expected_status_code", + ("kwargs", "expected_status_code"), [ ({}, 200), ({"statusCode": 200}, 200), diff --git a/tests/test_sessions.py b/tests/test_sessions.py index 9341938c..846d6631 100644 --- a/tests/test_sessions.py +++ b/tests/test_sessions.py @@ -1,15 +1,15 @@ from collections import deque from copy import copy, deepcopy from math import floor -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any from unittest.mock import patch import pytest from aiohttp.client_exceptions import ServerConnectionError -from scrapy.utils.defer import deferred_f_from_coro_f from scrapy import Request, Spider, signals from scrapy.exceptions import CloseSpider from scrapy.http import Response +from scrapy.utils.defer import deferred_f_from_coro_f from scrapy.utils.httpobj import urlparse_cached from scrapy.utils.misc import load_object from zyte_api import RequestError @@ -37,7 +37,7 @@ @pytest.mark.parametrize( ("setting", "meta", "outcome"), - ( + [ (UNSET, UNSET, False), (UNSET, True, True), (UNSET, False, False), @@ -47,7 +47,7 @@ (False, UNSET, False), (False, True, True), (False, False, False), - ), + ], ) @deferred_f_from_coro_f async def test_enabled(setting, meta, outcome, mockserver): @@ -92,7 +92,7 @@ def parse(self, response): @pytest.mark.parametrize( ("params_setting", "params_meta", "location_setting", "location_meta", "outcome"), - ( + [ (UNSET, UNSET, UNSET, UNSET, False), (UNSET, UNSET, UNSET, None, False), (UNSET, UNSET, UNSET, False, False), @@ -201,7 +201,7 @@ def parse(self, response): (True, True, True, None, True), (True, True, True, False, True), (True, True, True, True, True), - ), + ], ) @deferred_f_from_coro_f async def test_params_precedence( @@ -222,7 +222,7 @@ async def test_params_precedence( "ZYTE_API_SESSION_ENABLED": True, "ZYTE_API_SESSION_MAX_BAD_INITS": 1, } - meta: Dict[str, Any] = {} + meta: dict[str, Any] = {} if params_setting is not UNSET: settings["ZYTE_API_SESSION_PARAMS"] = { @@ -298,7 +298,7 @@ def parse(self, response): @pytest.mark.parametrize( ("params", "close_reason", "stats"), - ( + [ ( {"browserHtml": True}, "bad_session_inits", @@ -313,7 +313,7 @@ def parse(self, response): "scrapy-zyte-api/sessions/pools/forbidden.example/init/check-passed": 1, }, ), - ), + ], ) @deferred_f_from_coro_f async def test_url_override(params, close_reason, stats, mockserver): @@ -436,7 +436,7 @@ def check(self, response: Response, request: Request) -> bool: # subclasses for the crawler classes because the init use is enough to verify # that using the crawler works. -CHECKER_TESTS: Tuple[Tuple[str, str, Dict[str, int]], ...] = ( +CHECKER_TESTS: tuple[tuple[str, str, dict[str, int]], ...] = ( ( "tests.test_sessions.TrueChecker", "finished", @@ -506,7 +506,7 @@ def check(self, response: Response, request: Request) -> bool: @pytest.mark.parametrize( ("checker", "close_reason", "stats"), - ( + [ *CHECKER_TESTS, *( pytest.param( @@ -523,7 +523,7 @@ def check(self, response: Response, request: Request) -> bool: ) for checker, close_reason, stats in CHECKER_TESTS ), - ), + ], ) @deferred_f_from_coro_f async def test_checker(checker, close_reason, stats, mockserver): @@ -559,7 +559,7 @@ def closed(self, reason): @pytest.mark.parametrize( ("postal_code", "url", "close_reason", "stats"), - ( + [ ( None, "https://postal-code-10001-soft.example", @@ -592,7 +592,7 @@ def closed(self, reason): "unsupported_set_location", {}, ), - ), + ], ) @deferred_f_from_coro_f async def test_checker_location(postal_code, url, close_reason, stats, mockserver): @@ -691,12 +691,12 @@ def closed(self, reason): @pytest.mark.parametrize( ("setting", "value"), - ( + [ (0, 1), (1, 1), (2, 2), (None, 8), - ), + ], ) @deferred_f_from_coro_f async def test_max_bad_inits(setting, value, mockserver): @@ -730,12 +730,12 @@ def parse(self, response): @pytest.mark.parametrize( ("global_setting", "pool_setting", "value"), - ( + [ (None, 0, 1), (None, 1, 1), (None, 2, 2), (3, None, 3), - ), + ], ) @deferred_f_from_coro_f async def test_max_bad_inits_per_pool(global_setting, pool_setting, value, mockserver): @@ -776,12 +776,12 @@ def parse(self, response): @pytest.mark.parametrize( ("setting", "value"), - ( + [ (None, 1), (0, 1), (1, 1), (2, 2), - ), + ], ) @deferred_f_from_coro_f async def test_max_check_failures(setting, value, mockserver): @@ -824,12 +824,12 @@ def parse(self, response): @pytest.mark.parametrize( ("setting", "value"), - ( + [ (None, 1), (0, 1), (1, 1), (2, 2), - ), + ], ) @deferred_f_from_coro_f async def test_max_errors(setting, value, mockserver): @@ -915,7 +915,7 @@ def parse(self, response): @pytest.mark.parametrize( ("meta", "pool"), - ( + [ ({}, "example.com"), ({"zyte_api_session_location": {"postalCode": "10001"}}, "example.com@10001"), ( @@ -957,7 +957,7 @@ def parse(self, response): }, "foo", ), - ), + ], ) @deferred_f_from_coro_f async def test_pool(meta, pool, mockserver): @@ -1060,11 +1060,11 @@ def parse(self, response): @pytest.mark.parametrize( ("setting", "value"), - ( + [ (1, 1), (2, 2), (None, 8), - ), + ], ) @deferred_f_from_coro_f async def test_pool_size(setting, value, mockserver): @@ -1098,11 +1098,11 @@ def parse(self, response): @pytest.mark.parametrize( ("global_setting", "pool_setting", "value"), - ( + [ (None, 1, 1), (None, 2, 2), (3, None, 3), - ), + ], ) @deferred_f_from_coro_f async def test_pool_sizes(global_setting, pool_setting, value, mockserver): @@ -1141,7 +1141,7 @@ def parse(self, response): def mock_request_error(*, status=200, response_content=None): - kwargs: Dict[str, Any] = {} + kwargs: dict[str, Any] = {} if _REQUEST_ERROR_HAS_QUERY: kwargs["query"] = {} return RequestError( @@ -1164,7 +1164,7 @@ def __init__(self, time): @pytest.mark.parametrize( ("retrying", "outcomes", "exhausted"), - ( + [ *( (retry_policy, outcomes, exhausted) for retry_policy in ( @@ -1190,7 +1190,7 @@ def __init__(self, time): ), ) ), - ), + ], ) @deferred_f_from_coro_f @patch("time.monotonic") @@ -1222,7 +1222,7 @@ async def run(): await run() except Exception as outcome: assert exhausted - assert outcome is last_outcome + assert outcome is last_outcome # noqa: PT017 else: assert not exhausted @@ -1237,7 +1237,7 @@ async def run(): @pytest.mark.parametrize( ("manual_settings", "addon_settings"), - ( + [ ( {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, {}, @@ -1268,7 +1268,7 @@ async def run(): {"ZYTE_API_RETRY_POLICY": "tests.test_sessions.UNSET"}, {"ZYTE_API_RETRY_POLICY": "tests.test_sessions.UNSET"}, ), - ), + ], ) @deferred_f_from_coro_f @pytest.mark.skipif( @@ -1522,7 +1522,7 @@ def parse(self, response): @pytest.mark.parametrize( ("settings", "meta", "used"), - ( + [ ({}, {}, True), ( { @@ -1548,7 +1548,7 @@ def parse(self, response): False, ), ({}, {"zyte_api_session_location": {"postalCode": "10002"}}, False), - ), + ], ) @deferred_f_from_coro_f async def test_session_config_location(settings, meta, used, mockserver): @@ -1631,7 +1631,7 @@ def parse(self, response): @pytest.mark.parametrize( ("settings", "meta", "used"), - ( + [ ({}, {}, True), ( { @@ -1657,7 +1657,7 @@ def parse(self, response): False, ), ({}, {"zyte_api_session_location": {"postalCode": "10002"}}, True), - ), + ], ) @deferred_f_from_coro_f async def test_session_config_location_bad(settings, meta, used, mockserver): @@ -1889,7 +1889,7 @@ def parse(self, response): @pytest.mark.parametrize( ("meta", "settings", "pool", "outcome"), - ( + [ ({}, {}, "postal-code-10001.example", False), ( { @@ -1933,7 +1933,7 @@ def parse(self, response): "postal-code-10001.example", False, ), - ), + ], ) @deferred_f_from_coro_f async def test_session_config_params_precedence( @@ -2157,7 +2157,7 @@ def closed(self, reason): async def test_session_config_no_web_poet(mockserver): """If web-poet is not installed, @session_config raises a RuntimeError.""" try: - import web_poet # noqa: F401 + import web_poet # noqa: F401, PLC0415 except ImportError: pass else: @@ -2186,7 +2186,7 @@ def check(self, response: Response, request: Request) -> bool: self.session_data[session_id] = {"foo": "bar"} return super().check(response, request) - def process_request(self, request: Request) -> Optional[Request]: + def process_request(self, request: Request) -> Request | None: session_id = get_request_session_id(request) foo = self.session_data[session_id]["foo"] request.headers["foo"] = foo @@ -2243,7 +2243,7 @@ def check(self, response: Response, request: Request) -> bool: self.session_data[session_id] = {"foo": "bar"} return super().check(response, request) - def process_request(self, request: Request) -> Optional[Request]: + def process_request(self, request: Request) -> Request | None: session_id = get_request_session_id(request) foo = self.session_data[session_id]["foo"] new_url = request.url.rstrip("/") + f"/{foo}" @@ -2298,8 +2298,8 @@ async def test_location_session_config(mockserver): ) class CustomSessionConfig(LocationSessionConfig): def location_params( - self, request: Request, location: Dict[str, Any] - ) -> Dict[str, Any]: + self, request: Request, location: dict[str, Any] + ) -> dict[str, Any]: assert location == {"postalCode": "10002"} return { "actions": [ @@ -2311,7 +2311,7 @@ def location_params( } def location_check( - self, response: Response, request: Request, location: Dict[str, Any] + self, response: Response, request: Request, location: dict[str, Any] ) -> bool: assert location == {"postalCode": "10002"} domain = urlparse_cached(request).netloc @@ -2485,14 +2485,14 @@ async def test_location_session_config_no_location(mockserver): @session_config(["postal-code-10001.example", "a.example"]) class CustomSessionConfig(LocationSessionConfig): def location_params( - self, request: Request, location: Dict[str, Any] - ) -> Dict[str, Any]: - assert False + self, request: Request, location: dict[str, Any] + ) -> dict[str, Any]: + raise AssertionError def location_check( - self, response: Response, request: Request, location: Dict[str, Any] + self, response: Response, request: Request, location: dict[str, Any] ) -> bool: - assert False + raise AssertionError settings = { "RETRY_TIMES": 0, @@ -2799,7 +2799,7 @@ def parse(self, response): class SessionIDRemovingDownloaderMiddleware: def process_exception( self, request: Request, exception: Exception, spider: Spider | None = None - ) -> Union[Request, None]: + ) -> Request | None: if not isinstance(exception, RequestError) or request.meta.get( "_is_session_init_request", False ): @@ -2856,7 +2856,7 @@ def parse(self, response): @pytest.mark.parametrize( ("settings", "meta", "meta_key"), - ( + [ ( {}, {}, @@ -2897,7 +2897,7 @@ def parse(self, response): {"zyte_api_automap": True}, "zyte_api_automap", ), - ), + ], ) @deferred_f_from_coro_f async def test_assign_meta_key(settings, meta, meta_key, mockserver): @@ -2907,7 +2907,7 @@ async def test_assign_meta_key(settings, meta, meta_key, mockserver): class Tracker: def __init__(self): - self.meta: Dict[str, Any] = {} + self.meta: dict[str, Any] = {} def track(self, request: Request, spider: Spider): self.meta = deepcopy(request.meta) @@ -2962,12 +2962,12 @@ def parse(self, response): async def test_provider(mockserver): pytest.importorskip("scrapy_poet") - from scrapy_poet import DummyResponse - from zyte_common_items import Product + from scrapy_poet import DummyResponse # noqa: PLC0415 + from zyte_common_items import Product # noqa: PLC0415 class Tracker: def __init__(self): - self.query: Dict[str, Any] = {} + self.query: dict[str, Any] = {} def track(self, request: Request, spider: Spider): self.query = request.meta["zyte_api"] @@ -3026,7 +3026,7 @@ async def process_request( @pytest.mark.parametrize( ("exception", "stat", "reason"), - ( + [ ( mock_request_error( status=422, response_content=b'{"type": "/problem/session-expired"}' @@ -3059,7 +3059,7 @@ async def process_request( None, None, ), - ), + ], ) @deferred_f_from_coro_f async def test_exceptions(exception, stat, reason, mockserver, caplog): @@ -3111,11 +3111,11 @@ def parse(self, response): @pytest.mark.parametrize( ("meta", "expected"), - ( + [ ({}, False), ({SESSION_INIT_META_KEY: False}, False), ({SESSION_INIT_META_KEY: True}, True), - ), + ], ) def test_is_session_init_request(meta, expected): actual = is_session_init_request(Request("https://example.com", meta=meta))