From d785a2d67d0e45e636e014cbc22827668b748c69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 27 Aug 2024 12:07:07 +0200 Subject: [PATCH 01/19] SERP (MVP) --- setup.py | 3 +- zyte_spider_templates/spiders/base.py | 4 +- zyte_spider_templates/spiders/serp.py | 135 ++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 zyte_spider_templates/spiders/serp.py diff --git a/setup.py b/setup.py index 3871341..de219a4 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ "scrapy-poet>=0.21.0", "scrapy-spider-metadata>=0.1.2", "scrapy-zyte-api[provider]>=0.16.0", - "zyte-common-items>=0.13.0", + # "zyte-common-items>=0.13.0", + "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@serp", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index 846b87a..2fc0c5d 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -90,13 +90,13 @@ class BaseSpider(scrapy.Spider): def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super().from_crawler(crawler, *args, **kwargs) - if spider.args.geolocation: + if geolocation := getattr(spider.args, "geolocation", None): # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object # additional requests. for component in ("AUTOMAP", "PROVIDER"): default_params = spider.settings.getdict(f"ZYTE_API_{component}_PARAMS") - default_params["geolocation"] = spider.args.geolocation + default_params["geolocation"] = geolocation spider.settings.set( f"ZYTE_API_{component}_PARAMS", default_params, diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py new file mode 100644 index 0000000..cbaeb34 --- /dev/null +++ b/zyte_spider_templates/spiders/serp.py @@ -0,0 +1,135 @@ +from typing import Any, Dict, Iterable + +import requests +import scrapy +from pydantic import BaseModel, ConfigDict, Field, model_validator +from scrapy import Request +from scrapy.crawler import Crawler +from scrapy_spider_metadata import Args +from w3lib.url import add_or_replace_parameter +from zyte_common_items import Serp + +from zyte_spider_templates.spiders.base import BaseSpider +from zyte_spider_templates.utils import get_domain + +from ..params import MaxRequestsParam, UrlParam, UrlsFileParam, UrlsParam +from ..utils import load_url_list +from .base import _INPUT_FIELDS + + +class SerpMaxPagesParam(BaseModel): + max_pages: int = Field( + title="Pages", + description="Maximum number of result pages to visit per input URL.", + default=1, + ) + + +class SerpSpiderParams( + MaxRequestsParam, + SerpMaxPagesParam, + UrlsFileParam, + UrlsParam, + UrlParam, + BaseModel, +): + model_config = ConfigDict( + json_schema_extra={ + "groups": [ + { + "id": "inputs", + "title": "Inputs", + "description": ( + "Input data that determines the start URLs of the crawl." + ), + "widget": "exclusive", + }, + ], + }, + ) + + @model_validator(mode="after") + def single_input(self): + """Fields + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.url` + and + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file` + form a mandatory, mutually-exclusive field group: one of them must be + defined, the rest must not be defined.""" + input_fields = set( + field for field in _INPUT_FIELDS if getattr(self, field, None) + ) + if not input_fields: + input_field_list = ", ".join(_INPUT_FIELDS) + raise ValueError( + f"No input parameter defined. Please, define one of: " + f"{input_field_list}." + ) + elif len(input_fields) > 1: + input_field_list = ", ".join( + f"{field} ({getattr(self, field)!r})" for field in input_fields + ) + raise ValueError( + f"Expected a single input parameter, got {len(input_fields)}: " + f"{input_field_list}." + ) + return self + + +class SerpSpider(Args[SerpSpiderParams], BaseSpider): + """Yield results from search engine result pages (SERP). + + See :class:`~zyte_spider_templates.spiders.ecommerce.SerpSpiderParams` + for supported parameters. + + .. seealso:: :ref:`serp`. + """ + + name = "serp" + + metadata: Dict[str, Any] = { + **BaseSpider.metadata, + "title": "SERP", + "description": "Template for spiders that extract search engine results.", + } + + @classmethod + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + spider = super().from_crawler(crawler, *args, **kwargs) + spider._init_input() + return spider + + def _init_input(self): + urls_file = self.args.urls_file + if urls_file: + response = requests.get(urls_file) + urls = load_url_list(response.text) + self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.") + self.start_urls = urls + elif self.args.urls: + self.start_urls = self.args.urls + else: + self.start_urls = [self.args.url] + self.allowed_domains = list(set(get_domain(url) for url in self.start_urls)) + + def get_start_request(self, url): + return Request( + url=url, + callback=self.parse_serp, + meta={ + "crawling_logs": {"page_type": "serp"}, + "zyte_api": { + "serp": True, + }, + }, + ) + + def start_requests(self) -> Iterable[Request]: + for url in self.start_urls: + for start in range(0, self.args.max_pages * 10, 10): + if start: + url = add_or_replace_parameter(url, "start", str(start)) + yield self.get_start_request(url) + + def parse_serp(self, response) -> Iterable[Serp]: + yield Serp.from_dict(response.raw_api_response["serp"]) From 4c47efc5e06517295097d0b8394b8aaf051d4173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 27 Aug 2024 15:45:17 +0200 Subject: [PATCH 02/19] Fix references and complete the docs --- docs/conf.py | 3 +++ docs/index.rst | 1 + docs/reference/index.rst | 5 +++++ docs/templates/index.rst | 3 +++ docs/templates/serp.rst | 19 +++++++++++++++++++ zyte_spider_templates/__init__.py | 1 + zyte_spider_templates/spiders/serp.py | 8 +++++--- 7 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 docs/templates/serp.rst diff --git a/docs/conf.py b/docs/conf.py index 5a610e3..ac67ce5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,8 +48,11 @@ ), } +autodoc_pydantic_model_show_config_summary = False autodoc_pydantic_model_show_field_summary = False autodoc_pydantic_model_show_json = False +autodoc_pydantic_model_show_validator_members = False +autodoc_pydantic_model_show_validator_summary = False # sphinx-reredirects redirects = { diff --git a/docs/index.rst b/docs/index.rst index d344faa..d26b6ca 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,7 @@ zyte-spider-templates documentation templates/index E-commerce + SERP .. toctree:: :caption: Customization diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 81826cb..14a158e 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -9,6 +9,8 @@ Spiders .. autoclass:: zyte_spider_templates.EcommerceSpider +.. autoclass:: zyte_spider_templates.SerpSpider + Pages ===== @@ -41,3 +43,6 @@ Parameter mixins :exclude-members: model_computed_fields .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy + +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam + :exclude-members: model_computed_fields diff --git a/docs/templates/index.rst b/docs/templates/index.rst index c70a7de..058187c 100644 --- a/docs/templates/index.rst +++ b/docs/templates/index.rst @@ -29,3 +29,6 @@ Spider template list :ref:`E-commerce ` Get products from an e-commerce website. + +:ref:`SERP ` + Get search engine result pages. diff --git a/docs/templates/serp.rst b/docs/templates/serp.rst new file mode 100644 index 0000000..496926f --- /dev/null +++ b/docs/templates/serp.rst @@ -0,0 +1,19 @@ +.. _serp: + +=============================== +SERP spider template (``serp``) +=============================== + +Basic use +========= + +.. code-block:: shell + + scrapy crawl serp -a url="https://www.google.com/search?q=foo" + +Parameters +========== + +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpSpiderParams + :inherited-members: BaseModel + :exclude-members: model_computed_fields diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py index e3de8c9..6b6d292 100644 --- a/zyte_spider_templates/__init__.py +++ b/zyte_spider_templates/__init__.py @@ -1,2 +1,3 @@ from .spiders.base import BaseSpider, BaseSpiderParams from .spiders.ecommerce import EcommerceSpider +from .spiders.serp import SerpSpider diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index cbaeb34..d159312 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -34,6 +34,8 @@ class SerpSpiderParams( BaseModel, ): model_config = ConfigDict( + # https://github.com/pydantic/pydantic/discussions/7763#discussioncomment-10338857 + protected_namespaces=(), json_schema_extra={ "groups": [ { @@ -51,9 +53,9 @@ class SerpSpiderParams( @model_validator(mode="after") def single_input(self): """Fields - :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.url` + :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.url` and - :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file` + :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.urls_file` form a mandatory, mutually-exclusive field group: one of them must be defined, the rest must not be defined.""" input_fields = set( @@ -79,7 +81,7 @@ def single_input(self): class SerpSpider(Args[SerpSpiderParams], BaseSpider): """Yield results from search engine result pages (SERP). - See :class:`~zyte_spider_templates.spiders.ecommerce.SerpSpiderParams` + See :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams` for supported parameters. .. seealso:: :ref:`serp`. From d10d75e2c3e1f6309985a2577f1a7a8919b3cc0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 28 Aug 2024 20:56:31 +0200 Subject: [PATCH 03/19] Customize UI strings for SERP and add tests --- tests/__init__.py | 4 + tests/test_ecommerce.py | 65 +----- tests/test_params.py | 51 +++++ tests/test_serp.py | 220 +++++++++++++++++++++ tests/utils.py | 16 ++ zyte_spider_templates/params.py | 134 ++++++++----- zyte_spider_templates/spiders/ecommerce.py | 18 +- zyte_spider_templates/spiders/serp.py | 69 ++++--- 8 files changed, 419 insertions(+), 158 deletions(-) create mode 100644 tests/test_params.py create mode 100644 tests/test_serp.py create mode 100644 tests/utils.py diff --git a/tests/__init__.py b/tests/__init__.py index 5e99e9c..2aa5953 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,11 @@ from typing import Any, Dict, Optional +import pytest from scrapy.utils.test import TestSpider +# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting +pytest.register_assert_rewrite("tests.utils") + # scrapy.utils.test.get_crawler alternative that does not freeze settings. def get_crawler(*, settings: Optional[Dict[str, Any]] = None): diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 7a8a9c7..987d93f 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -1,6 +1,4 @@ -import json import logging -import re from unittest.mock import MagicMock, call, patch import pytest @@ -11,7 +9,6 @@ from scrapy_spider_metadata import get_spider_metadata from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request -from zyte_spider_templates import BaseSpiderParams from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, GEOLOCATION_OPTIONS_WITH_CODE, @@ -24,6 +21,7 @@ from . import get_crawler from .test_utils import URL_TO_DOMAIN +from .utils import assertEqualJson def test_parameters(): @@ -362,21 +360,6 @@ def test_arguments(): assert spider.allowed_domains == ["example.com"] -def assertEqualJson(actual, expected): - """Compare the JSON representation of 2 Python objects. - - This allows to take into account things like the order of key-value pairs - in dictionaries, which would not be taken into account when comparing - dictionaries directly. - - It also generates a better diff in pytest output when enums are involved, - e.g. geolocation values. - """ - actual_json = json.dumps(actual, indent=2) - expected_json = json.dumps(expected, indent=2) - assert actual_json == expected_json - - def test_metadata(): actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True) expected_metadata = { @@ -558,52 +541,6 @@ def test_metadata(): assert set(geolocation["enum"]) == set(geolocation["enumMeta"]) -@pytest.mark.parametrize( - "valid,url", - [ - (False, ""), - (False, "http://"), - (False, "http:/example.com"), - (False, "ftp://example.com"), - (False, "example.com"), - (False, "//example.com"), - (False, "http://foo:bar@example.com"), - (False, " http://example.com"), - (False, "http://example.com "), - (False, "http://examp le.com"), - (False, "https://example.com:232323"), - (True, "http://example.com"), - (True, "http://bücher.example"), - (True, "http://xn--bcher-kva.example"), - (True, "https://i❤.ws"), - (True, "https://example.com"), - (True, "https://example.com/"), - (True, "https://example.com:2323"), - (True, "https://example.com:2323/"), - (True, "https://example.com:2323/foo"), - (True, "https://example.com/f"), - (True, "https://example.com/foo"), - (True, "https://example.com/foo/"), - (True, "https://example.com/foo/bar"), - (True, "https://example.com/foo/bar/"), - (True, "https://example.com/foo/bar?baz"), - (True, "https://example.com/foo/bar/?baz"), - (True, "https://example.com?foo"), - (True, "https://example.com?foo=bar"), - (True, "https://example.com/?foo=bar&baz"), - (True, "https://example.com/?foo=bar&baz#"), - (True, "https://example.com/?foo=bar&baz#frag"), - (True, "https://example.com#"), - (True, "https://example.com/#"), - (True, "https://example.com/&"), - (True, "https://example.com/&#"), - ], -) -def test_validation_url(url, valid): - url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern - assert bool(re.match(url_re, url)) == valid - - def test_get_parse_product_request(): base_kwargs = { "url": "https://example.com", diff --git a/tests/test_params.py b/tests/test_params.py new file mode 100644 index 0000000..df08a19 --- /dev/null +++ b/tests/test_params.py @@ -0,0 +1,51 @@ +import re + +import pytest + +from zyte_spider_templates.params import URL_FIELD_KWARGS + + +@pytest.mark.parametrize( + "valid,url", + [ + (False, ""), + (False, "http://"), + (False, "http:/example.com"), + (False, "ftp://example.com"), + (False, "example.com"), + (False, "//example.com"), + (False, "http://foo:bar@example.com"), + (False, " http://example.com"), + (False, "http://example.com "), + (False, "http://examp le.com"), + (False, "https://example.com:232323"), + (True, "http://example.com"), + (True, "http://bücher.example"), + (True, "http://xn--bcher-kva.example"), + (True, "https://i❤.ws"), + (True, "https://example.com"), + (True, "https://example.com/"), + (True, "https://example.com:2323"), + (True, "https://example.com:2323/"), + (True, "https://example.com:2323/foo"), + (True, "https://example.com/f"), + (True, "https://example.com/foo"), + (True, "https://example.com/foo/"), + (True, "https://example.com/foo/bar"), + (True, "https://example.com/foo/bar/"), + (True, "https://example.com/foo/bar?baz"), + (True, "https://example.com/foo/bar/?baz"), + (True, "https://example.com?foo"), + (True, "https://example.com?foo=bar"), + (True, "https://example.com/?foo=bar&baz"), + (True, "https://example.com/?foo=bar&baz#"), + (True, "https://example.com/?foo=bar&baz#frag"), + (True, "https://example.com#"), + (True, "https://example.com/#"), + (True, "https://example.com/&"), + (True, "https://example.com/&#"), + ], +) +def test_url_pattern(url, valid): + assert isinstance(URL_FIELD_KWARGS["pattern"], str) + assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid diff --git a/tests/test_serp.py b/tests/test_serp.py new file mode 100644 index 0000000..beadc3d --- /dev/null +++ b/tests/test_serp.py @@ -0,0 +1,220 @@ +from unittest.mock import patch + +import pytest +import requests +from pydantic import ValidationError +from scrapy_spider_metadata import get_spider_metadata + +from zyte_spider_templates.spiders.serp import SerpSpider + +from . import get_crawler +from .test_utils import URL_TO_DOMAIN +from .utils import assertEqualJson + + +def test_parameters(): + with pytest.raises(ValidationError): + SerpSpider() + + SerpSpider(url="https://google.com/search?q=foo+bar") + SerpSpider(url="https://google.com/search?q=foo+bar", max_pages=10) + + with pytest.raises(ValidationError): + SerpSpider(url="https://google.com/search?q=foo+bar", max_pages="all") + + +def test_start_requests(): + url = "https://google.com/search?q=foo+bar" + crawler = get_crawler() + spider = SerpSpider.from_crawler(crawler, url=url) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == url + assert requests[0].callback == spider.parse_serp + + +def test_metadata(): + actual_metadata = get_spider_metadata(SerpSpider, normalize=True) + expected_metadata = { + "template": True, + "title": "SERP", + "description": "Template for spiders that extract Google search results.", + "param_schema": { + "groups": [ + { + "description": ( + "Input data that determines the start URLs of the crawl." + ), + "id": "inputs", + "title": "Inputs", + "widget": "exclusive", + }, + ], + "properties": { + "url": { + "default": "", + "description": ( + "Initial URL for the crawl. Enter the full URL including http(s), " + "you can copy and paste it from your browser. Example: https://google.com/search?q=foo+bar" + ), + "exclusiveRequired": True, + "group": "inputs", + "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$", + "title": "URL", + "type": "string", + }, + "urls": { + "anyOf": [ + {"items": {"type": "string"}, "type": "array"}, + {"type": "null"}, + ], + "default": None, + "description": ( + "Initial URLs for the crawl, separated by new lines. Enter the " + "full URL including http(s), you can copy and paste it from your " + "browser. Example: https://google.com/search?q=foo+bar" + ), + "exclusiveRequired": True, + "group": "inputs", + "title": "URLs", + "widget": "textarea", + }, + "urls_file": { + "default": "", + "description": ( + "URL that point to a plain-text file with a list of " + "URLs to crawl, e.g. " + "https://example.com/url-list.txt. The linked list " + "must contain 1 URL per line." + ), + "exclusiveRequired": True, + "group": "inputs", + "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$", + "title": "URLs file", + "type": "string", + }, + "max_pages": { + "default": 1, + "description": "Maximum number of result pages to visit per input URL.", + "title": "Pages", + "type": "integer", + }, + "max_requests": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 100, + "description": ( + "The maximum number of Zyte API requests allowed for the crawl.\n" + "\n" + "Requests with error responses that cannot be retried or exceed " + "their retry limit also count here, but they incur in no costs " + "and do not increase the request count in Scrapy Cloud." + ), + "title": "Max Requests", + "widget": "request-limit", + }, + }, + "title": "SerpSpiderParams", + "type": "object", + }, + } + assertEqualJson(actual_metadata, expected_metadata) + + +@pytest.mark.parametrize("url,allowed_domain", URL_TO_DOMAIN) +def test_set_allowed_domains(url, allowed_domain): + crawler = get_crawler() + + kwargs = {"url": url} + spider = SerpSpider.from_crawler(crawler, **kwargs) + assert spider.allowed_domains == [allowed_domain] + + +def test_input_none(): + crawler = get_crawler() + with pytest.raises(ValueError): + SerpSpider.from_crawler(crawler) + + +def test_input_multiple(): + crawler = get_crawler() + with pytest.raises(ValueError): + SerpSpider.from_crawler( + crawler, + url="https://google.com/search?q=a", + urls=["https://google.com/search?q=b"], + ) + with pytest.raises(ValueError): + SerpSpider.from_crawler( + crawler, + url="https://google.com/search?q=a", + urls_file="https://example.com/input-urls.txt", + ) + with pytest.raises(ValueError): + SerpSpider.from_crawler( + crawler, + urls=["https://google.com/search?q=b"], + urls_file="https://example.com/input-urls.txt", + ) + + +def test_url_invalid(): + crawler = get_crawler() + with pytest.raises(ValueError): + SerpSpider.from_crawler(crawler, url="foo") + + +def test_urls(caplog): + crawler = get_crawler() + url = "https://google.com/search?q=foo+bar" + + spider = SerpSpider.from_crawler(crawler, urls=[url]) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_serp + + spider = SerpSpider.from_crawler(crawler, urls=url) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_serp + + caplog.clear() + spider = SerpSpider.from_crawler( + crawler, + urls="https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\nfoo\n\n", + ) + assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text + start_requests = list(spider.start_requests()) + assert len(start_requests) == 3 + assert all(request.callback == spider.parse_serp for request in start_requests) + assert start_requests[0].url == "https://google.com/search?q=a" + assert start_requests[1].url == "https://google.com/search?q=b" + assert start_requests[2].url == "https://google.com/search?q=c" + + caplog.clear() + with pytest.raises(ValueError): + spider = SerpSpider.from_crawler( + crawler, + urls="foo\nbar", + ) + assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text + assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text + + +def test_urls_file(): + crawler = get_crawler() + url = "https://example.com/input-urls.txt" + + with patch("zyte_spider_templates.spiders.serp.requests.get") as mock_get: + response = requests.Response() + response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n" + mock_get.return_value = response + spider = SerpSpider.from_crawler(crawler, urls_file=url) + mock_get.assert_called_with(url) + + start_requests = list(spider.start_requests()) + assert len(start_requests) == 3 + assert start_requests[0].url == "https://google.com/search?q=a" + assert start_requests[1].url == "https://google.com/search?q=b" + assert start_requests[2].url == "https://google.com/search?q=c" diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..2fd7261 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,16 @@ +import json + + +def assertEqualJson(actual, expected): + """Compare the JSON representation of 2 Python objects. + + This allows to take into account things like the order of key-value pairs + in dictionaries, which would not be taken into account when comparing + dictionaries directly. + + It also generates a better diff in pytest output when enums are involved, + e.g. geolocation values. + """ + actual_json = json.dumps(actual, indent=2) + expected_json = json.dumps(expected, indent=2) + assert actual_json == expected_json diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index d9245a8..be87ea8 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -4,6 +4,7 @@ from logging import getLogger from typing import Dict, List, Optional, Union +import requests from pydantic import BaseModel, ConfigDict, Field, field_validator from zyte_spider_templates._geolocations import ( @@ -12,7 +13,7 @@ ) from zyte_spider_templates.documentation import document_enum -from .utils import _URL_PATTERN +from .utils import _URL_PATTERN, get_domain, load_url_list logger = getLogger(__name__) @@ -100,66 +101,93 @@ class UrlsFileParam(BaseModel): ) +def parse_input_params(spider): + urls_file = spider.args.urls_file + if urls_file: + response = requests.get(urls_file) + urls = load_url_list(response.text) + spider.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.") + spider.start_urls = urls + elif spider.args.urls: + spider.start_urls = spider.args.urls + else: + spider.start_urls = [spider.args.url] + spider.allowed_domains = list(set(get_domain(url) for url in spider.start_urls)) + + +URL_FIELD_KWARGS = { + "title": "URL", + "description": ( + "Initial URL for the crawl. Enter the full URL including http(s), " + "you can copy and paste it from your browser. Example: " + "https://toscrape.com/" + ), + "pattern": _URL_PATTERN, + "default": "", + "json_schema_extra": { + "group": "inputs", + "exclusiveRequired": True, + }, +} + + class UrlParam(BaseModel): - url: str = Field( - title="URL", - description="Initial URL for the crawl. Enter the full URL including http(s), " - "you can copy and paste it from your browser. Example: https://toscrape.com/", - pattern=_URL_PATTERN, - default="", - json_schema_extra={ - "group": "inputs", - "exclusiveRequired": True, - }, - ) + url: str = Field(**URL_FIELD_KWARGS) # type: ignore[misc, arg-type] + + +URLS_FIELD_KWARGS = { + "title": "URLs", + "description": ( + "Initial URLs for the crawl, separated by new lines. Enter the " + "full URL including http(s), you can copy and paste it from your " + "browser. Example: https://toscrape.com/" + ), + "default": None, + "json_schema_extra": { + "group": "inputs", + "exclusiveRequired": True, + "widget": "textarea", + }, +} + + +def validate_url_list(value: Union[List[str], str]) -> List[str]: + """Validate a list of URLs. + + If a string is received as input, it is split into multiple strings + on new lines. + + List items that do not match a URL pattern trigger a warning and are + removed from the list. If all URLs are invalid, validation fails. + """ + if isinstance(value, str): + value = value.split("\n") + if not value: + return value + result = [] + for v in value: + v = v.strip() + if not v: + continue + if not re.search(_URL_PATTERN, v): + logger.warning( + f"{v!r}, from the 'urls' spider argument, is not a " + f"valid URL and will be ignored." + ) + continue + result.append(v) + if not result: + raise ValueError(f"No valid URL found in {value!r}") + return result class UrlsParam(BaseModel): - urls: Optional[List[str]] = Field( - title="URLs", - description=( - "Initial URLs for the crawl, separated by new lines. Enter the " - "full URL including http(s), you can copy and paste it from your " - "browser. Example: https://toscrape.com/" - ), - default=None, - json_schema_extra={ - "group": "inputs", - "exclusiveRequired": True, - "widget": "textarea", - }, - ) + urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] @field_validator("urls", mode="before") @classmethod def validate_url_list(cls, value: Union[List[str], str]) -> List[str]: - """Validate a list of URLs. - - If a string is received as input, it is split into multiple strings - on new lines. - - List items that do not match a URL pattern trigger a warning and are - removed from the list. If all URLs are invalid, validation fails. - """ - if isinstance(value, str): - value = value.split("\n") - if not value: - return value - result = [] - for v in value: - v = v.strip() - if not v: - continue - if not re.search(_URL_PATTERN, v): - logger.warning( - f"{v!r}, from the 'urls' spider argument, is not a " - f"valid URL and will be ignored." - ) - continue - result.append(v) - if not result: - raise ValueError(f"No valid URL found in {value!r}") - return result + return validate_url_list(value) class PostalAddress(BaseModel): diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index bfcb672..025e174 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,7 +1,6 @@ from enum import Enum from typing import Any, Callable, Dict, Iterable, Optional, Union -import requests import scrapy from pydantic import BaseModel, Field from scrapy import Request @@ -11,6 +10,7 @@ from zyte_common_items import ProbabilityRequest, Product, ProductNavigation from zyte_spider_templates.heuristics import is_homepage +from zyte_spider_templates.params import parse_input_params from zyte_spider_templates.spiders.base import ( ARG_SETTING_PRIORITY, BaseSpider, @@ -19,7 +19,6 @@ from zyte_spider_templates.utils import get_domain from ..documentation import document_enum -from ..utils import load_url_list @document_enum @@ -126,23 +125,10 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs) - spider._init_input() + parse_input_params(spider) spider._init_extract_from() return spider - def _init_input(self): - urls_file = self.args.urls_file - if urls_file: - response = requests.get(urls_file) - urls = load_url_list(response.text) - self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.") - self.start_urls = urls - elif self.args.urls: - self.start_urls = self.args.urls - else: - self.start_urls = [self.args.url] - self.allowed_domains = list(set(get_domain(url) for url in self.start_urls)) - def _init_extract_from(self): if self.args.extract_from is not None: self.settings.set( diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index d159312..45fb9c7 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,19 +1,24 @@ -from typing import Any, Dict, Iterable +from copy import deepcopy +from typing import Any, Dict, Iterable, List, Optional, Union -import requests import scrapy -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from scrapy import Request from scrapy.crawler import Crawler from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter from zyte_common_items import Serp +from zyte_spider_templates.params import parse_input_params from zyte_spider_templates.spiders.base import BaseSpider -from zyte_spider_templates.utils import get_domain -from ..params import MaxRequestsParam, UrlParam, UrlsFileParam, UrlsParam -from ..utils import load_url_list +from ..params import ( + URL_FIELD_KWARGS, + URLS_FIELD_KWARGS, + MaxRequestsParam, + UrlsFileParam, + validate_url_list, +) from .base import _INPUT_FIELDS @@ -25,12 +30,39 @@ class SerpMaxPagesParam(BaseModel): ) +SERP_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS) +assert isinstance(SERP_URL_FIELD_KWARGS["description"], str) +SERP_URL_FIELD_KWARGS["description"] = SERP_URL_FIELD_KWARGS["description"].replace( + "https://toscrape.com/", "https://google.com/search?q=foo+bar" +) + + +class SerpUrlParam(BaseModel): + url: str = Field(**SERP_URL_FIELD_KWARGS) # type: ignore[misc, arg-type] + + +SERP_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS) +assert isinstance(SERP_URLS_FIELD_KWARGS["description"], str) +SERP_URLS_FIELD_KWARGS["description"] = SERP_URLS_FIELD_KWARGS["description"].replace( + "https://toscrape.com/", "https://google.com/search?q=foo+bar" +) + + +class SerpUrlsParam(BaseModel): + urls: Optional[List[str]] = Field(**SERP_URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] + + @field_validator("urls", mode="before") + @classmethod + def validate_url_list(cls, value: Union[List[str], str]) -> List[str]: + return validate_url_list(value) + + class SerpSpiderParams( MaxRequestsParam, SerpMaxPagesParam, UrlsFileParam, - UrlsParam, - UrlParam, + SerpUrlsParam, + SerpUrlParam, BaseModel, ): model_config = ConfigDict( @@ -53,9 +85,9 @@ class SerpSpiderParams( @model_validator(mode="after") def single_input(self): """Fields - :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.url` + :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams.url` and - :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.urls_file` + :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams.urls_file` form a mandatory, mutually-exclusive field group: one of them must be defined, the rest must not be defined.""" input_fields = set( @@ -92,28 +124,15 @@ class SerpSpider(Args[SerpSpiderParams], BaseSpider): metadata: Dict[str, Any] = { **BaseSpider.metadata, "title": "SERP", - "description": "Template for spiders that extract search engine results.", + "description": "Template for spiders that extract Google search results.", } @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super().from_crawler(crawler, *args, **kwargs) - spider._init_input() + parse_input_params(spider) return spider - def _init_input(self): - urls_file = self.args.urls_file - if urls_file: - response = requests.get(urls_file) - urls = load_url_list(response.text) - self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.") - self.start_urls = urls - elif self.args.urls: - self.start_urls = self.args.urls - else: - self.start_urls = [self.args.url] - self.allowed_domains = list(set(get_domain(url) for url in self.start_urls)) - def get_start_request(self, url): return Request( url=url, From ff97f077c54fa402a0afb2ab6ca1757322197007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 28 Aug 2024 21:02:16 +0200 Subject: [PATCH 04/19] Fix requests mocking --- tests/test_ecommerce.py | 2 +- tests/test_serp.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 987d93f..c59cb18 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -755,7 +755,7 @@ def test_urls_file(): crawler = get_crawler() url = "https://example.com" - with patch("zyte_spider_templates.spiders.ecommerce.requests.get") as mock_get: + with patch("zyte_spider_templates.params.requests.get") as mock_get: response = requests.Response() response._content = ( b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n" diff --git a/tests/test_serp.py b/tests/test_serp.py index beadc3d..fbe7f02 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -206,7 +206,7 @@ def test_urls_file(): crawler = get_crawler() url = "https://example.com/input-urls.txt" - with patch("zyte_spider_templates.spiders.serp.requests.get") as mock_get: + with patch("zyte_spider_templates.params.requests.get") as mock_get: response = requests.Response() response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n" mock_get.return_value = response From 3a44330e95de397962a63cb26d68ea24523ffdce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 29 Aug 2024 14:03:28 +0200 Subject: [PATCH 05/19] Enable the aggressive retry policy by default for the SERP spider --- zyte_spider_templates/spiders/serp.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 45fb9c7..1dc0c33 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -5,6 +5,7 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from scrapy import Request from scrapy.crawler import Crawler +from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter from zyte_common_items import Serp @@ -127,6 +128,20 @@ class SerpSpider(Args[SerpSpiderParams], BaseSpider): "description": "Template for spiders that extract Google search results.", } + @classmethod + def update_settings(cls, settings: BaseSettings) -> None: + super().update_settings(settings) + retry_policy_setting_priority = settings.getpriority("ZYTE_API_RETRY_POLICY") + if ( + retry_policy_setting_priority is None + or retry_policy_setting_priority < SETTINGS_PRIORITIES["spider"] + ): + settings.set( + "ZYTE_API_RETRY_POLICY", + "zyte_api.aggressive_retrying", + priority="spider", + ) + @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super().from_crawler(crawler, *args, **kwargs) From 1bc4a29357059662136286582bb9ea8f2c5d59c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 4 Sep 2024 19:57:16 +0200 Subject: [PATCH 06/19] Make the SERP spider more Google-specific, in line with the current actual implementation --- docs/index.rst | 2 +- docs/reference/index.rst | 2 +- docs/templates/google-search.rst | 19 +++++++++++++ docs/templates/index.rst | 4 +-- docs/templates/serp.rst | 19 ------------- tests/test_serp.py | 40 +++++++++++++-------------- zyte_spider_templates/__init__.py | 2 +- zyte_spider_templates/spiders/serp.py | 14 +++++----- 8 files changed, 51 insertions(+), 51 deletions(-) create mode 100644 docs/templates/google-search.rst delete mode 100644 docs/templates/serp.rst diff --git a/docs/index.rst b/docs/index.rst index d26b6ca..1083299 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,7 +18,7 @@ zyte-spider-templates documentation templates/index E-commerce - SERP + Google search .. toctree:: :caption: Customization diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 14a158e..dd368dd 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -9,7 +9,7 @@ Spiders .. autoclass:: zyte_spider_templates.EcommerceSpider -.. autoclass:: zyte_spider_templates.SerpSpider +.. autoclass:: zyte_spider_templates.GoogleSearchSpider Pages diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst new file mode 100644 index 0000000..2bf9a6b --- /dev/null +++ b/docs/templates/google-search.rst @@ -0,0 +1,19 @@ +.. _google-search: + +================================================= +Google search spider template (``google_search``) +================================================= + +Basic use +========= + +.. code-block:: shell + + scrapy crawl google_search -a url="https://www.google.com/search?q=foo" + +Parameters +========== + +.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams + :inherited-members: BaseModel + :exclude-members: model_computed_fields diff --git a/docs/templates/index.rst b/docs/templates/index.rst index 058187c..ea86c6d 100644 --- a/docs/templates/index.rst +++ b/docs/templates/index.rst @@ -30,5 +30,5 @@ Spider template list :ref:`E-commerce ` Get products from an e-commerce website. -:ref:`SERP ` - Get search engine result pages. +:ref:`Google Search ` + Get Google search results. diff --git a/docs/templates/serp.rst b/docs/templates/serp.rst deleted file mode 100644 index 496926f..0000000 --- a/docs/templates/serp.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. _serp: - -=============================== -SERP spider template (``serp``) -=============================== - -Basic use -========= - -.. code-block:: shell - - scrapy crawl serp -a url="https://www.google.com/search?q=foo" - -Parameters -========== - -.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpSpiderParams - :inherited-members: BaseModel - :exclude-members: model_computed_fields diff --git a/tests/test_serp.py b/tests/test_serp.py index fbe7f02..09d6e08 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -5,7 +5,7 @@ from pydantic import ValidationError from scrapy_spider_metadata import get_spider_metadata -from zyte_spider_templates.spiders.serp import SerpSpider +from zyte_spider_templates.spiders.serp import GoogleSearchSpider from . import get_crawler from .test_utils import URL_TO_DOMAIN @@ -14,19 +14,19 @@ def test_parameters(): with pytest.raises(ValidationError): - SerpSpider() + GoogleSearchSpider() - SerpSpider(url="https://google.com/search?q=foo+bar") - SerpSpider(url="https://google.com/search?q=foo+bar", max_pages=10) + GoogleSearchSpider(url="https://google.com/search?q=foo+bar") + GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages=10) with pytest.raises(ValidationError): - SerpSpider(url="https://google.com/search?q=foo+bar", max_pages="all") + GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages="all") def test_start_requests(): url = "https://google.com/search?q=foo+bar" crawler = get_crawler() - spider = SerpSpider.from_crawler(crawler, url=url) + spider = GoogleSearchSpider.from_crawler(crawler, url=url) requests = list(spider.start_requests()) assert len(requests) == 1 assert requests[0].url == url @@ -34,10 +34,10 @@ def test_start_requests(): def test_metadata(): - actual_metadata = get_spider_metadata(SerpSpider, normalize=True) + actual_metadata = get_spider_metadata(GoogleSearchSpider, normalize=True) expected_metadata = { "template": True, - "title": "SERP", + "title": "Google Search Results", "description": "Template for spiders that extract Google search results.", "param_schema": { "groups": [ @@ -113,7 +113,7 @@ def test_metadata(): "widget": "request-limit", }, }, - "title": "SerpSpiderParams", + "title": "GoogleSearchSpiderParams", "type": "object", }, } @@ -125,32 +125,32 @@ def test_set_allowed_domains(url, allowed_domain): crawler = get_crawler() kwargs = {"url": url} - spider = SerpSpider.from_crawler(crawler, **kwargs) + spider = GoogleSearchSpider.from_crawler(crawler, **kwargs) assert spider.allowed_domains == [allowed_domain] def test_input_none(): crawler = get_crawler() with pytest.raises(ValueError): - SerpSpider.from_crawler(crawler) + GoogleSearchSpider.from_crawler(crawler) def test_input_multiple(): crawler = get_crawler() with pytest.raises(ValueError): - SerpSpider.from_crawler( + GoogleSearchSpider.from_crawler( crawler, url="https://google.com/search?q=a", urls=["https://google.com/search?q=b"], ) with pytest.raises(ValueError): - SerpSpider.from_crawler( + GoogleSearchSpider.from_crawler( crawler, url="https://google.com/search?q=a", urls_file="https://example.com/input-urls.txt", ) with pytest.raises(ValueError): - SerpSpider.from_crawler( + GoogleSearchSpider.from_crawler( crawler, urls=["https://google.com/search?q=b"], urls_file="https://example.com/input-urls.txt", @@ -160,27 +160,27 @@ def test_input_multiple(): def test_url_invalid(): crawler = get_crawler() with pytest.raises(ValueError): - SerpSpider.from_crawler(crawler, url="foo") + GoogleSearchSpider.from_crawler(crawler, url="foo") def test_urls(caplog): crawler = get_crawler() url = "https://google.com/search?q=foo+bar" - spider = SerpSpider.from_crawler(crawler, urls=[url]) + spider = GoogleSearchSpider.from_crawler(crawler, urls=[url]) start_requests = list(spider.start_requests()) assert len(start_requests) == 1 assert start_requests[0].url == url assert start_requests[0].callback == spider.parse_serp - spider = SerpSpider.from_crawler(crawler, urls=url) + spider = GoogleSearchSpider.from_crawler(crawler, urls=url) start_requests = list(spider.start_requests()) assert len(start_requests) == 1 assert start_requests[0].url == url assert start_requests[0].callback == spider.parse_serp caplog.clear() - spider = SerpSpider.from_crawler( + spider = GoogleSearchSpider.from_crawler( crawler, urls="https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\nfoo\n\n", ) @@ -194,7 +194,7 @@ def test_urls(caplog): caplog.clear() with pytest.raises(ValueError): - spider = SerpSpider.from_crawler( + spider = GoogleSearchSpider.from_crawler( crawler, urls="foo\nbar", ) @@ -210,7 +210,7 @@ def test_urls_file(): response = requests.Response() response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n" mock_get.return_value = response - spider = SerpSpider.from_crawler(crawler, urls_file=url) + spider = GoogleSearchSpider.from_crawler(crawler, urls_file=url) mock_get.assert_called_with(url) start_requests = list(spider.start_requests()) diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py index 6b6d292..75bfbde 100644 --- a/zyte_spider_templates/__init__.py +++ b/zyte_spider_templates/__init__.py @@ -1,3 +1,3 @@ from .spiders.base import BaseSpider, BaseSpiderParams from .spiders.ecommerce import EcommerceSpider -from .spiders.serp import SerpSpider +from .spiders.serp import GoogleSearchSpider diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 9d093ea..857e6d5 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -57,7 +57,7 @@ def validate_url_list(cls, value: Union[List[str], str]) -> List[str]: return validate_url_list(value) -class SerpSpiderParams( +class GoogleSearchSpiderParams( MaxRequestsParam, SerpMaxPagesParam, UrlsFileParam, @@ -76,20 +76,20 @@ class SerpSpiderParams( ) -class SerpSpider(Args[SerpSpiderParams], BaseSpider): - """Yield results from search engine result pages (SERP). +class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): + """Yield results from Google searches. - See :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams` + See :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams` for supported parameters. - .. seealso:: :ref:`serp`. + .. seealso:: :ref:`google-search`. """ - name = "serp" + name = "google_search" metadata: Dict[str, Any] = { **BaseSpider.metadata, - "title": "SERP", + "title": "Google Search Results", "description": "Template for spiders that extract Google search results.", } From 8f3ab3eb37275c5c6e65320f50184b534c4a0cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 4 Sep 2024 20:28:49 +0200 Subject: [PATCH 07/19] Add a mandatory search keywords field, and set a default input URL --- zyte_spider_templates/spiders/serp.py | 61 ++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 857e6d5..bb46afb 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,5 +1,6 @@ from copy import deepcopy from typing import Any, Dict, Iterable, List, Optional, Union +from urllib.parse import urlparse, urlunparse import scrapy from pydantic import BaseModel, ConfigDict, Field, field_validator @@ -22,6 +23,35 @@ from .base import INPUT_GROUP, BaseSpider +class SearchKeywordsParam(BaseModel): + search_keywords: Optional[List[str]] = Field( + title="Search Keywords", + description=("Search keywords to use on the specified input Google URLs."), + default=None, + json_schema_extra={ + "widget": "textarea", + }, + ) + + @field_validator("search_keywords", mode="before") + @classmethod + def validate_search_keywords(cls, value: Union[List[str], str]) -> List[str]: + """Validate a list of search keywords. + If a string is received as input, it is split into multiple strings + on new lines. + """ + if isinstance(value, str): + value = value.split("\n") + if not value: + return value + result = [] + for v in value: + if not (v := v.strip()): + continue + result.append(v) + return result + + class SerpMaxPagesParam(BaseModel): max_pages: int = Field( title="Pages", @@ -30,15 +60,16 @@ class SerpMaxPagesParam(BaseModel): ) -SERP_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS) -assert isinstance(SERP_URL_FIELD_KWARGS["description"], str) -SERP_URL_FIELD_KWARGS["description"] = SERP_URL_FIELD_KWARGS["description"].replace( +GOOGLE_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS) +assert isinstance(GOOGLE_URL_FIELD_KWARGS["description"], str) +GOOGLE_URL_FIELD_KWARGS["default"] = "https://www.google.com/" +GOOGLE_URL_FIELD_KWARGS["description"] = GOOGLE_URL_FIELD_KWARGS["description"].replace( "https://toscrape.com/", "https://google.com/search?q=foo+bar" ) -class SerpUrlParam(BaseModel): - url: str = Field(**SERP_URL_FIELD_KWARGS) # type: ignore[misc, arg-type] +class GoogleUrlParam(BaseModel): + url: str = Field(**GOOGLE_URL_FIELD_KWARGS) # type: ignore[misc, arg-type] SERP_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS) @@ -60,9 +91,10 @@ def validate_url_list(cls, value: Union[List[str], str]) -> List[str]: class GoogleSearchSpiderParams( MaxRequestsParam, SerpMaxPagesParam, + SearchKeywordsParam, UrlsFileParam, SerpUrlsParam, - SerpUrlParam, + GoogleUrlParam, BaseModel, ): model_config = ConfigDict( @@ -126,11 +158,20 @@ def get_start_request(self, url): ) def start_requests(self) -> Iterable[Request]: + search_keywords = self.args.search_keywords + if not search_keywords: + raise ValueError("No search keywords specified.") + for url in self.start_urls: - for start in range(0, self.args.max_pages * 10, 10): - if start: - url = add_or_replace_parameter(url, "start", str(start)) - yield self.get_start_request(url) + url = urlunparse(urlparse(url)._replace(path="/search")) + for search_keyword in search_keywords: + search_url = add_or_replace_parameter(url, "q", search_keyword) + for start in range(0, self.args.max_pages * 10, 10): + if start: + search_url = add_or_replace_parameter( + search_url, "start", str(start) + ) + yield self.get_start_request(search_url) def parse_serp(self, response) -> Iterable[Serp]: yield Serp.from_dict(response.raw_api_response["serp"]) From b0786e6a8b443b3bd9fce63ea588131090ab00f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 4 Sep 2024 21:20:19 +0200 Subject: [PATCH 08/19] Improve the SERP implementation, get all tests to pass --- docs/templates/google-search.rst | 2 +- tests/test_serp.py | 88 +++++++++++++++++---------- zyte_spider_templates/params.py | 31 +++++----- zyte_spider_templates/spiders/serp.py | 68 +++++++++++++++------ 4 files changed, 126 insertions(+), 63 deletions(-) diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst index 2bf9a6b..e8a9053 100644 --- a/docs/templates/google-search.rst +++ b/docs/templates/google-search.rst @@ -9,7 +9,7 @@ Basic use .. code-block:: shell - scrapy crawl google_search -a url="https://www.google.com/search?q=foo" + scrapy crawl google_search -a search_keywords="foo bar" Parameters ========== diff --git a/tests/test_serp.py b/tests/test_serp.py index 09d6e08..177ff53 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -16,20 +16,25 @@ def test_parameters(): with pytest.raises(ValidationError): GoogleSearchSpider() - GoogleSearchSpider(url="https://google.com/search?q=foo+bar") - GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages=10) + with pytest.raises(ValidationError): + GoogleSearchSpider(url="https://www.google.com/") + + GoogleSearchSpider(search_keywords="foo bar") + GoogleSearchSpider(url="https://www.google.cat/", search_keywords="foo bar") + GoogleSearchSpider( + url="https://www.google.cat/", search_keywords="foo bar", max_pages=10 + ) with pytest.raises(ValidationError): - GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages="all") + GoogleSearchSpider(search_keywords="foo bar", max_pages="all") def test_start_requests(): - url = "https://google.com/search?q=foo+bar" crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, url=url) + spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar") requests = list(spider.start_requests()) assert len(requests) == 1 - assert requests[0].url == url + assert requests[0].url == "https://www.google.com/search?q=foo+bar" assert requests[0].callback == spider.parse_serp @@ -52,10 +57,9 @@ def test_metadata(): ], "properties": { "url": { - "default": "", + "default": "https://www.google.com/", "description": ( - "Initial URL for the crawl. Enter the full URL including http(s), " - "you can copy and paste it from your browser. Example: https://google.com/search?q=foo+bar" + "Target Google URL. Defaults to https://www.google.com/." ), "exclusiveRequired": True, "group": "inputs", @@ -70,9 +74,7 @@ def test_metadata(): ], "default": None, "description": ( - "Initial URLs for the crawl, separated by new lines. Enter the " - "full URL including http(s), you can copy and paste it from your " - "browser. Example: https://google.com/search?q=foo+bar" + "Target Google URLs. Defaults to https://www.google.com/." ), "exclusiveRequired": True, "group": "inputs", @@ -83,9 +85,10 @@ def test_metadata(): "default": "", "description": ( "URL that point to a plain-text file with a list of " - "URLs to crawl, e.g. " + "target Google URLs, e.g. " "https://example.com/url-list.txt. The linked list " - "must contain 1 URL per line." + "must contain 1 Google URL (e.g. " + "https://www.google.com/) per line." ), "exclusiveRequired": True, "group": "inputs", @@ -93,6 +96,15 @@ def test_metadata(): "title": "URLs file", "type": "string", }, + "search_keywords": { + "anyOf": [ + {"items": {"type": "string"}, "type": "array"}, + {"type": "null"}, + ], + "description": "Search keywords to use on the specified input Google URLs.", + "title": "Search Keywords", + "widget": "textarea", + }, "max_pages": { "default": 1, "description": "Maximum number of result pages to visit per input URL.", @@ -113,6 +125,7 @@ def test_metadata(): "widget": "request-limit", }, }, + "required": ["search_keywords"], "title": "GoogleSearchSpiderParams", "type": "object", }, @@ -125,7 +138,9 @@ def test_set_allowed_domains(url, allowed_domain): crawler = get_crawler() kwargs = {"url": url} - spider = GoogleSearchSpider.from_crawler(crawler, **kwargs) + spider = GoogleSearchSpider.from_crawler( + crawler, **kwargs, search_keywords="foo bar" + ) assert spider.allowed_domains == [allowed_domain] @@ -140,20 +155,23 @@ def test_input_multiple(): with pytest.raises(ValueError): GoogleSearchSpider.from_crawler( crawler, - url="https://google.com/search?q=a", - urls=["https://google.com/search?q=b"], + url="https://www.google.com/search?q=a", + urls=["https://www.google.com/search?q=b"], + search_keywords="foo bar", ) with pytest.raises(ValueError): GoogleSearchSpider.from_crawler( crawler, - url="https://google.com/search?q=a", + url="https://www.google.com/search?q=a", urls_file="https://example.com/input-urls.txt", + search_keywords="foo bar", ) with pytest.raises(ValueError): GoogleSearchSpider.from_crawler( crawler, - urls=["https://google.com/search?q=b"], + urls=["https://www.google.com/search?q=b"], urls_file="https://example.com/input-urls.txt", + search_keywords="foo bar", ) @@ -165,15 +183,19 @@ def test_url_invalid(): def test_urls(caplog): crawler = get_crawler() - url = "https://google.com/search?q=foo+bar" + url = "https://www.google.com/search?q=foo+bar" - spider = GoogleSearchSpider.from_crawler(crawler, urls=[url]) + spider = GoogleSearchSpider.from_crawler( + crawler, urls=[url], search_keywords="foo bar" + ) start_requests = list(spider.start_requests()) assert len(start_requests) == 1 assert start_requests[0].url == url assert start_requests[0].callback == spider.parse_serp - spider = GoogleSearchSpider.from_crawler(crawler, urls=url) + spider = GoogleSearchSpider.from_crawler( + crawler, urls=url, search_keywords="foo bar" + ) start_requests = list(spider.start_requests()) assert len(start_requests) == 1 assert start_requests[0].url == url @@ -182,21 +204,23 @@ def test_urls(caplog): caplog.clear() spider = GoogleSearchSpider.from_crawler( crawler, - urls="https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\nfoo\n\n", + urls="https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\nfoo\n\n", + search_keywords="foo bar", ) assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text start_requests = list(spider.start_requests()) assert len(start_requests) == 3 assert all(request.callback == spider.parse_serp for request in start_requests) - assert start_requests[0].url == "https://google.com/search?q=a" - assert start_requests[1].url == "https://google.com/search?q=b" - assert start_requests[2].url == "https://google.com/search?q=c" + assert start_requests[0].url == "https://www.google.com/search?q=foo+bar" + assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar" + assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar" caplog.clear() with pytest.raises(ValueError): spider = GoogleSearchSpider.from_crawler( crawler, urls="foo\nbar", + search_keywords="foo bar", ) assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text @@ -208,13 +232,15 @@ def test_urls_file(): with patch("zyte_spider_templates.params.requests.get") as mock_get: response = requests.Response() - response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n" + response._content = b"https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\n\n" mock_get.return_value = response - spider = GoogleSearchSpider.from_crawler(crawler, urls_file=url) + spider = GoogleSearchSpider.from_crawler( + crawler, urls_file=url, search_keywords="foo bar" + ) mock_get.assert_called_with(url) start_requests = list(spider.start_requests()) assert len(start_requests) == 3 - assert start_requests[0].url == "https://google.com/search?q=a" - assert start_requests[1].url == "https://google.com/search?q=b" - assert start_requests[2].url == "https://google.com/search?q=c" + assert start_requests[0].url == "https://www.google.com/search?q=foo+bar" + assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar" + assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar" diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index f9d55de..030e5f1 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -120,21 +120,24 @@ def validate_input_group(model): return model +URLS_FILE_FIELD_KWARGS = { + "title": "URLs file", + "description": ( + "URL that point to a plain-text file with a list of URLs to " + "crawl, e.g. https://example.com/url-list.txt. The linked list " + "must contain 1 URL per line." + ), + "pattern": _URL_PATTERN, + "default": "", + "json_schema_extra": { + "group": "inputs", + "exclusiveRequired": True, + }, +} + + class UrlsFileParam(BaseModel): - urls_file: str = Field( - title="URLs file", - description=( - "URL that point to a plain-text file with a list of URLs to " - "crawl, e.g. https://example.com/url-list.txt. The linked list " - "must contain 1 URL per line." - ), - pattern=_URL_PATTERN, - default="", - json_schema_extra={ - "group": "inputs", - "exclusiveRequired": True, - }, - ) + urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type] @model_validator(mode="after") def input_group(self): diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index bb46afb..a57ebea 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -3,7 +3,7 @@ from urllib.parse import urlparse, urlunparse import scrapy -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from scrapy import Request from scrapy.crawler import Crawler from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings @@ -14,10 +14,11 @@ from zyte_spider_templates.params import parse_input_params from ..params import ( + INPUT_GROUP_FIELDS, URL_FIELD_KWARGS, URLS_FIELD_KWARGS, + URLS_FILE_FIELD_KWARGS, MaxRequestsParam, - UrlsFileParam, validate_url_list, ) from .base import INPUT_GROUP, BaseSpider @@ -27,7 +28,6 @@ class SearchKeywordsParam(BaseModel): search_keywords: Optional[List[str]] = Field( title="Search Keywords", description=("Search keywords to use on the specified input Google URLs."), - default=None, json_schema_extra={ "widget": "textarea", }, @@ -43,7 +43,7 @@ def validate_search_keywords(cls, value: Union[List[str], str]) -> List[str]: if isinstance(value, str): value = value.split("\n") if not value: - return value + raise ValueError("The search_keywords parameter value is missing or empty.") result = [] for v in value: if not (v := v.strip()): @@ -61,26 +61,24 @@ class SerpMaxPagesParam(BaseModel): GOOGLE_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS) -assert isinstance(GOOGLE_URL_FIELD_KWARGS["description"], str) GOOGLE_URL_FIELD_KWARGS["default"] = "https://www.google.com/" -GOOGLE_URL_FIELD_KWARGS["description"] = GOOGLE_URL_FIELD_KWARGS["description"].replace( - "https://toscrape.com/", "https://google.com/search?q=foo+bar" -) +GOOGLE_URL_FIELD_KWARGS[ + "description" +] = "Target Google URL. Defaults to https://www.google.com/." class GoogleUrlParam(BaseModel): url: str = Field(**GOOGLE_URL_FIELD_KWARGS) # type: ignore[misc, arg-type] -SERP_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS) -assert isinstance(SERP_URLS_FIELD_KWARGS["description"], str) -SERP_URLS_FIELD_KWARGS["description"] = SERP_URLS_FIELD_KWARGS["description"].replace( - "https://toscrape.com/", "https://google.com/search?q=foo+bar" -) +GOOGLE_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS) +GOOGLE_URLS_FIELD_KWARGS[ + "description" +] = "Target Google URLs. Defaults to https://www.google.com/." -class SerpUrlsParam(BaseModel): - urls: Optional[List[str]] = Field(**SERP_URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] +class GoogleUrlsParam(BaseModel): + urls: Optional[List[str]] = Field(**GOOGLE_URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] @field_validator("urls", mode="before") @classmethod @@ -88,12 +86,24 @@ def validate_url_list(cls, value: Union[List[str], str]) -> List[str]: return validate_url_list(value) +GOOGLE_URLS_FILE_FIELD_KWARGS = deepcopy(URLS_FILE_FIELD_KWARGS) +GOOGLE_URLS_FILE_FIELD_KWARGS["description"] = ( + "URL that point to a plain-text file with a list of target Google URLs, " + "e.g. https://example.com/url-list.txt. The linked list must contain 1 " + "Google URL (e.g. https://www.google.com/) per line." +) + + +class GoogleUrlsFileParam(BaseModel): + urls_file: str = Field(**GOOGLE_URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type] + + class GoogleSearchSpiderParams( MaxRequestsParam, SerpMaxPagesParam, SearchKeywordsParam, - UrlsFileParam, - SerpUrlsParam, + GoogleUrlsFileParam, + GoogleUrlsParam, GoogleUrlParam, BaseModel, ): @@ -107,6 +117,30 @@ class GoogleSearchSpiderParams( }, ) + @model_validator(mode="after") + def input_group(self): + input_fields = set( + field for field in INPUT_GROUP_FIELDS if getattr(self, field, None) + ) + if not input_fields: + input_field_list = ", ".join(INPUT_GROUP_FIELDS) + raise ValueError( + f"No input parameter defined. Please, define one of: " + f"{input_field_list}." + ) + elif ( + len(input_fields) > 1 + and getattr(self, "url", None) != GOOGLE_URL_FIELD_KWARGS["default"] + ): + input_field_list = ", ".join( + f"{field} ({getattr(self, field)!r})" for field in input_fields + ) + raise ValueError( + f"Expected a single input parameter, got {len(input_fields)}: " + f"{input_field_list}." + ) + return self + class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): """Yield results from Google searches. From bcf556620ac0e0bc1cc769f49fd1996cfddf4985 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 9 Sep 2024 13:20:11 +0200 Subject: [PATCH 09/19] Use a domain drop-down list --- .../spiders/_google_domains.py | 193 ++++++++++++++++++ zyte_spider_templates/spiders/serp.py | 125 ++---------- 2 files changed, 214 insertions(+), 104 deletions(-) create mode 100644 zyte_spider_templates/spiders/_google_domains.py diff --git a/zyte_spider_templates/spiders/_google_domains.py b/zyte_spider_templates/spiders/_google_domains.py new file mode 100644 index 0000000..b38d582 --- /dev/null +++ b/zyte_spider_templates/spiders/_google_domains.py @@ -0,0 +1,193 @@ +from enum import Enum + + +# https://www.google.com/supported_domains +# Sorted alphabetically, except for keeping the main domain first. +class GoogleDomain(str, Enum): + google_com: str = "google.com" + google_ad: str = "google.ad" + google_ae: str = "google.ae" + google_al: str = "google.al" + google_am: str = "google.am" + google_as: str = "google.as" + google_at: str = "google.at" + google_az: str = "google.az" + google_ba: str = "google.ba" + google_be: str = "google.be" + google_bf: str = "google.bf" + google_bg: str = "google.bg" + google_bi: str = "google.bi" + google_bj: str = "google.bj" + google_bs: str = "google.bs" + google_bt: str = "google.bt" + google_by: str = "google.by" + google_ca: str = "google.ca" + google_cat: str = "google.cat" + google_cd: str = "google.cd" + google_cf: str = "google.cf" + google_cg: str = "google.cg" + google_ch: str = "google.ch" + google_ci: str = "google.ci" + google_cl: str = "google.cl" + google_cm: str = "google.cm" + google_cn: str = "google.cn" + google_co_ao: str = "google.co.ao" + google_co_bw: str = "google.co.bw" + google_co_ck: str = "google.co.ck" + google_co_cr: str = "google.co.cr" + google_co_id: str = "google.co.id" + google_co_il: str = "google.co.il" + google_co_in: str = "google.co.in" + google_co_jp: str = "google.co.jp" + google_co_ke: str = "google.co.ke" + google_co_kr: str = "google.co.kr" + google_co_ls: str = "google.co.ls" + google_co_ma: str = "google.co.ma" + google_co_mz: str = "google.co.mz" + google_co_nz: str = "google.co.nz" + google_co_th: str = "google.co.th" + google_co_tz: str = "google.co.tz" + google_co_ug: str = "google.co.ug" + google_co_uk: str = "google.co.uk" + google_co_uz: str = "google.co.uz" + google_co_ve: str = "google.co.ve" + google_co_vi: str = "google.co.vi" + google_co_za: str = "google.co.za" + google_co_zm: str = "google.co.zm" + google_co_zw: str = "google.co.zw" + google_com_af: str = "google.com.af" + google_com_ag: str = "google.com.ag" + google_com_ar: str = "google.com.ar" + google_com_au: str = "google.com.au" + google_com_bd: str = "google.com.bd" + google_com_bh: str = "google.com.bh" + google_com_bn: str = "google.com.bn" + google_com_bo: str = "google.com.bo" + google_com_br: str = "google.com.br" + google_com_bz: str = "google.com.bz" + google_com_co: str = "google.com.co" + google_com_cu: str = "google.com.cu" + google_com_cy: str = "google.com.cy" + google_com_do: str = "google.com.do" + google_com_ec: str = "google.com.ec" + google_com_eg: str = "google.com.eg" + google_com_et: str = "google.com.et" + google_com_fj: str = "google.com.fj" + google_com_gh: str = "google.com.gh" + google_com_gi: str = "google.com.gi" + google_com_gt: str = "google.com.gt" + google_com_hk: str = "google.com.hk" + google_com_jm: str = "google.com.jm" + google_com_kh: str = "google.com.kh" + google_com_kw: str = "google.com.kw" + google_com_lb: str = "google.com.lb" + google_com_ly: str = "google.com.ly" + google_com_mm: str = "google.com.mm" + google_com_mt: str = "google.com.mt" + google_com_mx: str = "google.com.mx" + google_com_my: str = "google.com.my" + google_com_na: str = "google.com.na" + google_com_ng: str = "google.com.ng" + google_com_ni: str = "google.com.ni" + google_com_np: str = "google.com.np" + google_com_om: str = "google.com.om" + google_com_pa: str = "google.com.pa" + google_com_pe: str = "google.com.pe" + google_com_pg: str = "google.com.pg" + google_com_ph: str = "google.com.ph" + google_com_pk: str = "google.com.pk" + google_com_pr: str = "google.com.pr" + google_com_py: str = "google.com.py" + google_com_qa: str = "google.com.qa" + google_com_sa: str = "google.com.sa" + google_com_sb: str = "google.com.sb" + google_com_sg: str = "google.com.sg" + google_com_sl: str = "google.com.sl" + google_com_sv: str = "google.com.sv" + google_com_tj: str = "google.com.tj" + google_com_tr: str = "google.com.tr" + google_com_tw: str = "google.com.tw" + google_com_ua: str = "google.com.ua" + google_com_uy: str = "google.com.uy" + google_com_vc: str = "google.com.vc" + google_com_vn: str = "google.com.vn" + google_cv: str = "google.cv" + google_cz: str = "google.cz" + google_de: str = "google.de" + google_dj: str = "google.dj" + google_dk: str = "google.dk" + google_dm: str = "google.dm" + google_dz: str = "google.dz" + google_ee: str = "google.ee" + google_es: str = "google.es" + google_fi: str = "google.fi" + google_fm: str = "google.fm" + google_fr: str = "google.fr" + google_ga: str = "google.ga" + google_ge: str = "google.ge" + google_gg: str = "google.gg" + google_gl: str = "google.gl" + google_gm: str = "google.gm" + google_gr: str = "google.gr" + google_gy: str = "google.gy" + google_hn: str = "google.hn" + google_hr: str = "google.hr" + google_ht: str = "google.ht" + google_hu: str = "google.hu" + google_ie: str = "google.ie" + google_im: str = "google.im" + google_iq: str = "google.iq" + google_is: str = "google.is" + google_it: str = "google.it" + google_je: str = "google.je" + google_jo: str = "google.jo" + google_kg: str = "google.kg" + google_ki: str = "google.ki" + google_kz: str = "google.kz" + google_la: str = "google.la" + google_li: str = "google.li" + google_lk: str = "google.lk" + google_lt: str = "google.lt" + google_lu: str = "google.lu" + google_lv: str = "google.lv" + google_md: str = "google.md" + google_me: str = "google.me" + google_mg: str = "google.mg" + google_mk: str = "google.mk" + google_ml: str = "google.ml" + google_mn: str = "google.mn" + google_mu: str = "google.mu" + google_mv: str = "google.mv" + google_mw: str = "google.mw" + google_ne: str = "google.ne" + google_nl: str = "google.nl" + google_no: str = "google.no" + google_nr: str = "google.nr" + google_nu: str = "google.nu" + google_pl: str = "google.pl" + google_pn: str = "google.pn" + google_ps: str = "google.ps" + google_pt: str = "google.pt" + google_ro: str = "google.ro" + google_rs: str = "google.rs" + google_ru: str = "google.ru" + google_rw: str = "google.rw" + google_sc: str = "google.sc" + google_se: str = "google.se" + google_sh: str = "google.sh" + google_si: str = "google.si" + google_sk: str = "google.sk" + google_sm: str = "google.sm" + google_sn: str = "google.sn" + google_so: str = "google.so" + google_sr: str = "google.sr" + google_st: str = "google.st" + google_td: str = "google.td" + google_tg: str = "google.tg" + google_tl: str = "google.tl" + google_tm: str = "google.tm" + google_tn: str = "google.tn" + google_to: str = "google.to" + google_tt: str = "google.tt" + google_vu: str = "google.vu" + google_ws: str = "google.ws" diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index a57ebea..56d6049 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,27 +1,15 @@ -from copy import deepcopy from typing import Any, Dict, Iterable, List, Optional, Union -from urllib.parse import urlparse, urlunparse -import scrapy -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import BaseModel, Field, field_validator from scrapy import Request -from scrapy.crawler import Crawler from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter from zyte_common_items import Serp -from zyte_spider_templates.params import parse_input_params - -from ..params import ( - INPUT_GROUP_FIELDS, - URL_FIELD_KWARGS, - URLS_FIELD_KWARGS, - URLS_FILE_FIELD_KWARGS, - MaxRequestsParam, - validate_url_list, -) -from .base import INPUT_GROUP, BaseSpider +from ..params import MaxRequestsParam +from ._google_domains import GoogleDomain +from .base import BaseSpider class SearchKeywordsParam(BaseModel): @@ -60,86 +48,22 @@ class SerpMaxPagesParam(BaseModel): ) -GOOGLE_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS) -GOOGLE_URL_FIELD_KWARGS["default"] = "https://www.google.com/" -GOOGLE_URL_FIELD_KWARGS[ - "description" -] = "Target Google URL. Defaults to https://www.google.com/." - - -class GoogleUrlParam(BaseModel): - url: str = Field(**GOOGLE_URL_FIELD_KWARGS) # type: ignore[misc, arg-type] - - -GOOGLE_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS) -GOOGLE_URLS_FIELD_KWARGS[ - "description" -] = "Target Google URLs. Defaults to https://www.google.com/." - - -class GoogleUrlsParam(BaseModel): - urls: Optional[List[str]] = Field(**GOOGLE_URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] - - @field_validator("urls", mode="before") - @classmethod - def validate_url_list(cls, value: Union[List[str], str]) -> List[str]: - return validate_url_list(value) - - -GOOGLE_URLS_FILE_FIELD_KWARGS = deepcopy(URLS_FILE_FIELD_KWARGS) -GOOGLE_URLS_FILE_FIELD_KWARGS["description"] = ( - "URL that point to a plain-text file with a list of target Google URLs, " - "e.g. https://example.com/url-list.txt. The linked list must contain 1 " - "Google URL (e.g. https://www.google.com/) per line." -) - - -class GoogleUrlsFileParam(BaseModel): - urls_file: str = Field(**GOOGLE_URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type] +class GoogleDomainParam(BaseModel): + domain: GoogleDomain = Field( + title="Domain", + description="Target Google domain.", + default=GoogleDomain.google_com, + ) class GoogleSearchSpiderParams( MaxRequestsParam, SerpMaxPagesParam, SearchKeywordsParam, - GoogleUrlsFileParam, - GoogleUrlsParam, - GoogleUrlParam, + GoogleDomainParam, BaseModel, ): - model_config = ConfigDict( - # https://github.com/pydantic/pydantic/discussions/7763#discussioncomment-10338857 - protected_namespaces=(), - json_schema_extra={ - "groups": [ - INPUT_GROUP, - ], - }, - ) - - @model_validator(mode="after") - def input_group(self): - input_fields = set( - field for field in INPUT_GROUP_FIELDS if getattr(self, field, None) - ) - if not input_fields: - input_field_list = ", ".join(INPUT_GROUP_FIELDS) - raise ValueError( - f"No input parameter defined. Please, define one of: " - f"{input_field_list}." - ) - elif ( - len(input_fields) > 1 - and getattr(self, "url", None) != GOOGLE_URL_FIELD_KWARGS["default"] - ): - input_field_list = ", ".join( - f"{field} ({getattr(self, field)!r})" for field in input_fields - ) - raise ValueError( - f"Expected a single input parameter, got {len(input_fields)}: " - f"{input_field_list}." - ) - return self + pass class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): @@ -173,12 +97,6 @@ def update_settings(cls, settings: BaseSettings) -> None: priority="spider", ) - @classmethod - def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: - spider = super().from_crawler(crawler, *args, **kwargs) - parse_input_params(spider) - return spider - def get_start_request(self, url): return Request( url=url, @@ -196,16 +114,15 @@ def start_requests(self) -> Iterable[Request]: if not search_keywords: raise ValueError("No search keywords specified.") - for url in self.start_urls: - url = urlunparse(urlparse(url)._replace(path="/search")) - for search_keyword in search_keywords: - search_url = add_or_replace_parameter(url, "q", search_keyword) - for start in range(0, self.args.max_pages * 10, 10): - if start: - search_url = add_or_replace_parameter( - search_url, "start", str(start) - ) - yield self.get_start_request(search_url) + url = f"https://www.{self.args.domain.value}/search" + for search_keyword in search_keywords: + search_url = add_or_replace_parameter(url, "q", search_keyword) + for start in range(0, self.args.max_pages * 10, 10): + if start: + search_url = add_or_replace_parameter( + search_url, "start", str(start) + ) + yield self.get_start_request(search_url) def parse_serp(self, response) -> Iterable[Serp]: yield Serp.from_dict(response.raw_api_response["serp"]) From 89c1b7f459a339faa30ffd0844a8c04b0d874977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 9 Sep 2024 13:37:39 +0200 Subject: [PATCH 10/19] Improve the search_keywords tooltip and update tests --- tests/test_serp.py | 381 +++++++++++++++----------- zyte_spider_templates/spiders/serp.py | 5 +- 2 files changed, 229 insertions(+), 157 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 177ff53..99be79e 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -1,14 +1,10 @@ -from unittest.mock import patch - import pytest -import requests from pydantic import ValidationError from scrapy_spider_metadata import get_spider_metadata from zyte_spider_templates.spiders.serp import GoogleSearchSpider from . import get_crawler -from .test_utils import URL_TO_DOMAIN from .utils import assertEqualJson @@ -17,13 +13,14 @@ def test_parameters(): GoogleSearchSpider() with pytest.raises(ValidationError): - GoogleSearchSpider(url="https://www.google.com/") + GoogleSearchSpider(domain="google.com") GoogleSearchSpider(search_keywords="foo bar") - GoogleSearchSpider(url="https://www.google.cat/", search_keywords="foo bar") - GoogleSearchSpider( - url="https://www.google.cat/", search_keywords="foo bar", max_pages=10 - ) + GoogleSearchSpider(domain="google.cat", search_keywords="foo bar") + GoogleSearchSpider(domain="google.cat", search_keywords="foo bar", max_pages=10) + + with pytest.raises(ValidationError): + GoogleSearchSpider(domain="google.foo", search_keywords="foo bar") with pytest.raises(ValidationError): GoogleSearchSpider(search_keywords="foo bar", max_pages="all") @@ -45,55 +42,200 @@ def test_metadata(): "title": "Google Search Results", "description": "Template for spiders that extract Google search results.", "param_schema": { - "groups": [ - { - "description": ( - "Input data that determines the start URLs of the crawl." - ), - "id": "inputs", - "title": "Inputs", - "widget": "exclusive", - }, - ], "properties": { - "url": { - "default": "https://www.google.com/", - "description": ( - "Target Google URL. Defaults to https://www.google.com/." - ), - "exclusiveRequired": True, - "group": "inputs", - "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$", - "title": "URL", - "type": "string", - }, - "urls": { - "anyOf": [ - {"items": {"type": "string"}, "type": "array"}, - {"type": "null"}, + "domain": { + "default": "google.com", + "description": "Target Google domain.", + "title": "Domain", + "enum": [ + "google.com", + "google.ad", + "google.ae", + "google.al", + "google.am", + "google.as", + "google.at", + "google.az", + "google.ba", + "google.be", + "google.bf", + "google.bg", + "google.bi", + "google.bj", + "google.bs", + "google.bt", + "google.by", + "google.ca", + "google.cat", + "google.cd", + "google.cf", + "google.cg", + "google.ch", + "google.ci", + "google.cl", + "google.cm", + "google.cn", + "google.co.ao", + "google.co.bw", + "google.co.ck", + "google.co.cr", + "google.co.id", + "google.co.il", + "google.co.in", + "google.co.jp", + "google.co.ke", + "google.co.kr", + "google.co.ls", + "google.co.ma", + "google.co.mz", + "google.co.nz", + "google.co.th", + "google.co.tz", + "google.co.ug", + "google.co.uk", + "google.co.uz", + "google.co.ve", + "google.co.vi", + "google.co.za", + "google.co.zm", + "google.co.zw", + "google.com.af", + "google.com.ag", + "google.com.ar", + "google.com.au", + "google.com.bd", + "google.com.bh", + "google.com.bn", + "google.com.bo", + "google.com.br", + "google.com.bz", + "google.com.co", + "google.com.cu", + "google.com.cy", + "google.com.do", + "google.com.ec", + "google.com.eg", + "google.com.et", + "google.com.fj", + "google.com.gh", + "google.com.gi", + "google.com.gt", + "google.com.hk", + "google.com.jm", + "google.com.kh", + "google.com.kw", + "google.com.lb", + "google.com.ly", + "google.com.mm", + "google.com.mt", + "google.com.mx", + "google.com.my", + "google.com.na", + "google.com.ng", + "google.com.ni", + "google.com.np", + "google.com.om", + "google.com.pa", + "google.com.pe", + "google.com.pg", + "google.com.ph", + "google.com.pk", + "google.com.pr", + "google.com.py", + "google.com.qa", + "google.com.sa", + "google.com.sb", + "google.com.sg", + "google.com.sl", + "google.com.sv", + "google.com.tj", + "google.com.tr", + "google.com.tw", + "google.com.ua", + "google.com.uy", + "google.com.vc", + "google.com.vn", + "google.cv", + "google.cz", + "google.de", + "google.dj", + "google.dk", + "google.dm", + "google.dz", + "google.ee", + "google.es", + "google.fi", + "google.fm", + "google.fr", + "google.ga", + "google.ge", + "google.gg", + "google.gl", + "google.gm", + "google.gr", + "google.gy", + "google.hn", + "google.hr", + "google.ht", + "google.hu", + "google.ie", + "google.im", + "google.iq", + "google.is", + "google.it", + "google.je", + "google.jo", + "google.kg", + "google.ki", + "google.kz", + "google.la", + "google.li", + "google.lk", + "google.lt", + "google.lu", + "google.lv", + "google.md", + "google.me", + "google.mg", + "google.mk", + "google.ml", + "google.mn", + "google.mu", + "google.mv", + "google.mw", + "google.ne", + "google.nl", + "google.no", + "google.nr", + "google.nu", + "google.pl", + "google.pn", + "google.ps", + "google.pt", + "google.ro", + "google.rs", + "google.ru", + "google.rw", + "google.sc", + "google.se", + "google.sh", + "google.si", + "google.sk", + "google.sm", + "google.sn", + "google.so", + "google.sr", + "google.st", + "google.td", + "google.tg", + "google.tl", + "google.tm", + "google.tn", + "google.to", + "google.tt", + "google.vu", + "google.ws", ], - "default": None, - "description": ( - "Target Google URLs. Defaults to https://www.google.com/." - ), - "exclusiveRequired": True, - "group": "inputs", - "title": "URLs", - "widget": "textarea", - }, - "urls_file": { - "default": "", - "description": ( - "URL that point to a plain-text file with a list of " - "target Google URLs, e.g. " - "https://example.com/url-list.txt. The linked list " - "must contain 1 Google URL (e.g. " - "https://www.google.com/) per line." - ), - "exclusiveRequired": True, - "group": "inputs", - "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$", - "title": "URLs file", "type": "string", }, "search_keywords": { @@ -101,7 +243,11 @@ def test_metadata(): {"items": {"type": "string"}, "type": "array"}, {"type": "null"}, ], - "description": "Search keywords to use on the specified input Google URLs.", + "description": ( + "Keywords to search for. Use multiple lines to " + "trigger multiple searches for different search " + "keywords." + ), "title": "Search Keywords", "widget": "textarea", }, @@ -133,114 +279,37 @@ def test_metadata(): assertEqualJson(actual_metadata, expected_metadata) -@pytest.mark.parametrize("url,allowed_domain", URL_TO_DOMAIN) -def test_set_allowed_domains(url, allowed_domain): - crawler = get_crawler() - - kwargs = {"url": url} - spider = GoogleSearchSpider.from_crawler( - crawler, **kwargs, search_keywords="foo bar" - ) - assert spider.allowed_domains == [allowed_domain] - - def test_input_none(): crawler = get_crawler() with pytest.raises(ValueError): GoogleSearchSpider.from_crawler(crawler) -def test_input_multiple(): - crawler = get_crawler() - with pytest.raises(ValueError): - GoogleSearchSpider.from_crawler( - crawler, - url="https://www.google.com/search?q=a", - urls=["https://www.google.com/search?q=b"], - search_keywords="foo bar", - ) - with pytest.raises(ValueError): - GoogleSearchSpider.from_crawler( - crawler, - url="https://www.google.com/search?q=a", - urls_file="https://example.com/input-urls.txt", - search_keywords="foo bar", - ) - with pytest.raises(ValueError): - GoogleSearchSpider.from_crawler( - crawler, - urls=["https://www.google.com/search?q=b"], - urls_file="https://example.com/input-urls.txt", - search_keywords="foo bar", - ) - - -def test_url_invalid(): - crawler = get_crawler() - with pytest.raises(ValueError): - GoogleSearchSpider.from_crawler(crawler, url="foo") - - -def test_urls(caplog): +@pytest.mark.parametrize( + ("input_domain", "expected_domain"), + ( + (None, "google.com"), + ("google.com", "google.com"), + ("google.cat", "google.cat"), + ), +) +def test_domain(input_domain, expected_domain): crawler = get_crawler() - url = "https://www.google.com/search?q=foo+bar" - - spider = GoogleSearchSpider.from_crawler( - crawler, urls=[url], search_keywords="foo bar" - ) - start_requests = list(spider.start_requests()) - assert len(start_requests) == 1 - assert start_requests[0].url == url - assert start_requests[0].callback == spider.parse_serp - - spider = GoogleSearchSpider.from_crawler( - crawler, urls=url, search_keywords="foo bar" - ) - start_requests = list(spider.start_requests()) - assert len(start_requests) == 1 - assert start_requests[0].url == url - assert start_requests[0].callback == spider.parse_serp - - caplog.clear() + kwargs = {} + if input_domain: + kwargs["domain"] = input_domain spider = GoogleSearchSpider.from_crawler( - crawler, - urls="https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\nfoo\n\n", - search_keywords="foo bar", + crawler, search_keywords="foo bar", **kwargs ) - assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text - start_requests = list(spider.start_requests()) - assert len(start_requests) == 3 - assert all(request.callback == spider.parse_serp for request in start_requests) - assert start_requests[0].url == "https://www.google.com/search?q=foo+bar" - assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar" - assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar" - - caplog.clear() - with pytest.raises(ValueError): - spider = GoogleSearchSpider.from_crawler( - crawler, - urls="foo\nbar", - search_keywords="foo bar", - ) - assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text - assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == f"https://www.{expected_domain}/search?q=foo+bar" -def test_urls_file(): +def test_search_keywords(): crawler = get_crawler() - url = "https://example.com/input-urls.txt" - - with patch("zyte_spider_templates.params.requests.get") as mock_get: - response = requests.Response() - response._content = b"https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\n\n" - mock_get.return_value = response - spider = GoogleSearchSpider.from_crawler( - crawler, urls_file=url, search_keywords="foo bar" - ) - mock_get.assert_called_with(url) - - start_requests = list(spider.start_requests()) - assert len(start_requests) == 3 - assert start_requests[0].url == "https://www.google.com/search?q=foo+bar" - assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar" - assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar" + spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar\nbaz") + requests = list(spider.start_requests()) + assert len(requests) == 2 + assert requests[0].url == "https://www.google.com/search?q=foo+bar" + assert requests[1].url == "https://www.google.com/search?q=baz" diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 56d6049..0d22d41 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -15,7 +15,10 @@ class SearchKeywordsParam(BaseModel): search_keywords: Optional[List[str]] = Field( title="Search Keywords", - description=("Search keywords to use on the specified input Google URLs."), + description=( + "Keywords to search for. Use multiple lines to trigger multiple " + "searches for different search keywords." + ), json_schema_extra={ "widget": "textarea", }, From c3a2f2369ce34aa1041fffd60a066396b77b59f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 9 Sep 2024 13:57:57 +0200 Subject: [PATCH 11/19] =?UTF-8?q?search=20keywords=20=E2=86=92=20search=20?= =?UTF-8?q?queries?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/templates/google-search.rst | 2 +- tests/test_serp.py | 29 +++++++++++++------------- zyte_spider_templates/spiders/serp.py | 30 +++++++++++++-------------- 3 files changed, 30 insertions(+), 31 deletions(-) diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst index e8a9053..a8ba77c 100644 --- a/docs/templates/google-search.rst +++ b/docs/templates/google-search.rst @@ -9,7 +9,7 @@ Basic use .. code-block:: shell - scrapy crawl google_search -a search_keywords="foo bar" + scrapy crawl google_search -a search_queries="foo bar" Parameters ========== diff --git a/tests/test_serp.py b/tests/test_serp.py index 99be79e..90472ad 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -15,20 +15,20 @@ def test_parameters(): with pytest.raises(ValidationError): GoogleSearchSpider(domain="google.com") - GoogleSearchSpider(search_keywords="foo bar") - GoogleSearchSpider(domain="google.cat", search_keywords="foo bar") - GoogleSearchSpider(domain="google.cat", search_keywords="foo bar", max_pages=10) + GoogleSearchSpider(search_queries="foo bar") + GoogleSearchSpider(domain="google.cat", search_queries="foo bar") + GoogleSearchSpider(domain="google.cat", search_queries="foo bar", max_pages=10) with pytest.raises(ValidationError): - GoogleSearchSpider(domain="google.foo", search_keywords="foo bar") + GoogleSearchSpider(domain="google.foo", search_queries="foo bar") with pytest.raises(ValidationError): - GoogleSearchSpider(search_keywords="foo bar", max_pages="all") + GoogleSearchSpider(search_queries="foo bar", max_pages="all") def test_start_requests(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar") + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") requests = list(spider.start_requests()) assert len(requests) == 1 assert requests[0].url == "https://www.google.com/search?q=foo+bar" @@ -238,17 +238,16 @@ def test_metadata(): ], "type": "string", }, - "search_keywords": { + "search_queries": { "anyOf": [ {"items": {"type": "string"}, "type": "array"}, {"type": "null"}, ], "description": ( - "Keywords to search for. Use multiple lines to " - "trigger multiple searches for different search " - "keywords." + "Input 1 search query per line. A search query is a " + "string of search keywords (e.g. foo bar)." ), - "title": "Search Keywords", + "title": "Search Queries", "widget": "textarea", }, "max_pages": { @@ -271,7 +270,7 @@ def test_metadata(): "widget": "request-limit", }, }, - "required": ["search_keywords"], + "required": ["search_queries"], "title": "GoogleSearchSpiderParams", "type": "object", }, @@ -299,16 +298,16 @@ def test_domain(input_domain, expected_domain): if input_domain: kwargs["domain"] = input_domain spider = GoogleSearchSpider.from_crawler( - crawler, search_keywords="foo bar", **kwargs + crawler, search_queries="foo bar", **kwargs ) requests = list(spider.start_requests()) assert len(requests) == 1 assert requests[0].url == f"https://www.{expected_domain}/search?q=foo+bar" -def test_search_keywords(): +def test_search_queries(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar\nbaz") + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar\nbaz") requests = list(spider.start_requests()) assert len(requests) == 2 assert requests[0].url == "https://www.google.com/search?q=foo+bar" diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 0d22d41..8181607 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -12,29 +12,29 @@ from .base import BaseSpider -class SearchKeywordsParam(BaseModel): - search_keywords: Optional[List[str]] = Field( - title="Search Keywords", +class SearchQueriesParam(BaseModel): + search_queries: Optional[List[str]] = Field( + title="Search Queries", description=( - "Keywords to search for. Use multiple lines to trigger multiple " - "searches for different search keywords." + "Input 1 search query per line. A search query is a string of " + "search keywords (e.g. foo bar)." ), json_schema_extra={ "widget": "textarea", }, ) - @field_validator("search_keywords", mode="before") + @field_validator("search_queries", mode="before") @classmethod - def validate_search_keywords(cls, value: Union[List[str], str]) -> List[str]: - """Validate a list of search keywords. + def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]: + """Validate a list of search queries. If a string is received as input, it is split into multiple strings on new lines. """ if isinstance(value, str): value = value.split("\n") if not value: - raise ValueError("The search_keywords parameter value is missing or empty.") + raise ValueError("The search_queries parameter value is missing or empty.") result = [] for v in value: if not (v := v.strip()): @@ -62,7 +62,7 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( MaxRequestsParam, SerpMaxPagesParam, - SearchKeywordsParam, + SearchQueriesParam, GoogleDomainParam, BaseModel, ): @@ -113,13 +113,13 @@ def get_start_request(self, url): ) def start_requests(self) -> Iterable[Request]: - search_keywords = self.args.search_keywords - if not search_keywords: - raise ValueError("No search keywords specified.") + search_queries = self.args.search_queries + if not search_queries: + raise ValueError("No search queries specified.") url = f"https://www.{self.args.domain.value}/search" - for search_keyword in search_keywords: - search_url = add_or_replace_parameter(url, "q", search_keyword) + for search_query in search_queries: + search_url = add_or_replace_parameter(url, "q", search_query) for start in range(0, self.args.max_pages * 10, 10): if start: search_url = add_or_replace_parameter( From 25d4cf71cac4c204df1516a92a6ceeb2a2ba65a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 9 Sep 2024 14:15:45 +0200 Subject: [PATCH 12/19] Fix metadata JSON schema comparison --- tests/test_ecommerce.py | 6 +++--- tests/test_serp.py | 4 ++-- tests/utils.py | 17 ++++++++++------- zyte_spider_templates/spiders/ecommerce.py | 2 +- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index c7bb88b..adf6487 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -21,7 +21,7 @@ from . import get_crawler from .test_utils import URL_TO_DOMAIN -from .utils import assertEqualJson +from .utils import assertEqualSpiderMetadata def test_parameters(): @@ -463,7 +463,7 @@ def test_metadata(): "title": "Pagination Only", }, }, - "title": "Crawl strategy", + "title": "Crawl Strategy", "enum": [ "automatic", "full", @@ -533,7 +533,7 @@ def test_metadata(): "type": "object", }, } - assertEqualJson(actual_metadata, expected_metadata) + assertEqualSpiderMetadata(actual_metadata, expected_metadata) geolocation = actual_metadata["param_schema"]["properties"]["geolocation"] assert geolocation["enum"][0] == "AF" diff --git a/tests/test_serp.py b/tests/test_serp.py index 90472ad..3ba9dd7 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -5,7 +5,7 @@ from zyte_spider_templates.spiders.serp import GoogleSearchSpider from . import get_crawler -from .utils import assertEqualJson +from .utils import assertEqualSpiderMetadata def test_parameters(): @@ -275,7 +275,7 @@ def test_metadata(): "type": "object", }, } - assertEqualJson(actual_metadata, expected_metadata) + assertEqualSpiderMetadata(actual_metadata, expected_metadata) def test_input_none(): diff --git a/tests/utils.py b/tests/utils.py index 2fd7261..c18cb9b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,16 +1,19 @@ import json -def assertEqualJson(actual, expected): - """Compare the JSON representation of 2 Python objects. +def assertEqualSpiderMetadata(actual, expected): + """Compare 2 JSON schemas of spider metadata. - This allows to take into account things like the order of key-value pairs - in dictionaries, which would not be taken into account when comparing - dictionaries directly. + The parameter order in the parameter schema is taken into account, given + how it affects the UI, while the order of other object keys may be + different. It also generates a better diff in pytest output when enums are involved, e.g. geolocation values. """ - actual_json = json.dumps(actual, indent=2) - expected_json = json.dumps(expected, indent=2) + assert tuple(actual["param_schema"]["properties"]) == tuple( + expected["param_schema"]["properties"] + ) + actual_json = json.dumps(actual, indent=2, sort_keys=True) + expected_json = json.dumps(expected, indent=2, sort_keys=True) assert actual_json == expected_json diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 0a1aa6f..3868649 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -61,7 +61,7 @@ class EcommerceCrawlStrategy(str, Enum): class EcommerceCrawlStrategyParam(BaseModel): crawl_strategy: EcommerceCrawlStrategy = Field( - title="Crawl strategy", + title="Crawl Strategy", description="Determines how the start URL and follow-up URLs are crawled.", default=EcommerceCrawlStrategy.automatic, json_schema_extra={ From d8fe94c30147877f8545a95525946287bb71f9b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 9 Sep 2024 14:24:19 +0200 Subject: [PATCH 13/19] =?UTF-8?q?Min=20zyte-common-items:=200.13.0=20?= =?UTF-8?q?=E2=86=92=200.22.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.rst | 6 ++++++ setup.py | 3 +-- tox.ini | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index abce60b..7e92b08 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changes ======= +Unreleased +---------- + +* Now requires ``zyte-common-items >= 0.22.0``. + + 0.8.0 (2024-08-21) ------------------ diff --git a/setup.py b/setup.py index de219a4..e5f8e9b 100644 --- a/setup.py +++ b/setup.py @@ -18,8 +18,7 @@ "scrapy-poet>=0.21.0", "scrapy-spider-metadata>=0.1.2", "scrapy-zyte-api[provider]>=0.16.0", - # "zyte-common-items>=0.13.0", - "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@serp", + "zyte-common-items>=0.22.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tox.ini b/tox.ini index ce4287d..a88f936 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ deps = scrapy-poet==0.21.0 scrapy-spider-metadata==0.1.2 scrapy-zyte-api[provider]==0.16.0 - zyte-common-items==0.13.0 + zyte-common-items==0.22.0 [testenv:mypy] deps = From b94b7b44122413cc42561d2fa93dbede5a183568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 10 Sep 2024 15:58:32 +0200 Subject: [PATCH 14/19] Remove potentially confusing search keyword references --- tests/test_serp.py | 5 +---- zyte_spider_templates/spiders/serp.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 3ba9dd7..2ea229d 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -243,10 +243,7 @@ def test_metadata(): {"items": {"type": "string"}, "type": "array"}, {"type": "null"}, ], - "description": ( - "Input 1 search query per line. A search query is a " - "string of search keywords (e.g. foo bar)." - ), + "description": "Input 1 search query per line (e.g. foo bar).", "title": "Search Queries", "widget": "textarea", }, diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 8181607..e942964 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -15,10 +15,7 @@ class SearchQueriesParam(BaseModel): search_queries: Optional[List[str]] = Field( title="Search Queries", - description=( - "Input 1 search query per line. A search query is a string of " - "search keywords (e.g. foo bar)." - ), + description="Input 1 search query per line (e.g. foo bar).", json_schema_extra={ "widget": "textarea", }, From d7b724aa154ec71d3aa977574a8e770471551672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 17 Sep 2024 00:13:24 +0200 Subject: [PATCH 15/19] Make crawl logging more flexible for new page types --- zyte_spider_templates/middlewares.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py index 5a40872..68e4987 100644 --- a/zyte_spider_templates/middlewares.py +++ b/zyte_spider_templates/middlewares.py @@ -1,6 +1,7 @@ import json import logging import warnings +from collections import defaultdict from datetime import datetime from typing import Any, Dict from warnings import warn @@ -28,6 +29,9 @@ class CrawlingLogsMiddleware: the fingerprints logged in Scrapy Cloud's request data. """ + # Deprecated in practice, but there is no good way to deprecate it, since + # class properties that also work for class instances are not a thing. + # https://stackoverflow.com/q/128573 valid_page_types = [ "product", "nextPage", @@ -35,6 +39,7 @@ class CrawlingLogsMiddleware: "productNavigation", "productNavigation-heuristics", ] + unknown_page_type = "unknown" @classmethod @@ -82,12 +87,9 @@ def crawl_logs(self, response, result): "probability" ), }, - "to_crawl": {}, + "to_crawl": defaultdict(list), } - for page_type in self.valid_page_types + [self.unknown_page_type]: - data["to_crawl"][page_type] = [] - if result: for entry in result: if not isinstance(entry, Request): @@ -104,14 +106,17 @@ def crawl_logs(self, response, result): ) page_type = crawling_logs.get("page_type") - if page_type not in self.valid_page_types: + if not page_type: page_type = self.unknown_page_type data["to_crawl"][page_type].append(crawling_logs) - summary = ["Number of Requests per page type:"] - for page_type, requests in data["to_crawl"].items(): - summary.append(f"- {page_type}: {len(requests)}") + if data["to_crawl"]: + summary = ["Number of Requests per page type:"] + for page_type, requests in data["to_crawl"].items(): + summary.append(f"- {page_type}: {len(requests)}") + else: + summary = ["Nothing to crawl."] report = [ f"Crawling Logs for {response.url} (parsed as: {current_page_type}):", From a9d5588f236be421d7c4d8bac78be78f6d2d0fab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 17 Sep 2024 00:23:40 +0200 Subject: [PATCH 16/19] Update test expectations --- tests/test_middlewares.py | 42 ++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 6fc03ea..9b808bb 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -36,13 +36,7 @@ def results_gen(): crawl_logs = middleware.crawl_logs(response, results_gen()) assert crawl_logs == ( "Crawling Logs for https://example.com (parsed as: None):\n" - "Number of Requests per page type:\n" - "- product: 0\n" - "- nextPage: 0\n" - "- subCategories: 0\n" - "- productNavigation: 0\n" - "- productNavigation-heuristics: 0\n" - "- unknown: 0\n" + "Nothing to crawl.\n" "Structured Logs:\n" "{\n" ' "time": "2023-10-10 20:09:29",\n' @@ -53,14 +47,7 @@ def results_gen(): ' "page_type": null,\n' ' "probability": null\n' " },\n" - ' "to_crawl": {\n' - ' "product": [],\n' - ' "nextPage": [],\n' - ' "subCategories": [],\n' - ' "productNavigation": [],\n' - ' "productNavigation-heuristics": [],\n' - ' "unknown": []\n' - " }\n" + ' "to_crawl": {}\n' "}" ) @@ -131,15 +118,19 @@ def test_crawling_logs_middleware(): }, }, ) - unknown_request = Request( - "https://example.com/other-unknown", + custom_request = Request( + "https://example.com/custom-page-type", meta={ "crawling_logs": { - "name": "Unknown Page", + "name": "Custom Page", "page_type": "some other page_type", + "foo": "bar", }, }, ) + unknown_request = Request( + "https://example.com/other-unknown", + ) request_fingerprint = get_fingerprinter(crawler) fingerprint = request_fingerprint(request) @@ -150,6 +141,7 @@ def test_crawling_logs_middleware(): product_navigation_heuristics_request_fp = request_fingerprint( product_navigation_heuristics_request ) + custom_request_fp = request_fingerprint(custom_request) unknown_request_fp = request_fingerprint(unknown_request) def results_gen(): @@ -158,6 +150,7 @@ def results_gen(): yield subcategory_request yield product_navigation_request yield product_navigation_heuristics_request + yield custom_request yield unknown_request crawl_logs = middleware.crawl_logs(response, results_gen()) @@ -169,6 +162,7 @@ def results_gen(): "- subCategories: 1\n" "- productNavigation: 1\n" "- productNavigation-heuristics: 1\n" + "- some other page_type: 1\n" "- unknown: 1\n" "Structured Logs:\n" "{\n" @@ -231,10 +225,18 @@ def results_gen(): f' "request_fingerprint": "{product_navigation_heuristics_request_fp}"\n' " }\n" " ],\n" - ' "unknown": [\n' + ' "some other page_type": [\n' " {\n" - ' "name": "Unknown Page",\n' + ' "name": "Custom Page",\n' ' "page_type": "some other page_type",\n' + ' "foo": "bar",\n' + ' "request_url": "https://example.com/custom-page-type",\n' + ' "request_priority": 0,\n' + f' "request_fingerprint": "{custom_request_fp}"\n' + " }\n" + " ],\n" + ' "unknown": [\n' + " {\n" ' "request_url": "https://example.com/other-unknown",\n' ' "request_priority": 0,\n' f' "request_fingerprint": "{unknown_request_fp}"\n' From 7aa70b2763ef3f837e19029943f952bb13f06881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 17 Sep 2024 09:54:14 +0200 Subject: [PATCH 17/19] Apply feedback --- tests/test_ecommerce.py | 2 +- tests/test_serp.py | 6 ++++-- zyte_spider_templates/params.py | 2 +- zyte_spider_templates/spiders/serp.py | 13 ++++++------- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index adf6487..ae77049 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -411,7 +411,7 @@ def test_metadata(): "description": ( "URL that point to a plain-text file with a list of " "URLs to crawl, e.g. " - "https://example.com/url-list.txt. The linked list " + "https://example.com/url-list.txt. The linked file " "must contain 1 URL per line." ), "exclusiveRequired": True, diff --git a/tests/test_serp.py b/tests/test_serp.py index 2ea229d..e8ec9fe 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -249,8 +249,10 @@ def test_metadata(): }, "max_pages": { "default": 1, - "description": "Maximum number of result pages to visit per input URL.", - "title": "Pages", + "description": ( + "Maximum number of result pages to visit per search query." + ), + "title": "Max Pages", "type": "integer", }, "max_requests": { diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index 030e5f1..f3190ab 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -124,7 +124,7 @@ def validate_input_group(model): "title": "URLs file", "description": ( "URL that point to a plain-text file with a list of URLs to " - "crawl, e.g. https://example.com/url-list.txt. The linked list " + "crawl, e.g. https://example.com/url-list.txt. The linked file " "must contain 1 URL per line." ), "pattern": _URL_PATTERN, diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index e942964..cbf9554 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -30,20 +30,19 @@ def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]: """ if isinstance(value, str): value = value.split("\n") - if not value: - raise ValueError("The search_queries parameter value is missing or empty.") result = [] for v in value: - if not (v := v.strip()): - continue - result.append(v) + if v := v.strip(): + result.append(v) + if not result: + raise ValueError("The search_queries parameter value is missing or empty.") return result class SerpMaxPagesParam(BaseModel): max_pages: int = Field( - title="Pages", - description="Maximum number of result pages to visit per input URL.", + title="Max Pages", + description="Maximum number of result pages to visit per search query.", default=1, ) From 916b58c461b1d90c01cf2ca5414024b02702498e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 17 Sep 2024 10:13:12 +0200 Subject: [PATCH 18/19] Release notes for 0.9.0 --- CHANGES.rst | 20 ++++++++++++++++++-- docs/_ext/__init__.py | 41 +++++++++++++++++++++++++++++++++++++++++ docs/conf.py | 4 ++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 7e92b08..a330408 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,11 +1,27 @@ Changes ======= -Unreleased ----------- +0.9.0 (2024-09-NN) +------------------ * Now requires ``zyte-common-items >= 0.22.0``. +* New :ref:`Google Search spider template `, built on top of + Zyte API’s :http:`request:serp`. + +* The heuristics of the :ref:`e-commerce spider template ` to + ignore certain URLs when following category links now also handles + subdomains. For example, before https://example.com/blog was ignored, now + https://blog.example.com is also ignored. + +* In the :ref:`spider parameters JSON schema `, the + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy` + parameter of the :ref:`e-commerce spider template ` switches + position, from being the last parameter to being between + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file` + and + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`. + 0.8.0 (2024-08-21) ------------------ diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py index 5a3839e..4181427 100644 --- a/docs/_ext/__init__.py +++ b/docs/_ext/__init__.py @@ -1,4 +1,45 @@ +import re + +from docutils import nodes +from docutils.parsers.rst.roles import set_classes + + +def http_api_reference_role( + name, rawtext, text, lineno, inliner, options={}, content=[] +): + match = re.search( + r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text + ) + if match: + display_text = match[1] + reference = match[2] + else: + display_text = None + reference = text + if reference.startswith("request:"): + request_or_response = "request" + elif reference.startswith("response:"): + request_or_response = "response/200" + else: + raise ValueError( + f":http: directive reference must start with request: or " + f"response:, got {reference} from {text!r}." + ) + + field = reference.split(":", maxsplit=1)[1] + if not display_text: + display_text = field + refuri = ( + f"https://docs.zyte.com/zyte-api/usage/reference.html" + f"#operation/extract/{request_or_response}/{field}" + ) + set_classes(options) + node = nodes.reference(rawtext, display_text, refuri=refuri, **options) + return [node], [] + + def setup(app): + app.add_role("http", http_api_reference_role) # https://stackoverflow.com/a/13663325 # # Scrapy’s diff --git a/docs/conf.py b/docs/conf.py index ac67ce5..ff0ef7f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,6 +34,10 @@ "https://scrapy-poet.readthedocs.io/en/stable", None, ), + "scrapy-spider-metadata": ( + "https://scrapy-spider-metadata.readthedocs.io/en/latest", + None, + ), "scrapy-zyte-api": ( "https://scrapy-zyte-api.readthedocs.io/en/stable", None, From 5c5502eebdaa1ec70d9b499403932560df6c4cb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 17 Sep 2024 11:08:48 +0200 Subject: [PATCH 19/19] Remove valid_page_types --- CHANGES.rst | 3 +++ zyte_spider_templates/middlewares.py | 11 ----------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index a330408..b64a334 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,6 +22,9 @@ Changes and :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`. +* Removed the ``valid_page_types`` attribute of + :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`. + 0.8.0 (2024-08-21) ------------------ diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py index 68e4987..2cd8019 100644 --- a/zyte_spider_templates/middlewares.py +++ b/zyte_spider_templates/middlewares.py @@ -29,17 +29,6 @@ class CrawlingLogsMiddleware: the fingerprints logged in Scrapy Cloud's request data. """ - # Deprecated in practice, but there is no good way to deprecate it, since - # class properties that also work for class instances are not a thing. - # https://stackoverflow.com/q/128573 - valid_page_types = [ - "product", - "nextPage", - "subCategories", - "productNavigation", - "productNavigation-heuristics", - ] - unknown_page_type = "unknown" @classmethod