From d785a2d67d0e45e636e014cbc22827668b748c69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 27 Aug 2024 12:07:07 +0200
Subject: [PATCH 01/19] SERP (MVP)

---
 setup.py                              |   3 +-
 zyte_spider_templates/spiders/base.py |   4 +-
 zyte_spider_templates/spiders/serp.py | 135 ++++++++++++++++++++++++++
 3 files changed, 139 insertions(+), 3 deletions(-)
 create mode 100644 zyte_spider_templates/spiders/serp.py

diff --git a/setup.py b/setup.py
index 3871341..de219a4 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,8 @@
         "scrapy-poet>=0.21.0",
         "scrapy-spider-metadata>=0.1.2",
         "scrapy-zyte-api[provider]>=0.16.0",
-        "zyte-common-items>=0.13.0",
+        # "zyte-common-items>=0.13.0",
+        "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@serp",
     ],
     classifiers=[
         "Development Status :: 3 - Alpha",
diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py
index 846b87a..2fc0c5d 100644
--- a/zyte_spider_templates/spiders/base.py
+++ b/zyte_spider_templates/spiders/base.py
@@ -90,13 +90,13 @@ class BaseSpider(scrapy.Spider):
     def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
         spider = super().from_crawler(crawler, *args, **kwargs)
 
-        if spider.args.geolocation:
+        if geolocation := getattr(spider.args, "geolocation", None):
             # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected
             # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object
             # additional requests.
             for component in ("AUTOMAP", "PROVIDER"):
                 default_params = spider.settings.getdict(f"ZYTE_API_{component}_PARAMS")
-                default_params["geolocation"] = spider.args.geolocation
+                default_params["geolocation"] = geolocation
                 spider.settings.set(
                     f"ZYTE_API_{component}_PARAMS",
                     default_params,
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
new file mode 100644
index 0000000..cbaeb34
--- /dev/null
+++ b/zyte_spider_templates/spiders/serp.py
@@ -0,0 +1,135 @@
+from typing import Any, Dict, Iterable
+
+import requests
+import scrapy
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from scrapy import Request
+from scrapy.crawler import Crawler
+from scrapy_spider_metadata import Args
+from w3lib.url import add_or_replace_parameter
+from zyte_common_items import Serp
+
+from zyte_spider_templates.spiders.base import BaseSpider
+from zyte_spider_templates.utils import get_domain
+
+from ..params import MaxRequestsParam, UrlParam, UrlsFileParam, UrlsParam
+from ..utils import load_url_list
+from .base import _INPUT_FIELDS
+
+
+class SerpMaxPagesParam(BaseModel):
+    max_pages: int = Field(
+        title="Pages",
+        description="Maximum number of result pages to visit per input URL.",
+        default=1,
+    )
+
+
+class SerpSpiderParams(
+    MaxRequestsParam,
+    SerpMaxPagesParam,
+    UrlsFileParam,
+    UrlsParam,
+    UrlParam,
+    BaseModel,
+):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "groups": [
+                {
+                    "id": "inputs",
+                    "title": "Inputs",
+                    "description": (
+                        "Input data that determines the start URLs of the crawl."
+                    ),
+                    "widget": "exclusive",
+                },
+            ],
+        },
+    )
+
+    @model_validator(mode="after")
+    def single_input(self):
+        """Fields
+        :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.url`
+        and
+        :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file`
+        form a mandatory, mutually-exclusive field group: one of them must be
+        defined, the rest must not be defined."""
+        input_fields = set(
+            field for field in _INPUT_FIELDS if getattr(self, field, None)
+        )
+        if not input_fields:
+            input_field_list = ", ".join(_INPUT_FIELDS)
+            raise ValueError(
+                f"No input parameter defined. Please, define one of: "
+                f"{input_field_list}."
+            )
+        elif len(input_fields) > 1:
+            input_field_list = ", ".join(
+                f"{field} ({getattr(self, field)!r})" for field in input_fields
+            )
+            raise ValueError(
+                f"Expected a single input parameter, got {len(input_fields)}: "
+                f"{input_field_list}."
+            )
+        return self
+
+
+class SerpSpider(Args[SerpSpiderParams], BaseSpider):
+    """Yield results from search engine result pages (SERP).
+
+    See :class:`~zyte_spider_templates.spiders.ecommerce.SerpSpiderParams`
+    for supported parameters.
+
+    .. seealso:: :ref:`serp`.
+    """
+
+    name = "serp"
+
+    metadata: Dict[str, Any] = {
+        **BaseSpider.metadata,
+        "title": "SERP",
+        "description": "Template for spiders that extract search engine results.",
+    }
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
+        spider = super().from_crawler(crawler, *args, **kwargs)
+        spider._init_input()
+        return spider
+
+    def _init_input(self):
+        urls_file = self.args.urls_file
+        if urls_file:
+            response = requests.get(urls_file)
+            urls = load_url_list(response.text)
+            self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.")
+            self.start_urls = urls
+        elif self.args.urls:
+            self.start_urls = self.args.urls
+        else:
+            self.start_urls = [self.args.url]
+        self.allowed_domains = list(set(get_domain(url) for url in self.start_urls))
+
+    def get_start_request(self, url):
+        return Request(
+            url=url,
+            callback=self.parse_serp,
+            meta={
+                "crawling_logs": {"page_type": "serp"},
+                "zyte_api": {
+                    "serp": True,
+                },
+            },
+        )
+
+    def start_requests(self) -> Iterable[Request]:
+        for url in self.start_urls:
+            for start in range(0, self.args.max_pages * 10, 10):
+                if start:
+                    url = add_or_replace_parameter(url, "start", str(start))
+                yield self.get_start_request(url)
+
+    def parse_serp(self, response) -> Iterable[Serp]:
+        yield Serp.from_dict(response.raw_api_response["serp"])

From 4c47efc5e06517295097d0b8394b8aaf051d4173 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 27 Aug 2024 15:45:17 +0200
Subject: [PATCH 02/19] Fix references and complete the docs

---
 docs/conf.py                          |  3 +++
 docs/index.rst                        |  1 +
 docs/reference/index.rst              |  5 +++++
 docs/templates/index.rst              |  3 +++
 docs/templates/serp.rst               | 19 +++++++++++++++++++
 zyte_spider_templates/__init__.py     |  1 +
 zyte_spider_templates/spiders/serp.py |  8 +++++---
 7 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 docs/templates/serp.rst

diff --git a/docs/conf.py b/docs/conf.py
index 5a610e3..ac67ce5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -48,8 +48,11 @@
     ),
 }
 
+autodoc_pydantic_model_show_config_summary = False
 autodoc_pydantic_model_show_field_summary = False
 autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_show_validator_summary = False
 
 # sphinx-reredirects
 redirects = {
diff --git a/docs/index.rst b/docs/index.rst
index d344faa..d26b6ca 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -18,6 +18,7 @@ zyte-spider-templates documentation
 
    templates/index
    E-commerce <templates/e-commerce>
+   SERP <templates/serp>
 
 .. toctree::
    :caption: Customization
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 81826cb..14a158e 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -9,6 +9,8 @@ Spiders
 
 .. autoclass:: zyte_spider_templates.EcommerceSpider
 
+.. autoclass:: zyte_spider_templates.SerpSpider
+
 
 Pages
 =====
@@ -41,3 +43,6 @@ Parameter mixins
     :exclude-members: model_computed_fields
 
 .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
+
+.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
+    :exclude-members: model_computed_fields
diff --git a/docs/templates/index.rst b/docs/templates/index.rst
index c70a7de..058187c 100644
--- a/docs/templates/index.rst
+++ b/docs/templates/index.rst
@@ -29,3 +29,6 @@ Spider template list
 
 :ref:`E-commerce <e-commerce>`
     Get products from an e-commerce website.
+
+:ref:`SERP <serp>`
+    Get search engine result pages.
diff --git a/docs/templates/serp.rst b/docs/templates/serp.rst
new file mode 100644
index 0000000..496926f
--- /dev/null
+++ b/docs/templates/serp.rst
@@ -0,0 +1,19 @@
+.. _serp:
+
+===============================
+SERP spider template (``serp``)
+===============================
+
+Basic use
+=========
+
+.. code-block:: shell
+
+    scrapy crawl serp -a url="https://www.google.com/search?q=foo"
+
+Parameters
+==========
+
+.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpSpiderParams
+    :inherited-members: BaseModel
+    :exclude-members: model_computed_fields
diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py
index e3de8c9..6b6d292 100644
--- a/zyte_spider_templates/__init__.py
+++ b/zyte_spider_templates/__init__.py
@@ -1,2 +1,3 @@
 from .spiders.base import BaseSpider, BaseSpiderParams
 from .spiders.ecommerce import EcommerceSpider
+from .spiders.serp import SerpSpider
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index cbaeb34..d159312 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -34,6 +34,8 @@ class SerpSpiderParams(
     BaseModel,
 ):
     model_config = ConfigDict(
+        # https://github.com/pydantic/pydantic/discussions/7763#discussioncomment-10338857
+        protected_namespaces=(),
         json_schema_extra={
             "groups": [
                 {
@@ -51,9 +53,9 @@ class SerpSpiderParams(
     @model_validator(mode="after")
     def single_input(self):
         """Fields
-        :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.url`
+        :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.url`
         and
-        :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file`
+        :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.urls_file`
         form a mandatory, mutually-exclusive field group: one of them must be
         defined, the rest must not be defined."""
         input_fields = set(
@@ -79,7 +81,7 @@ def single_input(self):
 class SerpSpider(Args[SerpSpiderParams], BaseSpider):
     """Yield results from search engine result pages (SERP).
 
-    See :class:`~zyte_spider_templates.spiders.ecommerce.SerpSpiderParams`
+    See :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams`
     for supported parameters.
 
     .. seealso:: :ref:`serp`.

From d10d75e2c3e1f6309985a2577f1a7a8919b3cc0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Wed, 28 Aug 2024 20:56:31 +0200
Subject: [PATCH 03/19] Customize UI strings for SERP and add tests

---
 tests/__init__.py                          |   4 +
 tests/test_ecommerce.py                    |  65 +-----
 tests/test_params.py                       |  51 +++++
 tests/test_serp.py                         | 220 +++++++++++++++++++++
 tests/utils.py                             |  16 ++
 zyte_spider_templates/params.py            | 134 ++++++++-----
 zyte_spider_templates/spiders/ecommerce.py |  18 +-
 zyte_spider_templates/spiders/serp.py      |  69 ++++---
 8 files changed, 419 insertions(+), 158 deletions(-)
 create mode 100644 tests/test_params.py
 create mode 100644 tests/test_serp.py
 create mode 100644 tests/utils.py

diff --git a/tests/__init__.py b/tests/__init__.py
index 5e99e9c..2aa5953 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,7 +1,11 @@
 from typing import Any, Dict, Optional
 
+import pytest
 from scrapy.utils.test import TestSpider
 
+# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
+pytest.register_assert_rewrite("tests.utils")
+
 
 # scrapy.utils.test.get_crawler alternative that does not freeze settings.
 def get_crawler(*, settings: Optional[Dict[str, Any]] = None):
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
index 7a8a9c7..987d93f 100644
--- a/tests/test_ecommerce.py
+++ b/tests/test_ecommerce.py
@@ -1,6 +1,4 @@
-import json
 import logging
-import re
 from unittest.mock import MagicMock, call, patch
 
 import pytest
@@ -11,7 +9,6 @@
 from scrapy_spider_metadata import get_spider_metadata
 from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request
 
-from zyte_spider_templates import BaseSpiderParams
 from zyte_spider_templates._geolocations import (
     GEOLOCATION_OPTIONS,
     GEOLOCATION_OPTIONS_WITH_CODE,
@@ -24,6 +21,7 @@
 
 from . import get_crawler
 from .test_utils import URL_TO_DOMAIN
+from .utils import assertEqualJson
 
 
 def test_parameters():
@@ -362,21 +360,6 @@ def test_arguments():
         assert spider.allowed_domains == ["example.com"]
 
 
-def assertEqualJson(actual, expected):
-    """Compare the JSON representation of 2 Python objects.
-
-    This allows to take into account things like the order of key-value pairs
-    in dictionaries, which would not be taken into account when comparing
-    dictionaries directly.
-
-    It also generates a better diff in pytest output when enums are involved,
-    e.g. geolocation values.
-    """
-    actual_json = json.dumps(actual, indent=2)
-    expected_json = json.dumps(expected, indent=2)
-    assert actual_json == expected_json
-
-
 def test_metadata():
     actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
     expected_metadata = {
@@ -558,52 +541,6 @@ def test_metadata():
     assert set(geolocation["enum"]) == set(geolocation["enumMeta"])
 
 
-@pytest.mark.parametrize(
-    "valid,url",
-    [
-        (False, ""),
-        (False, "http://"),
-        (False, "http:/example.com"),
-        (False, "ftp://example.com"),
-        (False, "example.com"),
-        (False, "//example.com"),
-        (False, "http://foo:bar@example.com"),
-        (False, " http://example.com"),
-        (False, "http://example.com "),
-        (False, "http://examp le.com"),
-        (False, "https://example.com:232323"),
-        (True, "http://example.com"),
-        (True, "http://bücher.example"),
-        (True, "http://xn--bcher-kva.example"),
-        (True, "https://i❤.ws"),
-        (True, "https://example.com"),
-        (True, "https://example.com/"),
-        (True, "https://example.com:2323"),
-        (True, "https://example.com:2323/"),
-        (True, "https://example.com:2323/foo"),
-        (True, "https://example.com/f"),
-        (True, "https://example.com/foo"),
-        (True, "https://example.com/foo/"),
-        (True, "https://example.com/foo/bar"),
-        (True, "https://example.com/foo/bar/"),
-        (True, "https://example.com/foo/bar?baz"),
-        (True, "https://example.com/foo/bar/?baz"),
-        (True, "https://example.com?foo"),
-        (True, "https://example.com?foo=bar"),
-        (True, "https://example.com/?foo=bar&baz"),
-        (True, "https://example.com/?foo=bar&baz#"),
-        (True, "https://example.com/?foo=bar&baz#frag"),
-        (True, "https://example.com#"),
-        (True, "https://example.com/#"),
-        (True, "https://example.com/&"),
-        (True, "https://example.com/&#"),
-    ],
-)
-def test_validation_url(url, valid):
-    url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
-    assert bool(re.match(url_re, url)) == valid
-
-
 def test_get_parse_product_request():
     base_kwargs = {
         "url": "https://example.com",
diff --git a/tests/test_params.py b/tests/test_params.py
new file mode 100644
index 0000000..df08a19
--- /dev/null
+++ b/tests/test_params.py
@@ -0,0 +1,51 @@
+import re
+
+import pytest
+
+from zyte_spider_templates.params import URL_FIELD_KWARGS
+
+
+@pytest.mark.parametrize(
+    "valid,url",
+    [
+        (False, ""),
+        (False, "http://"),
+        (False, "http:/example.com"),
+        (False, "ftp://example.com"),
+        (False, "example.com"),
+        (False, "//example.com"),
+        (False, "http://foo:bar@example.com"),
+        (False, " http://example.com"),
+        (False, "http://example.com "),
+        (False, "http://examp le.com"),
+        (False, "https://example.com:232323"),
+        (True, "http://example.com"),
+        (True, "http://bücher.example"),
+        (True, "http://xn--bcher-kva.example"),
+        (True, "https://i❤.ws"),
+        (True, "https://example.com"),
+        (True, "https://example.com/"),
+        (True, "https://example.com:2323"),
+        (True, "https://example.com:2323/"),
+        (True, "https://example.com:2323/foo"),
+        (True, "https://example.com/f"),
+        (True, "https://example.com/foo"),
+        (True, "https://example.com/foo/"),
+        (True, "https://example.com/foo/bar"),
+        (True, "https://example.com/foo/bar/"),
+        (True, "https://example.com/foo/bar?baz"),
+        (True, "https://example.com/foo/bar/?baz"),
+        (True, "https://example.com?foo"),
+        (True, "https://example.com?foo=bar"),
+        (True, "https://example.com/?foo=bar&baz"),
+        (True, "https://example.com/?foo=bar&baz#"),
+        (True, "https://example.com/?foo=bar&baz#frag"),
+        (True, "https://example.com#"),
+        (True, "https://example.com/#"),
+        (True, "https://example.com/&"),
+        (True, "https://example.com/&#"),
+    ],
+)
+def test_url_pattern(url, valid):
+    assert isinstance(URL_FIELD_KWARGS["pattern"], str)
+    assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid
diff --git a/tests/test_serp.py b/tests/test_serp.py
new file mode 100644
index 0000000..beadc3d
--- /dev/null
+++ b/tests/test_serp.py
@@ -0,0 +1,220 @@
+from unittest.mock import patch
+
+import pytest
+import requests
+from pydantic import ValidationError
+from scrapy_spider_metadata import get_spider_metadata
+
+from zyte_spider_templates.spiders.serp import SerpSpider
+
+from . import get_crawler
+from .test_utils import URL_TO_DOMAIN
+from .utils import assertEqualJson
+
+
+def test_parameters():
+    with pytest.raises(ValidationError):
+        SerpSpider()
+
+    SerpSpider(url="https://google.com/search?q=foo+bar")
+    SerpSpider(url="https://google.com/search?q=foo+bar", max_pages=10)
+
+    with pytest.raises(ValidationError):
+        SerpSpider(url="https://google.com/search?q=foo+bar", max_pages="all")
+
+
+def test_start_requests():
+    url = "https://google.com/search?q=foo+bar"
+    crawler = get_crawler()
+    spider = SerpSpider.from_crawler(crawler, url=url)
+    requests = list(spider.start_requests())
+    assert len(requests) == 1
+    assert requests[0].url == url
+    assert requests[0].callback == spider.parse_serp
+
+
+def test_metadata():
+    actual_metadata = get_spider_metadata(SerpSpider, normalize=True)
+    expected_metadata = {
+        "template": True,
+        "title": "SERP",
+        "description": "Template for spiders that extract Google search results.",
+        "param_schema": {
+            "groups": [
+                {
+                    "description": (
+                        "Input data that determines the start URLs of the crawl."
+                    ),
+                    "id": "inputs",
+                    "title": "Inputs",
+                    "widget": "exclusive",
+                },
+            ],
+            "properties": {
+                "url": {
+                    "default": "",
+                    "description": (
+                        "Initial URL for the crawl. Enter the full URL including http(s), "
+                        "you can copy and paste it from your browser. Example: https://google.com/search?q=foo+bar"
+                    ),
+                    "exclusiveRequired": True,
+                    "group": "inputs",
+                    "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+                    "title": "URL",
+                    "type": "string",
+                },
+                "urls": {
+                    "anyOf": [
+                        {"items": {"type": "string"}, "type": "array"},
+                        {"type": "null"},
+                    ],
+                    "default": None,
+                    "description": (
+                        "Initial URLs for the crawl, separated by new lines. Enter the "
+                        "full URL including http(s), you can copy and paste it from your "
+                        "browser. Example: https://google.com/search?q=foo+bar"
+                    ),
+                    "exclusiveRequired": True,
+                    "group": "inputs",
+                    "title": "URLs",
+                    "widget": "textarea",
+                },
+                "urls_file": {
+                    "default": "",
+                    "description": (
+                        "URL that point to a plain-text file with a list of "
+                        "URLs to crawl, e.g. "
+                        "https://example.com/url-list.txt. The linked list "
+                        "must contain 1 URL per line."
+                    ),
+                    "exclusiveRequired": True,
+                    "group": "inputs",
+                    "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+                    "title": "URLs file",
+                    "type": "string",
+                },
+                "max_pages": {
+                    "default": 1,
+                    "description": "Maximum number of result pages to visit per input URL.",
+                    "title": "Pages",
+                    "type": "integer",
+                },
+                "max_requests": {
+                    "anyOf": [{"type": "integer"}, {"type": "null"}],
+                    "default": 100,
+                    "description": (
+                        "The maximum number of Zyte API requests allowed for the crawl.\n"
+                        "\n"
+                        "Requests with error responses that cannot be retried or exceed "
+                        "their retry limit also count here, but they incur in no costs "
+                        "and do not increase the request count in Scrapy Cloud."
+                    ),
+                    "title": "Max Requests",
+                    "widget": "request-limit",
+                },
+            },
+            "title": "SerpSpiderParams",
+            "type": "object",
+        },
+    }
+    assertEqualJson(actual_metadata, expected_metadata)
+
+
+@pytest.mark.parametrize("url,allowed_domain", URL_TO_DOMAIN)
+def test_set_allowed_domains(url, allowed_domain):
+    crawler = get_crawler()
+
+    kwargs = {"url": url}
+    spider = SerpSpider.from_crawler(crawler, **kwargs)
+    assert spider.allowed_domains == [allowed_domain]
+
+
+def test_input_none():
+    crawler = get_crawler()
+    with pytest.raises(ValueError):
+        SerpSpider.from_crawler(crawler)
+
+
+def test_input_multiple():
+    crawler = get_crawler()
+    with pytest.raises(ValueError):
+        SerpSpider.from_crawler(
+            crawler,
+            url="https://google.com/search?q=a",
+            urls=["https://google.com/search?q=b"],
+        )
+    with pytest.raises(ValueError):
+        SerpSpider.from_crawler(
+            crawler,
+            url="https://google.com/search?q=a",
+            urls_file="https://example.com/input-urls.txt",
+        )
+    with pytest.raises(ValueError):
+        SerpSpider.from_crawler(
+            crawler,
+            urls=["https://google.com/search?q=b"],
+            urls_file="https://example.com/input-urls.txt",
+        )
+
+
+def test_url_invalid():
+    crawler = get_crawler()
+    with pytest.raises(ValueError):
+        SerpSpider.from_crawler(crawler, url="foo")
+
+
+def test_urls(caplog):
+    crawler = get_crawler()
+    url = "https://google.com/search?q=foo+bar"
+
+    spider = SerpSpider.from_crawler(crawler, urls=[url])
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].url == url
+    assert start_requests[0].callback == spider.parse_serp
+
+    spider = SerpSpider.from_crawler(crawler, urls=url)
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].url == url
+    assert start_requests[0].callback == spider.parse_serp
+
+    caplog.clear()
+    spider = SerpSpider.from_crawler(
+        crawler,
+        urls="https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\nfoo\n\n",
+    )
+    assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 3
+    assert all(request.callback == spider.parse_serp for request in start_requests)
+    assert start_requests[0].url == "https://google.com/search?q=a"
+    assert start_requests[1].url == "https://google.com/search?q=b"
+    assert start_requests[2].url == "https://google.com/search?q=c"
+
+    caplog.clear()
+    with pytest.raises(ValueError):
+        spider = SerpSpider.from_crawler(
+            crawler,
+            urls="foo\nbar",
+        )
+    assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
+    assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text
+
+
+def test_urls_file():
+    crawler = get_crawler()
+    url = "https://example.com/input-urls.txt"
+
+    with patch("zyte_spider_templates.spiders.serp.requests.get") as mock_get:
+        response = requests.Response()
+        response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n"
+        mock_get.return_value = response
+        spider = SerpSpider.from_crawler(crawler, urls_file=url)
+        mock_get.assert_called_with(url)
+
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 3
+    assert start_requests[0].url == "https://google.com/search?q=a"
+    assert start_requests[1].url == "https://google.com/search?q=b"
+    assert start_requests[2].url == "https://google.com/search?q=c"
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..2fd7261
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,16 @@
+import json
+
+
+def assertEqualJson(actual, expected):
+    """Compare the JSON representation of 2 Python objects.
+
+    This allows to take into account things like the order of key-value pairs
+    in dictionaries, which would not be taken into account when comparing
+    dictionaries directly.
+
+    It also generates a better diff in pytest output when enums are involved,
+    e.g. geolocation values.
+    """
+    actual_json = json.dumps(actual, indent=2)
+    expected_json = json.dumps(expected, indent=2)
+    assert actual_json == expected_json
diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py
index d9245a8..be87ea8 100644
--- a/zyte_spider_templates/params.py
+++ b/zyte_spider_templates/params.py
@@ -4,6 +4,7 @@
 from logging import getLogger
 from typing import Dict, List, Optional, Union
 
+import requests
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 
 from zyte_spider_templates._geolocations import (
@@ -12,7 +13,7 @@
 )
 from zyte_spider_templates.documentation import document_enum
 
-from .utils import _URL_PATTERN
+from .utils import _URL_PATTERN, get_domain, load_url_list
 
 logger = getLogger(__name__)
 
@@ -100,66 +101,93 @@ class UrlsFileParam(BaseModel):
     )
 
 
+def parse_input_params(spider):
+    urls_file = spider.args.urls_file
+    if urls_file:
+        response = requests.get(urls_file)
+        urls = load_url_list(response.text)
+        spider.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.")
+        spider.start_urls = urls
+    elif spider.args.urls:
+        spider.start_urls = spider.args.urls
+    else:
+        spider.start_urls = [spider.args.url]
+    spider.allowed_domains = list(set(get_domain(url) for url in spider.start_urls))
+
+
+URL_FIELD_KWARGS = {
+    "title": "URL",
+    "description": (
+        "Initial URL for the crawl. Enter the full URL including http(s), "
+        "you can copy and paste it from your browser. Example: "
+        "https://toscrape.com/"
+    ),
+    "pattern": _URL_PATTERN,
+    "default": "",
+    "json_schema_extra": {
+        "group": "inputs",
+        "exclusiveRequired": True,
+    },
+}
+
+
 class UrlParam(BaseModel):
-    url: str = Field(
-        title="URL",
-        description="Initial URL for the crawl. Enter the full URL including http(s), "
-        "you can copy and paste it from your browser. Example: https://toscrape.com/",
-        pattern=_URL_PATTERN,
-        default="",
-        json_schema_extra={
-            "group": "inputs",
-            "exclusiveRequired": True,
-        },
-    )
+    url: str = Field(**URL_FIELD_KWARGS)  # type: ignore[misc, arg-type]
+
+
+URLS_FIELD_KWARGS = {
+    "title": "URLs",
+    "description": (
+        "Initial URLs for the crawl, separated by new lines. Enter the "
+        "full URL including http(s), you can copy and paste it from your "
+        "browser. Example: https://toscrape.com/"
+    ),
+    "default": None,
+    "json_schema_extra": {
+        "group": "inputs",
+        "exclusiveRequired": True,
+        "widget": "textarea",
+    },
+}
+
+
+def validate_url_list(value: Union[List[str], str]) -> List[str]:
+    """Validate a list of URLs.
+
+    If a string is received as input, it is split into multiple strings
+    on new lines.
+
+    List items that do not match a URL pattern trigger a warning and are
+    removed from the list. If all URLs are invalid, validation fails.
+    """
+    if isinstance(value, str):
+        value = value.split("\n")
+    if not value:
+        return value
+    result = []
+    for v in value:
+        v = v.strip()
+        if not v:
+            continue
+        if not re.search(_URL_PATTERN, v):
+            logger.warning(
+                f"{v!r}, from the 'urls' spider argument, is not a "
+                f"valid URL and will be ignored."
+            )
+            continue
+        result.append(v)
+    if not result:
+        raise ValueError(f"No valid URL found in {value!r}")
+    return result
 
 
 class UrlsParam(BaseModel):
-    urls: Optional[List[str]] = Field(
-        title="URLs",
-        description=(
-            "Initial URLs for the crawl, separated by new lines. Enter the "
-            "full URL including http(s), you can copy and paste it from your "
-            "browser. Example: https://toscrape.com/"
-        ),
-        default=None,
-        json_schema_extra={
-            "group": "inputs",
-            "exclusiveRequired": True,
-            "widget": "textarea",
-        },
-    )
+    urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS)  # type: ignore[misc, arg-type]
 
     @field_validator("urls", mode="before")
     @classmethod
     def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
-        """Validate a list of URLs.
-
-        If a string is received as input, it is split into multiple strings
-        on new lines.
-
-        List items that do not match a URL pattern trigger a warning and are
-        removed from the list. If all URLs are invalid, validation fails.
-        """
-        if isinstance(value, str):
-            value = value.split("\n")
-        if not value:
-            return value
-        result = []
-        for v in value:
-            v = v.strip()
-            if not v:
-                continue
-            if not re.search(_URL_PATTERN, v):
-                logger.warning(
-                    f"{v!r}, from the 'urls' spider argument, is not a "
-                    f"valid URL and will be ignored."
-                )
-                continue
-            result.append(v)
-        if not result:
-            raise ValueError(f"No valid URL found in {value!r}")
-        return result
+        return validate_url_list(value)
 
 
 class PostalAddress(BaseModel):
diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
index bfcb672..025e174 100644
--- a/zyte_spider_templates/spiders/ecommerce.py
+++ b/zyte_spider_templates/spiders/ecommerce.py
@@ -1,7 +1,6 @@
 from enum import Enum
 from typing import Any, Callable, Dict, Iterable, Optional, Union
 
-import requests
 import scrapy
 from pydantic import BaseModel, Field
 from scrapy import Request
@@ -11,6 +10,7 @@
 from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
 
 from zyte_spider_templates.heuristics import is_homepage
+from zyte_spider_templates.params import parse_input_params
 from zyte_spider_templates.spiders.base import (
     ARG_SETTING_PRIORITY,
     BaseSpider,
@@ -19,7 +19,6 @@
 from zyte_spider_templates.utils import get_domain
 
 from ..documentation import document_enum
-from ..utils import load_url_list
 
 
 @document_enum
@@ -126,23 +125,10 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):
     @classmethod
     def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
         spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs)
-        spider._init_input()
+        parse_input_params(spider)
         spider._init_extract_from()
         return spider
 
-    def _init_input(self):
-        urls_file = self.args.urls_file
-        if urls_file:
-            response = requests.get(urls_file)
-            urls = load_url_list(response.text)
-            self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.")
-            self.start_urls = urls
-        elif self.args.urls:
-            self.start_urls = self.args.urls
-        else:
-            self.start_urls = [self.args.url]
-        self.allowed_domains = list(set(get_domain(url) for url in self.start_urls))
-
     def _init_extract_from(self):
         if self.args.extract_from is not None:
             self.settings.set(
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index d159312..45fb9c7 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -1,19 +1,24 @@
-from typing import Any, Dict, Iterable
+from copy import deepcopy
+from typing import Any, Dict, Iterable, List, Optional, Union
 
-import requests
 import scrapy
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy_spider_metadata import Args
 from w3lib.url import add_or_replace_parameter
 from zyte_common_items import Serp
 
+from zyte_spider_templates.params import parse_input_params
 from zyte_spider_templates.spiders.base import BaseSpider
-from zyte_spider_templates.utils import get_domain
 
-from ..params import MaxRequestsParam, UrlParam, UrlsFileParam, UrlsParam
-from ..utils import load_url_list
+from ..params import (
+    URL_FIELD_KWARGS,
+    URLS_FIELD_KWARGS,
+    MaxRequestsParam,
+    UrlsFileParam,
+    validate_url_list,
+)
 from .base import _INPUT_FIELDS
 
 
@@ -25,12 +30,39 @@ class SerpMaxPagesParam(BaseModel):
     )
 
 
+SERP_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS)
+assert isinstance(SERP_URL_FIELD_KWARGS["description"], str)
+SERP_URL_FIELD_KWARGS["description"] = SERP_URL_FIELD_KWARGS["description"].replace(
+    "https://toscrape.com/", "https://google.com/search?q=foo+bar"
+)
+
+
+class SerpUrlParam(BaseModel):
+    url: str = Field(**SERP_URL_FIELD_KWARGS)  # type: ignore[misc, arg-type]
+
+
+SERP_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS)
+assert isinstance(SERP_URLS_FIELD_KWARGS["description"], str)
+SERP_URLS_FIELD_KWARGS["description"] = SERP_URLS_FIELD_KWARGS["description"].replace(
+    "https://toscrape.com/", "https://google.com/search?q=foo+bar"
+)
+
+
+class SerpUrlsParam(BaseModel):
+    urls: Optional[List[str]] = Field(**SERP_URLS_FIELD_KWARGS)  # type: ignore[misc, arg-type]
+
+    @field_validator("urls", mode="before")
+    @classmethod
+    def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
+        return validate_url_list(value)
+
+
 class SerpSpiderParams(
     MaxRequestsParam,
     SerpMaxPagesParam,
     UrlsFileParam,
-    UrlsParam,
-    UrlParam,
+    SerpUrlsParam,
+    SerpUrlParam,
     BaseModel,
 ):
     model_config = ConfigDict(
@@ -53,9 +85,9 @@ class SerpSpiderParams(
     @model_validator(mode="after")
     def single_input(self):
         """Fields
-        :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.url`
+        :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams.url`
         and
-        :class:`~zyte_spider_templates.spiders.serp.EcommerceSpiderParams.urls_file`
+        :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams.urls_file`
         form a mandatory, mutually-exclusive field group: one of them must be
         defined, the rest must not be defined."""
         input_fields = set(
@@ -92,28 +124,15 @@ class SerpSpider(Args[SerpSpiderParams], BaseSpider):
     metadata: Dict[str, Any] = {
         **BaseSpider.metadata,
         "title": "SERP",
-        "description": "Template for spiders that extract search engine results.",
+        "description": "Template for spiders that extract Google search results.",
     }
 
     @classmethod
     def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
         spider = super().from_crawler(crawler, *args, **kwargs)
-        spider._init_input()
+        parse_input_params(spider)
         return spider
 
-    def _init_input(self):
-        urls_file = self.args.urls_file
-        if urls_file:
-            response = requests.get(urls_file)
-            urls = load_url_list(response.text)
-            self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.")
-            self.start_urls = urls
-        elif self.args.urls:
-            self.start_urls = self.args.urls
-        else:
-            self.start_urls = [self.args.url]
-        self.allowed_domains = list(set(get_domain(url) for url in self.start_urls))
-
     def get_start_request(self, url):
         return Request(
             url=url,

From ff97f077c54fa402a0afb2ab6ca1757322197007 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Wed, 28 Aug 2024 21:02:16 +0200
Subject: [PATCH 04/19] Fix requests mocking

---
 tests/test_ecommerce.py | 2 +-
 tests/test_serp.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
index 987d93f..c59cb18 100644
--- a/tests/test_ecommerce.py
+++ b/tests/test_ecommerce.py
@@ -755,7 +755,7 @@ def test_urls_file():
     crawler = get_crawler()
     url = "https://example.com"
 
-    with patch("zyte_spider_templates.spiders.ecommerce.requests.get") as mock_get:
+    with patch("zyte_spider_templates.params.requests.get") as mock_get:
         response = requests.Response()
         response._content = (
             b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n"
diff --git a/tests/test_serp.py b/tests/test_serp.py
index beadc3d..fbe7f02 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -206,7 +206,7 @@ def test_urls_file():
     crawler = get_crawler()
     url = "https://example.com/input-urls.txt"
 
-    with patch("zyte_spider_templates.spiders.serp.requests.get") as mock_get:
+    with patch("zyte_spider_templates.params.requests.get") as mock_get:
         response = requests.Response()
         response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n"
         mock_get.return_value = response

From 3a44330e95de397962a63cb26d68ea24523ffdce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Thu, 29 Aug 2024 14:03:28 +0200
Subject: [PATCH 05/19] Enable the aggressive retry policy by default for the
 SERP spider

---
 zyte_spider_templates/spiders/serp.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index 45fb9c7..1dc0c33 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -5,6 +5,7 @@
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from scrapy import Request
 from scrapy.crawler import Crawler
+from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
 from scrapy_spider_metadata import Args
 from w3lib.url import add_or_replace_parameter
 from zyte_common_items import Serp
@@ -127,6 +128,20 @@ class SerpSpider(Args[SerpSpiderParams], BaseSpider):
         "description": "Template for spiders that extract Google search results.",
     }
 
+    @classmethod
+    def update_settings(cls, settings: BaseSettings) -> None:
+        super().update_settings(settings)
+        retry_policy_setting_priority = settings.getpriority("ZYTE_API_RETRY_POLICY")
+        if (
+            retry_policy_setting_priority is None
+            or retry_policy_setting_priority < SETTINGS_PRIORITIES["spider"]
+        ):
+            settings.set(
+                "ZYTE_API_RETRY_POLICY",
+                "zyte_api.aggressive_retrying",
+                priority="spider",
+            )
+
     @classmethod
     def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
         spider = super().from_crawler(crawler, *args, **kwargs)

From 1bc4a29357059662136286582bb9ea8f2c5d59c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Wed, 4 Sep 2024 19:57:16 +0200
Subject: [PATCH 06/19] Make the SERP spider more Google-specific, in line with
 the current actual implementation

---
 docs/index.rst                        |  2 +-
 docs/reference/index.rst              |  2 +-
 docs/templates/google-search.rst      | 19 +++++++++++++
 docs/templates/index.rst              |  4 +--
 docs/templates/serp.rst               | 19 -------------
 tests/test_serp.py                    | 40 +++++++++++++--------------
 zyte_spider_templates/__init__.py     |  2 +-
 zyte_spider_templates/spiders/serp.py | 14 +++++-----
 8 files changed, 51 insertions(+), 51 deletions(-)
 create mode 100644 docs/templates/google-search.rst
 delete mode 100644 docs/templates/serp.rst

diff --git a/docs/index.rst b/docs/index.rst
index d26b6ca..1083299 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -18,7 +18,7 @@ zyte-spider-templates documentation
 
    templates/index
    E-commerce <templates/e-commerce>
-   SERP <templates/serp>
+   Google search <templates/google-search>
 
 .. toctree::
    :caption: Customization
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 14a158e..dd368dd 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -9,7 +9,7 @@ Spiders
 
 .. autoclass:: zyte_spider_templates.EcommerceSpider
 
-.. autoclass:: zyte_spider_templates.SerpSpider
+.. autoclass:: zyte_spider_templates.GoogleSearchSpider
 
 
 Pages
diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst
new file mode 100644
index 0000000..2bf9a6b
--- /dev/null
+++ b/docs/templates/google-search.rst
@@ -0,0 +1,19 @@
+.. _google-search:
+
+=================================================
+Google search spider template (``google_search``)
+=================================================
+
+Basic use
+=========
+
+.. code-block:: shell
+
+    scrapy crawl google_search -a url="https://www.google.com/search?q=foo"
+
+Parameters
+==========
+
+.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams
+    :inherited-members: BaseModel
+    :exclude-members: model_computed_fields
diff --git a/docs/templates/index.rst b/docs/templates/index.rst
index 058187c..ea86c6d 100644
--- a/docs/templates/index.rst
+++ b/docs/templates/index.rst
@@ -30,5 +30,5 @@ Spider template list
 :ref:`E-commerce <e-commerce>`
     Get products from an e-commerce website.
 
-:ref:`SERP <serp>`
-    Get search engine result pages.
+:ref:`Google Search <google-search>`
+    Get Google search results.
diff --git a/docs/templates/serp.rst b/docs/templates/serp.rst
deleted file mode 100644
index 496926f..0000000
--- a/docs/templates/serp.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. _serp:
-
-===============================
-SERP spider template (``serp``)
-===============================
-
-Basic use
-=========
-
-.. code-block:: shell
-
-    scrapy crawl serp -a url="https://www.google.com/search?q=foo"
-
-Parameters
-==========
-
-.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpSpiderParams
-    :inherited-members: BaseModel
-    :exclude-members: model_computed_fields
diff --git a/tests/test_serp.py b/tests/test_serp.py
index fbe7f02..09d6e08 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -5,7 +5,7 @@
 from pydantic import ValidationError
 from scrapy_spider_metadata import get_spider_metadata
 
-from zyte_spider_templates.spiders.serp import SerpSpider
+from zyte_spider_templates.spiders.serp import GoogleSearchSpider
 
 from . import get_crawler
 from .test_utils import URL_TO_DOMAIN
@@ -14,19 +14,19 @@
 
 def test_parameters():
     with pytest.raises(ValidationError):
-        SerpSpider()
+        GoogleSearchSpider()
 
-    SerpSpider(url="https://google.com/search?q=foo+bar")
-    SerpSpider(url="https://google.com/search?q=foo+bar", max_pages=10)
+    GoogleSearchSpider(url="https://google.com/search?q=foo+bar")
+    GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages=10)
 
     with pytest.raises(ValidationError):
-        SerpSpider(url="https://google.com/search?q=foo+bar", max_pages="all")
+        GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages="all")
 
 
 def test_start_requests():
     url = "https://google.com/search?q=foo+bar"
     crawler = get_crawler()
-    spider = SerpSpider.from_crawler(crawler, url=url)
+    spider = GoogleSearchSpider.from_crawler(crawler, url=url)
     requests = list(spider.start_requests())
     assert len(requests) == 1
     assert requests[0].url == url
@@ -34,10 +34,10 @@ def test_start_requests():
 
 
 def test_metadata():
-    actual_metadata = get_spider_metadata(SerpSpider, normalize=True)
+    actual_metadata = get_spider_metadata(GoogleSearchSpider, normalize=True)
     expected_metadata = {
         "template": True,
-        "title": "SERP",
+        "title": "Google Search Results",
         "description": "Template for spiders that extract Google search results.",
         "param_schema": {
             "groups": [
@@ -113,7 +113,7 @@ def test_metadata():
                     "widget": "request-limit",
                 },
             },
-            "title": "SerpSpiderParams",
+            "title": "GoogleSearchSpiderParams",
             "type": "object",
         },
     }
@@ -125,32 +125,32 @@ def test_set_allowed_domains(url, allowed_domain):
     crawler = get_crawler()
 
     kwargs = {"url": url}
-    spider = SerpSpider.from_crawler(crawler, **kwargs)
+    spider = GoogleSearchSpider.from_crawler(crawler, **kwargs)
     assert spider.allowed_domains == [allowed_domain]
 
 
 def test_input_none():
     crawler = get_crawler()
     with pytest.raises(ValueError):
-        SerpSpider.from_crawler(crawler)
+        GoogleSearchSpider.from_crawler(crawler)
 
 
 def test_input_multiple():
     crawler = get_crawler()
     with pytest.raises(ValueError):
-        SerpSpider.from_crawler(
+        GoogleSearchSpider.from_crawler(
             crawler,
             url="https://google.com/search?q=a",
             urls=["https://google.com/search?q=b"],
         )
     with pytest.raises(ValueError):
-        SerpSpider.from_crawler(
+        GoogleSearchSpider.from_crawler(
             crawler,
             url="https://google.com/search?q=a",
             urls_file="https://example.com/input-urls.txt",
         )
     with pytest.raises(ValueError):
-        SerpSpider.from_crawler(
+        GoogleSearchSpider.from_crawler(
             crawler,
             urls=["https://google.com/search?q=b"],
             urls_file="https://example.com/input-urls.txt",
@@ -160,27 +160,27 @@ def test_input_multiple():
 def test_url_invalid():
     crawler = get_crawler()
     with pytest.raises(ValueError):
-        SerpSpider.from_crawler(crawler, url="foo")
+        GoogleSearchSpider.from_crawler(crawler, url="foo")
 
 
 def test_urls(caplog):
     crawler = get_crawler()
     url = "https://google.com/search?q=foo+bar"
 
-    spider = SerpSpider.from_crawler(crawler, urls=[url])
+    spider = GoogleSearchSpider.from_crawler(crawler, urls=[url])
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 1
     assert start_requests[0].url == url
     assert start_requests[0].callback == spider.parse_serp
 
-    spider = SerpSpider.from_crawler(crawler, urls=url)
+    spider = GoogleSearchSpider.from_crawler(crawler, urls=url)
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 1
     assert start_requests[0].url == url
     assert start_requests[0].callback == spider.parse_serp
 
     caplog.clear()
-    spider = SerpSpider.from_crawler(
+    spider = GoogleSearchSpider.from_crawler(
         crawler,
         urls="https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\nfoo\n\n",
     )
@@ -194,7 +194,7 @@ def test_urls(caplog):
 
     caplog.clear()
     with pytest.raises(ValueError):
-        spider = SerpSpider.from_crawler(
+        spider = GoogleSearchSpider.from_crawler(
             crawler,
             urls="foo\nbar",
         )
@@ -210,7 +210,7 @@ def test_urls_file():
         response = requests.Response()
         response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n"
         mock_get.return_value = response
-        spider = SerpSpider.from_crawler(crawler, urls_file=url)
+        spider = GoogleSearchSpider.from_crawler(crawler, urls_file=url)
         mock_get.assert_called_with(url)
 
     start_requests = list(spider.start_requests())
diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py
index 6b6d292..75bfbde 100644
--- a/zyte_spider_templates/__init__.py
+++ b/zyte_spider_templates/__init__.py
@@ -1,3 +1,3 @@
 from .spiders.base import BaseSpider, BaseSpiderParams
 from .spiders.ecommerce import EcommerceSpider
-from .spiders.serp import SerpSpider
+from .spiders.serp import GoogleSearchSpider
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index 9d093ea..857e6d5 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -57,7 +57,7 @@ def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
         return validate_url_list(value)
 
 
-class SerpSpiderParams(
+class GoogleSearchSpiderParams(
     MaxRequestsParam,
     SerpMaxPagesParam,
     UrlsFileParam,
@@ -76,20 +76,20 @@ class SerpSpiderParams(
     )
 
 
-class SerpSpider(Args[SerpSpiderParams], BaseSpider):
-    """Yield results from search engine result pages (SERP).
+class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider):
+    """Yield results from Google searches.
 
-    See :class:`~zyte_spider_templates.spiders.serp.SerpSpiderParams`
+    See :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams`
     for supported parameters.
 
-    .. seealso:: :ref:`serp`.
+    .. seealso:: :ref:`google-search`.
     """
 
-    name = "serp"
+    name = "google_search"
 
     metadata: Dict[str, Any] = {
         **BaseSpider.metadata,
-        "title": "SERP",
+        "title": "Google Search Results",
         "description": "Template for spiders that extract Google search results.",
     }
 

From 8f3ab3eb37275c5c6e65320f50184b534c4a0cd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Wed, 4 Sep 2024 20:28:49 +0200
Subject: [PATCH 07/19] Add a mandatory search keywords field, and set a
 default input URL

---
 zyte_spider_templates/spiders/serp.py | 61 ++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index 857e6d5..bb46afb 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -1,5 +1,6 @@
 from copy import deepcopy
 from typing import Any, Dict, Iterable, List, Optional, Union
+from urllib.parse import urlparse, urlunparse
 
 import scrapy
 from pydantic import BaseModel, ConfigDict, Field, field_validator
@@ -22,6 +23,35 @@
 from .base import INPUT_GROUP, BaseSpider
 
 
+class SearchKeywordsParam(BaseModel):
+    search_keywords: Optional[List[str]] = Field(
+        title="Search Keywords",
+        description=("Search keywords to use on the specified input Google URLs."),
+        default=None,
+        json_schema_extra={
+            "widget": "textarea",
+        },
+    )
+
+    @field_validator("search_keywords", mode="before")
+    @classmethod
+    def validate_search_keywords(cls, value: Union[List[str], str]) -> List[str]:
+        """Validate a list of search keywords.
+        If a string is received as input, it is split into multiple strings
+        on new lines.
+        """
+        if isinstance(value, str):
+            value = value.split("\n")
+        if not value:
+            return value
+        result = []
+        for v in value:
+            if not (v := v.strip()):
+                continue
+            result.append(v)
+        return result
+
+
 class SerpMaxPagesParam(BaseModel):
     max_pages: int = Field(
         title="Pages",
@@ -30,15 +60,16 @@ class SerpMaxPagesParam(BaseModel):
     )
 
 
-SERP_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS)
-assert isinstance(SERP_URL_FIELD_KWARGS["description"], str)
-SERP_URL_FIELD_KWARGS["description"] = SERP_URL_FIELD_KWARGS["description"].replace(
+GOOGLE_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS)
+assert isinstance(GOOGLE_URL_FIELD_KWARGS["description"], str)
+GOOGLE_URL_FIELD_KWARGS["default"] = "https://www.google.com/"
+GOOGLE_URL_FIELD_KWARGS["description"] = GOOGLE_URL_FIELD_KWARGS["description"].replace(
     "https://toscrape.com/", "https://google.com/search?q=foo+bar"
 )
 
 
-class SerpUrlParam(BaseModel):
-    url: str = Field(**SERP_URL_FIELD_KWARGS)  # type: ignore[misc, arg-type]
+class GoogleUrlParam(BaseModel):
+    url: str = Field(**GOOGLE_URL_FIELD_KWARGS)  # type: ignore[misc, arg-type]
 
 
 SERP_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS)
@@ -60,9 +91,10 @@ def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
 class GoogleSearchSpiderParams(
     MaxRequestsParam,
     SerpMaxPagesParam,
+    SearchKeywordsParam,
     UrlsFileParam,
     SerpUrlsParam,
-    SerpUrlParam,
+    GoogleUrlParam,
     BaseModel,
 ):
     model_config = ConfigDict(
@@ -126,11 +158,20 @@ def get_start_request(self, url):
         )
 
     def start_requests(self) -> Iterable[Request]:
+        search_keywords = self.args.search_keywords
+        if not search_keywords:
+            raise ValueError("No search keywords specified.")
+
         for url in self.start_urls:
-            for start in range(0, self.args.max_pages * 10, 10):
-                if start:
-                    url = add_or_replace_parameter(url, "start", str(start))
-                yield self.get_start_request(url)
+            url = urlunparse(urlparse(url)._replace(path="/search"))
+            for search_keyword in search_keywords:
+                search_url = add_or_replace_parameter(url, "q", search_keyword)
+                for start in range(0, self.args.max_pages * 10, 10):
+                    if start:
+                        search_url = add_or_replace_parameter(
+                            search_url, "start", str(start)
+                        )
+                    yield self.get_start_request(search_url)
 
     def parse_serp(self, response) -> Iterable[Serp]:
         yield Serp.from_dict(response.raw_api_response["serp"])

From b0786e6a8b443b3bd9fce63ea588131090ab00f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Wed, 4 Sep 2024 21:20:19 +0200
Subject: [PATCH 08/19] Improve the SERP implementation, get all tests to pass

---
 docs/templates/google-search.rst      |  2 +-
 tests/test_serp.py                    | 88 +++++++++++++++++----------
 zyte_spider_templates/params.py       | 31 +++++-----
 zyte_spider_templates/spiders/serp.py | 68 +++++++++++++++------
 4 files changed, 126 insertions(+), 63 deletions(-)

diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst
index 2bf9a6b..e8a9053 100644
--- a/docs/templates/google-search.rst
+++ b/docs/templates/google-search.rst
@@ -9,7 +9,7 @@ Basic use
 
 .. code-block:: shell
 
-    scrapy crawl google_search -a url="https://www.google.com/search?q=foo"
+    scrapy crawl google_search -a search_keywords="foo bar"
 
 Parameters
 ==========
diff --git a/tests/test_serp.py b/tests/test_serp.py
index 09d6e08..177ff53 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -16,20 +16,25 @@ def test_parameters():
     with pytest.raises(ValidationError):
         GoogleSearchSpider()
 
-    GoogleSearchSpider(url="https://google.com/search?q=foo+bar")
-    GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages=10)
+    with pytest.raises(ValidationError):
+        GoogleSearchSpider(url="https://www.google.com/")
+
+    GoogleSearchSpider(search_keywords="foo bar")
+    GoogleSearchSpider(url="https://www.google.cat/", search_keywords="foo bar")
+    GoogleSearchSpider(
+        url="https://www.google.cat/", search_keywords="foo bar", max_pages=10
+    )
 
     with pytest.raises(ValidationError):
-        GoogleSearchSpider(url="https://google.com/search?q=foo+bar", max_pages="all")
+        GoogleSearchSpider(search_keywords="foo bar", max_pages="all")
 
 
 def test_start_requests():
-    url = "https://google.com/search?q=foo+bar"
     crawler = get_crawler()
-    spider = GoogleSearchSpider.from_crawler(crawler, url=url)
+    spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar")
     requests = list(spider.start_requests())
     assert len(requests) == 1
-    assert requests[0].url == url
+    assert requests[0].url == "https://www.google.com/search?q=foo+bar"
     assert requests[0].callback == spider.parse_serp
 
 
@@ -52,10 +57,9 @@ def test_metadata():
             ],
             "properties": {
                 "url": {
-                    "default": "",
+                    "default": "https://www.google.com/",
                     "description": (
-                        "Initial URL for the crawl. Enter the full URL including http(s), "
-                        "you can copy and paste it from your browser. Example: https://google.com/search?q=foo+bar"
+                        "Target Google URL. Defaults to https://www.google.com/."
                     ),
                     "exclusiveRequired": True,
                     "group": "inputs",
@@ -70,9 +74,7 @@ def test_metadata():
                     ],
                     "default": None,
                     "description": (
-                        "Initial URLs for the crawl, separated by new lines. Enter the "
-                        "full URL including http(s), you can copy and paste it from your "
-                        "browser. Example: https://google.com/search?q=foo+bar"
+                        "Target Google URLs. Defaults to https://www.google.com/."
                     ),
                     "exclusiveRequired": True,
                     "group": "inputs",
@@ -83,9 +85,10 @@ def test_metadata():
                     "default": "",
                     "description": (
                         "URL that point to a plain-text file with a list of "
-                        "URLs to crawl, e.g. "
+                        "target Google URLs, e.g. "
                         "https://example.com/url-list.txt. The linked list "
-                        "must contain 1 URL per line."
+                        "must contain 1 Google URL (e.g. "
+                        "https://www.google.com/) per line."
                     ),
                     "exclusiveRequired": True,
                     "group": "inputs",
@@ -93,6 +96,15 @@ def test_metadata():
                     "title": "URLs file",
                     "type": "string",
                 },
+                "search_keywords": {
+                    "anyOf": [
+                        {"items": {"type": "string"}, "type": "array"},
+                        {"type": "null"},
+                    ],
+                    "description": "Search keywords to use on the specified input Google URLs.",
+                    "title": "Search Keywords",
+                    "widget": "textarea",
+                },
                 "max_pages": {
                     "default": 1,
                     "description": "Maximum number of result pages to visit per input URL.",
@@ -113,6 +125,7 @@ def test_metadata():
                     "widget": "request-limit",
                 },
             },
+            "required": ["search_keywords"],
             "title": "GoogleSearchSpiderParams",
             "type": "object",
         },
@@ -125,7 +138,9 @@ def test_set_allowed_domains(url, allowed_domain):
     crawler = get_crawler()
 
     kwargs = {"url": url}
-    spider = GoogleSearchSpider.from_crawler(crawler, **kwargs)
+    spider = GoogleSearchSpider.from_crawler(
+        crawler, **kwargs, search_keywords="foo bar"
+    )
     assert spider.allowed_domains == [allowed_domain]
 
 
@@ -140,20 +155,23 @@ def test_input_multiple():
     with pytest.raises(ValueError):
         GoogleSearchSpider.from_crawler(
             crawler,
-            url="https://google.com/search?q=a",
-            urls=["https://google.com/search?q=b"],
+            url="https://www.google.com/search?q=a",
+            urls=["https://www.google.com/search?q=b"],
+            search_keywords="foo bar",
         )
     with pytest.raises(ValueError):
         GoogleSearchSpider.from_crawler(
             crawler,
-            url="https://google.com/search?q=a",
+            url="https://www.google.com/search?q=a",
             urls_file="https://example.com/input-urls.txt",
+            search_keywords="foo bar",
         )
     with pytest.raises(ValueError):
         GoogleSearchSpider.from_crawler(
             crawler,
-            urls=["https://google.com/search?q=b"],
+            urls=["https://www.google.com/search?q=b"],
             urls_file="https://example.com/input-urls.txt",
+            search_keywords="foo bar",
         )
 
 
@@ -165,15 +183,19 @@ def test_url_invalid():
 
 def test_urls(caplog):
     crawler = get_crawler()
-    url = "https://google.com/search?q=foo+bar"
+    url = "https://www.google.com/search?q=foo+bar"
 
-    spider = GoogleSearchSpider.from_crawler(crawler, urls=[url])
+    spider = GoogleSearchSpider.from_crawler(
+        crawler, urls=[url], search_keywords="foo bar"
+    )
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 1
     assert start_requests[0].url == url
     assert start_requests[0].callback == spider.parse_serp
 
-    spider = GoogleSearchSpider.from_crawler(crawler, urls=url)
+    spider = GoogleSearchSpider.from_crawler(
+        crawler, urls=url, search_keywords="foo bar"
+    )
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 1
     assert start_requests[0].url == url
@@ -182,21 +204,23 @@ def test_urls(caplog):
     caplog.clear()
     spider = GoogleSearchSpider.from_crawler(
         crawler,
-        urls="https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\nfoo\n\n",
+        urls="https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\nfoo\n\n",
+        search_keywords="foo bar",
     )
     assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 3
     assert all(request.callback == spider.parse_serp for request in start_requests)
-    assert start_requests[0].url == "https://google.com/search?q=a"
-    assert start_requests[1].url == "https://google.com/search?q=b"
-    assert start_requests[2].url == "https://google.com/search?q=c"
+    assert start_requests[0].url == "https://www.google.com/search?q=foo+bar"
+    assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar"
+    assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar"
 
     caplog.clear()
     with pytest.raises(ValueError):
         spider = GoogleSearchSpider.from_crawler(
             crawler,
             urls="foo\nbar",
+            search_keywords="foo bar",
         )
     assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
     assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text
@@ -208,13 +232,15 @@ def test_urls_file():
 
     with patch("zyte_spider_templates.params.requests.get") as mock_get:
         response = requests.Response()
-        response._content = b"https://google.com/search?q=a\n \nhttps://google.com/search?q=b\nhttps://google.com/search?q=c\n\n"
+        response._content = b"https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\n\n"
         mock_get.return_value = response
-        spider = GoogleSearchSpider.from_crawler(crawler, urls_file=url)
+        spider = GoogleSearchSpider.from_crawler(
+            crawler, urls_file=url, search_keywords="foo bar"
+        )
         mock_get.assert_called_with(url)
 
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 3
-    assert start_requests[0].url == "https://google.com/search?q=a"
-    assert start_requests[1].url == "https://google.com/search?q=b"
-    assert start_requests[2].url == "https://google.com/search?q=c"
+    assert start_requests[0].url == "https://www.google.com/search?q=foo+bar"
+    assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar"
+    assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar"
diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py
index f9d55de..030e5f1 100644
--- a/zyte_spider_templates/params.py
+++ b/zyte_spider_templates/params.py
@@ -120,21 +120,24 @@ def validate_input_group(model):
     return model
 
 
+URLS_FILE_FIELD_KWARGS = {
+    "title": "URLs file",
+    "description": (
+        "URL that point to a plain-text file with a list of URLs to "
+        "crawl, e.g. https://example.com/url-list.txt. The linked list "
+        "must contain 1 URL per line."
+    ),
+    "pattern": _URL_PATTERN,
+    "default": "",
+    "json_schema_extra": {
+        "group": "inputs",
+        "exclusiveRequired": True,
+    },
+}
+
+
 class UrlsFileParam(BaseModel):
-    urls_file: str = Field(
-        title="URLs file",
-        description=(
-            "URL that point to a plain-text file with a list of URLs to "
-            "crawl, e.g. https://example.com/url-list.txt. The linked list "
-            "must contain 1 URL per line."
-        ),
-        pattern=_URL_PATTERN,
-        default="",
-        json_schema_extra={
-            "group": "inputs",
-            "exclusiveRequired": True,
-        },
-    )
+    urls_file: str = Field(**URLS_FILE_FIELD_KWARGS)  # type: ignore[misc, arg-type]
 
     @model_validator(mode="after")
     def input_group(self):
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index bb46afb..a57ebea 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -3,7 +3,7 @@
 from urllib.parse import urlparse, urlunparse
 
 import scrapy
-from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
@@ -14,10 +14,11 @@
 from zyte_spider_templates.params import parse_input_params
 
 from ..params import (
+    INPUT_GROUP_FIELDS,
     URL_FIELD_KWARGS,
     URLS_FIELD_KWARGS,
+    URLS_FILE_FIELD_KWARGS,
     MaxRequestsParam,
-    UrlsFileParam,
     validate_url_list,
 )
 from .base import INPUT_GROUP, BaseSpider
@@ -27,7 +28,6 @@ class SearchKeywordsParam(BaseModel):
     search_keywords: Optional[List[str]] = Field(
         title="Search Keywords",
         description=("Search keywords to use on the specified input Google URLs."),
-        default=None,
         json_schema_extra={
             "widget": "textarea",
         },
@@ -43,7 +43,7 @@ def validate_search_keywords(cls, value: Union[List[str], str]) -> List[str]:
         if isinstance(value, str):
             value = value.split("\n")
         if not value:
-            return value
+            raise ValueError("The search_keywords parameter value is missing or empty.")
         result = []
         for v in value:
             if not (v := v.strip()):
@@ -61,26 +61,24 @@ class SerpMaxPagesParam(BaseModel):
 
 
 GOOGLE_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS)
-assert isinstance(GOOGLE_URL_FIELD_KWARGS["description"], str)
 GOOGLE_URL_FIELD_KWARGS["default"] = "https://www.google.com/"
-GOOGLE_URL_FIELD_KWARGS["description"] = GOOGLE_URL_FIELD_KWARGS["description"].replace(
-    "https://toscrape.com/", "https://google.com/search?q=foo+bar"
-)
+GOOGLE_URL_FIELD_KWARGS[
+    "description"
+] = "Target Google URL. Defaults to https://www.google.com/."
 
 
 class GoogleUrlParam(BaseModel):
     url: str = Field(**GOOGLE_URL_FIELD_KWARGS)  # type: ignore[misc, arg-type]
 
 
-SERP_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS)
-assert isinstance(SERP_URLS_FIELD_KWARGS["description"], str)
-SERP_URLS_FIELD_KWARGS["description"] = SERP_URLS_FIELD_KWARGS["description"].replace(
-    "https://toscrape.com/", "https://google.com/search?q=foo+bar"
-)
+GOOGLE_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS)
+GOOGLE_URLS_FIELD_KWARGS[
+    "description"
+] = "Target Google URLs. Defaults to https://www.google.com/."
 
 
-class SerpUrlsParam(BaseModel):
-    urls: Optional[List[str]] = Field(**SERP_URLS_FIELD_KWARGS)  # type: ignore[misc, arg-type]
+class GoogleUrlsParam(BaseModel):
+    urls: Optional[List[str]] = Field(**GOOGLE_URLS_FIELD_KWARGS)  # type: ignore[misc, arg-type]
 
     @field_validator("urls", mode="before")
     @classmethod
@@ -88,12 +86,24 @@ def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
         return validate_url_list(value)
 
 
+GOOGLE_URLS_FILE_FIELD_KWARGS = deepcopy(URLS_FILE_FIELD_KWARGS)
+GOOGLE_URLS_FILE_FIELD_KWARGS["description"] = (
+    "URL that point to a plain-text file with a list of target Google URLs, "
+    "e.g. https://example.com/url-list.txt. The linked list must contain 1 "
+    "Google URL (e.g. https://www.google.com/) per line."
+)
+
+
+class GoogleUrlsFileParam(BaseModel):
+    urls_file: str = Field(**GOOGLE_URLS_FILE_FIELD_KWARGS)  # type: ignore[misc, arg-type]
+
+
 class GoogleSearchSpiderParams(
     MaxRequestsParam,
     SerpMaxPagesParam,
     SearchKeywordsParam,
-    UrlsFileParam,
-    SerpUrlsParam,
+    GoogleUrlsFileParam,
+    GoogleUrlsParam,
     GoogleUrlParam,
     BaseModel,
 ):
@@ -107,6 +117,30 @@ class GoogleSearchSpiderParams(
         },
     )
 
+    @model_validator(mode="after")
+    def input_group(self):
+        input_fields = set(
+            field for field in INPUT_GROUP_FIELDS if getattr(self, field, None)
+        )
+        if not input_fields:
+            input_field_list = ", ".join(INPUT_GROUP_FIELDS)
+            raise ValueError(
+                f"No input parameter defined. Please, define one of: "
+                f"{input_field_list}."
+            )
+        elif (
+            len(input_fields) > 1
+            and getattr(self, "url", None) != GOOGLE_URL_FIELD_KWARGS["default"]
+        ):
+            input_field_list = ", ".join(
+                f"{field} ({getattr(self, field)!r})" for field in input_fields
+            )
+            raise ValueError(
+                f"Expected a single input parameter, got {len(input_fields)}: "
+                f"{input_field_list}."
+            )
+        return self
+
 
 class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider):
     """Yield results from Google searches.

From bcf556620ac0e0bc1cc769f49fd1996cfddf4985 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Mon, 9 Sep 2024 13:20:11 +0200
Subject: [PATCH 09/19] Use a domain drop-down list

---
 .../spiders/_google_domains.py                | 193 ++++++++++++++++++
 zyte_spider_templates/spiders/serp.py         | 125 ++----------
 2 files changed, 214 insertions(+), 104 deletions(-)
 create mode 100644 zyte_spider_templates/spiders/_google_domains.py

diff --git a/zyte_spider_templates/spiders/_google_domains.py b/zyte_spider_templates/spiders/_google_domains.py
new file mode 100644
index 0000000..b38d582
--- /dev/null
+++ b/zyte_spider_templates/spiders/_google_domains.py
@@ -0,0 +1,193 @@
+from enum import Enum
+
+
+# https://www.google.com/supported_domains
+# Sorted alphabetically, except for keeping the main domain first.
+class GoogleDomain(str, Enum):
+    google_com: str = "google.com"
+    google_ad: str = "google.ad"
+    google_ae: str = "google.ae"
+    google_al: str = "google.al"
+    google_am: str = "google.am"
+    google_as: str = "google.as"
+    google_at: str = "google.at"
+    google_az: str = "google.az"
+    google_ba: str = "google.ba"
+    google_be: str = "google.be"
+    google_bf: str = "google.bf"
+    google_bg: str = "google.bg"
+    google_bi: str = "google.bi"
+    google_bj: str = "google.bj"
+    google_bs: str = "google.bs"
+    google_bt: str = "google.bt"
+    google_by: str = "google.by"
+    google_ca: str = "google.ca"
+    google_cat: str = "google.cat"
+    google_cd: str = "google.cd"
+    google_cf: str = "google.cf"
+    google_cg: str = "google.cg"
+    google_ch: str = "google.ch"
+    google_ci: str = "google.ci"
+    google_cl: str = "google.cl"
+    google_cm: str = "google.cm"
+    google_cn: str = "google.cn"
+    google_co_ao: str = "google.co.ao"
+    google_co_bw: str = "google.co.bw"
+    google_co_ck: str = "google.co.ck"
+    google_co_cr: str = "google.co.cr"
+    google_co_id: str = "google.co.id"
+    google_co_il: str = "google.co.il"
+    google_co_in: str = "google.co.in"
+    google_co_jp: str = "google.co.jp"
+    google_co_ke: str = "google.co.ke"
+    google_co_kr: str = "google.co.kr"
+    google_co_ls: str = "google.co.ls"
+    google_co_ma: str = "google.co.ma"
+    google_co_mz: str = "google.co.mz"
+    google_co_nz: str = "google.co.nz"
+    google_co_th: str = "google.co.th"
+    google_co_tz: str = "google.co.tz"
+    google_co_ug: str = "google.co.ug"
+    google_co_uk: str = "google.co.uk"
+    google_co_uz: str = "google.co.uz"
+    google_co_ve: str = "google.co.ve"
+    google_co_vi: str = "google.co.vi"
+    google_co_za: str = "google.co.za"
+    google_co_zm: str = "google.co.zm"
+    google_co_zw: str = "google.co.zw"
+    google_com_af: str = "google.com.af"
+    google_com_ag: str = "google.com.ag"
+    google_com_ar: str = "google.com.ar"
+    google_com_au: str = "google.com.au"
+    google_com_bd: str = "google.com.bd"
+    google_com_bh: str = "google.com.bh"
+    google_com_bn: str = "google.com.bn"
+    google_com_bo: str = "google.com.bo"
+    google_com_br: str = "google.com.br"
+    google_com_bz: str = "google.com.bz"
+    google_com_co: str = "google.com.co"
+    google_com_cu: str = "google.com.cu"
+    google_com_cy: str = "google.com.cy"
+    google_com_do: str = "google.com.do"
+    google_com_ec: str = "google.com.ec"
+    google_com_eg: str = "google.com.eg"
+    google_com_et: str = "google.com.et"
+    google_com_fj: str = "google.com.fj"
+    google_com_gh: str = "google.com.gh"
+    google_com_gi: str = "google.com.gi"
+    google_com_gt: str = "google.com.gt"
+    google_com_hk: str = "google.com.hk"
+    google_com_jm: str = "google.com.jm"
+    google_com_kh: str = "google.com.kh"
+    google_com_kw: str = "google.com.kw"
+    google_com_lb: str = "google.com.lb"
+    google_com_ly: str = "google.com.ly"
+    google_com_mm: str = "google.com.mm"
+    google_com_mt: str = "google.com.mt"
+    google_com_mx: str = "google.com.mx"
+    google_com_my: str = "google.com.my"
+    google_com_na: str = "google.com.na"
+    google_com_ng: str = "google.com.ng"
+    google_com_ni: str = "google.com.ni"
+    google_com_np: str = "google.com.np"
+    google_com_om: str = "google.com.om"
+    google_com_pa: str = "google.com.pa"
+    google_com_pe: str = "google.com.pe"
+    google_com_pg: str = "google.com.pg"
+    google_com_ph: str = "google.com.ph"
+    google_com_pk: str = "google.com.pk"
+    google_com_pr: str = "google.com.pr"
+    google_com_py: str = "google.com.py"
+    google_com_qa: str = "google.com.qa"
+    google_com_sa: str = "google.com.sa"
+    google_com_sb: str = "google.com.sb"
+    google_com_sg: str = "google.com.sg"
+    google_com_sl: str = "google.com.sl"
+    google_com_sv: str = "google.com.sv"
+    google_com_tj: str = "google.com.tj"
+    google_com_tr: str = "google.com.tr"
+    google_com_tw: str = "google.com.tw"
+    google_com_ua: str = "google.com.ua"
+    google_com_uy: str = "google.com.uy"
+    google_com_vc: str = "google.com.vc"
+    google_com_vn: str = "google.com.vn"
+    google_cv: str = "google.cv"
+    google_cz: str = "google.cz"
+    google_de: str = "google.de"
+    google_dj: str = "google.dj"
+    google_dk: str = "google.dk"
+    google_dm: str = "google.dm"
+    google_dz: str = "google.dz"
+    google_ee: str = "google.ee"
+    google_es: str = "google.es"
+    google_fi: str = "google.fi"
+    google_fm: str = "google.fm"
+    google_fr: str = "google.fr"
+    google_ga: str = "google.ga"
+    google_ge: str = "google.ge"
+    google_gg: str = "google.gg"
+    google_gl: str = "google.gl"
+    google_gm: str = "google.gm"
+    google_gr: str = "google.gr"
+    google_gy: str = "google.gy"
+    google_hn: str = "google.hn"
+    google_hr: str = "google.hr"
+    google_ht: str = "google.ht"
+    google_hu: str = "google.hu"
+    google_ie: str = "google.ie"
+    google_im: str = "google.im"
+    google_iq: str = "google.iq"
+    google_is: str = "google.is"
+    google_it: str = "google.it"
+    google_je: str = "google.je"
+    google_jo: str = "google.jo"
+    google_kg: str = "google.kg"
+    google_ki: str = "google.ki"
+    google_kz: str = "google.kz"
+    google_la: str = "google.la"
+    google_li: str = "google.li"
+    google_lk: str = "google.lk"
+    google_lt: str = "google.lt"
+    google_lu: str = "google.lu"
+    google_lv: str = "google.lv"
+    google_md: str = "google.md"
+    google_me: str = "google.me"
+    google_mg: str = "google.mg"
+    google_mk: str = "google.mk"
+    google_ml: str = "google.ml"
+    google_mn: str = "google.mn"
+    google_mu: str = "google.mu"
+    google_mv: str = "google.mv"
+    google_mw: str = "google.mw"
+    google_ne: str = "google.ne"
+    google_nl: str = "google.nl"
+    google_no: str = "google.no"
+    google_nr: str = "google.nr"
+    google_nu: str = "google.nu"
+    google_pl: str = "google.pl"
+    google_pn: str = "google.pn"
+    google_ps: str = "google.ps"
+    google_pt: str = "google.pt"
+    google_ro: str = "google.ro"
+    google_rs: str = "google.rs"
+    google_ru: str = "google.ru"
+    google_rw: str = "google.rw"
+    google_sc: str = "google.sc"
+    google_se: str = "google.se"
+    google_sh: str = "google.sh"
+    google_si: str = "google.si"
+    google_sk: str = "google.sk"
+    google_sm: str = "google.sm"
+    google_sn: str = "google.sn"
+    google_so: str = "google.so"
+    google_sr: str = "google.sr"
+    google_st: str = "google.st"
+    google_td: str = "google.td"
+    google_tg: str = "google.tg"
+    google_tl: str = "google.tl"
+    google_tm: str = "google.tm"
+    google_tn: str = "google.tn"
+    google_to: str = "google.to"
+    google_tt: str = "google.tt"
+    google_vu: str = "google.vu"
+    google_ws: str = "google.ws"
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index a57ebea..56d6049 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -1,27 +1,15 @@
-from copy import deepcopy
 from typing import Any, Dict, Iterable, List, Optional, Union
-from urllib.parse import urlparse, urlunparse
 
-import scrapy
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic import BaseModel, Field, field_validator
 from scrapy import Request
-from scrapy.crawler import Crawler
 from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
 from scrapy_spider_metadata import Args
 from w3lib.url import add_or_replace_parameter
 from zyte_common_items import Serp
 
-from zyte_spider_templates.params import parse_input_params
-
-from ..params import (
-    INPUT_GROUP_FIELDS,
-    URL_FIELD_KWARGS,
-    URLS_FIELD_KWARGS,
-    URLS_FILE_FIELD_KWARGS,
-    MaxRequestsParam,
-    validate_url_list,
-)
-from .base import INPUT_GROUP, BaseSpider
+from ..params import MaxRequestsParam
+from ._google_domains import GoogleDomain
+from .base import BaseSpider
 
 
 class SearchKeywordsParam(BaseModel):
@@ -60,86 +48,22 @@ class SerpMaxPagesParam(BaseModel):
     )
 
 
-GOOGLE_URL_FIELD_KWARGS = deepcopy(URL_FIELD_KWARGS)
-GOOGLE_URL_FIELD_KWARGS["default"] = "https://www.google.com/"
-GOOGLE_URL_FIELD_KWARGS[
-    "description"
-] = "Target Google URL. Defaults to https://www.google.com/."
-
-
-class GoogleUrlParam(BaseModel):
-    url: str = Field(**GOOGLE_URL_FIELD_KWARGS)  # type: ignore[misc, arg-type]
-
-
-GOOGLE_URLS_FIELD_KWARGS = deepcopy(URLS_FIELD_KWARGS)
-GOOGLE_URLS_FIELD_KWARGS[
-    "description"
-] = "Target Google URLs. Defaults to https://www.google.com/."
-
-
-class GoogleUrlsParam(BaseModel):
-    urls: Optional[List[str]] = Field(**GOOGLE_URLS_FIELD_KWARGS)  # type: ignore[misc, arg-type]
-
-    @field_validator("urls", mode="before")
-    @classmethod
-    def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
-        return validate_url_list(value)
-
-
-GOOGLE_URLS_FILE_FIELD_KWARGS = deepcopy(URLS_FILE_FIELD_KWARGS)
-GOOGLE_URLS_FILE_FIELD_KWARGS["description"] = (
-    "URL that point to a plain-text file with a list of target Google URLs, "
-    "e.g. https://example.com/url-list.txt. The linked list must contain 1 "
-    "Google URL (e.g. https://www.google.com/) per line."
-)
-
-
-class GoogleUrlsFileParam(BaseModel):
-    urls_file: str = Field(**GOOGLE_URLS_FILE_FIELD_KWARGS)  # type: ignore[misc, arg-type]
+class GoogleDomainParam(BaseModel):
+    domain: GoogleDomain = Field(
+        title="Domain",
+        description="Target Google domain.",
+        default=GoogleDomain.google_com,
+    )
 
 
 class GoogleSearchSpiderParams(
     MaxRequestsParam,
     SerpMaxPagesParam,
     SearchKeywordsParam,
-    GoogleUrlsFileParam,
-    GoogleUrlsParam,
-    GoogleUrlParam,
+    GoogleDomainParam,
     BaseModel,
 ):
-    model_config = ConfigDict(
-        # https://github.com/pydantic/pydantic/discussions/7763#discussioncomment-10338857
-        protected_namespaces=(),
-        json_schema_extra={
-            "groups": [
-                INPUT_GROUP,
-            ],
-        },
-    )
-
-    @model_validator(mode="after")
-    def input_group(self):
-        input_fields = set(
-            field for field in INPUT_GROUP_FIELDS if getattr(self, field, None)
-        )
-        if not input_fields:
-            input_field_list = ", ".join(INPUT_GROUP_FIELDS)
-            raise ValueError(
-                f"No input parameter defined. Please, define one of: "
-                f"{input_field_list}."
-            )
-        elif (
-            len(input_fields) > 1
-            and getattr(self, "url", None) != GOOGLE_URL_FIELD_KWARGS["default"]
-        ):
-            input_field_list = ", ".join(
-                f"{field} ({getattr(self, field)!r})" for field in input_fields
-            )
-            raise ValueError(
-                f"Expected a single input parameter, got {len(input_fields)}: "
-                f"{input_field_list}."
-            )
-        return self
+    pass
 
 
 class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider):
@@ -173,12 +97,6 @@ def update_settings(cls, settings: BaseSettings) -> None:
                 priority="spider",
             )
 
-    @classmethod
-    def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
-        spider = super().from_crawler(crawler, *args, **kwargs)
-        parse_input_params(spider)
-        return spider
-
     def get_start_request(self, url):
         return Request(
             url=url,
@@ -196,16 +114,15 @@ def start_requests(self) -> Iterable[Request]:
         if not search_keywords:
             raise ValueError("No search keywords specified.")
 
-        for url in self.start_urls:
-            url = urlunparse(urlparse(url)._replace(path="/search"))
-            for search_keyword in search_keywords:
-                search_url = add_or_replace_parameter(url, "q", search_keyword)
-                for start in range(0, self.args.max_pages * 10, 10):
-                    if start:
-                        search_url = add_or_replace_parameter(
-                            search_url, "start", str(start)
-                        )
-                    yield self.get_start_request(search_url)
+        url = f"https://www.{self.args.domain.value}/search"
+        for search_keyword in search_keywords:
+            search_url = add_or_replace_parameter(url, "q", search_keyword)
+            for start in range(0, self.args.max_pages * 10, 10):
+                if start:
+                    search_url = add_or_replace_parameter(
+                        search_url, "start", str(start)
+                    )
+                yield self.get_start_request(search_url)
 
     def parse_serp(self, response) -> Iterable[Serp]:
         yield Serp.from_dict(response.raw_api_response["serp"])

From 89c1b7f459a339faa30ffd0844a8c04b0d874977 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Mon, 9 Sep 2024 13:37:39 +0200
Subject: [PATCH 10/19] Improve the search_keywords tooltip and update tests

---
 tests/test_serp.py                    | 381 +++++++++++++++-----------
 zyte_spider_templates/spiders/serp.py |   5 +-
 2 files changed, 229 insertions(+), 157 deletions(-)

diff --git a/tests/test_serp.py b/tests/test_serp.py
index 177ff53..99be79e 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -1,14 +1,10 @@
-from unittest.mock import patch
-
 import pytest
-import requests
 from pydantic import ValidationError
 from scrapy_spider_metadata import get_spider_metadata
 
 from zyte_spider_templates.spiders.serp import GoogleSearchSpider
 
 from . import get_crawler
-from .test_utils import URL_TO_DOMAIN
 from .utils import assertEqualJson
 
 
@@ -17,13 +13,14 @@ def test_parameters():
         GoogleSearchSpider()
 
     with pytest.raises(ValidationError):
-        GoogleSearchSpider(url="https://www.google.com/")
+        GoogleSearchSpider(domain="google.com")
 
     GoogleSearchSpider(search_keywords="foo bar")
-    GoogleSearchSpider(url="https://www.google.cat/", search_keywords="foo bar")
-    GoogleSearchSpider(
-        url="https://www.google.cat/", search_keywords="foo bar", max_pages=10
-    )
+    GoogleSearchSpider(domain="google.cat", search_keywords="foo bar")
+    GoogleSearchSpider(domain="google.cat", search_keywords="foo bar", max_pages=10)
+
+    with pytest.raises(ValidationError):
+        GoogleSearchSpider(domain="google.foo", search_keywords="foo bar")
 
     with pytest.raises(ValidationError):
         GoogleSearchSpider(search_keywords="foo bar", max_pages="all")
@@ -45,55 +42,200 @@ def test_metadata():
         "title": "Google Search Results",
         "description": "Template for spiders that extract Google search results.",
         "param_schema": {
-            "groups": [
-                {
-                    "description": (
-                        "Input data that determines the start URLs of the crawl."
-                    ),
-                    "id": "inputs",
-                    "title": "Inputs",
-                    "widget": "exclusive",
-                },
-            ],
             "properties": {
-                "url": {
-                    "default": "https://www.google.com/",
-                    "description": (
-                        "Target Google URL. Defaults to https://www.google.com/."
-                    ),
-                    "exclusiveRequired": True,
-                    "group": "inputs",
-                    "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
-                    "title": "URL",
-                    "type": "string",
-                },
-                "urls": {
-                    "anyOf": [
-                        {"items": {"type": "string"}, "type": "array"},
-                        {"type": "null"},
+                "domain": {
+                    "default": "google.com",
+                    "description": "Target Google domain.",
+                    "title": "Domain",
+                    "enum": [
+                        "google.com",
+                        "google.ad",
+                        "google.ae",
+                        "google.al",
+                        "google.am",
+                        "google.as",
+                        "google.at",
+                        "google.az",
+                        "google.ba",
+                        "google.be",
+                        "google.bf",
+                        "google.bg",
+                        "google.bi",
+                        "google.bj",
+                        "google.bs",
+                        "google.bt",
+                        "google.by",
+                        "google.ca",
+                        "google.cat",
+                        "google.cd",
+                        "google.cf",
+                        "google.cg",
+                        "google.ch",
+                        "google.ci",
+                        "google.cl",
+                        "google.cm",
+                        "google.cn",
+                        "google.co.ao",
+                        "google.co.bw",
+                        "google.co.ck",
+                        "google.co.cr",
+                        "google.co.id",
+                        "google.co.il",
+                        "google.co.in",
+                        "google.co.jp",
+                        "google.co.ke",
+                        "google.co.kr",
+                        "google.co.ls",
+                        "google.co.ma",
+                        "google.co.mz",
+                        "google.co.nz",
+                        "google.co.th",
+                        "google.co.tz",
+                        "google.co.ug",
+                        "google.co.uk",
+                        "google.co.uz",
+                        "google.co.ve",
+                        "google.co.vi",
+                        "google.co.za",
+                        "google.co.zm",
+                        "google.co.zw",
+                        "google.com.af",
+                        "google.com.ag",
+                        "google.com.ar",
+                        "google.com.au",
+                        "google.com.bd",
+                        "google.com.bh",
+                        "google.com.bn",
+                        "google.com.bo",
+                        "google.com.br",
+                        "google.com.bz",
+                        "google.com.co",
+                        "google.com.cu",
+                        "google.com.cy",
+                        "google.com.do",
+                        "google.com.ec",
+                        "google.com.eg",
+                        "google.com.et",
+                        "google.com.fj",
+                        "google.com.gh",
+                        "google.com.gi",
+                        "google.com.gt",
+                        "google.com.hk",
+                        "google.com.jm",
+                        "google.com.kh",
+                        "google.com.kw",
+                        "google.com.lb",
+                        "google.com.ly",
+                        "google.com.mm",
+                        "google.com.mt",
+                        "google.com.mx",
+                        "google.com.my",
+                        "google.com.na",
+                        "google.com.ng",
+                        "google.com.ni",
+                        "google.com.np",
+                        "google.com.om",
+                        "google.com.pa",
+                        "google.com.pe",
+                        "google.com.pg",
+                        "google.com.ph",
+                        "google.com.pk",
+                        "google.com.pr",
+                        "google.com.py",
+                        "google.com.qa",
+                        "google.com.sa",
+                        "google.com.sb",
+                        "google.com.sg",
+                        "google.com.sl",
+                        "google.com.sv",
+                        "google.com.tj",
+                        "google.com.tr",
+                        "google.com.tw",
+                        "google.com.ua",
+                        "google.com.uy",
+                        "google.com.vc",
+                        "google.com.vn",
+                        "google.cv",
+                        "google.cz",
+                        "google.de",
+                        "google.dj",
+                        "google.dk",
+                        "google.dm",
+                        "google.dz",
+                        "google.ee",
+                        "google.es",
+                        "google.fi",
+                        "google.fm",
+                        "google.fr",
+                        "google.ga",
+                        "google.ge",
+                        "google.gg",
+                        "google.gl",
+                        "google.gm",
+                        "google.gr",
+                        "google.gy",
+                        "google.hn",
+                        "google.hr",
+                        "google.ht",
+                        "google.hu",
+                        "google.ie",
+                        "google.im",
+                        "google.iq",
+                        "google.is",
+                        "google.it",
+                        "google.je",
+                        "google.jo",
+                        "google.kg",
+                        "google.ki",
+                        "google.kz",
+                        "google.la",
+                        "google.li",
+                        "google.lk",
+                        "google.lt",
+                        "google.lu",
+                        "google.lv",
+                        "google.md",
+                        "google.me",
+                        "google.mg",
+                        "google.mk",
+                        "google.ml",
+                        "google.mn",
+                        "google.mu",
+                        "google.mv",
+                        "google.mw",
+                        "google.ne",
+                        "google.nl",
+                        "google.no",
+                        "google.nr",
+                        "google.nu",
+                        "google.pl",
+                        "google.pn",
+                        "google.ps",
+                        "google.pt",
+                        "google.ro",
+                        "google.rs",
+                        "google.ru",
+                        "google.rw",
+                        "google.sc",
+                        "google.se",
+                        "google.sh",
+                        "google.si",
+                        "google.sk",
+                        "google.sm",
+                        "google.sn",
+                        "google.so",
+                        "google.sr",
+                        "google.st",
+                        "google.td",
+                        "google.tg",
+                        "google.tl",
+                        "google.tm",
+                        "google.tn",
+                        "google.to",
+                        "google.tt",
+                        "google.vu",
+                        "google.ws",
                     ],
-                    "default": None,
-                    "description": (
-                        "Target Google URLs. Defaults to https://www.google.com/."
-                    ),
-                    "exclusiveRequired": True,
-                    "group": "inputs",
-                    "title": "URLs",
-                    "widget": "textarea",
-                },
-                "urls_file": {
-                    "default": "",
-                    "description": (
-                        "URL that point to a plain-text file with a list of "
-                        "target Google URLs, e.g. "
-                        "https://example.com/url-list.txt. The linked list "
-                        "must contain 1 Google URL (e.g. "
-                        "https://www.google.com/) per line."
-                    ),
-                    "exclusiveRequired": True,
-                    "group": "inputs",
-                    "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
-                    "title": "URLs file",
                     "type": "string",
                 },
                 "search_keywords": {
@@ -101,7 +243,11 @@ def test_metadata():
                         {"items": {"type": "string"}, "type": "array"},
                         {"type": "null"},
                     ],
-                    "description": "Search keywords to use on the specified input Google URLs.",
+                    "description": (
+                        "Keywords to search for. Use multiple lines to "
+                        "trigger multiple searches for different search "
+                        "keywords."
+                    ),
                     "title": "Search Keywords",
                     "widget": "textarea",
                 },
@@ -133,114 +279,37 @@ def test_metadata():
     assertEqualJson(actual_metadata, expected_metadata)
 
 
-@pytest.mark.parametrize("url,allowed_domain", URL_TO_DOMAIN)
-def test_set_allowed_domains(url, allowed_domain):
-    crawler = get_crawler()
-
-    kwargs = {"url": url}
-    spider = GoogleSearchSpider.from_crawler(
-        crawler, **kwargs, search_keywords="foo bar"
-    )
-    assert spider.allowed_domains == [allowed_domain]
-
-
 def test_input_none():
     crawler = get_crawler()
     with pytest.raises(ValueError):
         GoogleSearchSpider.from_crawler(crawler)
 
 
-def test_input_multiple():
-    crawler = get_crawler()
-    with pytest.raises(ValueError):
-        GoogleSearchSpider.from_crawler(
-            crawler,
-            url="https://www.google.com/search?q=a",
-            urls=["https://www.google.com/search?q=b"],
-            search_keywords="foo bar",
-        )
-    with pytest.raises(ValueError):
-        GoogleSearchSpider.from_crawler(
-            crawler,
-            url="https://www.google.com/search?q=a",
-            urls_file="https://example.com/input-urls.txt",
-            search_keywords="foo bar",
-        )
-    with pytest.raises(ValueError):
-        GoogleSearchSpider.from_crawler(
-            crawler,
-            urls=["https://www.google.com/search?q=b"],
-            urls_file="https://example.com/input-urls.txt",
-            search_keywords="foo bar",
-        )
-
-
-def test_url_invalid():
-    crawler = get_crawler()
-    with pytest.raises(ValueError):
-        GoogleSearchSpider.from_crawler(crawler, url="foo")
-
-
-def test_urls(caplog):
+@pytest.mark.parametrize(
+    ("input_domain", "expected_domain"),
+    (
+        (None, "google.com"),
+        ("google.com", "google.com"),
+        ("google.cat", "google.cat"),
+    ),
+)
+def test_domain(input_domain, expected_domain):
     crawler = get_crawler()
-    url = "https://www.google.com/search?q=foo+bar"
-
-    spider = GoogleSearchSpider.from_crawler(
-        crawler, urls=[url], search_keywords="foo bar"
-    )
-    start_requests = list(spider.start_requests())
-    assert len(start_requests) == 1
-    assert start_requests[0].url == url
-    assert start_requests[0].callback == spider.parse_serp
-
-    spider = GoogleSearchSpider.from_crawler(
-        crawler, urls=url, search_keywords="foo bar"
-    )
-    start_requests = list(spider.start_requests())
-    assert len(start_requests) == 1
-    assert start_requests[0].url == url
-    assert start_requests[0].callback == spider.parse_serp
-
-    caplog.clear()
+    kwargs = {}
+    if input_domain:
+        kwargs["domain"] = input_domain
     spider = GoogleSearchSpider.from_crawler(
-        crawler,
-        urls="https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\nfoo\n\n",
-        search_keywords="foo bar",
+        crawler, search_keywords="foo bar", **kwargs
     )
-    assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
-    start_requests = list(spider.start_requests())
-    assert len(start_requests) == 3
-    assert all(request.callback == spider.parse_serp for request in start_requests)
-    assert start_requests[0].url == "https://www.google.com/search?q=foo+bar"
-    assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar"
-    assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar"
-
-    caplog.clear()
-    with pytest.raises(ValueError):
-        spider = GoogleSearchSpider.from_crawler(
-            crawler,
-            urls="foo\nbar",
-            search_keywords="foo bar",
-        )
-    assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
-    assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text
+    requests = list(spider.start_requests())
+    assert len(requests) == 1
+    assert requests[0].url == f"https://www.{expected_domain}/search?q=foo+bar"
 
 
-def test_urls_file():
+def test_search_keywords():
     crawler = get_crawler()
-    url = "https://example.com/input-urls.txt"
-
-    with patch("zyte_spider_templates.params.requests.get") as mock_get:
-        response = requests.Response()
-        response._content = b"https://www.google.com/\n \nhttps://www.google.cat/\nhttps://www.google.ie/\n\n"
-        mock_get.return_value = response
-        spider = GoogleSearchSpider.from_crawler(
-            crawler, urls_file=url, search_keywords="foo bar"
-        )
-        mock_get.assert_called_with(url)
-
-    start_requests = list(spider.start_requests())
-    assert len(start_requests) == 3
-    assert start_requests[0].url == "https://www.google.com/search?q=foo+bar"
-    assert start_requests[1].url == "https://www.google.cat/search?q=foo+bar"
-    assert start_requests[2].url == "https://www.google.ie/search?q=foo+bar"
+    spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar\nbaz")
+    requests = list(spider.start_requests())
+    assert len(requests) == 2
+    assert requests[0].url == "https://www.google.com/search?q=foo+bar"
+    assert requests[1].url == "https://www.google.com/search?q=baz"
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index 56d6049..0d22d41 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -15,7 +15,10 @@
 class SearchKeywordsParam(BaseModel):
     search_keywords: Optional[List[str]] = Field(
         title="Search Keywords",
-        description=("Search keywords to use on the specified input Google URLs."),
+        description=(
+            "Keywords to search for. Use multiple lines to trigger multiple "
+            "searches for different search keywords."
+        ),
         json_schema_extra={
             "widget": "textarea",
         },

From c3a2f2369ce34aa1041fffd60a066396b77b59f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Mon, 9 Sep 2024 13:57:57 +0200
Subject: [PATCH 11/19] =?UTF-8?q?search=20keywords=20=E2=86=92=20search=20?=
 =?UTF-8?q?queries?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/templates/google-search.rst      |  2 +-
 tests/test_serp.py                    | 29 +++++++++++++-------------
 zyte_spider_templates/spiders/serp.py | 30 +++++++++++++--------------
 3 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst
index e8a9053..a8ba77c 100644
--- a/docs/templates/google-search.rst
+++ b/docs/templates/google-search.rst
@@ -9,7 +9,7 @@ Basic use
 
 .. code-block:: shell
 
-    scrapy crawl google_search -a search_keywords="foo bar"
+    scrapy crawl google_search -a search_queries="foo bar"
 
 Parameters
 ==========
diff --git a/tests/test_serp.py b/tests/test_serp.py
index 99be79e..90472ad 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -15,20 +15,20 @@ def test_parameters():
     with pytest.raises(ValidationError):
         GoogleSearchSpider(domain="google.com")
 
-    GoogleSearchSpider(search_keywords="foo bar")
-    GoogleSearchSpider(domain="google.cat", search_keywords="foo bar")
-    GoogleSearchSpider(domain="google.cat", search_keywords="foo bar", max_pages=10)
+    GoogleSearchSpider(search_queries="foo bar")
+    GoogleSearchSpider(domain="google.cat", search_queries="foo bar")
+    GoogleSearchSpider(domain="google.cat", search_queries="foo bar", max_pages=10)
 
     with pytest.raises(ValidationError):
-        GoogleSearchSpider(domain="google.foo", search_keywords="foo bar")
+        GoogleSearchSpider(domain="google.foo", search_queries="foo bar")
 
     with pytest.raises(ValidationError):
-        GoogleSearchSpider(search_keywords="foo bar", max_pages="all")
+        GoogleSearchSpider(search_queries="foo bar", max_pages="all")
 
 
 def test_start_requests():
     crawler = get_crawler()
-    spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar")
+    spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar")
     requests = list(spider.start_requests())
     assert len(requests) == 1
     assert requests[0].url == "https://www.google.com/search?q=foo+bar"
@@ -238,17 +238,16 @@ def test_metadata():
                     ],
                     "type": "string",
                 },
-                "search_keywords": {
+                "search_queries": {
                     "anyOf": [
                         {"items": {"type": "string"}, "type": "array"},
                         {"type": "null"},
                     ],
                     "description": (
-                        "Keywords to search for. Use multiple lines to "
-                        "trigger multiple searches for different search "
-                        "keywords."
+                        "Input 1 search query per line. A search query is a "
+                        "string of search keywords (e.g. foo bar)."
                     ),
-                    "title": "Search Keywords",
+                    "title": "Search Queries",
                     "widget": "textarea",
                 },
                 "max_pages": {
@@ -271,7 +270,7 @@ def test_metadata():
                     "widget": "request-limit",
                 },
             },
-            "required": ["search_keywords"],
+            "required": ["search_queries"],
             "title": "GoogleSearchSpiderParams",
             "type": "object",
         },
@@ -299,16 +298,16 @@ def test_domain(input_domain, expected_domain):
     if input_domain:
         kwargs["domain"] = input_domain
     spider = GoogleSearchSpider.from_crawler(
-        crawler, search_keywords="foo bar", **kwargs
+        crawler, search_queries="foo bar", **kwargs
     )
     requests = list(spider.start_requests())
     assert len(requests) == 1
     assert requests[0].url == f"https://www.{expected_domain}/search?q=foo+bar"
 
 
-def test_search_keywords():
+def test_search_queries():
     crawler = get_crawler()
-    spider = GoogleSearchSpider.from_crawler(crawler, search_keywords="foo bar\nbaz")
+    spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar\nbaz")
     requests = list(spider.start_requests())
     assert len(requests) == 2
     assert requests[0].url == "https://www.google.com/search?q=foo+bar"
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index 0d22d41..8181607 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -12,29 +12,29 @@
 from .base import BaseSpider
 
 
-class SearchKeywordsParam(BaseModel):
-    search_keywords: Optional[List[str]] = Field(
-        title="Search Keywords",
+class SearchQueriesParam(BaseModel):
+    search_queries: Optional[List[str]] = Field(
+        title="Search Queries",
         description=(
-            "Keywords to search for. Use multiple lines to trigger multiple "
-            "searches for different search keywords."
+            "Input 1 search query per line. A search query is a string of "
+            "search keywords (e.g. foo bar)."
         ),
         json_schema_extra={
             "widget": "textarea",
         },
     )
 
-    @field_validator("search_keywords", mode="before")
+    @field_validator("search_queries", mode="before")
     @classmethod
-    def validate_search_keywords(cls, value: Union[List[str], str]) -> List[str]:
-        """Validate a list of search keywords.
+    def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]:
+        """Validate a list of search queries.
         If a string is received as input, it is split into multiple strings
         on new lines.
         """
         if isinstance(value, str):
             value = value.split("\n")
         if not value:
-            raise ValueError("The search_keywords parameter value is missing or empty.")
+            raise ValueError("The search_queries parameter value is missing or empty.")
         result = []
         for v in value:
             if not (v := v.strip()):
@@ -62,7 +62,7 @@ class GoogleDomainParam(BaseModel):
 class GoogleSearchSpiderParams(
     MaxRequestsParam,
     SerpMaxPagesParam,
-    SearchKeywordsParam,
+    SearchQueriesParam,
     GoogleDomainParam,
     BaseModel,
 ):
@@ -113,13 +113,13 @@ def get_start_request(self, url):
         )
 
     def start_requests(self) -> Iterable[Request]:
-        search_keywords = self.args.search_keywords
-        if not search_keywords:
-            raise ValueError("No search keywords specified.")
+        search_queries = self.args.search_queries
+        if not search_queries:
+            raise ValueError("No search queries specified.")
 
         url = f"https://www.{self.args.domain.value}/search"
-        for search_keyword in search_keywords:
-            search_url = add_or_replace_parameter(url, "q", search_keyword)
+        for search_query in search_queries:
+            search_url = add_or_replace_parameter(url, "q", search_query)
             for start in range(0, self.args.max_pages * 10, 10):
                 if start:
                     search_url = add_or_replace_parameter(

From 25d4cf71cac4c204df1516a92a6ceeb2a2ba65a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Mon, 9 Sep 2024 14:15:45 +0200
Subject: [PATCH 12/19] Fix metadata JSON schema comparison

---
 tests/test_ecommerce.py                    |  6 +++---
 tests/test_serp.py                         |  4 ++--
 tests/utils.py                             | 17 ++++++++++-------
 zyte_spider_templates/spiders/ecommerce.py |  2 +-
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
index c7bb88b..adf6487 100644
--- a/tests/test_ecommerce.py
+++ b/tests/test_ecommerce.py
@@ -21,7 +21,7 @@
 
 from . import get_crawler
 from .test_utils import URL_TO_DOMAIN
-from .utils import assertEqualJson
+from .utils import assertEqualSpiderMetadata
 
 
 def test_parameters():
@@ -463,7 +463,7 @@ def test_metadata():
                             "title": "Pagination Only",
                         },
                     },
-                    "title": "Crawl strategy",
+                    "title": "Crawl Strategy",
                     "enum": [
                         "automatic",
                         "full",
@@ -533,7 +533,7 @@ def test_metadata():
             "type": "object",
         },
     }
-    assertEqualJson(actual_metadata, expected_metadata)
+    assertEqualSpiderMetadata(actual_metadata, expected_metadata)
 
     geolocation = actual_metadata["param_schema"]["properties"]["geolocation"]
     assert geolocation["enum"][0] == "AF"
diff --git a/tests/test_serp.py b/tests/test_serp.py
index 90472ad..3ba9dd7 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -5,7 +5,7 @@
 from zyte_spider_templates.spiders.serp import GoogleSearchSpider
 
 from . import get_crawler
-from .utils import assertEqualJson
+from .utils import assertEqualSpiderMetadata
 
 
 def test_parameters():
@@ -275,7 +275,7 @@ def test_metadata():
             "type": "object",
         },
     }
-    assertEqualJson(actual_metadata, expected_metadata)
+    assertEqualSpiderMetadata(actual_metadata, expected_metadata)
 
 
 def test_input_none():
diff --git a/tests/utils.py b/tests/utils.py
index 2fd7261..c18cb9b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,16 +1,19 @@
 import json
 
 
-def assertEqualJson(actual, expected):
-    """Compare the JSON representation of 2 Python objects.
+def assertEqualSpiderMetadata(actual, expected):
+    """Compare 2 JSON schemas of spider metadata.
 
-    This allows to take into account things like the order of key-value pairs
-    in dictionaries, which would not be taken into account when comparing
-    dictionaries directly.
+    The parameter order in the parameter schema is taken into account, given
+    how it affects the UI, while the order of other object keys may be
+    different.
 
     It also generates a better diff in pytest output when enums are involved,
     e.g. geolocation values.
     """
-    actual_json = json.dumps(actual, indent=2)
-    expected_json = json.dumps(expected, indent=2)
+    assert tuple(actual["param_schema"]["properties"]) == tuple(
+        expected["param_schema"]["properties"]
+    )
+    actual_json = json.dumps(actual, indent=2, sort_keys=True)
+    expected_json = json.dumps(expected, indent=2, sort_keys=True)
     assert actual_json == expected_json
diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
index 0a1aa6f..3868649 100644
--- a/zyte_spider_templates/spiders/ecommerce.py
+++ b/zyte_spider_templates/spiders/ecommerce.py
@@ -61,7 +61,7 @@ class EcommerceCrawlStrategy(str, Enum):
 
 class EcommerceCrawlStrategyParam(BaseModel):
     crawl_strategy: EcommerceCrawlStrategy = Field(
-        title="Crawl strategy",
+        title="Crawl Strategy",
         description="Determines how the start URL and follow-up URLs are crawled.",
         default=EcommerceCrawlStrategy.automatic,
         json_schema_extra={

From d8fe94c30147877f8545a95525946287bb71f9b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Mon, 9 Sep 2024 14:24:19 +0200
Subject: [PATCH 13/19] =?UTF-8?q?Min=20zyte-common-items:=200.13.0=20?=
 =?UTF-8?q?=E2=86=92=200.22.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CHANGES.rst | 6 ++++++
 setup.py    | 3 +--
 tox.ini     | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index abce60b..7e92b08 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,6 +1,12 @@
 Changes
 =======
 
+Unreleased
+----------
+
+* Now requires ``zyte-common-items >= 0.22.0``.
+
+
 0.8.0 (2024-08-21)
 ------------------
 
diff --git a/setup.py b/setup.py
index de219a4..e5f8e9b 100644
--- a/setup.py
+++ b/setup.py
@@ -18,8 +18,7 @@
         "scrapy-poet>=0.21.0",
         "scrapy-spider-metadata>=0.1.2",
         "scrapy-zyte-api[provider]>=0.16.0",
-        # "zyte-common-items>=0.13.0",
-        "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@serp",
+        "zyte-common-items>=0.22.0",
     ],
     classifiers=[
         "Development Status :: 3 - Alpha",
diff --git a/tox.ini b/tox.ini
index ce4287d..a88f936 100644
--- a/tox.ini
+++ b/tox.ini
@@ -26,7 +26,7 @@ deps =
     scrapy-poet==0.21.0
     scrapy-spider-metadata==0.1.2
     scrapy-zyte-api[provider]==0.16.0
-    zyte-common-items==0.13.0
+    zyte-common-items==0.22.0
 
 [testenv:mypy]
 deps =

From b94b7b44122413cc42561d2fa93dbede5a183568 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 10 Sep 2024 15:58:32 +0200
Subject: [PATCH 14/19] Remove potentially confusing search keyword references

---
 tests/test_serp.py                    | 5 +----
 zyte_spider_templates/spiders/serp.py | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/test_serp.py b/tests/test_serp.py
index 3ba9dd7..2ea229d 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -243,10 +243,7 @@ def test_metadata():
                         {"items": {"type": "string"}, "type": "array"},
                         {"type": "null"},
                     ],
-                    "description": (
-                        "Input 1 search query per line. A search query is a "
-                        "string of search keywords (e.g. foo bar)."
-                    ),
+                    "description": "Input 1 search query per line (e.g. foo bar).",
                     "title": "Search Queries",
                     "widget": "textarea",
                 },
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index 8181607..e942964 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -15,10 +15,7 @@
 class SearchQueriesParam(BaseModel):
     search_queries: Optional[List[str]] = Field(
         title="Search Queries",
-        description=(
-            "Input 1 search query per line. A search query is a string of "
-            "search keywords (e.g. foo bar)."
-        ),
+        description="Input 1 search query per line (e.g. foo bar).",
         json_schema_extra={
             "widget": "textarea",
         },

From d7b724aa154ec71d3aa977574a8e770471551672 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 17 Sep 2024 00:13:24 +0200
Subject: [PATCH 15/19] Make crawl logging more flexible for new page types

---
 zyte_spider_templates/middlewares.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py
index 5a40872..68e4987 100644
--- a/zyte_spider_templates/middlewares.py
+++ b/zyte_spider_templates/middlewares.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import warnings
+from collections import defaultdict
 from datetime import datetime
 from typing import Any, Dict
 from warnings import warn
@@ -28,6 +29,9 @@ class CrawlingLogsMiddleware:
       the fingerprints logged in Scrapy Cloud's request data.
     """
 
+    # Deprecated in practice, but there is no good way to deprecate it, since
+    # class properties that also work for class instances are not a thing.
+    # https://stackoverflow.com/q/128573
     valid_page_types = [
         "product",
         "nextPage",
@@ -35,6 +39,7 @@ class CrawlingLogsMiddleware:
         "productNavigation",
         "productNavigation-heuristics",
     ]
+
     unknown_page_type = "unknown"
 
     @classmethod
@@ -82,12 +87,9 @@ def crawl_logs(self, response, result):
                     "probability"
                 ),
             },
-            "to_crawl": {},
+            "to_crawl": defaultdict(list),
         }
 
-        for page_type in self.valid_page_types + [self.unknown_page_type]:
-            data["to_crawl"][page_type] = []
-
         if result:
             for entry in result:
                 if not isinstance(entry, Request):
@@ -104,14 +106,17 @@ def crawl_logs(self, response, result):
                 )
 
                 page_type = crawling_logs.get("page_type")
-                if page_type not in self.valid_page_types:
+                if not page_type:
                     page_type = self.unknown_page_type
 
                 data["to_crawl"][page_type].append(crawling_logs)
 
-        summary = ["Number of Requests per page type:"]
-        for page_type, requests in data["to_crawl"].items():
-            summary.append(f"- {page_type}: {len(requests)}")
+        if data["to_crawl"]:
+            summary = ["Number of Requests per page type:"]
+            for page_type, requests in data["to_crawl"].items():
+                summary.append(f"- {page_type}: {len(requests)}")
+        else:
+            summary = ["Nothing to crawl."]
 
         report = [
             f"Crawling Logs for {response.url} (parsed as: {current_page_type}):",

From a9d5588f236be421d7c4d8bac78be78f6d2d0fab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 17 Sep 2024 00:23:40 +0200
Subject: [PATCH 16/19] Update test expectations

---
 tests/test_middlewares.py | 42 ++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py
index 6fc03ea..9b808bb 100644
--- a/tests/test_middlewares.py
+++ b/tests/test_middlewares.py
@@ -36,13 +36,7 @@ def results_gen():
     crawl_logs = middleware.crawl_logs(response, results_gen())
     assert crawl_logs == (
         "Crawling Logs for https://example.com (parsed as: None):\n"
-        "Number of Requests per page type:\n"
-        "- product: 0\n"
-        "- nextPage: 0\n"
-        "- subCategories: 0\n"
-        "- productNavigation: 0\n"
-        "- productNavigation-heuristics: 0\n"
-        "- unknown: 0\n"
+        "Nothing to crawl.\n"
         "Structured Logs:\n"
         "{\n"
         '  "time": "2023-10-10 20:09:29",\n'
@@ -53,14 +47,7 @@ def results_gen():
         '    "page_type": null,\n'
         '    "probability": null\n'
         "  },\n"
-        '  "to_crawl": {\n'
-        '    "product": [],\n'
-        '    "nextPage": [],\n'
-        '    "subCategories": [],\n'
-        '    "productNavigation": [],\n'
-        '    "productNavigation-heuristics": [],\n'
-        '    "unknown": []\n'
-        "  }\n"
+        '  "to_crawl": {}\n'
         "}"
     )
 
@@ -131,15 +118,19 @@ def test_crawling_logs_middleware():
             },
         },
     )
-    unknown_request = Request(
-        "https://example.com/other-unknown",
+    custom_request = Request(
+        "https://example.com/custom-page-type",
         meta={
             "crawling_logs": {
-                "name": "Unknown Page",
+                "name": "Custom Page",
                 "page_type": "some other page_type",
+                "foo": "bar",
             },
         },
     )
+    unknown_request = Request(
+        "https://example.com/other-unknown",
+    )
 
     request_fingerprint = get_fingerprinter(crawler)
     fingerprint = request_fingerprint(request)
@@ -150,6 +141,7 @@ def test_crawling_logs_middleware():
     product_navigation_heuristics_request_fp = request_fingerprint(
         product_navigation_heuristics_request
     )
+    custom_request_fp = request_fingerprint(custom_request)
     unknown_request_fp = request_fingerprint(unknown_request)
 
     def results_gen():
@@ -158,6 +150,7 @@ def results_gen():
         yield subcategory_request
         yield product_navigation_request
         yield product_navigation_heuristics_request
+        yield custom_request
         yield unknown_request
 
     crawl_logs = middleware.crawl_logs(response, results_gen())
@@ -169,6 +162,7 @@ def results_gen():
         "- subCategories: 1\n"
         "- productNavigation: 1\n"
         "- productNavigation-heuristics: 1\n"
+        "- some other page_type: 1\n"
         "- unknown: 1\n"
         "Structured Logs:\n"
         "{\n"
@@ -231,10 +225,18 @@ def results_gen():
         f'        "request_fingerprint": "{product_navigation_heuristics_request_fp}"\n'
         "      }\n"
         "    ],\n"
-        '    "unknown": [\n'
+        '    "some other page_type": [\n'
         "      {\n"
-        '        "name": "Unknown Page",\n'
+        '        "name": "Custom Page",\n'
         '        "page_type": "some other page_type",\n'
+        '        "foo": "bar",\n'
+        '        "request_url": "https://example.com/custom-page-type",\n'
+        '        "request_priority": 0,\n'
+        f'        "request_fingerprint": "{custom_request_fp}"\n'
+        "      }\n"
+        "    ],\n"
+        '    "unknown": [\n'
+        "      {\n"
         '        "request_url": "https://example.com/other-unknown",\n'
         '        "request_priority": 0,\n'
         f'        "request_fingerprint": "{unknown_request_fp}"\n'

From 7aa70b2763ef3f837e19029943f952bb13f06881 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 17 Sep 2024 09:54:14 +0200
Subject: [PATCH 17/19] Apply feedback

---
 tests/test_ecommerce.py               |  2 +-
 tests/test_serp.py                    |  6 ++++--
 zyte_spider_templates/params.py       |  2 +-
 zyte_spider_templates/spiders/serp.py | 13 ++++++-------
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
index adf6487..ae77049 100644
--- a/tests/test_ecommerce.py
+++ b/tests/test_ecommerce.py
@@ -411,7 +411,7 @@ def test_metadata():
                     "description": (
                         "URL that point to a plain-text file with a list of "
                         "URLs to crawl, e.g. "
-                        "https://example.com/url-list.txt. The linked list "
+                        "https://example.com/url-list.txt. The linked file "
                         "must contain 1 URL per line."
                     ),
                     "exclusiveRequired": True,
diff --git a/tests/test_serp.py b/tests/test_serp.py
index 2ea229d..e8ec9fe 100644
--- a/tests/test_serp.py
+++ b/tests/test_serp.py
@@ -249,8 +249,10 @@ def test_metadata():
                 },
                 "max_pages": {
                     "default": 1,
-                    "description": "Maximum number of result pages to visit per input URL.",
-                    "title": "Pages",
+                    "description": (
+                        "Maximum number of result pages to visit per search query."
+                    ),
+                    "title": "Max Pages",
                     "type": "integer",
                 },
                 "max_requests": {
diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py
index 030e5f1..f3190ab 100644
--- a/zyte_spider_templates/params.py
+++ b/zyte_spider_templates/params.py
@@ -124,7 +124,7 @@ def validate_input_group(model):
     "title": "URLs file",
     "description": (
         "URL that point to a plain-text file with a list of URLs to "
-        "crawl, e.g. https://example.com/url-list.txt. The linked list "
+        "crawl, e.g. https://example.com/url-list.txt. The linked file "
         "must contain 1 URL per line."
     ),
     "pattern": _URL_PATTERN,
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
index e942964..cbf9554 100644
--- a/zyte_spider_templates/spiders/serp.py
+++ b/zyte_spider_templates/spiders/serp.py
@@ -30,20 +30,19 @@ def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]:
         """
         if isinstance(value, str):
             value = value.split("\n")
-        if not value:
-            raise ValueError("The search_queries parameter value is missing or empty.")
         result = []
         for v in value:
-            if not (v := v.strip()):
-                continue
-            result.append(v)
+            if v := v.strip():
+                result.append(v)
+        if not result:
+            raise ValueError("The search_queries parameter value is missing or empty.")
         return result
 
 
 class SerpMaxPagesParam(BaseModel):
     max_pages: int = Field(
-        title="Pages",
-        description="Maximum number of result pages to visit per input URL.",
+        title="Max Pages",
+        description="Maximum number of result pages to visit per search query.",
         default=1,
     )
 

From 916b58c461b1d90c01cf2ca5414024b02702498e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 17 Sep 2024 10:13:12 +0200
Subject: [PATCH 18/19] Release notes for 0.9.0

---
 CHANGES.rst           | 20 ++++++++++++++++++--
 docs/_ext/__init__.py | 41 +++++++++++++++++++++++++++++++++++++++++
 docs/conf.py          |  4 ++++
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 7e92b08..a330408 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,11 +1,27 @@
 Changes
 =======
 
-Unreleased
-----------
+0.9.0 (2024-09-NN)
+------------------
 
 * Now requires ``zyte-common-items >= 0.22.0``.
 
+* New :ref:`Google Search spider template <google-search>`, built on top of
+  Zyte API’s :http:`request:serp`.
+
+* The heuristics of the :ref:`e-commerce spider template <e-commerce>` to
+  ignore certain URLs when following category links now also handles
+  subdomains. For example, before https://example.com/blog was ignored, now
+  https://blog.example.com is also ignored.
+
+* In the :ref:`spider parameters JSON schema <params-schema>`, the
+  :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy`
+  parameter of the :ref:`e-commerce spider template <e-commerce>` switches
+  position, from being the last parameter to being between
+  :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file`
+  and
+  :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`.
+
 
 0.8.0 (2024-08-21)
 ------------------
diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py
index 5a3839e..4181427 100644
--- a/docs/_ext/__init__.py
+++ b/docs/_ext/__init__.py
@@ -1,4 +1,45 @@
+import re
+
+from docutils import nodes
+from docutils.parsers.rst.roles import set_classes
+
+
+def http_api_reference_role(
+    name, rawtext, text, lineno, inliner, options={}, content=[]
+):
+    match = re.search(
+        r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text
+    )
+    if match:
+        display_text = match[1]
+        reference = match[2]
+    else:
+        display_text = None
+        reference = text
+    if reference.startswith("request:"):
+        request_or_response = "request"
+    elif reference.startswith("response:"):
+        request_or_response = "response/200"
+    else:
+        raise ValueError(
+            f":http: directive reference must start with request: or "
+            f"response:, got {reference} from {text!r}."
+        )
+
+    field = reference.split(":", maxsplit=1)[1]
+    if not display_text:
+        display_text = field
+    refuri = (
+        f"https://docs.zyte.com/zyte-api/usage/reference.html"
+        f"#operation/extract/{request_or_response}/{field}"
+    )
+    set_classes(options)
+    node = nodes.reference(rawtext, display_text, refuri=refuri, **options)
+    return [node], []
+
+
 def setup(app):
+    app.add_role("http", http_api_reference_role)
     # https://stackoverflow.com/a/13663325
     #
     # Scrapy’s
diff --git a/docs/conf.py b/docs/conf.py
index ac67ce5..ff0ef7f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,6 +34,10 @@
         "https://scrapy-poet.readthedocs.io/en/stable",
         None,
     ),
+    "scrapy-spider-metadata": (
+        "https://scrapy-spider-metadata.readthedocs.io/en/latest",
+        None,
+    ),
     "scrapy-zyte-api": (
         "https://scrapy-zyte-api.readthedocs.io/en/stable",
         None,

From 5c5502eebdaa1ec70d9b499403932560df6c4cb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 17 Sep 2024 11:08:48 +0200
Subject: [PATCH 19/19] Remove valid_page_types

---
 CHANGES.rst                          |  3 +++
 zyte_spider_templates/middlewares.py | 11 -----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index a330408..b64a334 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -22,6 +22,9 @@ Changes
   and
   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`.
 
+* Removed the ``valid_page_types`` attribute of
+  :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`.
+
 
 0.8.0 (2024-08-21)
 ------------------
diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py
index 68e4987..2cd8019 100644
--- a/zyte_spider_templates/middlewares.py
+++ b/zyte_spider_templates/middlewares.py
@@ -29,17 +29,6 @@ class CrawlingLogsMiddleware:
       the fingerprints logged in Scrapy Cloud's request data.
     """
 
-    # Deprecated in practice, but there is no good way to deprecate it, since
-    # class properties that also work for class instances are not a thing.
-    # https://stackoverflow.com/q/128573
-    valid_page_types = [
-        "product",
-        "nextPage",
-        "subCategories",
-        "productNavigation",
-        "productNavigation-heuristics",
-    ]
-
     unknown_page_type = "unknown"
 
     @classmethod