Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SERP (MVP) #62

Merged
merged 20 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changes
=======

Unreleased
----------

* Now requires ``zyte-common-items >= 0.22.0``.
kmike marked this conversation as resolved.
Show resolved Hide resolved


0.8.0 (2024-08-21)
------------------

Expand Down
3 changes: 3 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@
),
}

autodoc_pydantic_model_show_config_summary = False
autodoc_pydantic_model_show_field_summary = False
autodoc_pydantic_model_show_json = False
autodoc_pydantic_model_show_validator_members = False
autodoc_pydantic_model_show_validator_summary = False

# sphinx-reredirects
redirects = {
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ zyte-spider-templates documentation

templates/index
E-commerce <templates/e-commerce>
Google search <templates/google-search>

.. toctree::
:caption: Customization
Expand Down
5 changes: 5 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Spiders

.. autoclass:: zyte_spider_templates.EcommerceSpider

.. autoclass:: zyte_spider_templates.GoogleSearchSpider


Pages
=====
Expand Down Expand Up @@ -41,3 +43,6 @@ Parameter mixins
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
:exclude-members: model_computed_fields
19 changes: 19 additions & 0 deletions docs/templates/google-search.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.. _google-search:

=================================================
Google search spider template (``google_search``)
=================================================

Basic use
=========

.. code-block:: shell

scrapy crawl google_search -a search_queries="foo bar"

Parameters
==========

.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams
:inherited-members: BaseModel
:exclude-members: model_computed_fields
3 changes: 3 additions & 0 deletions docs/templates/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ Spider template list

:ref:`E-commerce <e-commerce>`
Get products from an e-commerce website.

:ref:`Google Search <google-search>`
Get Google search results.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"scrapy-poet>=0.21.0",
"scrapy-spider-metadata>=0.1.2",
"scrapy-zyte-api[provider]>=0.16.0",
"zyte-common-items>=0.13.0",
"zyte-common-items>=0.22.0",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
4 changes: 4 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from typing import Any, Dict, Optional

import pytest
from scrapy.utils.test import TestSpider

# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
pytest.register_assert_rewrite("tests.utils")


# scrapy.utils.test.get_crawler alternative that does not freeze settings.
def get_crawler(*, settings: Optional[Dict[str, Any]] = None):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@

def test_deprecation():
with pytest.deprecated_call(match="^BaseSpiderParams is deprecated.*"):
BaseSpiderParams(url="https://example.com")
BaseSpiderParams(url="https://example.com") # type: ignore[call-arg]
71 changes: 4 additions & 67 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import json
import logging
import re
from unittest.mock import MagicMock, call, patch

import pytest
Expand All @@ -11,7 +9,6 @@
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request

from zyte_spider_templates import BaseSpiderParams
from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS,
GEOLOCATION_OPTIONS_WITH_CODE,
Expand All @@ -24,6 +21,7 @@

from . import get_crawler
from .test_utils import URL_TO_DOMAIN
from .utils import assertEqualSpiderMetadata


def test_parameters():
Expand Down Expand Up @@ -362,21 +360,6 @@ def test_arguments():
assert spider.allowed_domains == ["example.com"]


def assertEqualJson(actual, expected):
"""Compare the JSON representation of 2 Python objects.

This allows to take into account things like the order of key-value pairs
in dictionaries, which would not be taken into account when comparing
dictionaries directly.

It also generates a better diff in pytest output when enums are involved,
e.g. geolocation values.
"""
actual_json = json.dumps(actual, indent=2)
expected_json = json.dumps(expected, indent=2)
assert actual_json == expected_json


def test_metadata():
actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
expected_metadata = {
Expand Down Expand Up @@ -480,7 +463,7 @@ def test_metadata():
"title": "Pagination Only",
},
},
"title": "Crawl strategy",
"title": "Crawl Strategy",
"enum": [
"automatic",
"full",
Expand Down Expand Up @@ -550,60 +533,14 @@ def test_metadata():
"type": "object",
},
}
assertEqualJson(actual_metadata, expected_metadata)
assertEqualSpiderMetadata(actual_metadata, expected_metadata)

geolocation = actual_metadata["param_schema"]["properties"]["geolocation"]
assert geolocation["enum"][0] == "AF"
assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"}
assert set(geolocation["enum"]) == set(geolocation["enumMeta"])


@pytest.mark.parametrize(
"valid,url",
[
(False, ""),
(False, "http://"),
(False, "http:/example.com"),
(False, "ftp://example.com"),
(False, "example.com"),
(False, "//example.com"),
(False, "http://foo:[email protected]"),
(False, " http://example.com"),
(False, "http://example.com "),
(False, "http://examp le.com"),
(False, "https://example.com:232323"),
(True, "http://example.com"),
(True, "http://bücher.example"),
(True, "http://xn--bcher-kva.example"),
(True, "https://i❤.ws"),
(True, "https://example.com"),
(True, "https://example.com/"),
(True, "https://example.com:2323"),
(True, "https://example.com:2323/"),
(True, "https://example.com:2323/foo"),
(True, "https://example.com/f"),
(True, "https://example.com/foo"),
(True, "https://example.com/foo/"),
(True, "https://example.com/foo/bar"),
(True, "https://example.com/foo/bar/"),
(True, "https://example.com/foo/bar?baz"),
(True, "https://example.com/foo/bar/?baz"),
(True, "https://example.com?foo"),
(True, "https://example.com?foo=bar"),
(True, "https://example.com/?foo=bar&baz"),
(True, "https://example.com/?foo=bar&baz#"),
(True, "https://example.com/?foo=bar&baz#frag"),
(True, "https://example.com#"),
(True, "https://example.com/#"),
(True, "https://example.com/&"),
(True, "https://example.com/&#"),
],
)
def test_validation_url(url, valid):
url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
assert bool(re.match(url_re, url)) == valid


def test_get_parse_product_request():
base_kwargs = {
"url": "https://example.com",
Expand Down Expand Up @@ -818,7 +755,7 @@ def test_urls_file():
crawler = get_crawler()
url = "https://example.com"

with patch("zyte_spider_templates.spiders.ecommerce.requests.get") as mock_get:
with patch("zyte_spider_templates.params.requests.get") as mock_get:
response = requests.Response()
response._content = (
b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n"
Expand Down
51 changes: 51 additions & 0 deletions tests/test_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import re

import pytest

from zyte_spider_templates.params import URL_FIELD_KWARGS


@pytest.mark.parametrize(
"valid,url",
[
(False, ""),
(False, "http://"),
(False, "http:/example.com"),
(False, "ftp://example.com"),
(False, "example.com"),
(False, "//example.com"),
(False, "http://foo:[email protected]"),
(False, " http://example.com"),
(False, "http://example.com "),
(False, "http://examp le.com"),
(False, "https://example.com:232323"),
(True, "http://example.com"),
(True, "http://bücher.example"),
(True, "http://xn--bcher-kva.example"),
(True, "https://i❤.ws"),
(True, "https://example.com"),
(True, "https://example.com/"),
(True, "https://example.com:2323"),
(True, "https://example.com:2323/"),
(True, "https://example.com:2323/foo"),
(True, "https://example.com/f"),
(True, "https://example.com/foo"),
(True, "https://example.com/foo/"),
(True, "https://example.com/foo/bar"),
(True, "https://example.com/foo/bar/"),
(True, "https://example.com/foo/bar?baz"),
(True, "https://example.com/foo/bar/?baz"),
(True, "https://example.com?foo"),
(True, "https://example.com?foo=bar"),
(True, "https://example.com/?foo=bar&baz"),
(True, "https://example.com/?foo=bar&baz#"),
(True, "https://example.com/?foo=bar&baz#frag"),
(True, "https://example.com#"),
(True, "https://example.com/#"),
(True, "https://example.com/&"),
(True, "https://example.com/&#"),
],
)
def test_url_pattern(url, valid):
assert isinstance(URL_FIELD_KWARGS["pattern"], str)
assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid
Loading
Loading