diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1ad4fc80..9ee7a40f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,12 @@ Changelog ========= +TBD +=== + +* Use ``web_poet.Unset`` sentinel value which represents fields which hasn't been + assigned with any value. This is to differentiate values which are ``None``. + 0.2.0 (2022-09-22) ================== diff --git a/README.rst b/README.rst index 6b6d955b..a30c3c95 100644 --- a/README.rst +++ b/README.rst @@ -20,12 +20,12 @@ zyte-common-items .. description starts -``zyte-common-items`` is a Python 3.7+ library of item classes used by Zyte_ to -normalize different types of data extracted from websites. - -It can be used in custom data extraction code for normalization purposes, -maximizing opportunities for code reuse. +``zyte-common-items`` is a Python 3.7+ library of item_ and `page object`_ +classes for web data extraction that we use at Zyte_ to maximize opportunities +for code reuse. +.. _item: https://docs.scrapy.org/en/latest/topics/items.html +.. _page object: https://web-poet.readthedocs.io/en/stable/ .. _Zyte: https://www.zyte.com/ .. description ends diff --git a/docs/conf.py b/docs/conf.py index 3c86a9da..91eaf1b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,6 +35,8 @@ def get_version_and_release(): autodoc_member_order = "groupwise" +intersphinx_disabled_reftypes = [] intersphinx_mapping = { "python": ("https://docs.python.org/3", None), + "web-poet": ("https://web-poet.readthedocs.io/en/stable", None), } diff --git a/docs/index.rst b/docs/index.rst index 7f2fd69b..79cda06f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,10 +7,27 @@ zyte-common-items |version| documentation :end-before: .. description ends .. toctree:: - :hidden: + :caption: Getting started + :maxdepth: 1 setup - usage + +.. toctree:: + :caption: Usage + :maxdepth: 1 + + usage/items + usage/pages + +.. toctree:: + :caption: Reference + :maxdepth: 1 + reference/index changelog + +.. toctree:: + :caption: Contributing + :maxdepth: 1 + contributing diff --git a/docs/reference/adapter.rst b/docs/reference/adapter.rst index 9add1131..f23fc23b 100644 --- a/docs/reference/adapter.rst +++ b/docs/reference/adapter.rst @@ -2,5 +2,4 @@ Adapter ======= -.. class:: zyte_common_items.ZyteItemAdapter -.. autoclass:: zyte_common_items.adapter.ZyteItemAdapter \ No newline at end of file +.. autoclass:: zyte_common_items.ZyteItemAdapter \ No newline at end of file diff --git a/docs/reference/components.rst b/docs/reference/components.rst index 5e230a59..ef089bc4 100644 --- a/docs/reference/components.rst +++ b/docs/reference/components.rst @@ -7,34 +7,26 @@ Components These classes are used to map data within :ref:`items `, and are not tied to any specific item type. -.. class:: zyte_common_items.AdditionalProperty(**kwargs) -.. autoclass:: zyte_common_items.components.AdditionalProperty(**kwargs) +.. autoclass:: zyte_common_items.AdditionalProperty(**kwargs) :members: -.. class:: zyte_common_items.AggregateRating(**kwargs) -.. autoclass:: zyte_common_items.components.AggregateRating(**kwargs) +.. autoclass:: zyte_common_items.AggregateRating(**kwargs) :members: -.. class:: zyte_common_items.Brand(**kwargs) -.. autoclass:: zyte_common_items.components.Brand(**kwargs) +.. autoclass:: zyte_common_items.Brand(**kwargs) :members: -.. class:: zyte_common_items.Breadcrumb(**kwargs) -.. autoclass:: zyte_common_items.components.Breadcrumb(**kwargs) +.. autoclass:: zyte_common_items.Breadcrumb(**kwargs) :members: -.. class:: zyte_common_items.Gtin(**kwargs) -.. autoclass:: zyte_common_items.components.Gtin(**kwargs) +.. autoclass:: zyte_common_items.Gtin(**kwargs) :members: -.. class:: zyte_common_items.Image(**kwargs) -.. autoclass:: zyte_common_items.components.Image(**kwargs) +.. autoclass:: zyte_common_items.Image(**kwargs) :members: -.. class:: zyte_common_items.Link(**kwargs) -.. autoclass:: zyte_common_items.components.Link(**kwargs) +.. autoclass:: zyte_common_items.Link(**kwargs) :members: -.. class:: zyte_common_items.Metadata(**kwargs) -.. autoclass:: zyte_common_items.components.Metadata(**kwargs) +.. autoclass:: zyte_common_items.Metadata(**kwargs) :members: diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 4a5fe362..482b7e91 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -5,5 +5,6 @@ Reference .. toctree:: items + pages components adapter diff --git a/docs/reference/items.rst b/docs/reference/items.rst index c3f712dc..24555180 100644 --- a/docs/reference/items.rst +++ b/docs/reference/items.rst @@ -1,32 +1,28 @@ -.. _items: +.. _item-api: -===== -Items -===== +======== +Item API +======== Product ======= -.. class:: zyte_common_items.Product(**kwargs) -.. autoclass:: zyte_common_items.items.Product(**kwargs) +.. autoclass:: zyte_common_items.Product(**kwargs) :members: :inherited-members: -.. class:: zyte_common_items.ProductVariant(**kwargs) -.. autoclass:: zyte_common_items.items.ProductVariant(**kwargs) +.. autoclass:: zyte_common_items.ProductVariant(**kwargs) :members: :inherited-members: Product List ============ -.. class:: zyte_common_items.ProductList(**kwargs) -.. autoclass:: zyte_common_items.items.ProductList(**kwargs) +.. autoclass:: zyte_common_items.ProductList(**kwargs) :members: :inherited-members: -.. class:: zyte_common_items.ProductFromList(**kwargs) -.. autoclass:: zyte_common_items.items.ProductFromList(**kwargs) +.. autoclass:: zyte_common_items.ProductFromList(**kwargs) :members: :inherited-members: @@ -34,10 +30,9 @@ Product List Custom items ============ -Subclass :class:`~zyte_common_items.base.Item` to create your own item classes. +Subclass :class:`~zyte_common_items.Item` to create your own item classes. -.. class:: zyte_common_items.Item(**kwargs) -.. autoclass:: zyte_common_items.base.Item(**kwargs) +.. autoclass:: zyte_common_items.Item(**kwargs) :members: .. attribute:: _unknown_fields_dict diff --git a/docs/reference/pages.rst b/docs/reference/pages.rst new file mode 100644 index 00000000..d2cdccf4 --- /dev/null +++ b/docs/reference/pages.rst @@ -0,0 +1,76 @@ +.. _page-object-api: + +=============== +Page object API +=============== + +Product +======= + +.. autoclass:: zyte_common_items.BaseProductPage(**kwargs) + :show-inheritance: + +.. autoclass:: zyte_common_items.ProductPage(**kwargs) + :show-inheritance: + + +Product List +============ + +.. autoclass:: zyte_common_items.BaseProductListPage(**kwargs) + :show-inheritance: + +.. autoclass:: zyte_common_items.ProductListPage(**kwargs) + :show-inheritance: + + +Custom page objects +=================== + +Subclass :class:`~zyte_common_items.Page` to create your own page object +classes that rely on :class:`~zyte_common_items.HttpResponse`. + +If you do not want :class:`~zyte_common_items.HttpResponse` as input, you can +inherit from :class:`~zyte_common_items.BasePage` instead. + +.. autoclass:: zyte_common_items.BasePage(**kwargs) + :show-inheritance: + + Base class for page object classes that has + :class:`~zyte_common_items.ResponseUrl` as a dependency. + + .. data:: metadata + :type: zyte_common_items.Metadata + + Data extraction process metadata. + + :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current + UTC date and time. + + :attr:`~zyte_common_items.Metadata.probability` is set to ``1.0``. + + .. data:: url + :type: web_poet.page_inputs.http.ResponseUrl + + Main URL from which the data has been extracted. + +.. autoclass:: zyte_common_items.Page(**kwargs) + :show-inheritance: + + Base class for page object classes that has + :class:`~zyte_common_items.HttpResponse` as a dependency. + + .. data:: metadata + :type: zyte_common_items.Metadata + + Data extraction process metadata. + + :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current + UTC date and time. + + :attr:`~zyte_common_items.Metadata.probability` is set to ``1.0``. + + .. data:: url + :type: web_poet.page_inputs.http.ResponseUrl + + Main URL from which the data has been extracted. diff --git a/docs/usage.rst b/docs/usage/items.rst similarity index 96% rename from docs/usage.rst rename to docs/usage/items.rst index 516acae5..1298df40 100644 --- a/docs/usage.rst +++ b/docs/usage/items.rst @@ -1,7 +1,12 @@ +.. _items: + ===== -Usage +Items ===== +The :ref:`provided item classes ` can be used to map data extracted +from web pages, e.g. using :ref:`page objects `. + Creating items from dictionaries ================================ @@ -31,6 +36,7 @@ nested data, such as :class:`~zyte_common_items.components.Image` and >>> product.mainImage Image(url='https://example.com/image.png') >>> product.canonicalUrl +Unset >>> product.gtin [Gtin(type='gtin13', value='9504000059446')] diff --git a/docs/usage/pages.rst b/docs/usage/pages.rst new file mode 100644 index 00000000..749f4638 --- /dev/null +++ b/docs/usage/pages.rst @@ -0,0 +1,37 @@ +.. _page-objects: + +============ +Page objects +============ + +The :ref:`provided page object classes ` are good base classes +for custom page object classes that implement website-specific :doc:`page +objects `. + +They provide the following base line: + +- They declare the :ref:`item class ` that they return, allowing for + their ``to_item`` method to automatically build an instance of it from + ``@field``-decorated methods. See :ref:`web-poet-fields`. + +- They provide a default implementation for their + :attr:`~zyte_common_items.Page.metadata` and + :attr:`~zyte_common_items.Page.url` fields. + +The following code shows a :class:`~zyte_common_items.ProductPage` subclass +whose ``to_item`` method returns an instance of +:class:`~zyte_common_items.Product` with +:attr:`~zyte_common_items.Product.metadata`, a +:attr:`~zyte_common_items.Product.name`, and a +:attr:`~zyte_common_items.Product.url`: + +.. code-block:: python + + import attrs + from zyte_common_items import ProductPage + + class CustomProductPage(ProductPage): + + @field + def name(self): + return self.css("h1::text").get() diff --git a/setup.py b/setup.py index c35236d9..5f7ffa9e 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ install_requires=[ "attrs>=21.3.0", "itemadapter>=0.2.0", - "web-poet>=0.5.0", + "web-poet @ git+https://git@github.com/scrapinghub/web-poet@feat-unset#egg=web-poet", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_adapter.py b/tests/test_adapter.py index c251603a..37573850 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -9,6 +9,7 @@ import attrs import pytest from itemadapter import ItemAdapter +from web_poet import Unset from zyte_common_items import Item, Product, ZyteItemAdapter @@ -202,7 +203,7 @@ def test_known_field_get_missing(): product = Product(url=url) with configured_adapter(): adapter = ItemAdapter(product) - assert adapter["canonicalUrl"] is None + assert adapter["canonicalUrl"] is Unset def test_known_field_set(): diff --git a/tests/test_components.py b/tests/test_components.py index 62582f36..5d4fce24 100644 --- a/tests/test_components.py +++ b/tests/test_components.py @@ -1,3 +1,5 @@ +from web_poet import Unset + from zyte_common_items import AggregateRating, Breadcrumb, Link, Metadata @@ -19,5 +21,5 @@ def test_link_optional_fields(): def test_metadata_default_values(): metadata = Metadata() - assert metadata.dateDownloaded is None - assert metadata.probability is None + assert metadata.dateDownloaded is Unset + assert metadata.probability is Unset diff --git a/tests/test_items.py b/tests/test_items.py index fe00ecf3..7262e7b1 100644 --- a/tests/test_items.py +++ b/tests/test_items.py @@ -1,6 +1,7 @@ from copy import copy import pytest +from web_poet import Unset from zyte_common_items import ( AdditionalProperty, @@ -119,7 +120,7 @@ def test_product_min_fields(): for field in list(_PRODUCT_ALL_KWARGS): if field in _PRODUCT_MIN_KWARGS: continue - assert getattr(product, field) is None + assert getattr(product, field) is Unset def test_product_missing_fields(): @@ -141,7 +142,7 @@ def test_product_list_min_fields(): for field in list(_PRODUCT_LIST_MIN_KWARGS): if field in _PRODUCT_LIST_MIN_KWARGS: continue - assert getattr(product_list, field) is None + assert getattr(product_list, field) is Unset def test_product_list_missing_fields(): @@ -161,7 +162,7 @@ def test_product_from_list_all_fields(): def test_product_from_list_min_fields(): product_from_list = ProductFromList() for field in list(_PRODUCT_FROM_LIST_ALL_KWARGS): - assert getattr(product_from_list, field) is None + assert getattr(product_from_list, field) is Unset def test_product_variant_all_fields(): @@ -173,4 +174,4 @@ def test_product_variant_all_fields(): def test_product_variant_min_fields(): product_variant = ProductVariant() for field in list(_PRODUCT_VARIANT_ALL_KWARGS): - assert getattr(product_variant, field) is None + assert getattr(product_variant, field) is Unset diff --git a/tests/test_mypy.py b/tests/test_mypy.py index 8e572bdd..af03bc7e 100644 --- a/tests/test_mypy.py +++ b/tests/test_mypy.py @@ -187,28 +187,28 @@ def test_assignment_product_variant(): @pytest.mark.mypy_testing def test_instantiation_image(): with pytest.raises(ValueError): - Image(url=123) # E: Argument "url" to "Image" has incompatible type "int"; expected "Union[str, _Url]" + Image(url=123) # E: Argument "url" to "Image" has incompatible type "int"; expected "Union[str, _Url, UnsetType]" @pytest.mark.mypy_testing def test_instantiation_breadcrumb(): with pytest.raises(ValueError): - Breadcrumb(url=123) # E: Argument "url" to "Breadcrumb" has incompatible type "int"; expected "Union[str, _Url, None]" + Breadcrumb(url=123) # E: Argument "url" to "Breadcrumb" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" @pytest.mark.mypy_testing def test_instantiation_link(): with pytest.raises(ValueError): - Link(url=123) # E: Argument "url" to "Link" has incompatible type "int"; expected "Union[str, _Url, None]" + Link(url=123) # E: Argument "url" to "Link" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" @pytest.mark.mypy_testing def test_instantiation_product_list(): with pytest.raises(ValueError): - ProductList(url=123) # E: Argument "url" to "ProductList" has incompatible type "int"; expected "Union[str, _Url]" + ProductList(url=123) # E: Argument "url" to "ProductList" has incompatible type "int"; expected "Union[str, _Url, UnsetType]" with pytest.raises(ValueError): ProductList( - url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductList" has incompatible type "int"; expected "Union[str, _Url, None]" + url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductList" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) @@ -216,7 +216,7 @@ def test_instantiation_product_list(): def test_instantiation_product_from_list(): with pytest.raises(ValueError): ProductFromList( - url=123 # E: Argument "url" to "ProductFromList" has incompatible type "int"; expected "Union[str, _Url, None]" + url=123 # E: Argument "url" to "ProductFromList" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) @@ -224,9 +224,9 @@ def test_instantiation_product_from_list(): def test_instantiation_product_variant(): with pytest.raises(ValueError): ProductVariant( - url=123 # E: Argument "url" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, None]" + url=123 # E: Argument "url" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) with pytest.raises(ValueError): ProductVariant( - url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, None]" + url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) diff --git a/tests/test_pages.py b/tests/test_pages.py new file mode 100644 index 00000000..d9163ae9 --- /dev/null +++ b/tests/test_pages.py @@ -0,0 +1,103 @@ +from datetime import datetime + +import attrs +import pytest +from web_poet import HttpResponse, ResponseUrl, field + +from zyte_common_items import ( + BaseProductListPage, + BaseProductPage, + ProductListPage, + ProductPage, +) + + +@pytest.mark.parametrize( + "page_class", + ( + BaseProductPage, + BaseProductListPage, + ), +) +def test_base_pages_default(page_class): + datetime_before = datetime.utcnow().replace(microsecond=0) + + page = page_class(url=ResponseUrl("https://example.com")) + + assert page.metadata.probability == 1.0 + assert page.url == "https://example.com" + assert isinstance(page.url, str) + + page_datetime_string = page.metadata.dateDownloaded + assert page_datetime_string.endswith("Z") + page_datetime = datetime.fromisoformat(page_datetime_string[:-1]) + datetime_after = datetime.utcnow().replace(microsecond=0) + assert datetime_before <= page_datetime <= datetime_after + + +@pytest.mark.parametrize( + "page_class", + ( + ProductPage, + ProductListPage, + ), +) +def test_pages_default(page_class): + datetime_before = datetime.utcnow().replace(microsecond=0) + + url = ResponseUrl("https://example.com") + html = b""" + + + +

Foo

+ + + """ + response = HttpResponse(url=url, body=html) + + page = page_class(response=response) + + assert page.metadata.probability == 1.0 + assert page.url == "https://example.com" + assert isinstance(page.url, str) + + page_datetime_string = page.metadata.dateDownloaded + assert page_datetime_string.endswith("Z") + page_datetime = datetime.fromisoformat(page_datetime_string[:-1]) + datetime_after = datetime.utcnow().replace(microsecond=0) + assert datetime_before <= page_datetime <= datetime_after + + +@pytest.mark.asyncio +async def test_example(): + datetime_before = datetime.utcnow().replace(microsecond=0) + + @attrs.define + class BookPage(ProductPage): + @field + def name(self): + return self.css("h1::text").get() + + url = ResponseUrl("https://example.com/books/1") + html = b""" + + + +

Foo

+ + + """ + response = HttpResponse(url=url, body=html) + + item = await BookPage(response=response).to_item() + + assert item.url == str(url) + assert item.name == "Foo" + assert item.metadata.probability == 1.0 + + item_datetime_string = item.metadata.dateDownloaded + assert item_datetime_string.endswith("Z") + item_datetime = datetime.fromisoformat(item_datetime_string[:-1]) + datetime_after = datetime.utcnow().replace(microsecond=0) + assert datetime_before <= item_datetime <= datetime_after diff --git a/tox.ini b/tox.ini index 0b92512d..aa62e981 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,14 @@ [tox] -envlist = py37,py38,py39,py310,mypy +envlist = py37,py38,py39,py310,mypy,docs [testenv] deps = pytest + pytest-asyncio pytest-cov - pytest-mypy-testing==0.0.11 + # https://github.com/davidfritzsche/pytest-mypy-testing/issues/35 + git+https://github.com/davidfritzsche/pytest-mypy-testing.git@031514ff6ecd5bdf4d11ff238c14d4801b5e47f3 + mypy==0.971 setenv = PY_IGNORE_IMPORTMISMATCH=1 commands = diff --git a/zyte_common_items/__init__.py b/zyte_common_items/__init__.py index c958cc8c..c389a534 100644 --- a/zyte_common_items/__init__.py +++ b/zyte_common_items/__init__.py @@ -12,3 +12,11 @@ Metadata, ) from .items import Product, ProductFromList, ProductList, ProductVariant +from .pages import ( + BasePage, + BaseProductListPage, + BaseProductPage, + Page, + ProductListPage, + ProductPage, +) diff --git a/zyte_common_items/base.py b/zyte_common_items/base.py index 313019a8..ef74b6d7 100644 --- a/zyte_common_items/base.py +++ b/zyte_common_items/base.py @@ -6,19 +6,20 @@ from typing import get_args except ImportError: # Compliance with python 3.7 - from zyte_common_items.util import get_args + from .util import get_args try: from typing import get_origin except ImportError: # Compliance with python 3.7 - from zyte_common_items.util import get_origin + from .util import get_origin from typing import Dict, List, Optional, Union import attrs +from web_poet import UnsetType -from zyte_common_items.util import split_in_unknown_and_known_fields +from .util import split_in_unknown_and_known_fields def is_data_container(cls_or_obj): @@ -88,9 +89,12 @@ def _apply_field_types_to_sub_fields(cls, item: Dict): origin = get_origin(type_annotation) if origin == Union: field_classes = get_args(type_annotation) - if len(field_classes) != 2 or not isinstance(None, field_classes[1]): + if len(field_classes) != 2 and not {UnsetType, type(None)} <= set( + field_classes + ): raise ValueError( - "Field should only be annotated with one type (or optional)." + "Field should only be annotated with one type " + "(or either None or Unset)." ) type_annotation = field_classes[0] origin = get_origin(type_annotation) diff --git a/zyte_common_items/components.py b/zyte_common_items/components.py index 1ec481f2..b4c5af55 100644 --- a/zyte_common_items/components.py +++ b/zyte_common_items/components.py @@ -1,8 +1,9 @@ """Classes for data nested within items.""" -from typing import Optional +from typing import Union import attrs +from web_poet import Unset, UnsetType from zyte_common_items.base import Item from zyte_common_items.util import url_to_str @@ -34,13 +35,13 @@ class AggregateRating(Item): """ #: Maximum value of the rating system. - bestRating: Optional[float] = None + bestRating: Union[float, None, UnsetType] = Unset #: Average value of all ratings. - ratingValue: Optional[float] = None + ratingValue: Union[float, None, UnsetType] = Unset #: Review count. - reviewCount: Optional[int] = None + reviewCount: Union[int, None, UnsetType] = Unset @attrs.define @@ -65,11 +66,11 @@ class Breadcrumb(Item): """ #: Displayed name. - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Target URL. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -119,11 +120,11 @@ class Link(Item): """A link from a webpage to another webpage.""" #: Displayed text. - text: Optional[str] = None + text: Union[str, None, UnsetType] = Unset #: Target URL. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -136,7 +137,7 @@ class Metadata(Item): #: Date and time when the product data was downloaded, in UTC timezone and #: the following format: ``YYYY-MM-DDThh:mm:ssZ``. - dateDownloaded: Optional[str] = None + dateDownloaded: Union[str, None, UnsetType] = Unset #: The probability (0 for 0%, 1 for 100%) that the webpage features the #: requested data type. @@ -147,4 +148,4 @@ class Metadata(Item): #: webpage features a job listing instead of a product, the value should be #: `0`. When there is no complete certainty, the value could be anything in #: between (e.g. `0.96`). - probability: Optional[float] = None + probability: Union[str, None, UnsetType] = Unset diff --git a/zyte_common_items/items.py b/zyte_common_items/items.py index 37a8ad9e..f7f45f9a 100644 --- a/zyte_common_items/items.py +++ b/zyte_common_items/items.py @@ -1,6 +1,7 @@ -from typing import List, Optional +from typing import List, Union import attrs +from web_poet import Unset, UnsetType from zyte_common_items.base import Item from zyte_common_items.components import ( @@ -33,18 +34,18 @@ class ProductVariant(Item): #: extracted. #: #: See also ``features``. - additionalProperties: Optional[List[AdditionalProperty]] = None + additionalProperties: Union[List[AdditionalProperty], None, UnsetType] = Unset #: Availability status. #: #: The value is expected to be one of: ``"InStock"``, ``"OutOfStock"``. - availability: Optional[str] = None + availability: Union[str, None, UnsetType] = Unset #: Canonical form of the URL, as indicated by the website. #: #: See also ``url``. - canonicalUrl: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + canonicalUrl: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) #: Color. @@ -52,20 +53,20 @@ class ProductVariant(Item): #: It is extracted as displayed (e.g. ``"white"``). #: #: See also ``size``, ``style``. - color: Optional[str] = None + color: Union[str, None, UnsetType] = Unset #: Price currency `ISO 4217`_ alphabetic code (e.g. ``"USD"``). #: #: See also ``currencyRaw``. #: #: .. _ISO 4217: https://en.wikipedia.org/wiki/ISO_4217 - currency: Optional[str] = None + currency: Union[str, None, UnsetType] = Unset #: Price currency as it appears on the webpage (no post-processing), e.g. #: ``"$"``. #: #: See also ``currency``. - currencyRaw: Optional[str] = None + currencyRaw: Union[str, None, UnsetType] = Unset #: List of standardized GTIN_ product identifiers associated with the #: product, which are unique for the product across different sellers. @@ -73,17 +74,17 @@ class ProductVariant(Item): #: See also: ``mpn``, ``productId``, ``sku``. #: #: .. _GTIN: https://en.wikipedia.org/wiki/Global_Trade_Item_Number - gtin: Optional[List[Gtin]] = None + gtin: Union[List[Gtin], None, UnsetType] = Unset #: All product images. #: #: The main image (see ``mainImage``) should be first in the list. #: #: Images only displayed as part of the product description are excluded. - images: Optional[List[Image]] = None + images: Union[List[Image], None, UnsetType] = Unset #: Main product image. - mainImage: Optional[Image] = None + mainImage: Union[Image, None, UnsetType] = Unset #: `Manufacturer part number (MPN)`_. #: @@ -92,10 +93,10 @@ class ProductVariant(Item): #: See also: ``gtin``, ``productId``, ``sku``. #: #: .. _Manufacturer part number (MPN): https://en.wikipedia.org/wiki/Part_number - mpn: Optional[str] = None + mpn: Union[str, None, UnsetType] = Unset #: Name as it appears on the webpage (no post-processing). - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Price at which the product is being offered. #: @@ -105,7 +106,7 @@ class ProductVariant(Item): #: #: If ``regularPrice`` is not ``None``, ``price`` should always be lower #: than ``regularPrice``. - price: Optional[str] = None + price: Union[str, None, UnsetType] = Unset #: Product identifier, unique within an e-commerce website. #: @@ -113,7 +114,7 @@ class ProductVariant(Item): #: even a URL. #: #: See also: ``gtin``, ``mpn``, ``sku``. - productId: Optional[str] = None + productId: Union[str, None, UnsetType] = Unset #: Price at which the product was being offered in the past, and which is #: presented as a reference next to the current price. @@ -125,7 +126,7 @@ class ProductVariant(Item): #: #: If ``regularPrice`` is not ``None``, it should always be higher than #: ``price``. - regularPrice: Optional[str] = None + regularPrice: Union[str, None, UnsetType] = Unset #: Size or dimensions. #: @@ -134,7 +135,7 @@ class ProductVariant(Item): #: It is extracted as displayed (e.g. ``"XL"``). #: #: See also ``color``, ``style``. - size: Optional[str] = None + size: Union[str, None, UnsetType] = Unset #: `Stock keeping unit (SKU)`_ identifier, i.e. a merchant-specific product #: identifier. @@ -142,7 +143,7 @@ class ProductVariant(Item): #: See also: ``gtin``, ``mpn``, ``productId``. #: #: .. _Stock keeping unit (SKU): https://en.wikipedia.org/wiki/Stock_keeping_unit - sku: Optional[str] = None + sku: Union[str, None, UnsetType] = Unset #: Style. #: @@ -151,13 +152,13 @@ class ProductVariant(Item): #: It is extracted as displayed (e.g. ``"polka dots"``). #: #: See also ``color``, ``size``. - style: Optional[str] = None + style: Union[str, None, UnsetType] = Unset #: Main URL from which the product variant data could be extracted. #: #: See also ``canonicalUrl``. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -179,29 +180,29 @@ class Product(Item): #: extracted. #: #: See also ``features``. - additionalProperties: Optional[List[AdditionalProperty]] = None + additionalProperties: Union[List[AdditionalProperty], None, UnsetType] = Unset #: Aggregate data about reviews and ratings. - aggregateRating: Optional[AggregateRating] = None + aggregateRating: Union[AggregateRating, None, UnsetType] = Unset #: Availability status. #: #: The value is expected to be one of: ``"InStock"``, ``"OutOfStock"``. - availability: Optional[str] = None + availability: Union[str, None, UnsetType] = Unset #: Brand. - brand: Optional[Brand] = None + brand: Union[Brand, None, UnsetType] = Unset #: Webpage `breadcrumb trail`_. #: #: .. _Breadcrumb trail: https://en.wikipedia.org/wiki/Breadcrumb_navigation - breadcrumbs: Optional[List[Breadcrumb]] = None + breadcrumbs: Union[List[Breadcrumb], None, UnsetType] = Unset #: Canonical form of the URL, as indicated by the website. #: #: See also ``url``. - canonicalUrl: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + canonicalUrl: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) #: Color. @@ -209,20 +210,20 @@ class Product(Item): #: It is extracted as displayed (e.g. ``"white"``). #: #: See also ``size``, ``style``. - color: Optional[str] = None + color: Union[str, None, UnsetType] = Unset #: Price currency `ISO 4217`_ alphabetic code (e.g. ``"USD"``). #: #: See also ``currencyRaw``. #: #: .. _ISO 4217: https://en.wikipedia.org/wiki/ISO_4217 - currency: Optional[str] = None + currency: Union[str, None, UnsetType] = Unset #: Price currency as it appears on the webpage (no post-processing), e.g. #: ``"$"``. #: #: See also ``currency``. - currencyRaw: Optional[str] = None + currencyRaw: Union[str, None, UnsetType] = Unset #: Plain-text description. #: @@ -243,7 +244,7 @@ class Product(Item): #: - There should be no whitespace at the beginning or end. #: #: See also ``descriptionHtml``. - description: Optional[str] = None + description: Union[str, None, UnsetType] = Unset #: HTML description. #: @@ -253,14 +254,14 @@ class Product(Item): #: normalization specification`_ for details. #: #: .. _HTML normalization specification: https://docs.zyte.com/automatic-extraction/article.html#format-of-articlebodyhtml-field - descriptionHtml: Optional[str] = None + descriptionHtml: Union[str, None, UnsetType] = Unset #: List of features. #: #: They are usually listed as bullet points in product webpages. #: #: See also ``additionalProperties``. - features: Optional[List[str]] = None + features: Union[List[str], None, UnsetType] = Unset #: List of standardized GTIN_ product identifiers associated with the #: product, which are unique for the product across different sellers. @@ -268,20 +269,20 @@ class Product(Item): #: See also: ``mpn``, ``productId``, ``sku``. #: #: .. _GTIN: https://en.wikipedia.org/wiki/Global_Trade_Item_Number - gtin: Optional[List[Gtin]] = None + gtin: Union[List[Gtin], None, UnsetType] = Unset #: All product images. #: #: The main image (see ``mainImage``) should be first in the list. #: #: Images only displayed as part of the product description are excluded. - images: Optional[List[Image]] = None + images: Union[List[Image], None, UnsetType] = Unset #: Main product image. - mainImage: Optional[Image] = None + mainImage: Union[Image, None, UnsetType] = Unset #: Data extraction process metadata. - metadata: Optional[Metadata] = None + metadata: Union[Metadata, None, UnsetType] = Unset #: `Manufacturer part number (MPN)`_. #: @@ -290,10 +291,10 @@ class Product(Item): #: See also: ``gtin``, ``productId``, ``sku``. #: #: .. _Manufacturer part number (MPN): https://en.wikipedia.org/wiki/Part_number - mpn: Optional[str] = None + mpn: Union[str, None, UnsetType] = Unset #: Name as it appears on the webpage (no post-processing). - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Price at which the product is being offered. #: @@ -303,7 +304,7 @@ class Product(Item): #: #: If ``regularPrice`` is not ``None``, ``price`` should always be lower #: than ``regularPrice``. - price: Optional[str] = None + price: Union[str, None, UnsetType] = Unset # Redefined to extend the documentation. #: Product identifier, unique within an e-commerce website. @@ -312,7 +313,7 @@ class Product(Item): #: even a URL. #: #: See also: ``gtin``, ``mpn``, ``sku``. - productId: Optional[str] = None + productId: Union[str, None, UnsetType] = Unset #: Price at which the product was being offered in the past, and which is #: presented as a reference next to the current price. @@ -324,7 +325,7 @@ class Product(Item): #: #: If ``regularPrice`` is not ``None``, it should always be higher than #: ``price``. - regularPrice: Optional[str] = None + regularPrice: Union[str, None, UnsetType] = Unset #: Size or dimensions. #: @@ -333,7 +334,7 @@ class Product(Item): #: It is extracted as displayed (e.g. ``"XL"``). #: #: See also ``color``, ``style``. - size: Optional[str] = None + size: Union[str, None, UnsetType] = Unset #: `Stock keeping unit (SKU)`_ identifier, i.e. a merchant-specific product #: identifier. @@ -341,7 +342,7 @@ class Product(Item): #: See also: ``gtin``, ``mpn``, ``productId``. #: #: .. _Stock keeping unit (SKU): https://en.wikipedia.org/wiki/Stock_keeping_unit - sku: Optional[str] = None + sku: Union[str, None, UnsetType] = Unset #: Style. #: @@ -350,7 +351,7 @@ class Product(Item): #: It is extracted as displayed (e.g. ``"polka dots"``). #: #: See also ``color``, ``size``. - style: Optional[str] = None + style: Union[str, None, UnsetType] = Unset #: Main URL from which the data has been extracted. #: @@ -382,7 +383,7 @@ class Product(Item): #: #: Product variant details may not include those that require multiple #: additional requests (e.g. 1 or more requests per variant). - variants: Optional[List[ProductVariant]] = None + variants: Union[List[ProductVariant], None, UnsetType] = Unset @attrs.define(slots=True, kw_only=True) @@ -398,22 +399,22 @@ class ProductFromList(Item): #: See also ``currencyRaw``. #: #: .. _ISO 4217: https://en.wikipedia.org/wiki/ISO_4217 - currency: Optional[str] = None + currency: Union[str, None, UnsetType] = Unset #: Price currency as it appears on the webpage (no post-processing), e.g. #: ``"$"``. #: #: See also ``currency``. - currencyRaw: Optional[str] = None + currencyRaw: Union[str, None, UnsetType] = Unset #: Main product image. - mainImage: Optional[Image] = None + mainImage: Union[Image, None, UnsetType] = Unset #: Data extraction process metadata. - metadata: Optional[Metadata] = None + metadata: Union[Metadata, None, UnsetType] = Unset #: Name as it appears on the webpage (no post-processing). - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Price at which the product is being offered. #: @@ -423,13 +424,13 @@ class ProductFromList(Item): #: #: If ``regularPrice`` is not ``None``, ``price`` should always be lower #: than ``regularPrice``. - price: Optional[str] = None + price: Union[str, None, UnsetType] = Unset #: Product identifier, unique within an e-commerce website. #: #: It may come in the form of an SKU or any other identifier, a hash, or #: even a URL. - productId: Optional[str] = None + productId: Union[str, None, UnsetType] = Unset #: Price at which the product was being offered in the past, and which is #: presented as a reference next to the current price. @@ -441,11 +442,11 @@ class ProductFromList(Item): #: #: If ``regularPrice`` is not ``None``, it should always be higher than #: ``price``. - regularPrice: Optional[str] = None + regularPrice: Union[str, None, UnsetType] = Unset #: Main URL from which the product data could be extracted. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -462,13 +463,13 @@ class ProductList(Item): #: Webpage `breadcrumb trail`_. #: #: .. _Breadcrumb trail: https://en.wikipedia.org/wiki/Breadcrumb_navigation - breadcrumbs: Optional[List[Breadcrumb]] = None + breadcrumbs: Union[List[Breadcrumb], None, UnsetType] = Unset #: Canonical form of the URL, as indicated by the website. #: #: See also ``url``. - canonicalUrl: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + canonicalUrl: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) #: Name of the product listing as it appears on the webpage @@ -476,10 +477,10 @@ class ProductList(Item): #: #: For example, if the webpage is one of the pages of the Robots category, #: ``categoryName`` is ``'Robots'``. - categoryName: Optional[str] = None + categoryName: Union[str, None, UnsetType] = Unset #: Data extraction process metadata. - metadata: Optional[Metadata] = None + metadata: Union[Metadata, None, UnsetType] = Unset #: Number of the current page. #: @@ -487,10 +488,10 @@ class ProductList(Item): #: #: It must be 1-based. For example, if the first page of a listing is #: numbered as 0 on the website, it should be extracted as `1` nonetheless. - pageNumber: Optional[int] = None + pageNumber: Union[int, None, UnsetType] = Unset #: Link to the next page. - paginationNext: Optional[Link] = None + paginationNext: Union[Link, None, UnsetType] = Unset #: List of products. #: @@ -501,7 +502,7 @@ class ProductList(Item): #: The order of the products reflects their position on the rendered page. #: Product order is top-to-bottom, and left-to-right or right-to-left #: depending on the webpage locale. - products: Optional[List[ProductFromList]] = None + products: Union[List[ProductFromList], None, UnsetType] = Unset #: Main URL from which the data has been extracted. #: diff --git a/zyte_common_items/pages.py b/zyte_common_items/pages.py new file mode 100644 index 00000000..6db8b9c2 --- /dev/null +++ b/zyte_common_items/pages.py @@ -0,0 +1,53 @@ +from datetime import datetime + +import attrs +from web_poet import ItemPage, ResponseUrl, Returns, WebPage, field + +from .components import Metadata +from .items import Product, ProductList + + +class _BaseMixin: + def _get_response_url(self): + raise NotImplementedError + + @field + def metadata(self) -> Metadata: + return Metadata( + dateDownloaded=f"{datetime.utcnow().isoformat(timespec='seconds')}Z", + probability=1.0, + ) + + @field + def url(self) -> str: + return str(self._get_response_url()) + + +@attrs.define +class BasePage(_BaseMixin, ItemPage): + _url: ResponseUrl + + def _get_response_url(self): + return self._url + + +class BaseProductPage(BasePage, Returns[Product]): + pass + + +class BaseProductListPage(BasePage, Returns[ProductList]): + pass + + +@attrs.define +class Page(_BaseMixin, WebPage): + def _get_response_url(self): + return self.response.url + + +class ProductPage(Page, Returns[Product]): + pass + + +class ProductListPage(Page, Returns[ProductList]): + pass diff --git a/zyte_common_items/util.py b/zyte_common_items/util.py index 6b088a96..743fd8d1 100644 --- a/zyte_common_items/util.py +++ b/zyte_common_items/util.py @@ -2,6 +2,7 @@ from weakref import WeakKeyDictionary import attrs +from web_poet import UnsetType from web_poet.page_inputs.url import _Url # Caches the attribute names for attr.s classes @@ -59,7 +60,9 @@ def get_origin(tp) -> Tuple: return getattr(tp, "__origin__", ()) -def url_to_str(url: Union[str, _Url]) -> str: +def url_to_str(url: Union[str, _Url, UnsetType]) -> Union[str, UnsetType]: + if isinstance(url, UnsetType): + return url if not isinstance(url, (str, _Url)): raise ValueError( f"{url!r} is neither a string nor an instance of RequestURL or ResponseURL."