Skip to content
110 changes: 100 additions & 10 deletions tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,22 @@
from zyte_parsers import Gtin as zp_Gtin
from zyte_parsers import extract_breadcrumbs

from zyte_common_items import AggregateRating, BasePage, Breadcrumb, Gtin, ProductPage
from zyte_common_items import (
AggregateRating,
BasePage,
Brand,
Breadcrumb,
Gtin,
Image,
ProductPage,
)
from zyte_common_items.processors import (
_format_price,
brand_processor,
breadcrumbs_processor,
gtin_processor,
images_processor,
price_processor,
rating_processor,
)

Expand Down Expand Up @@ -125,16 +135,16 @@ def breadcrumbs(self):
"input_value,expected_value",
[
(None, None),
("", ""),
("foo", "foo"),
("", None),
("foo", Brand(name="foo")),
(Selector(text="<html></html>"), None),
(SelectorList([]), None),
(fromstring("<p>foo</p>"), "foo"),
(fromstring("<img alt='foo'>"), "foo"),
(fromstring("<p><img alt='foo'></p>"), "foo"),
(fromstring("<p><p><img alt='foo'></p></p>"), "foo"),
(Selector(text="<p>foo</p>"), "foo"),
(SelectorList([Selector(text="<p>foo</p>")]), "foo"),
(fromstring("<p>foo</p>"), Brand(name="foo")),
(fromstring("<img alt='foo'>"), Brand(name="foo")),
(fromstring("<p><img alt='foo'></p>"), Brand(name="foo")),
(fromstring("<p><p><img alt='foo'></p></p>"), Brand(name="foo")),
(Selector(text="<p>foo</p>"), Brand(name="foo")),
(SelectorList([Selector(text="<p>foo</p>")]), Brand(name="foo")),
],
)
def test_brand(input_value, expected_value):
Expand All @@ -158,7 +168,7 @@ def brand(self):
body="<html><body><img alt='foo'></body></html>".encode(),
)
page = MyProductPage(response=response)
assert page.brand == "foo"
assert page.brand == Brand(name="foo")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -321,3 +331,83 @@ def aggregateRating(self):
assert page.aggregateRating == AggregateRating(
ratingValue=3.8, bestRating=10, reviewCount=5
)


@pytest.mark.parametrize(
"input_value,expected_value",
[
(None, None),
([], []),
("https://www.url.com/img.jpg", [Image(url="https://www.url.com/img.jpg")]),
(
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
),
(
["https://www.url.com/img1.jpg", "https://www.url.com/img2.jpg"],
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
),
(
[
{"url": "https://www.url.com/img1.jpg"},
{"url": "https://www.url.com/img2.jpg"},
],
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
),
],
)
def test_images(input_value, expected_value):
class ImagesPage(BasePage):
@field(out=[images_processor])
def images(self):
return input_value

page = ImagesPage(base_url) # type: ignore[arg-type]
assert page.images == expected_value


def test_images_page():
class MyProductPage(ProductPage):
@field
def images(self):
return self.css("img::attr(href)").getall()

response = HttpResponse(
url="http://www.example.com/",
body="<html><body><img href='https://www.url.com/img.jpg'></body></html>".encode(),
)
page = MyProductPage(response=response)
assert page.images == [Image(url="https://www.url.com/img.jpg")]


@pytest.mark.parametrize(
"input_value,expected_value",
[
(100, "100.00"),
(None, None),
([], []),
({}, {}),
(22.9, "22.90"),
(22.0, "22.00"),
],
)
def test_prices(input_value, expected_value):
class PricePage(BasePage):
@field(out=[price_processor])
def price(self):
return input_value

page = PricePage(base_url) # type: ignore[arg-type]
assert page.price == expected_value
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ commands = mypy zyte_common_items tests
[testenv:twinecheck]
basepython = python3
deps =
twine==4.0.2
twine==5.1.1
build==0.10.0
commands =
python -m build --sdist
Expand Down
3 changes: 3 additions & 0 deletions zyte_common_items/pages/product.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
description_html_processor,
description_processor,
gtin_processor,
images_processor,
price_processor,
rating_processor,
simple_price_processor,
Expand Down Expand Up @@ -46,6 +47,7 @@ class Processors(BasePage.Processors):
gtin = [gtin_processor]
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]


class ProductPage(
Expand All @@ -62,6 +64,7 @@ class Processors(Page.Processors):
gtin = [gtin_processor]
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]


@attrs.define
Expand Down
105 changes: 87 additions & 18 deletions zyte_common_items/processors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections.abc import Iterable
from collections.abc import Iterable, Mapping
from functools import wraps
from numbers import Real
from typing import Any, Callable, List, Optional, Union

from clear_html import clean_node, cleaned_node_to_html, cleaned_node_to_text
Expand All @@ -21,8 +22,10 @@
from .components import (
AggregateRating,
BaseMetadata,
Brand,
Breadcrumb,
Gtin,
Image,
ProbabilityRequest,
Request,
)
Expand Down Expand Up @@ -104,50 +107,78 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb:
return results


@only_handle_nodes
def brand_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:
def brand_processor(value: Any, page: Any) -> Any:
"""Convert the data into a brand name if possible.

Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
Other inputs are returned as is.
If inputs are either :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts
to extract brand data from it.

If value is a string, uses it to create a :class:`~zyte_common_items.Brand` instance.

Other inputs are returned unchanged.
"""
return extract_brand_name(value, search_depth=2)
value = _handle_selectorlist(value)

if isinstance(value, str):
return Brand(name=value) if value else None

@only_handle_nodes
def price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:
if isinstance(value, (Selector, SelectorList, HtmlElement)):
if brand_name := extract_brand_name(value, search_depth=2):
return Brand(name=brand_name)
else:
return None

return value


def price_processor(value: Any, page: Any) -> Any:
"""Convert the data into a price string if possible.

Uses the price-parser_ library.

Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
:class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values.

Other inputs are returned as is.

Puts the parsed Price object into ``page._parsed_price``.

.. _price-parser: https://github.com/scrapinghub/price-parser
"""
price = extract_price(value)
page._parsed_price = price
return _format_price(price)
value = _handle_selectorlist(value)

if isinstance(value, Real):
return f"{value:.2f}"
elif isinstance(value, (Selector, HtmlElement)):
price = extract_price(value)
page._parsed_price = price
return _format_price(price)
else:
return value

@only_handle_nodes
def simple_price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:

def simple_price_processor(value: Any, page: Any) -> Any:
"""Convert the data into a price string if possible.

Uses the price-parser_ library.

Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
:class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values.

Other inputs are returned as is.

.. _price-parser: https://github.com/scrapinghub/price-parser
"""
price = extract_price(value)
return _format_price(price)
value = _handle_selectorlist(value)

if isinstance(value, Real):
return f"{value:.2f}"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems this would allow to remove the duplication in price_processor vs simple_price_processor.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'd like to migrate our stuff quickly before certain deadlines, which is why I want to have this thing working for now and avoid coordinating between PR in yet another dependency repository (which is also why I placed proposed parsing of images into TODO). I think we can improve it later and migrate into more fitting place.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good!

elif isinstance(value, (Selector, HtmlElement)):
price = extract_price(value)
return _format_price(price)
else:
return value


@only_handle_nodes
Expand Down Expand Up @@ -330,6 +361,44 @@ def aggregateRating(self):
return value


def images_processor(value: Any, page: Any) -> Any:
"""Convert the data into a list of :class:`~zyte_common_items.Image`
objects if possible.

If the input is a string, it's used as a url for returning image object.

If input is either an iterable of strings or mappings with "url" key, they are
used to populate image objects.

Other inputs are returned unchanged.
"""

# TODO: add generic-purpose extract_images utility to zyte-parsers
#
# value = _handle_selectorlist(value)
# if isinstance(value, (Selector, HtmlElement)):
# images = extract_images(value)
# return [Image(url=url) for url in images]

if isinstance(value, str):
return [Image(url=value)]

if isinstance(value, Iterable):
results: List[Any] = []
for item in value:
if isinstance(item, Image):
results.append(item)
elif isinstance(item, Mapping):
if url := item.get("url"):
results.append(Image(url=url))
elif isinstance(item, str):
results.append(Image(url=item))

return results

return value


def probability_request_list_processor(
request_list: List[Request],
) -> List[ProbabilityRequest]:
Expand Down