diff --git a/tests/test_processors.py b/tests/test_processors.py index 87c6d263..dbda616e 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -7,12 +7,22 @@ from zyte_parsers import Gtin as zp_Gtin from zyte_parsers import extract_breadcrumbs -from zyte_common_items import AggregateRating, BasePage, Breadcrumb, Gtin, ProductPage +from zyte_common_items import ( + AggregateRating, + BasePage, + Brand, + Breadcrumb, + Gtin, + Image, + ProductPage, +) from zyte_common_items.processors import ( _format_price, brand_processor, breadcrumbs_processor, gtin_processor, + images_processor, + price_processor, rating_processor, ) @@ -125,16 +135,18 @@ def breadcrumbs(self): "input_value,expected_value", [ (None, None), - ("", ""), - ("foo", "foo"), + ("", None), + (" ", None), + ("foo", Brand(name="foo")), + (" foo ", Brand(name="foo")), (Selector(text=""), None), (SelectorList([]), None), - (fromstring("

foo

"), "foo"), - (fromstring("foo"), "foo"), - (fromstring("

foo

"), "foo"), - (fromstring("

foo

"), "foo"), - (Selector(text="

foo

"), "foo"), - (SelectorList([Selector(text="

foo

")]), "foo"), + (fromstring("

foo

"), Brand(name="foo")), + (fromstring("foo"), Brand(name="foo")), + (fromstring("

foo

"), Brand(name="foo")), + (fromstring("

foo

"), Brand(name="foo")), + (Selector(text="

foo

"), Brand(name="foo")), + (SelectorList([Selector(text="

foo

")]), Brand(name="foo")), ], ) def test_brand(input_value, expected_value): @@ -158,7 +170,7 @@ def brand(self): body="foo".encode(), ) page = MyProductPage(response=response) - assert page.brand == "foo" + assert page.brand == Brand(name="foo") @pytest.mark.parametrize( @@ -321,3 +333,85 @@ def aggregateRating(self): assert page.aggregateRating == AggregateRating( ratingValue=3.8, bestRating=10, reviewCount=5 ) + + +@pytest.mark.parametrize( + "input_value,expected_value", + [ + (None, None), + ([], []), + ("https://www.url.com/img.jpg", [Image(url="https://www.url.com/img.jpg")]), + ( + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + ), + ( + ["https://www.url.com/img1.jpg", "https://www.url.com/img2.jpg"], + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + ), + ( + [ + {"url": "https://www.url.com/img1.jpg"}, + {"url": "https://www.url.com/img2.jpg"}, + ], + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + ), + ], +) +def test_images(input_value, expected_value): + class ImagesPage(BasePage): + @field(out=[images_processor]) + def images(self): + return input_value + + page = ImagesPage(base_url) # type: ignore[arg-type] + assert page.images == expected_value + + +def test_images_page(): + class MyProductPage(ProductPage): + @field + def images(self): + return self.css("img::attr(href)").getall() + + response = HttpResponse( + url="http://www.example.com/", + body="".encode(), + ) + page = MyProductPage(response=response) + assert page.images == [Image(url="https://www.url.com/img.jpg")] + + +@pytest.mark.parametrize( + "input_value,expected_value", + [ + (100, "100.00"), + (None, None), + ([], []), + ({}, {}), + (22.9, "22.90"), + (22.0, "22.00"), + ("22.9", "22.9"), + ("Do not apply to strings...", "Do not apply to strings..."), + ], +) +def test_prices(input_value, expected_value): + class PricePage(BasePage): + @field(out=[price_processor]) + def price(self): + return input_value + + page = PricePage(base_url) # type: ignore[arg-type] + assert page.price == expected_value diff --git a/tox.ini b/tox.ini index 27bf6d3a..50f412cc 100644 --- a/tox.ini +++ b/tox.ini @@ -73,7 +73,7 @@ commands = mypy zyte_common_items tests [testenv:twinecheck] basepython = python3 deps = - twine==4.0.2 + twine==5.1.1 build==0.10.0 commands = python -m build --sdist diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py index fc297629..45c1500a 100644 --- a/zyte_common_items/pages/product.py +++ b/zyte_common_items/pages/product.py @@ -19,6 +19,7 @@ description_html_processor, description_processor, gtin_processor, + images_processor, price_processor, rating_processor, simple_price_processor, @@ -46,6 +47,7 @@ class Processors(BasePage.Processors): gtin = [gtin_processor] price = [price_processor] regularPrice = [simple_price_processor] + images = [images_processor] class ProductPage( @@ -62,6 +64,7 @@ class Processors(Page.Processors): gtin = [gtin_processor] price = [price_processor] regularPrice = [simple_price_processor] + images = [images_processor] @attrs.define diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index e4306c11..d803f5e1 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -1,5 +1,6 @@ -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from functools import wraps +from numbers import Real from typing import Any, Callable, List, Optional, Union from clear_html import clean_node, cleaned_node_to_html, cleaned_node_to_text @@ -21,8 +22,10 @@ from .components import ( AggregateRating, BaseMetadata, + Brand, Breadcrumb, Gtin, + Image, ProbabilityRequest, Request, ) @@ -104,50 +107,79 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb: return results -@only_handle_nodes -def brand_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: +def brand_processor(value: Any, page: Any) -> Any: """Convert the data into a brand name if possible. - Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. - Other inputs are returned as is. + If inputs are either :class:`~parsel.selector.Selector`, + :class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts + to extract brand data from it. + + If value is a string, uses it to create a :class:`~zyte_common_items.Brand` instance. + + Other inputs are returned unchanged. """ - return extract_brand_name(value, search_depth=2) + value = _handle_selectorlist(value) + if isinstance(value, str): + value = value.strip() + return Brand(name=value) if value else None -@only_handle_nodes -def price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: + if isinstance(value, (Selector, SelectorList, HtmlElement)): + if brand_name := extract_brand_name(value, search_depth=2): + return Brand(name=brand_name) + else: + return None + + return value + + +def price_processor(value: Any, page: Any) -> Any: """Convert the data into a price string if possible. Uses the price-parser_ library. Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. + :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values. + Other inputs are returned as is. Puts the parsed Price object into ``page._parsed_price``. .. _price-parser: https://github.com/scrapinghub/price-parser """ - price = extract_price(value) - page._parsed_price = price - return _format_price(price) + value = _handle_selectorlist(value) + if isinstance(value, Real): + return f"{value:.2f}" + elif isinstance(value, (Selector, HtmlElement)): + price = extract_price(value) + page._parsed_price = price + return _format_price(price) + else: + return value -@only_handle_nodes -def simple_price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: + +def simple_price_processor(value: Any, page: Any) -> Any: """Convert the data into a price string if possible. Uses the price-parser_ library. Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. + :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values. + Other inputs are returned as is. .. _price-parser: https://github.com/scrapinghub/price-parser """ - price = extract_price(value) - return _format_price(price) + value = _handle_selectorlist(value) + + if isinstance(value, Real): + return f"{value:.2f}" + elif isinstance(value, (Selector, HtmlElement)): + price = extract_price(value) + return _format_price(price) + else: + return value @only_handle_nodes @@ -330,6 +362,37 @@ def aggregateRating(self): return value +def images_processor(value: Any, page: Any) -> Any: + """Convert the data into a list of :class:`~zyte_common_items.Image` + objects if possible. + + If the input is a string, it's used as a url for returning image object. + + If input is either an iterable of strings or mappings with "url" key, they are + used to populate image objects. + + Other inputs are returned unchanged. + """ + + if isinstance(value, str): + return [Image(url=value)] + + if isinstance(value, Iterable): + results: List[Any] = [] + for item in value: + if isinstance(item, Image): + results.append(item) + elif isinstance(item, Mapping): + if url := item.get("url"): + results.append(Image(url=url)) + elif isinstance(item, str): + results.append(Image(url=item)) + + return results + + return value + + def probability_request_list_processor( request_list: List[Request], ) -> List[ProbabilityRequest]: