From 78f0655b33c783c181be6f169adaa8c60cade784 Mon Sep 17 00:00:00 2001 From: Nykakin Date: Mon, 12 Aug 2024 06:53:53 +0200 Subject: [PATCH 01/14] Update processors --- tests/test_processors.py | 105 ++++++++++++++++++++++++++--- zyte_common_items/pages/product.py | 3 + zyte_common_items/processors.py | 104 +++++++++++++++++++++++----- 3 files changed, 183 insertions(+), 29 deletions(-) diff --git a/tests/test_processors.py b/tests/test_processors.py index 87c6d263..c525b45b 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -7,12 +7,22 @@ from zyte_parsers import Gtin as zp_Gtin from zyte_parsers import extract_breadcrumbs -from zyte_common_items import AggregateRating, BasePage, Breadcrumb, Gtin, ProductPage +from zyte_common_items import ( + AggregateRating, + BasePage, + Brand, + Breadcrumb, + Gtin, + Image, + ProductPage, +) from zyte_common_items.processors import ( _format_price, brand_processor, breadcrumbs_processor, gtin_processor, + images_processor, + price_processor, rating_processor, ) @@ -125,16 +135,16 @@ def breadcrumbs(self): "input_value,expected_value", [ (None, None), - ("", ""), - ("foo", "foo"), + ("", None), + ("foo", Brand(name="foo")), (Selector(text=""), None), (SelectorList([]), None), - (fromstring("

foo

"), "foo"), - (fromstring("foo"), "foo"), - (fromstring("

foo

"), "foo"), - (fromstring("

foo

"), "foo"), - (Selector(text="

foo

"), "foo"), - (SelectorList([Selector(text="

foo

")]), "foo"), + (fromstring("

foo

"), Brand(name="foo")), + (fromstring("foo"), Brand(name="foo")), + (fromstring("

foo

"), Brand(name="foo")), + (fromstring("

foo

"), Brand(name="foo")), + (Selector(text="

foo

"), Brand(name="foo")), + (SelectorList([Selector(text="

foo

")]), Brand(name="foo")), ], ) def test_brand(input_value, expected_value): @@ -149,7 +159,7 @@ def brand(self): def test_brand_page(): class MyProductPage(ProductPage): - @field + @field(out=[brand_processor]) def brand(self): return self.css("body") @@ -158,7 +168,7 @@ def brand(self): body="foo".encode(), ) page = MyProductPage(response=response) - assert page.brand == "foo" + assert page.brand == Brand(name="foo") @pytest.mark.parametrize( @@ -321,3 +331,76 @@ def aggregateRating(self): assert page.aggregateRating == AggregateRating( ratingValue=3.8, bestRating=10, reviewCount=5 ) + + +@pytest.mark.parametrize( + "input_value,expected_value", + [ + (None, None), + ([], []), + ("https://www.url.com/img.jpg", [Image(url="https://www.url.com/img.jpg")]), + ( + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + ), + ( + ["https://www.url.com/img1.jpg", "https://www.url.com/img2.jpg"], + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + ), + ( + [ + {"url": "https://www.url.com/img1.jpg"}, + {"url": "https://www.url.com/img2.jpg"}, + ], + [ + Image("https://www.url.com/img1.jpg"), + Image("https://www.url.com/img2.jpg"), + ], + ), + ], +) +def test_images(input_value, expected_value): + class ImagesPage(BasePage): + @field(out=[images_processor]) + def images(self): + return input_value + + page = ImagesPage(base_url) # type: ignore[arg-type] + assert page.images == expected_value + + +@pytest.mark.parametrize( + "input_value,expected_value", + [ + ("$10", "10.00"), + ("100 ", "100.00"), + ("100rub", "100.00"), + (100, "100.00"), + (None, None), + ([], []), + ({}, {}), + ("", None), + ("buy 10 ab ab", "10.00"), + ("1,000.17", "1000.17"), + ("1,000", "1000.00"), + (22.9, "22.90"), + (22.0, "22.00"), + ] +) +def test_prices(input_value, expected_value): + class PricePage(BasePage): + @field(out=[price_processor]) + def price(self): + return input_value + + page = PricePage(base_url) # type: ignore[arg-type] + assert page.price == expected_value diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py index fc297629..45c1500a 100644 --- a/zyte_common_items/pages/product.py +++ b/zyte_common_items/pages/product.py @@ -19,6 +19,7 @@ description_html_processor, description_processor, gtin_processor, + images_processor, price_processor, rating_processor, simple_price_processor, @@ -46,6 +47,7 @@ class Processors(BasePage.Processors): gtin = [gtin_processor] price = [price_processor] regularPrice = [simple_price_processor] + images = [images_processor] class ProductPage( @@ -62,6 +64,7 @@ class Processors(Page.Processors): gtin = [gtin_processor] price = [price_processor] regularPrice = [simple_price_processor] + images = [images_processor] @attrs.define diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index e4306c11..0d070c18 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -1,5 +1,6 @@ -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from functools import wraps +from numbers import Real from typing import Any, Callable, List, Optional, Union from clear_html import clean_node, cleaned_node_to_html, cleaned_node_to_text @@ -21,8 +22,10 @@ from .components import ( AggregateRating, BaseMetadata, + Brand, Breadcrumb, Gtin, + Image, ProbabilityRequest, Request, ) @@ -104,50 +107,76 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb: return results -@only_handle_nodes -def brand_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: +def brand_processor(value: Any, page: Any) -> Union[Brand, None]: """Convert the data into a brand name if possible. - Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. - Other inputs are returned as is. + If inputs are either :class:`~parsel.selector.Selector`, + :class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts + to extract brand data from it. + + If value is a string, use it to create brand object instance + + Other inputs are returned unchanged """ - return extract_brand_name(value, search_depth=2) + value = _handle_selectorlist(value) + if isinstance(value, str): + return Brand(name=value) if value else None -@only_handle_nodes -def price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: + if isinstance(value, (Selector, SelectorList, HtmlElement)): + if brand_name := extract_brand_name(value, search_depth=2): + return Brand(name=brand_name) + else: + return None + + return value + + +def price_processor(value: Any, page: Any) -> Any: """Convert the data into a price string if possible. Uses the price-parser_ library. Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. + :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string + instances and numberic values. + Other inputs are returned as is. Puts the parsed Price object into ``page._parsed_price``. .. _price-parser: https://github.com/scrapinghub/price-parser """ - price = extract_price(value) - page._parsed_price = price - return _format_price(price) + if isinstance(value, Real): + return f"{value:.2f}" + elif isinstance(value, (Selector, HtmlElement, str)): + price = extract_price(value) + page._parsed_price = price + return _format_price(price) + else: + return value -@only_handle_nodes -def simple_price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: +def simple_price_processor(value: Any, page: Any) -> Any: """Convert the data into a price string if possible. Uses the price-parser_ library. Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. + :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string + instances and numberic values. + Other inputs are returned as is. .. _price-parser: https://github.com/scrapinghub/price-parser """ - price = extract_price(value) - return _format_price(price) + if isinstance(value, Real): + return f"{value:.2f}" + elif isinstance(value, (Selector, HtmlElement, str)): + price = extract_price(value) + return _format_price(price) + else: + return value @only_handle_nodes @@ -330,6 +359,45 @@ def aggregateRating(self): return value +def images_processor(value: Any, page: Any) -> List[Image]: + """Convert the data into a list of :class:`~zyte_common_items.Image` + objects if possible. + + If the input is a string, it's used as a url for returning image object. + + If input is either an iterable of strings or mappings with "url" key, they are + used to populate image objects. + + Other inputs are returned unchanged. + """ + + value = _handle_selectorlist(value) + + # TODO: add generic-purpose extract_images utility to zyte-parsers + # + # if isinstance(value, (Selector, HtmlElement)): + # images = extract_images(value) + # return [Image(url=url) for url in images] + + if isinstance(value, str): + return [Image(url=value)] + + if isinstance(value, Iterable): + results: List[Any] = [] + for item in value: + if isinstance(item, Image): + results.append(item) + elif isinstance(item, Mapping): + if url := item.get("url"): + results.append(Image(url=url)) + elif isinstance(item, str): + results.append(Image(url=item)) + + return results + + return value + + def probability_request_list_processor( request_list: List[Request], ) -> List[ProbabilityRequest]: From 0e0e733ddfb0ccd190b7e32f8fb54971ccc98820 Mon Sep 17 00:00:00 2001 From: Nykakin Date: Mon, 12 Aug 2024 07:05:48 +0200 Subject: [PATCH 02/14] Clean-up --- tests/test_pages_price.py | 14 +++++++------- zyte_common_items/processors.py | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_pages_price.py b/tests/test_pages_price.py index e367c375..23fdd0e4 100644 --- a/tests/test_pages_price.py +++ b/tests/test_pages_price.py @@ -62,21 +62,21 @@ def price(self): url = "https://example.com" page = CustomProductPage(response=HttpResponse(url=url, body=html)) assert page.call_count == 0 - assert page.price == "$13.2" + assert page.price == "13.20" assert page.call_count == 1 assert page.currency is None - assert await page.currencyRaw is None - assert page.call_count == 2 # we want this to be 1 - assert await page.currencyRaw is None - assert page.call_count == 2 # we want this to be 1 + assert await page.currencyRaw == "$" + assert page.call_count == 1 # we want this to be 1 + assert await page.currencyRaw == "$" + assert page.call_count == 1 # we want this to be 1 # access currency fields before the price field page = CustomProductPage(response=HttpResponse(url=url, body=html)) assert page.call_count == 0 assert page.currency is None - assert await page.currencyRaw is None + assert await page.currencyRaw == "$" assert page.call_count == 1 - assert page.price == "$13.2" + assert page.price == "13.20" assert page.call_count == 2 # we want this to be 1 diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index 0d070c18..03fbadc8 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -147,6 +147,8 @@ def price_processor(value: Any, page: Any) -> Any: .. _price-parser: https://github.com/scrapinghub/price-parser """ + value = _handle_selectorlist(value) + if isinstance(value, Real): return f"{value:.2f}" elif isinstance(value, (Selector, HtmlElement, str)): @@ -170,6 +172,8 @@ def simple_price_processor(value: Any, page: Any) -> Any: .. _price-parser: https://github.com/scrapinghub/price-parser """ + value = _handle_selectorlist(value) + if isinstance(value, Real): return f"{value:.2f}" elif isinstance(value, (Selector, HtmlElement, str)): From 308db2eb58bbfbfc155c65c05eb1a12ded0ebbdd Mon Sep 17 00:00:00 2001 From: Nykakin Date: Mon, 12 Aug 2024 07:11:36 +0200 Subject: [PATCH 03/14] Fix typos --- zyte_common_items/processors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index 03fbadc8..494b0897 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -139,7 +139,7 @@ def price_processor(value: Any, page: Any) -> Any: Supported inputs are :class:`~parsel.selector.Selector`, :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string - instances and numberic values. + instances and numeric values. Other inputs are returned as is. @@ -166,7 +166,7 @@ def simple_price_processor(value: Any, page: Any) -> Any: Supported inputs are :class:`~parsel.selector.Selector`, :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string - instances and numberic values. + instances and numeric values. Other inputs are returned as is. From 3fc6bb2b0f82e2850c77293ba85e7ca40e654f1e Mon Sep 17 00:00:00 2001 From: Nykakin Date: Mon, 12 Aug 2024 07:23:23 +0200 Subject: [PATCH 04/14] Fix twinecheck exception in CI --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 27bf6d3a..50f412cc 100644 --- a/tox.ini +++ b/tox.ini @@ -73,7 +73,7 @@ commands = mypy zyte_common_items tests [testenv:twinecheck] basepython = python3 deps = - twine==4.0.2 + twine==5.1.1 build==0.10.0 commands = python -m build --sdist From 6620a1538ee3b558d78705800ad4ea19f058f7dc Mon Sep 17 00:00:00 2001 From: Nykakin Date: Mon, 12 Aug 2024 07:27:38 +0200 Subject: [PATCH 05/14] Add trailing "," --- tests/test_processors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_processors.py b/tests/test_processors.py index c525b45b..2094972b 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -394,7 +394,7 @@ def images(self): ("1,000", "1000.00"), (22.9, "22.90"), (22.0, "22.00"), - ] + ], ) def test_prices(input_value, expected_value): class PricePage(BasePage): From 67cf862517dca1febe80d942ca908fdd6efd6d8b Mon Sep 17 00:00:00 2001 From: Nykakin Date: Fri, 16 Aug 2024 05:28:56 +0200 Subject: [PATCH 06/14] Variant processor --- tests/test_processors.py | 21 +++++++++++++++++++++ zyte_common_items/pages/product.py | 3 +++ zyte_common_items/processors.py | 19 +++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/tests/test_processors.py b/tests/test_processors.py index 2094972b..1e7bec02 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -15,6 +15,7 @@ Gtin, Image, ProductPage, + ProductVariant, ) from zyte_common_items.processors import ( _format_price, @@ -24,6 +25,7 @@ images_processor, price_processor, rating_processor, + variants_processor ) base_url = "http://www.example.com/blog/" @@ -404,3 +406,22 @@ def price(self): page = PricePage(base_url) # type: ignore[arg-type] assert page.price == expected_value + + +@pytest.mark.parametrize( + "input_value,expected_value", + [ + ( + [{"price": "12,12$"}, {"price": 100}], + [ProductVariant(price="12.12"), ProductVariant(price="100.00")] + ), + ], +) +def test_variants(input_value, expected_value): + class VariantPage(BasePage): + @field(out=[variants_processor]) + def variants(self): + return input_value + + page = VariantPage(base_url) # type: ignore[arg-type] + assert page.variants == expected_value diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py index 45c1500a..cad470f1 100644 --- a/zyte_common_items/pages/product.py +++ b/zyte_common_items/pages/product.py @@ -23,6 +23,7 @@ price_processor, rating_processor, simple_price_processor, + variants_processor, ) from .base import BasePage, Page @@ -48,6 +49,7 @@ class Processors(BasePage.Processors): price = [price_processor] regularPrice = [simple_price_processor] images = [images_processor] + variants = [variants_processor] class ProductPage( @@ -65,6 +67,7 @@ class Processors(Page.Processors): price = [price_processor] regularPrice = [simple_price_processor] images = [images_processor] + variants = [variants_processor] @attrs.define diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index 494b0897..41f66ced 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -29,6 +29,7 @@ ProbabilityRequest, Request, ) +from .items import ProductVariant def _get_base_url(page: Any) -> Optional[str]: @@ -417,3 +418,21 @@ def metadata_processor(metadata: BaseMetadata, page): if page.metadata_cls is None: return None return metadata.cast(page.metadata_cls) + + +def variants_processor(variants: list[Any], page: Any) -> list[ProductVariant]: + ret = [] + for variant in variants: + if isinstance(variant, Mapping): + processors = { + "price": simple_price_processor, + "regularPrice": simple_price_processor, + } + for field, processor in processors.items(): + if field in variant: + variant[field] = processor(variant[field], page) + + ret.append(ProductVariant(**variant)) + else: + ret.append(variant) + return ret From ba1a5e93749c65eac05e4879588f87695aca818e Mon Sep 17 00:00:00 2001 From: Nykakin Date: Tue, 20 Aug 2024 17:21:53 +0200 Subject: [PATCH 07/14] Apply changes from code review --- zyte_common_items/processors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index 41f66ced..3a1cb4d4 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -108,16 +108,16 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb: return results -def brand_processor(value: Any, page: Any) -> Union[Brand, None]: +def brand_processor(value: Any, page: Any) -> Any: """Convert the data into a brand name if possible. If inputs are either :class:`~parsel.selector.Selector`, :class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts to extract brand data from it. - If value is a string, use it to create brand object instance + If value is a string, uses it to create a :class:`~zyte_common_items.Brand` instance. - Other inputs are returned unchanged + Other inputs are returned unchanged. """ value = _handle_selectorlist(value) @@ -364,7 +364,7 @@ def aggregateRating(self): return value -def images_processor(value: Any, page: Any) -> List[Image]: +def images_processor(value: Any, page: Any) -> Any: """Convert the data into a list of :class:`~zyte_common_items.Image` objects if possible. From 71eb060a98618c579b985d19614d06e18f8bb8ac Mon Sep 17 00:00:00 2001 From: Nykakin Date: Tue, 20 Aug 2024 17:24:19 +0200 Subject: [PATCH 08/14] Revert "Variant processor" This reverts commit 67cf862517dca1febe80d942ca908fdd6efd6d8b. --- tests/test_processors.py | 21 --------------------- zyte_common_items/pages/product.py | 3 --- zyte_common_items/processors.py | 19 ------------------- 3 files changed, 43 deletions(-) diff --git a/tests/test_processors.py b/tests/test_processors.py index 1e7bec02..2094972b 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -15,7 +15,6 @@ Gtin, Image, ProductPage, - ProductVariant, ) from zyte_common_items.processors import ( _format_price, @@ -25,7 +24,6 @@ images_processor, price_processor, rating_processor, - variants_processor ) base_url = "http://www.example.com/blog/" @@ -406,22 +404,3 @@ def price(self): page = PricePage(base_url) # type: ignore[arg-type] assert page.price == expected_value - - -@pytest.mark.parametrize( - "input_value,expected_value", - [ - ( - [{"price": "12,12$"}, {"price": 100}], - [ProductVariant(price="12.12"), ProductVariant(price="100.00")] - ), - ], -) -def test_variants(input_value, expected_value): - class VariantPage(BasePage): - @field(out=[variants_processor]) - def variants(self): - return input_value - - page = VariantPage(base_url) # type: ignore[arg-type] - assert page.variants == expected_value diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py index cad470f1..45c1500a 100644 --- a/zyte_common_items/pages/product.py +++ b/zyte_common_items/pages/product.py @@ -23,7 +23,6 @@ price_processor, rating_processor, simple_price_processor, - variants_processor, ) from .base import BasePage, Page @@ -49,7 +48,6 @@ class Processors(BasePage.Processors): price = [price_processor] regularPrice = [simple_price_processor] images = [images_processor] - variants = [variants_processor] class ProductPage( @@ -67,7 +65,6 @@ class Processors(Page.Processors): price = [price_processor] regularPrice = [simple_price_processor] images = [images_processor] - variants = [variants_processor] @attrs.define diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index 3a1cb4d4..a423eaae 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -29,7 +29,6 @@ ProbabilityRequest, Request, ) -from .items import ProductVariant def _get_base_url(page: Any) -> Optional[str]: @@ -418,21 +417,3 @@ def metadata_processor(metadata: BaseMetadata, page): if page.metadata_cls is None: return None return metadata.cast(page.metadata_cls) - - -def variants_processor(variants: list[Any], page: Any) -> list[ProductVariant]: - ret = [] - for variant in variants: - if isinstance(variant, Mapping): - processors = { - "price": simple_price_processor, - "regularPrice": simple_price_processor, - } - for field, processor in processors.items(): - if field in variant: - variant[field] = processor(variant[field], page) - - ret.append(ProductVariant(**variant)) - else: - ret.append(variant) - return ret From d2ab672afffdc9aad5a064a8345e663d4af8dd33 Mon Sep 17 00:00:00 2001 From: Nykakin Date: Thu, 22 Aug 2024 00:32:13 +0200 Subject: [PATCH 09/14] Apply changes from code review --- tests/test_processors.py | 2 +- zyte_common_items/processors.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_processors.py b/tests/test_processors.py index 2094972b..427d8002 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -158,7 +158,7 @@ def brand(self): def test_brand_page(): - class MyProductPage(ProductPage): + class MyProductPage(BasePage): @field(out=[brand_processor]) def brand(self): return self.css("body") diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index a423eaae..cc686d30 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -375,10 +375,9 @@ def images_processor(value: Any, page: Any) -> Any: Other inputs are returned unchanged. """ - value = _handle_selectorlist(value) - # TODO: add generic-purpose extract_images utility to zyte-parsers # + # value = _handle_selectorlist(value) # if isinstance(value, (Selector, HtmlElement)): # images = extract_images(value) # return [Image(url=url) for url in images] From 30baf8a0ca371d93af1118a7829d8f4e607378a8 Mon Sep 17 00:00:00 2001 From: Nykakin Date: Thu, 22 Aug 2024 01:10:27 +0200 Subject: [PATCH 10/14] Fix tox --- tests/test_processors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_processors.py b/tests/test_processors.py index 427d8002..2d37b0b7 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -14,6 +14,7 @@ Breadcrumb, Gtin, Image, + Page, ProductPage, ) from zyte_common_items.processors import ( @@ -158,7 +159,7 @@ def brand(self): def test_brand_page(): - class MyProductPage(BasePage): + class MyProductPage(Page): @field(out=[brand_processor]) def brand(self): return self.css("body") From aa42e22b7db52fc5ab1a711df44f6180069d6d37 Mon Sep 17 00:00:00 2001 From: Nykakin Date: Thu, 22 Aug 2024 15:36:39 +0200 Subject: [PATCH 11/14] Revert handling strings by price_processor --- tests/test_pages_price.py | 14 +++++++------- tests/test_processors.py | 7 ------- zyte_common_items/processors.py | 5 ++--- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/tests/test_pages_price.py b/tests/test_pages_price.py index 23fdd0e4..e367c375 100644 --- a/tests/test_pages_price.py +++ b/tests/test_pages_price.py @@ -62,21 +62,21 @@ def price(self): url = "https://example.com" page = CustomProductPage(response=HttpResponse(url=url, body=html)) assert page.call_count == 0 - assert page.price == "13.20" + assert page.price == "$13.2" assert page.call_count == 1 assert page.currency is None - assert await page.currencyRaw == "$" - assert page.call_count == 1 # we want this to be 1 - assert await page.currencyRaw == "$" - assert page.call_count == 1 # we want this to be 1 + assert await page.currencyRaw is None + assert page.call_count == 2 # we want this to be 1 + assert await page.currencyRaw is None + assert page.call_count == 2 # we want this to be 1 # access currency fields before the price field page = CustomProductPage(response=HttpResponse(url=url, body=html)) assert page.call_count == 0 assert page.currency is None - assert await page.currencyRaw == "$" + assert await page.currencyRaw is None assert page.call_count == 1 - assert page.price == "13.20" + assert page.price == "$13.2" assert page.call_count == 2 # we want this to be 1 diff --git a/tests/test_processors.py b/tests/test_processors.py index 2d37b0b7..c9e9e0ef 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -382,17 +382,10 @@ def images(self): @pytest.mark.parametrize( "input_value,expected_value", [ - ("$10", "10.00"), - ("100 ", "100.00"), - ("100rub", "100.00"), (100, "100.00"), (None, None), ([], []), ({}, {}), - ("", None), - ("buy 10 ab ab", "10.00"), - ("1,000.17", "1000.17"), - ("1,000", "1000.00"), (22.9, "22.90"), (22.0, "22.00"), ], diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index cc686d30..9da38bed 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -138,8 +138,7 @@ def price_processor(value: Any, page: Any) -> Any: Uses the price-parser_ library. Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string - instances and numeric values. + :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values. Other inputs are returned as is. @@ -151,7 +150,7 @@ def price_processor(value: Any, page: Any) -> Any: if isinstance(value, Real): return f"{value:.2f}" - elif isinstance(value, (Selector, HtmlElement, str)): + elif isinstance(value, (Selector, HtmlElement)): price = extract_price(value) page._parsed_price = price return _format_price(price) From 3fce43df5831e1a3654d46606aff5f0e260447ec Mon Sep 17 00:00:00 2001 From: Nykakin Date: Thu, 22 Aug 2024 15:45:27 +0200 Subject: [PATCH 12/14] Do proper testing --- tests/test_processors.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/test_processors.py b/tests/test_processors.py index c9e9e0ef..7181fcdb 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -14,7 +14,6 @@ Breadcrumb, Gtin, Image, - Page, ProductPage, ) from zyte_common_items.processors import ( @@ -159,8 +158,8 @@ def brand(self): def test_brand_page(): - class MyProductPage(Page): - @field(out=[brand_processor]) + class MyProductPage(ProductPage): + @field def brand(self): return self.css("body") @@ -379,6 +378,20 @@ def images(self): assert page.images == expected_value +def test_images_page(): + class MyProductPage(ProductPage): + @field + def images(self): + return self.css("img::attr(href)").getall() + + response = HttpResponse( + url="http://www.example.com/", + body="".encode(), + ) + page = MyProductPage(response=response) + assert page.images == [Image(url="https://www.url.com/img.jpg")] + + @pytest.mark.parametrize( "input_value,expected_value", [ From d62570564deedf53a96cf14993b751502e4c9bab Mon Sep 17 00:00:00 2001 From: Nykakin Date: Thu, 22 Aug 2024 16:12:36 +0200 Subject: [PATCH 13/14] Update simple_price_processor --- zyte_common_items/processors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index 9da38bed..bc401268 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -164,8 +164,7 @@ def simple_price_processor(value: Any, page: Any) -> Any: Uses the price-parser_ library. Supported inputs are :class:`~parsel.selector.Selector`, - :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string - instances and numeric values. + :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values. Other inputs are returned as is. @@ -175,7 +174,7 @@ def simple_price_processor(value: Any, page: Any) -> Any: if isinstance(value, Real): return f"{value:.2f}" - elif isinstance(value, (Selector, HtmlElement, str)): + elif isinstance(value, (Selector, HtmlElement)): price = extract_price(value) return _format_price(price) else: From 12cb8efadd59d399b41bf59de6b0d721d4d10efb Mon Sep 17 00:00:00 2001 From: Nykakin Date: Mon, 26 Aug 2024 04:52:13 +0200 Subject: [PATCH 14/14] Changes from code review --- tests/test_processors.py | 4 ++++ zyte_common_items/processors.py | 8 +------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_processors.py b/tests/test_processors.py index 7181fcdb..dbda616e 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -136,7 +136,9 @@ def breadcrumbs(self): [ (None, None), ("", None), + (" ", None), ("foo", Brand(name="foo")), + (" foo ", Brand(name="foo")), (Selector(text=""), None), (SelectorList([]), None), (fromstring("

foo

"), Brand(name="foo")), @@ -401,6 +403,8 @@ def images(self): ({}, {}), (22.9, "22.90"), (22.0, "22.00"), + ("22.9", "22.9"), + ("Do not apply to strings...", "Do not apply to strings..."), ], ) def test_prices(input_value, expected_value): diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index bc401268..d803f5e1 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -121,6 +121,7 @@ def brand_processor(value: Any, page: Any) -> Any: value = _handle_selectorlist(value) if isinstance(value, str): + value = value.strip() return Brand(name=value) if value else None if isinstance(value, (Selector, SelectorList, HtmlElement)): @@ -373,13 +374,6 @@ def images_processor(value: Any, page: Any) -> Any: Other inputs are returned unchanged. """ - # TODO: add generic-purpose extract_images utility to zyte-parsers - # - # value = _handle_selectorlist(value) - # if isinstance(value, (Selector, HtmlElement)): - # images = extract_images(value) - # return [Image(url=url) for url in images] - if isinstance(value, str): return [Image(url=value)]