From 78f0655b33c783c181be6f169adaa8c60cade784 Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Mon, 12 Aug 2024 06:53:53 +0200
Subject: [PATCH 01/14] Update processors
---
tests/test_processors.py | 105 ++++++++++++++++++++++++++---
zyte_common_items/pages/product.py | 3 +
zyte_common_items/processors.py | 104 +++++++++++++++++++++++-----
3 files changed, 183 insertions(+), 29 deletions(-)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index 87c6d263..c525b45b 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -7,12 +7,22 @@
from zyte_parsers import Gtin as zp_Gtin
from zyte_parsers import extract_breadcrumbs
-from zyte_common_items import AggregateRating, BasePage, Breadcrumb, Gtin, ProductPage
+from zyte_common_items import (
+ AggregateRating,
+ BasePage,
+ Brand,
+ Breadcrumb,
+ Gtin,
+ Image,
+ ProductPage,
+)
from zyte_common_items.processors import (
_format_price,
brand_processor,
breadcrumbs_processor,
gtin_processor,
+ images_processor,
+ price_processor,
rating_processor,
)
@@ -125,16 +135,16 @@ def breadcrumbs(self):
"input_value,expected_value",
[
(None, None),
- ("", ""),
- ("foo", "foo"),
+ ("", None),
+ ("foo", Brand(name="foo")),
(Selector(text=""), None),
(SelectorList([]), None),
- (fromstring("foo
"), "foo"),
- (fromstring("
"), "foo"),
- (fromstring("![foo]()
"), "foo"),
- (fromstring("![foo]()
"), "foo"),
- (Selector(text="foo
"), "foo"),
- (SelectorList([Selector(text="foo
")]), "foo"),
+ (fromstring("foo
"), Brand(name="foo")),
+ (fromstring("
"), Brand(name="foo")),
+ (fromstring("![foo]()
"), Brand(name="foo")),
+ (fromstring("![foo]()
"), Brand(name="foo")),
+ (Selector(text="foo
"), Brand(name="foo")),
+ (SelectorList([Selector(text="foo
")]), Brand(name="foo")),
],
)
def test_brand(input_value, expected_value):
@@ -149,7 +159,7 @@ def brand(self):
def test_brand_page():
class MyProductPage(ProductPage):
- @field
+ @field(out=[brand_processor])
def brand(self):
return self.css("body")
@@ -158,7 +168,7 @@ def brand(self):
body="
".encode(),
)
page = MyProductPage(response=response)
- assert page.brand == "foo"
+ assert page.brand == Brand(name="foo")
@pytest.mark.parametrize(
@@ -321,3 +331,76 @@ def aggregateRating(self):
assert page.aggregateRating == AggregateRating(
ratingValue=3.8, bestRating=10, reviewCount=5
)
+
+
+@pytest.mark.parametrize(
+ "input_value,expected_value",
+ [
+ (None, None),
+ ([], []),
+ ("https://www.url.com/img.jpg", [Image(url="https://www.url.com/img.jpg")]),
+ (
+ [
+ Image("https://www.url.com/img1.jpg"),
+ Image("https://www.url.com/img2.jpg"),
+ ],
+ [
+ Image("https://www.url.com/img1.jpg"),
+ Image("https://www.url.com/img2.jpg"),
+ ],
+ ),
+ (
+ ["https://www.url.com/img1.jpg", "https://www.url.com/img2.jpg"],
+ [
+ Image("https://www.url.com/img1.jpg"),
+ Image("https://www.url.com/img2.jpg"),
+ ],
+ ),
+ (
+ [
+ {"url": "https://www.url.com/img1.jpg"},
+ {"url": "https://www.url.com/img2.jpg"},
+ ],
+ [
+ Image("https://www.url.com/img1.jpg"),
+ Image("https://www.url.com/img2.jpg"),
+ ],
+ ),
+ ],
+)
+def test_images(input_value, expected_value):
+ class ImagesPage(BasePage):
+ @field(out=[images_processor])
+ def images(self):
+ return input_value
+
+ page = ImagesPage(base_url) # type: ignore[arg-type]
+ assert page.images == expected_value
+
+
+@pytest.mark.parametrize(
+ "input_value,expected_value",
+ [
+ ("$10", "10.00"),
+ ("100 ", "100.00"),
+ ("100rub", "100.00"),
+ (100, "100.00"),
+ (None, None),
+ ([], []),
+ ({}, {}),
+ ("", None),
+ ("buy 10 ab ab", "10.00"),
+ ("1,000.17", "1000.17"),
+ ("1,000", "1000.00"),
+ (22.9, "22.90"),
+ (22.0, "22.00"),
+ ]
+)
+def test_prices(input_value, expected_value):
+ class PricePage(BasePage):
+ @field(out=[price_processor])
+ def price(self):
+ return input_value
+
+ page = PricePage(base_url) # type: ignore[arg-type]
+ assert page.price == expected_value
diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py
index fc297629..45c1500a 100644
--- a/zyte_common_items/pages/product.py
+++ b/zyte_common_items/pages/product.py
@@ -19,6 +19,7 @@
description_html_processor,
description_processor,
gtin_processor,
+ images_processor,
price_processor,
rating_processor,
simple_price_processor,
@@ -46,6 +47,7 @@ class Processors(BasePage.Processors):
gtin = [gtin_processor]
price = [price_processor]
regularPrice = [simple_price_processor]
+ images = [images_processor]
class ProductPage(
@@ -62,6 +64,7 @@ class Processors(Page.Processors):
gtin = [gtin_processor]
price = [price_processor]
regularPrice = [simple_price_processor]
+ images = [images_processor]
@attrs.define
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index e4306c11..0d070c18 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -1,5 +1,6 @@
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping
from functools import wraps
+from numbers import Real
from typing import Any, Callable, List, Optional, Union
from clear_html import clean_node, cleaned_node_to_html, cleaned_node_to_text
@@ -21,8 +22,10 @@
from .components import (
AggregateRating,
BaseMetadata,
+ Brand,
Breadcrumb,
Gtin,
+ Image,
ProbabilityRequest,
Request,
)
@@ -104,50 +107,76 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb:
return results
-@only_handle_nodes
-def brand_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:
+def brand_processor(value: Any, page: Any) -> Union[Brand, None]:
"""Convert the data into a brand name if possible.
- Supported inputs are :class:`~parsel.selector.Selector`,
- :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
- Other inputs are returned as is.
+ If inputs are either :class:`~parsel.selector.Selector`,
+ :class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts
+ to extract brand data from it.
+
+ If value is a string, use it to create brand object instance
+
+ Other inputs are returned unchanged
"""
- return extract_brand_name(value, search_depth=2)
+ value = _handle_selectorlist(value)
+ if isinstance(value, str):
+ return Brand(name=value) if value else None
-@only_handle_nodes
-def price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:
+ if isinstance(value, (Selector, SelectorList, HtmlElement)):
+ if brand_name := extract_brand_name(value, search_depth=2):
+ return Brand(name=brand_name)
+ else:
+ return None
+
+ return value
+
+
+def price_processor(value: Any, page: Any) -> Any:
"""Convert the data into a price string if possible.
Uses the price-parser_ library.
Supported inputs are :class:`~parsel.selector.Selector`,
- :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
+ :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string
+ instances and numberic values.
+
Other inputs are returned as is.
Puts the parsed Price object into ``page._parsed_price``.
.. _price-parser: https://github.com/scrapinghub/price-parser
"""
- price = extract_price(value)
- page._parsed_price = price
- return _format_price(price)
+ if isinstance(value, Real):
+ return f"{value:.2f}"
+ elif isinstance(value, (Selector, HtmlElement, str)):
+ price = extract_price(value)
+ page._parsed_price = price
+ return _format_price(price)
+ else:
+ return value
-@only_handle_nodes
-def simple_price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:
+def simple_price_processor(value: Any, page: Any) -> Any:
"""Convert the data into a price string if possible.
Uses the price-parser_ library.
Supported inputs are :class:`~parsel.selector.Selector`,
- :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
+ :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string
+ instances and numberic values.
+
Other inputs are returned as is.
.. _price-parser: https://github.com/scrapinghub/price-parser
"""
- price = extract_price(value)
- return _format_price(price)
+ if isinstance(value, Real):
+ return f"{value:.2f}"
+ elif isinstance(value, (Selector, HtmlElement, str)):
+ price = extract_price(value)
+ return _format_price(price)
+ else:
+ return value
@only_handle_nodes
@@ -330,6 +359,45 @@ def aggregateRating(self):
return value
+def images_processor(value: Any, page: Any) -> List[Image]:
+ """Convert the data into a list of :class:`~zyte_common_items.Image`
+ objects if possible.
+
+ If the input is a string, it's used as a url for returning image object.
+
+ If input is either an iterable of strings or mappings with "url" key, they are
+ used to populate image objects.
+
+ Other inputs are returned unchanged.
+ """
+
+ value = _handle_selectorlist(value)
+
+ # TODO: add generic-purpose extract_images utility to zyte-parsers
+ #
+ # if isinstance(value, (Selector, HtmlElement)):
+ # images = extract_images(value)
+ # return [Image(url=url) for url in images]
+
+ if isinstance(value, str):
+ return [Image(url=value)]
+
+ if isinstance(value, Iterable):
+ results: List[Any] = []
+ for item in value:
+ if isinstance(item, Image):
+ results.append(item)
+ elif isinstance(item, Mapping):
+ if url := item.get("url"):
+ results.append(Image(url=url))
+ elif isinstance(item, str):
+ results.append(Image(url=item))
+
+ return results
+
+ return value
+
+
def probability_request_list_processor(
request_list: List[Request],
) -> List[ProbabilityRequest]:
From 0e0e733ddfb0ccd190b7e32f8fb54971ccc98820 Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Mon, 12 Aug 2024 07:05:48 +0200
Subject: [PATCH 02/14] Clean-up
---
tests/test_pages_price.py | 14 +++++++-------
zyte_common_items/processors.py | 4 ++++
2 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/tests/test_pages_price.py b/tests/test_pages_price.py
index e367c375..23fdd0e4 100644
--- a/tests/test_pages_price.py
+++ b/tests/test_pages_price.py
@@ -62,21 +62,21 @@ def price(self):
url = "https://example.com"
page = CustomProductPage(response=HttpResponse(url=url, body=html))
assert page.call_count == 0
- assert page.price == "$13.2"
+ assert page.price == "13.20"
assert page.call_count == 1
assert page.currency is None
- assert await page.currencyRaw is None
- assert page.call_count == 2 # we want this to be 1
- assert await page.currencyRaw is None
- assert page.call_count == 2 # we want this to be 1
+ assert await page.currencyRaw == "$"
+ assert page.call_count == 1 # we want this to be 1
+ assert await page.currencyRaw == "$"
+ assert page.call_count == 1 # we want this to be 1
# access currency fields before the price field
page = CustomProductPage(response=HttpResponse(url=url, body=html))
assert page.call_count == 0
assert page.currency is None
- assert await page.currencyRaw is None
+ assert await page.currencyRaw == "$"
assert page.call_count == 1
- assert page.price == "$13.2"
+ assert page.price == "13.20"
assert page.call_count == 2 # we want this to be 1
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index 0d070c18..03fbadc8 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -147,6 +147,8 @@ def price_processor(value: Any, page: Any) -> Any:
.. _price-parser: https://github.com/scrapinghub/price-parser
"""
+ value = _handle_selectorlist(value)
+
if isinstance(value, Real):
return f"{value:.2f}"
elif isinstance(value, (Selector, HtmlElement, str)):
@@ -170,6 +172,8 @@ def simple_price_processor(value: Any, page: Any) -> Any:
.. _price-parser: https://github.com/scrapinghub/price-parser
"""
+ value = _handle_selectorlist(value)
+
if isinstance(value, Real):
return f"{value:.2f}"
elif isinstance(value, (Selector, HtmlElement, str)):
From 308db2eb58bbfbfc155c65c05eb1a12ded0ebbdd Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Mon, 12 Aug 2024 07:11:36 +0200
Subject: [PATCH 03/14] Fix typos
---
zyte_common_items/processors.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index 03fbadc8..494b0897 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -139,7 +139,7 @@ def price_processor(value: Any, page: Any) -> Any:
Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string
- instances and numberic values.
+ instances and numeric values.
Other inputs are returned as is.
@@ -166,7 +166,7 @@ def simple_price_processor(value: Any, page: Any) -> Any:
Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string
- instances and numberic values.
+ instances and numeric values.
Other inputs are returned as is.
From 3fc6bb2b0f82e2850c77293ba85e7ca40e654f1e Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Mon, 12 Aug 2024 07:23:23 +0200
Subject: [PATCH 04/14] Fix twinecheck exception in CI
---
tox.ini | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tox.ini b/tox.ini
index 27bf6d3a..50f412cc 100644
--- a/tox.ini
+++ b/tox.ini
@@ -73,7 +73,7 @@ commands = mypy zyte_common_items tests
[testenv:twinecheck]
basepython = python3
deps =
- twine==4.0.2
+ twine==5.1.1
build==0.10.0
commands =
python -m build --sdist
From 6620a1538ee3b558d78705800ad4ea19f058f7dc Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Mon, 12 Aug 2024 07:27:38 +0200
Subject: [PATCH 05/14] Add trailing ","
---
tests/test_processors.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index c525b45b..2094972b 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -394,7 +394,7 @@ def images(self):
("1,000", "1000.00"),
(22.9, "22.90"),
(22.0, "22.00"),
- ]
+ ],
)
def test_prices(input_value, expected_value):
class PricePage(BasePage):
From 67cf862517dca1febe80d942ca908fdd6efd6d8b Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Fri, 16 Aug 2024 05:28:56 +0200
Subject: [PATCH 06/14] Variant processor
---
tests/test_processors.py | 21 +++++++++++++++++++++
zyte_common_items/pages/product.py | 3 +++
zyte_common_items/processors.py | 19 +++++++++++++++++++
3 files changed, 43 insertions(+)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index 2094972b..1e7bec02 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -15,6 +15,7 @@
Gtin,
Image,
ProductPage,
+ ProductVariant,
)
from zyte_common_items.processors import (
_format_price,
@@ -24,6 +25,7 @@
images_processor,
price_processor,
rating_processor,
+ variants_processor
)
base_url = "http://www.example.com/blog/"
@@ -404,3 +406,22 @@ def price(self):
page = PricePage(base_url) # type: ignore[arg-type]
assert page.price == expected_value
+
+
+@pytest.mark.parametrize(
+ "input_value,expected_value",
+ [
+ (
+ [{"price": "12,12$"}, {"price": 100}],
+ [ProductVariant(price="12.12"), ProductVariant(price="100.00")]
+ ),
+ ],
+)
+def test_variants(input_value, expected_value):
+ class VariantPage(BasePage):
+ @field(out=[variants_processor])
+ def variants(self):
+ return input_value
+
+ page = VariantPage(base_url) # type: ignore[arg-type]
+ assert page.variants == expected_value
diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py
index 45c1500a..cad470f1 100644
--- a/zyte_common_items/pages/product.py
+++ b/zyte_common_items/pages/product.py
@@ -23,6 +23,7 @@
price_processor,
rating_processor,
simple_price_processor,
+ variants_processor,
)
from .base import BasePage, Page
@@ -48,6 +49,7 @@ class Processors(BasePage.Processors):
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]
+ variants = [variants_processor]
class ProductPage(
@@ -65,6 +67,7 @@ class Processors(Page.Processors):
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]
+ variants = [variants_processor]
@attrs.define
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index 494b0897..41f66ced 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -29,6 +29,7 @@
ProbabilityRequest,
Request,
)
+from .items import ProductVariant
def _get_base_url(page: Any) -> Optional[str]:
@@ -417,3 +418,21 @@ def metadata_processor(metadata: BaseMetadata, page):
if page.metadata_cls is None:
return None
return metadata.cast(page.metadata_cls)
+
+
+def variants_processor(variants: list[Any], page: Any) -> list[ProductVariant]:
+ ret = []
+ for variant in variants:
+ if isinstance(variant, Mapping):
+ processors = {
+ "price": simple_price_processor,
+ "regularPrice": simple_price_processor,
+ }
+ for field, processor in processors.items():
+ if field in variant:
+ variant[field] = processor(variant[field], page)
+
+ ret.append(ProductVariant(**variant))
+ else:
+ ret.append(variant)
+ return ret
From ba1a5e93749c65eac05e4879588f87695aca818e Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Tue, 20 Aug 2024 17:21:53 +0200
Subject: [PATCH 07/14] Apply changes from code review
---
zyte_common_items/processors.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index 41f66ced..3a1cb4d4 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -108,16 +108,16 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb:
return results
-def brand_processor(value: Any, page: Any) -> Union[Brand, None]:
+def brand_processor(value: Any, page: Any) -> Any:
"""Convert the data into a brand name if possible.
If inputs are either :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts
to extract brand data from it.
- If value is a string, use it to create brand object instance
+ If value is a string, uses it to create a :class:`~zyte_common_items.Brand` instance.
- Other inputs are returned unchanged
+ Other inputs are returned unchanged.
"""
value = _handle_selectorlist(value)
@@ -364,7 +364,7 @@ def aggregateRating(self):
return value
-def images_processor(value: Any, page: Any) -> List[Image]:
+def images_processor(value: Any, page: Any) -> Any:
"""Convert the data into a list of :class:`~zyte_common_items.Image`
objects if possible.
From 71eb060a98618c579b985d19614d06e18f8bb8ac Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Tue, 20 Aug 2024 17:24:19 +0200
Subject: [PATCH 08/14] Revert "Variant processor"
This reverts commit 67cf862517dca1febe80d942ca908fdd6efd6d8b.
---
tests/test_processors.py | 21 ---------------------
zyte_common_items/pages/product.py | 3 ---
zyte_common_items/processors.py | 19 -------------------
3 files changed, 43 deletions(-)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index 1e7bec02..2094972b 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -15,7 +15,6 @@
Gtin,
Image,
ProductPage,
- ProductVariant,
)
from zyte_common_items.processors import (
_format_price,
@@ -25,7 +24,6 @@
images_processor,
price_processor,
rating_processor,
- variants_processor
)
base_url = "http://www.example.com/blog/"
@@ -406,22 +404,3 @@ def price(self):
page = PricePage(base_url) # type: ignore[arg-type]
assert page.price == expected_value
-
-
-@pytest.mark.parametrize(
- "input_value,expected_value",
- [
- (
- [{"price": "12,12$"}, {"price": 100}],
- [ProductVariant(price="12.12"), ProductVariant(price="100.00")]
- ),
- ],
-)
-def test_variants(input_value, expected_value):
- class VariantPage(BasePage):
- @field(out=[variants_processor])
- def variants(self):
- return input_value
-
- page = VariantPage(base_url) # type: ignore[arg-type]
- assert page.variants == expected_value
diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py
index cad470f1..45c1500a 100644
--- a/zyte_common_items/pages/product.py
+++ b/zyte_common_items/pages/product.py
@@ -23,7 +23,6 @@
price_processor,
rating_processor,
simple_price_processor,
- variants_processor,
)
from .base import BasePage, Page
@@ -49,7 +48,6 @@ class Processors(BasePage.Processors):
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]
- variants = [variants_processor]
class ProductPage(
@@ -67,7 +65,6 @@ class Processors(Page.Processors):
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]
- variants = [variants_processor]
@attrs.define
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index 3a1cb4d4..a423eaae 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -29,7 +29,6 @@
ProbabilityRequest,
Request,
)
-from .items import ProductVariant
def _get_base_url(page: Any) -> Optional[str]:
@@ -418,21 +417,3 @@ def metadata_processor(metadata: BaseMetadata, page):
if page.metadata_cls is None:
return None
return metadata.cast(page.metadata_cls)
-
-
-def variants_processor(variants: list[Any], page: Any) -> list[ProductVariant]:
- ret = []
- for variant in variants:
- if isinstance(variant, Mapping):
- processors = {
- "price": simple_price_processor,
- "regularPrice": simple_price_processor,
- }
- for field, processor in processors.items():
- if field in variant:
- variant[field] = processor(variant[field], page)
-
- ret.append(ProductVariant(**variant))
- else:
- ret.append(variant)
- return ret
From d2ab672afffdc9aad5a064a8345e663d4af8dd33 Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Thu, 22 Aug 2024 00:32:13 +0200
Subject: [PATCH 09/14] Apply changes from code review
---
tests/test_processors.py | 2 +-
zyte_common_items/processors.py | 3 +--
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index 2094972b..427d8002 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -158,7 +158,7 @@ def brand(self):
def test_brand_page():
- class MyProductPage(ProductPage):
+ class MyProductPage(BasePage):
@field(out=[brand_processor])
def brand(self):
return self.css("body")
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index a423eaae..cc686d30 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -375,10 +375,9 @@ def images_processor(value: Any, page: Any) -> Any:
Other inputs are returned unchanged.
"""
- value = _handle_selectorlist(value)
-
# TODO: add generic-purpose extract_images utility to zyte-parsers
#
+ # value = _handle_selectorlist(value)
# if isinstance(value, (Selector, HtmlElement)):
# images = extract_images(value)
# return [Image(url=url) for url in images]
From 30baf8a0ca371d93af1118a7829d8f4e607378a8 Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Thu, 22 Aug 2024 01:10:27 +0200
Subject: [PATCH 10/14] Fix tox
---
tests/test_processors.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index 427d8002..2d37b0b7 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -14,6 +14,7 @@
Breadcrumb,
Gtin,
Image,
+ Page,
ProductPage,
)
from zyte_common_items.processors import (
@@ -158,7 +159,7 @@ def brand(self):
def test_brand_page():
- class MyProductPage(BasePage):
+ class MyProductPage(Page):
@field(out=[brand_processor])
def brand(self):
return self.css("body")
From aa42e22b7db52fc5ab1a711df44f6180069d6d37 Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Thu, 22 Aug 2024 15:36:39 +0200
Subject: [PATCH 11/14] Revert handling strings by price_processor
---
tests/test_pages_price.py | 14 +++++++-------
tests/test_processors.py | 7 -------
zyte_common_items/processors.py | 5 ++---
3 files changed, 9 insertions(+), 17 deletions(-)
diff --git a/tests/test_pages_price.py b/tests/test_pages_price.py
index 23fdd0e4..e367c375 100644
--- a/tests/test_pages_price.py
+++ b/tests/test_pages_price.py
@@ -62,21 +62,21 @@ def price(self):
url = "https://example.com"
page = CustomProductPage(response=HttpResponse(url=url, body=html))
assert page.call_count == 0
- assert page.price == "13.20"
+ assert page.price == "$13.2"
assert page.call_count == 1
assert page.currency is None
- assert await page.currencyRaw == "$"
- assert page.call_count == 1 # we want this to be 1
- assert await page.currencyRaw == "$"
- assert page.call_count == 1 # we want this to be 1
+ assert await page.currencyRaw is None
+ assert page.call_count == 2 # we want this to be 1
+ assert await page.currencyRaw is None
+ assert page.call_count == 2 # we want this to be 1
# access currency fields before the price field
page = CustomProductPage(response=HttpResponse(url=url, body=html))
assert page.call_count == 0
assert page.currency is None
- assert await page.currencyRaw == "$"
+ assert await page.currencyRaw is None
assert page.call_count == 1
- assert page.price == "13.20"
+ assert page.price == "$13.2"
assert page.call_count == 2 # we want this to be 1
diff --git a/tests/test_processors.py b/tests/test_processors.py
index 2d37b0b7..c9e9e0ef 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -382,17 +382,10 @@ def images(self):
@pytest.mark.parametrize(
"input_value,expected_value",
[
- ("$10", "10.00"),
- ("100 ", "100.00"),
- ("100rub", "100.00"),
(100, "100.00"),
(None, None),
([], []),
({}, {}),
- ("", None),
- ("buy 10 ab ab", "10.00"),
- ("1,000.17", "1000.17"),
- ("1,000", "1000.00"),
(22.9, "22.90"),
(22.0, "22.00"),
],
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index cc686d30..9da38bed 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -138,8 +138,7 @@ def price_processor(value: Any, page: Any) -> Any:
Uses the price-parser_ library.
Supported inputs are :class:`~parsel.selector.Selector`,
- :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string
- instances and numeric values.
+ :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values.
Other inputs are returned as is.
@@ -151,7 +150,7 @@ def price_processor(value: Any, page: Any) -> Any:
if isinstance(value, Real):
return f"{value:.2f}"
- elif isinstance(value, (Selector, HtmlElement, str)):
+ elif isinstance(value, (Selector, HtmlElement)):
price = extract_price(value)
page._parsed_price = price
return _format_price(price)
From 3fce43df5831e1a3654d46606aff5f0e260447ec Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Thu, 22 Aug 2024 15:45:27 +0200
Subject: [PATCH 12/14] Do proper testing
---
tests/test_processors.py | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index c9e9e0ef..7181fcdb 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -14,7 +14,6 @@
Breadcrumb,
Gtin,
Image,
- Page,
ProductPage,
)
from zyte_common_items.processors import (
@@ -159,8 +158,8 @@ def brand(self):
def test_brand_page():
- class MyProductPage(Page):
- @field(out=[brand_processor])
+ class MyProductPage(ProductPage):
+ @field
def brand(self):
return self.css("body")
@@ -379,6 +378,20 @@ def images(self):
assert page.images == expected_value
+def test_images_page():
+ class MyProductPage(ProductPage):
+ @field
+ def images(self):
+ return self.css("img::attr(href)").getall()
+
+ response = HttpResponse(
+ url="http://www.example.com/",
+ body="
".encode(),
+ )
+ page = MyProductPage(response=response)
+ assert page.images == [Image(url="https://www.url.com/img.jpg")]
+
+
@pytest.mark.parametrize(
"input_value,expected_value",
[
From d62570564deedf53a96cf14993b751502e4c9bab Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Thu, 22 Aug 2024 16:12:36 +0200
Subject: [PATCH 13/14] Update simple_price_processor
---
zyte_common_items/processors.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index 9da38bed..bc401268 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -164,8 +164,7 @@ def simple_price_processor(value: Any, page: Any) -> Any:
Uses the price-parser_ library.
Supported inputs are :class:`~parsel.selector.Selector`,
- :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, string
- instances and numeric values.
+ :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values.
Other inputs are returned as is.
@@ -175,7 +174,7 @@ def simple_price_processor(value: Any, page: Any) -> Any:
if isinstance(value, Real):
return f"{value:.2f}"
- elif isinstance(value, (Selector, HtmlElement, str)):
+ elif isinstance(value, (Selector, HtmlElement)):
price = extract_price(value)
return _format_price(price)
else:
From 12cb8efadd59d399b41bf59de6b0d721d4d10efb Mon Sep 17 00:00:00 2001
From: Nykakin
Date: Mon, 26 Aug 2024 04:52:13 +0200
Subject: [PATCH 14/14] Changes from code review
---
tests/test_processors.py | 4 ++++
zyte_common_items/processors.py | 8 +-------
2 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/tests/test_processors.py b/tests/test_processors.py
index 7181fcdb..dbda616e 100644
--- a/tests/test_processors.py
+++ b/tests/test_processors.py
@@ -136,7 +136,9 @@ def breadcrumbs(self):
[
(None, None),
("", None),
+ (" ", None),
("foo", Brand(name="foo")),
+ (" foo ", Brand(name="foo")),
(Selector(text=""), None),
(SelectorList([]), None),
(fromstring("foo
"), Brand(name="foo")),
@@ -401,6 +403,8 @@ def images(self):
({}, {}),
(22.9, "22.90"),
(22.0, "22.00"),
+ ("22.9", "22.9"),
+ ("Do not apply to strings...", "Do not apply to strings..."),
],
)
def test_prices(input_value, expected_value):
diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py
index bc401268..d803f5e1 100644
--- a/zyte_common_items/processors.py
+++ b/zyte_common_items/processors.py
@@ -121,6 +121,7 @@ def brand_processor(value: Any, page: Any) -> Any:
value = _handle_selectorlist(value)
if isinstance(value, str):
+ value = value.strip()
return Brand(name=value) if value else None
if isinstance(value, (Selector, SelectorList, HtmlElement)):
@@ -373,13 +374,6 @@ def images_processor(value: Any, page: Any) -> Any:
Other inputs are returned unchanged.
"""
- # TODO: add generic-purpose extract_images utility to zyte-parsers
- #
- # value = _handle_selectorlist(value)
- # if isinstance(value, (Selector, HtmlElement)):
- # images = extract_images(value)
- # return [Image(url=url) for url in images]
-
if isinstance(value, str):
return [Image(url=value)]