diff --git a/tests/test_processors.py b/tests/test_processors.py index 87c6d263..adec74a2 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -13,7 +13,9 @@ brand_processor, breadcrumbs_processor, gtin_processor, + list_processor, rating_processor, + string_processor, ) base_url = "http://www.example.com/blog/" @@ -321,3 +323,46 @@ def aggregateRating(self): assert page.aggregateRating == AggregateRating( ratingValue=3.8, bestRating=10, reviewCount=5 ) + + +@pytest.mark.parametrize( + "input_value,expected_value", + [ + (None, None), + ("", ""), + ("Value ", "Value"), + (" Value", "Value"), + (" Value ", "Value"), + ("Multiword value ", "Multiword value"), + (" Multiword value", "Multiword value"), + (" Multiword value ", "Multiword value"), + ], +) +def test_string_processor(input_value, expected_value): + class RatingPage(BasePage): + @field(out=[string_processor]) + def name(self): + return input_value + + page = RatingPage(base_url) # type: ignore[arg-type] + assert page.name == expected_value + + +@pytest.mark.parametrize( + "input_value,expected_value", + [ + (None, None), + ([], []), + (["a", "b"], ["a", "b"]), + ([" a", "b "], ["a", "b"]), + ([" a ", " b "], ["a", "b"]), + ], +) +def test_list_processor(input_value, expected_value): + class RatingPage(BasePage): + @field(out=[list_processor(string_processor)]) + def name(self): + return input_value + + page = RatingPage(base_url) # type: ignore[arg-type] + assert page.name == expected_value diff --git a/tox.ini b/tox.ini index 27bf6d3a..50f412cc 100644 --- a/tox.ini +++ b/tox.ini @@ -73,7 +73,7 @@ commands = mypy zyte_common_items tests [testenv:twinecheck] basepython = python3 deps = - twine==4.0.2 + twine==5.1.1 build==0.10.0 commands = python -m build --sdist diff --git a/zyte_common_items/pages/article.py b/zyte_common_items/pages/article.py index 55add465..bd8d30c8 100644 --- a/zyte_common_items/pages/article.py +++ b/zyte_common_items/pages/article.py @@ -6,24 +6,38 @@ from zyte_common_items.components import Audio, Author, Breadcrumb, Image, Video from zyte_common_items.fields import auto_field from zyte_common_items.items import Article, ArticleMetadata -from zyte_common_items.processors import breadcrumbs_processor +from zyte_common_items.processors import breadcrumbs_processor, string_processor from .base import BasePage, Page from .mixins import HasMetadata +class _ArticleProcessors(BasePage.Processors): + breadcrumbs = [breadcrumbs_processor] + headline = [string_processor] + datePublished = [string_processor] + datePublishedRaw = [string_processor] + dateModified = [string_processor] + dateModifiedRaw = [string_processor] + inLanguage = [string_processor] + description = [string_processor] + articleBody = [string_processor] + articleBodyHtml = [string_processor] + canonicalUrl = [string_processor] + + class BaseArticlePage(BasePage, Returns[Article], HasMetadata[ArticleMetadata]): """:class:`BasePage` subclass for :class:`Article`.""" - class Processors(BasePage.Processors): - breadcrumbs = [breadcrumbs_processor] + class Processors(_ArticleProcessors): + pass class ArticlePage(Page, Returns[Article], HasMetadata[ArticleMetadata]): """:class:`Page` subclass for :class:`Article`.""" - class Processors(Page.Processors): - breadcrumbs = [breadcrumbs_processor] + class Processors(_ArticleProcessors): + pass @attrs.define diff --git a/zyte_common_items/pages/article_list.py b/zyte_common_items/pages/article_list.py index 40b15c56..d17cfd01 100644 --- a/zyte_common_items/pages/article_list.py +++ b/zyte_common_items/pages/article_list.py @@ -6,26 +6,31 @@ from zyte_common_items.components import Breadcrumb from zyte_common_items.fields import auto_field from zyte_common_items.items import ArticleFromList, ArticleList, ArticleListMetadata -from zyte_common_items.processors import breadcrumbs_processor +from zyte_common_items.processors import breadcrumbs_processor, string_processor from .base import BasePage, Page from .mixins import HasMetadata +class _ArticleListProcessors(BasePage.Processors): + breadcrumbs = [breadcrumbs_processor] + canonicalUrl = [string_processor] + + class BaseArticleListPage( BasePage, Returns[ArticleList], HasMetadata[ArticleListMetadata] ): """:class:`BasePage` subclass for :class:`ArticleList`.""" - class Processors(BasePage.Processors): - breadcrumbs = [breadcrumbs_processor] + class Processors(_ArticleListProcessors): + pass class ArticleListPage(Page, Returns[ArticleList], HasMetadata[ArticleListMetadata]): """:class:`Page` subclass for :class:`ArticleList`.""" - class Processors(Page.Processors): - breadcrumbs = [breadcrumbs_processor] + class Processors(_ArticleListProcessors): + pass @attrs.define diff --git a/zyte_common_items/pages/article_navigation.py b/zyte_common_items/pages/article_navigation.py index 7a84d39d..c51e3267 100644 --- a/zyte_common_items/pages/article_navigation.py +++ b/zyte_common_items/pages/article_navigation.py @@ -6,22 +6,33 @@ from zyte_common_items.components import ProbabilityRequest, Request from zyte_common_items.fields import auto_field from zyte_common_items.items import ArticleNavigation, ArticleNavigationMetadata +from zyte_common_items.processors import string_processor from .base import BasePage, Page from .mixins import HasMetadata +class _ArticleNavigationProcessors(BasePage.Processors): + categoryName = [string_processor] + + class BaseArticleNavigationPage( BasePage, Returns[ArticleNavigation], HasMetadata[ArticleNavigationMetadata] ): """:class:`BasePage` subclass for :class:`ArticleNavigation`.""" + class Processors(_ArticleNavigationProcessors): + pass + class ArticleNavigationPage( Page, Returns[ArticleNavigation], HasMetadata[ArticleNavigationMetadata] ): """:class:`Page` subclass for :class:`ArticleNavigation`.""" + class Processors(_ArticleNavigationProcessors): + pass + @attrs.define class AutoArticleNavigationPage(BaseArticleNavigationPage): diff --git a/zyte_common_items/pages/base.py b/zyte_common_items/pages/base.py index fc36e5d2..45c1a7f9 100644 --- a/zyte_common_items/pages/base.py +++ b/zyte_common_items/pages/base.py @@ -3,7 +3,7 @@ from web_poet.pages import ItemT from .._dateutils import utcnow_formatted -from ..processors import metadata_processor +from ..processors import metadata_processor, string_processor from .mixins import HasMetadata, MetadataT @@ -43,12 +43,16 @@ def no_item_found(self) -> ItemT: ) +class _BaseProcessors(_BasePage.Processors): + url = [string_processor] + + @attrs.define class BasePage(_BasePage): """Base class for page object classes that has :class:`~web_poet.page_inputs.http.RequestUrl` as a dependency.""" - class Processors(_BasePage.Processors): + class Processors(_BaseProcessors): pass request_url: RequestUrl @@ -63,7 +67,7 @@ class Page(_BasePage, WebPage): """Base class for page object classes that has :class:`~web_poet.page_inputs.http.HttpResponse` as a dependency.""" - class Processors(_BasePage.Processors): + class Processors(_BaseProcessors): pass @field diff --git a/zyte_common_items/pages/business_place.py b/zyte_common_items/pages/business_place.py index c0ec6ab6..cee11e6d 100644 --- a/zyte_common_items/pages/business_place.py +++ b/zyte_common_items/pages/business_place.py @@ -16,20 +16,39 @@ ) from zyte_common_items.fields import auto_field from zyte_common_items.items import BusinessPlace, BusinessPlaceMetadata -from zyte_common_items.processors import description_processor, rating_processor +from zyte_common_items.processors import ( + description_processor, + list_processor, + rating_processor, + string_processor, +) from .base import BasePage, Page from .mixins import HasMetadata +class _BusinessPlaceProcessors(BasePage.Processors): + aggregateRating = [rating_processor] + description = [description_processor] + categories = [list_processor(string_processor)] + features = [list_processor(string_processor)] + map = [string_processor] + name = [string_processor] + placeId = [string_processor] + priceRange = [string_processor] + tags = [list_processor(string_processor)] + telephone = [string_processor] + timezone = [string_processor] + website = [string_processor] + + class BaseBusinessPlacePage( BasePage, Returns[BusinessPlace], HasMetadata[BusinessPlaceMetadata] ): """:class:`BasePage` subclass for :class:`BusinessPlace`.""" - class Processors(BasePage.Processors): - aggregateRating = [rating_processor] - description = [description_processor] + class Processors(_BusinessPlaceProcessors): + pass class BusinessPlacePage( @@ -37,9 +56,8 @@ class BusinessPlacePage( ): """:class:`Page` subclass for :class:`BusinessPlace`.""" - class Processors(Page.Processors): - aggregateRating = [rating_processor] - description = [description_processor] + class Processors(_BusinessPlaceProcessors): + pass @attrs.define diff --git a/zyte_common_items/pages/job_posting.py b/zyte_common_items/pages/job_posting.py index a48a95be..119136e0 100644 --- a/zyte_common_items/pages/job_posting.py +++ b/zyte_common_items/pages/job_posting.py @@ -9,20 +9,40 @@ from zyte_common_items.processors import ( description_html_processor, description_processor, + list_processor, + string_processor, ) from .base import BasePage, Page from .mixins import DescriptionMixin, HasMetadata +class _JobPostingProcessors(BasePage.Processors): + description = [description_processor] + descriptionHtml = [description_html_processor] + jobPostingId = [string_processor] + datePublished = [string_processor] + datePublishedRaw = [string_processor] + dateModified = [string_processor] + dateModifiedRaw = [string_processor] + validThrough = [string_processor] + validThroughRaw = [string_processor] + jobTitle = [string_processor] + headline = [string_processor] + employmentType = [string_processor] + requirements = [list_processor(string_processor)] + jobStartDate = [string_processor] + jobStartDateRaw = [string_processor] + remoteStatus = [string_processor] + + class BaseJobPostingPage( BasePage, DescriptionMixin, Returns[JobPosting], HasMetadata[JobPostingMetadata] ): """:class:`BasePage` subclass for :class:`JobPosting`.""" - class Processors(BasePage.Processors): - description = [description_processor] - descriptionHtml = [description_html_processor] + class Processors(_JobPostingProcessors): + pass class JobPostingPage( @@ -30,9 +50,8 @@ class JobPostingPage( ): """:class:`Page` subclass for :class:`JobPosting`.""" - class Processors(Page.Processors): - description = [description_processor] - descriptionHtml = [description_html_processor] + class Processors(_JobPostingProcessors): + pass @attrs.define diff --git a/zyte_common_items/pages/product.py b/zyte_common_items/pages/product.py index fc297629..1973d46d 100644 --- a/zyte_common_items/pages/product.py +++ b/zyte_common_items/pages/product.py @@ -19,15 +19,40 @@ description_html_processor, description_processor, gtin_processor, + list_processor, price_processor, rating_processor, simple_price_processor, + string_processor, ) from .base import BasePage, Page from .mixins import DescriptionMixin, HasMetadata, PriceMixin +class _ProductProcessors(BasePage.Processors): + aggregateRating = [rating_processor] + brand = [brand_processor] + breadcrumbs = [breadcrumbs_processor] + description = [description_processor] + descriptionHtml = [description_html_processor] + gtin = [gtin_processor] + price = [price_processor] + regularPrice = [simple_price_processor] + availability = [string_processor] + canonicalUrl = [string_processor] + color = [string_processor] + currency = [string_processor] + currencyRaw = [string_processor] + features = [list_processor(string_processor)] + mpn = [string_processor] + name = [string_processor] + productId = [string_processor] + size = [string_processor] + sku = [string_processor] + style = [string_processor] + + class BaseProductPage( BasePage, DescriptionMixin, @@ -37,15 +62,8 @@ class BaseProductPage( ): """:class:`BasePage` subclass for :class:`Product`.""" - class Processors(BasePage.Processors): - aggregateRating = [rating_processor] - brand = [brand_processor] - breadcrumbs = [breadcrumbs_processor] - description = [description_processor] - descriptionHtml = [description_html_processor] - gtin = [gtin_processor] - price = [price_processor] - regularPrice = [simple_price_processor] + class Processors(_ProductProcessors): + pass class ProductPage( @@ -53,15 +71,8 @@ class ProductPage( ): """:class:`Page` subclass for :class:`Product`.""" - class Processors(Page.Processors): - aggregateRating = [rating_processor] - brand = [brand_processor] - breadcrumbs = [breadcrumbs_processor] - description = [description_processor] - descriptionHtml = [description_html_processor] - gtin = [gtin_processor] - price = [price_processor] - regularPrice = [simple_price_processor] + class Processors(_ProductProcessors): + pass @attrs.define diff --git a/zyte_common_items/pages/product_list.py b/zyte_common_items/pages/product_list.py index e115b4c4..d2553f3a 100644 --- a/zyte_common_items/pages/product_list.py +++ b/zyte_common_items/pages/product_list.py @@ -6,26 +6,32 @@ from zyte_common_items.components import Breadcrumb, Link from zyte_common_items.fields import auto_field from zyte_common_items.items import ProductFromList, ProductList, ProductListMetadata -from zyte_common_items.processors import breadcrumbs_processor +from zyte_common_items.processors import breadcrumbs_processor, string_processor from .base import BasePage, Page from .mixins import HasMetadata +class _ProductListProcessors(BasePage.Processors): + breadcrumbs = [breadcrumbs_processor] + canonicalUrl = [string_processor] + categoryName = [string_processor] + + class BaseProductListPage( BasePage, Returns[ProductList], HasMetadata[ProductListMetadata] ): """:class:`BasePage` subclass for :class:`ProductList`.""" - class Processors(BasePage.Processors): - breadcrumbs = [breadcrumbs_processor] + class Processors(_ProductListProcessors): + pass class ProductListPage(Page, Returns[ProductList], HasMetadata[ProductListMetadata]): """:class:`Page` subclass for :class:`ProductList`.""" - class Processors(Page.Processors): - breadcrumbs = [breadcrumbs_processor] + class Processors(_ProductListProcessors): + pass @attrs.define diff --git a/zyte_common_items/pages/product_navigation.py b/zyte_common_items/pages/product_navigation.py index 7ed00d9b..cd6bee6a 100644 --- a/zyte_common_items/pages/product_navigation.py +++ b/zyte_common_items/pages/product_navigation.py @@ -6,20 +6,28 @@ from zyte_common_items.components import ProbabilityRequest, Request from zyte_common_items.fields import auto_field from zyte_common_items.items import ProductNavigation, ProductNavigationMetadata -from zyte_common_items.processors import probability_request_list_processor +from zyte_common_items.processors import ( + probability_request_list_processor, + string_processor, +) from .base import BasePage, Page from .mixins import HasMetadata +class _ProductNavigationProcessors(BasePage.Processors): + subCategories = [probability_request_list_processor] + items = [probability_request_list_processor] + categoryName = [string_processor] + + class BaseProductNavigationPage( BasePage, Returns[ProductNavigation], HasMetadata[ProductNavigationMetadata] ): """:class:`BasePage` subclass for :class:`ProductNavigation`.""" - class Processors(BasePage.Processors): - subCategories = [probability_request_list_processor] - items = [probability_request_list_processor] + class Processors(_ProductNavigationProcessors): + pass class ProductNavigationPage( @@ -27,6 +35,9 @@ class ProductNavigationPage( ): """:class:`Page` subclass for :class:`ProductNavigation`.""" + class Processors(_ProductNavigationProcessors): + pass + @attrs.define class AutoProductNavigationPage(BaseProductNavigationPage): diff --git a/zyte_common_items/pages/real_estate.py b/zyte_common_items/pages/real_estate.py index e869e703..71f809c6 100644 --- a/zyte_common_items/pages/real_estate.py +++ b/zyte_common_items/pages/real_estate.py @@ -13,27 +13,41 @@ from zyte_common_items.fields import auto_field from zyte_common_items.items import RealEstate, RealEstateMetadata -from ..processors import breadcrumbs_processor, description_processor +from ..processors import breadcrumbs_processor, description_processor, string_processor from .base import BasePage, Page from .mixins import HasMetadata +class _RealEstateProcessors(BasePage.Processors): + breadcrumbs = [breadcrumbs_processor] + description = [description_processor] + currency = [string_processor] + currencyRaw = [string_processor] + datePublished = [string_processor] + datePublishedRaw = [string_processor] + name = [string_processor] + price = [string_processor] + propertyType = [string_processor] + realEstateId = [string_processor] + rentalPeriod = [string_processor] + tradeType = [string_processor] + virtualTourUrl = [string_processor] + + class BaseRealEstatePage( BasePage, Returns[RealEstate], HasMetadata[RealEstateMetadata] ): """:class:`BasePage` subclass for :class:`RealEstate`.""" - class Processors(BasePage.Processors): - breadcrumbs = [breadcrumbs_processor] - description = [description_processor] + class Processors(_RealEstateProcessors): + pass class RealEstatePage(Page, Returns[RealEstate], HasMetadata[RealEstateMetadata]): """:class:`Page` subclass for :class:`RealEstate`.""" - class Processors(Page.Processors): - breadcrumbs = [breadcrumbs_processor] - description = [description_processor] + class Processors(_RealEstateProcessors): + pass @attrs.define diff --git a/zyte_common_items/pages/social_media_post.py b/zyte_common_items/pages/social_media_post.py index a4f7823c..0dee9319 100644 --- a/zyte_common_items/pages/social_media_post.py +++ b/zyte_common_items/pages/social_media_post.py @@ -6,21 +6,31 @@ from zyte_common_items.components import Reactions, SocialMediaPostAuthor, Url from zyte_common_items.fields import auto_field from zyte_common_items.items import SocialMediaPost, SocialMediaPostMetadata +from zyte_common_items.processors import list_processor, string_processor from .base import BasePage, Page from .mixins import HasMetadata +class _SocialMediaPostProcessors(BasePage.Processors): + postId = [string_processor] + text = [string_processor] + datePublished = [string_processor] + hashtags = [list_processor(string_processor)] + + class BaseSocialMediaPostPage( BasePage, Returns[SocialMediaPost], HasMetadata[SocialMediaPostMetadata] ): - pass + class Processors(_SocialMediaPostProcessors): + pass class SocialMediaPostPage( Page, Returns[SocialMediaPost], HasMetadata[SocialMediaPostMetadata] ): - pass + class Processors(_SocialMediaPostProcessors): + pass @attrs.define diff --git a/zyte_common_items/processors.py b/zyte_common_items/processors.py index e4306c11..4fc86bd0 100644 --- a/zyte_common_items/processors.py +++ b/zyte_common_items/processors.py @@ -345,3 +345,21 @@ def metadata_processor(metadata: BaseMetadata, page): if page.metadata_cls is None: return None return metadata.cast(page.metadata_cls) + + +def string_processor(value: str) -> Union[str, None]: + """Processor for string values""" + if isinstance(value, str): + return value.strip() + return value + + +def list_processor(processor: Callable) -> Any: + """Apply processor to a list of items""" + + def loop(values): + if not isinstance(values, Iterable): + return values + return [processor(value) for value in values] + + return loop