-
Notifications
You must be signed in to change notification settings - Fork 10
Processor changes #99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
78f0655
0e0e733
308db2e
3fc6bb2
6620a15
67cf862
ba1a5e9
71eb060
d2ab672
30baf8a
aa42e22
3fce43d
d625705
12cb8ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| from collections.abc import Iterable | ||
| from collections.abc import Iterable, Mapping | ||
| from functools import wraps | ||
| from numbers import Real | ||
| from typing import Any, Callable, List, Optional, Union | ||
|
|
||
| from clear_html import clean_node, cleaned_node_to_html, cleaned_node_to_text | ||
|
|
@@ -21,8 +22,10 @@ | |
| from .components import ( | ||
| AggregateRating, | ||
| BaseMetadata, | ||
| Brand, | ||
| Breadcrumb, | ||
| Gtin, | ||
| Image, | ||
| ProbabilityRequest, | ||
| Request, | ||
| ) | ||
|
|
@@ -104,50 +107,78 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb: | |
| return results | ||
|
|
||
|
|
||
| @only_handle_nodes | ||
| def brand_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: | ||
| def brand_processor(value: Any, page: Any) -> Any: | ||
| """Convert the data into a brand name if possible. | ||
|
|
||
| Supported inputs are :class:`~parsel.selector.Selector`, | ||
| :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. | ||
| Other inputs are returned as is. | ||
| If inputs are either :class:`~parsel.selector.Selector`, | ||
| :class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts | ||
| to extract brand data from it. | ||
|
|
||
| If value is a string, uses it to create a :class:`~zyte_common_items.Brand` instance. | ||
|
|
||
| Other inputs are returned unchanged. | ||
| """ | ||
| return extract_brand_name(value, search_depth=2) | ||
| value = _handle_selectorlist(value) | ||
|
|
||
| if isinstance(value, str): | ||
| return Brand(name=value) if value else None | ||
Gallaecio marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| @only_handle_nodes | ||
| def price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: | ||
| if isinstance(value, (Selector, SelectorList, HtmlElement)): | ||
| if brand_name := extract_brand_name(value, search_depth=2): | ||
| return Brand(name=brand_name) | ||
| else: | ||
| return None | ||
|
|
||
| return value | ||
|
|
||
|
|
||
| def price_processor(value: Any, page: Any) -> Any: | ||
| """Convert the data into a price string if possible. | ||
|
|
||
| Uses the price-parser_ library. | ||
|
|
||
| Supported inputs are :class:`~parsel.selector.Selector`, | ||
| :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. | ||
| :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values. | ||
|
|
||
| Other inputs are returned as is. | ||
|
|
||
| Puts the parsed Price object into ``page._parsed_price``. | ||
|
|
||
| .. _price-parser: https://github.com/scrapinghub/price-parser | ||
| """ | ||
| price = extract_price(value) | ||
| page._parsed_price = price | ||
| return _format_price(price) | ||
| value = _handle_selectorlist(value) | ||
|
|
||
| if isinstance(value, Real): | ||
| return f"{value:.2f}" | ||
| elif isinstance(value, (Selector, HtmlElement)): | ||
| price = extract_price(value) | ||
| page._parsed_price = price | ||
| return _format_price(price) | ||
| else: | ||
| return value | ||
|
|
||
| @only_handle_nodes | ||
| def simple_price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any: | ||
|
|
||
| def simple_price_processor(value: Any, page: Any) -> Any: | ||
| """Convert the data into a price string if possible. | ||
|
|
||
| Uses the price-parser_ library. | ||
|
|
||
| Supported inputs are :class:`~parsel.selector.Selector`, | ||
| :class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`. | ||
| :class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values. | ||
|
|
||
| Other inputs are returned as is. | ||
|
|
||
| .. _price-parser: https://github.com/scrapinghub/price-parser | ||
| """ | ||
| price = extract_price(value) | ||
| return _format_price(price) | ||
| value = _handle_selectorlist(value) | ||
|
|
||
| if isinstance(value, Real): | ||
| return f"{value:.2f}" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think about moving it to https://github.com/zytedata/zyte-parsers/blob/main/zyte_parsers/price.py?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems this would allow to remove the duplication in price_processor vs simple_price_processor.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'd like to migrate our stuff quickly before certain deadlines, which is why I want to have this thing working for now and avoid coordinating between PR in yet another dependency repository (which is also why I placed proposed parsing of images into TODO). I think we can improve it later and migrate into more fitting place.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good! |
||
| elif isinstance(value, (Selector, HtmlElement)): | ||
| price = extract_price(value) | ||
| return _format_price(price) | ||
| else: | ||
| return value | ||
|
|
||
|
|
||
| @only_handle_nodes | ||
|
|
@@ -330,6 +361,44 @@ def aggregateRating(self): | |
| return value | ||
|
|
||
|
|
||
| def images_processor(value: Any, page: Any) -> Any: | ||
kmike marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """Convert the data into a list of :class:`~zyte_common_items.Image` | ||
| objects if possible. | ||
|
|
||
| If the input is a string, it's used as a url for returning image object. | ||
|
|
||
| If input is either an iterable of strings or mappings with "url" key, they are | ||
| used to populate image objects. | ||
|
|
||
| Other inputs are returned unchanged. | ||
| """ | ||
|
|
||
| # TODO: add generic-purpose extract_images utility to zyte-parsers | ||
| # | ||
| # value = _handle_selectorlist(value) | ||
| # if isinstance(value, (Selector, HtmlElement)): | ||
| # images = extract_images(value) | ||
| # return [Image(url=url) for url in images] | ||
Gallaecio marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if isinstance(value, str): | ||
| return [Image(url=value)] | ||
|
|
||
| if isinstance(value, Iterable): | ||
| results: List[Any] = [] | ||
| for item in value: | ||
| if isinstance(item, Image): | ||
| results.append(item) | ||
| elif isinstance(item, Mapping): | ||
| if url := item.get("url"): | ||
| results.append(Image(url=url)) | ||
| elif isinstance(item, str): | ||
| results.append(Image(url=item)) | ||
|
|
||
| return results | ||
|
|
||
| return value | ||
|
|
||
|
|
||
| def probability_request_list_processor( | ||
| request_list: List[Request], | ||
| ) -> List[ProbabilityRequest]: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.