Skip to content

Commit

Permalink
Merge pull request #69 from zytedata/custom-attrs-ecommerce
Browse files Browse the repository at this point in the history
Custom attributes extraction
  • Loading branch information
kmike authored Oct 1, 2024
2 parents 72240e4 + faa382a commit 9fe01af
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 29 deletions.
10 changes: 5 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
packages=find_packages(),
include_package_data=True,
install_requires=[
"pydantic>=2",
"pydantic>=2.1",
"requests>=0.10.1",
"scrapy>=2.11.0",
"scrapy-poet>=0.21.0",
"scrapy-spider-metadata>=0.1.2",
"scrapy-zyte-api[provider]>=0.16.0",
"zyte-common-items>=0.22.0",
"scrapy-poet>=0.23.0",
"scrapy-spider-metadata>=0.2.0",
"scrapy-zyte-api[provider]>=0.23.0",
"zyte-common-items>=0.23.0",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
42 changes: 39 additions & 3 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import requests
import scrapy
from pydantic import ValidationError
from scrapy_poet import DummyResponse
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request

Expand Down Expand Up @@ -243,7 +243,7 @@ def test_parse_product(probability, has_item, item_drop, caplog):
mock_crawler = MagicMock()
spider.crawler = mock_crawler
logging.getLogger().setLevel(logging.INFO)
items = list(spider.parse_product(response, product))
items = list(spider.parse_product(response, product, DynamicDeps()))
if item_drop:
assert mock_crawler.method_calls == [
call.stats.inc_value("drop_item/product/low_probability")
Expand Down Expand Up @@ -463,7 +463,7 @@ def test_metadata():
"title": "Pagination Only",
},
},
"title": "Crawl Strategy",
"title": "Crawl strategy",
"enum": [
"automatic",
"full",
Expand Down Expand Up @@ -528,6 +528,42 @@ def test_metadata():
"title": "Extraction source",
"enum": ["httpResponseBody", "browserHtml"],
},
"custom_attrs_input": {
"anyOf": [
{
"contentMediaType": "application/json",
"contentSchema": {"type": "object"},
"type": "string",
},
{"type": "null"},
],
"default": None,
"description": "Custom attributes to extract.",
"title": "Custom attributes schema",
"widget": "custom-attrs",
},
"custom_attrs_method": {
"default": "generate",
"description": "Which model to use for custom attribute extraction.",
"enum": ["generate", "extract"],
"enumMeta": {
"extract": {
"description": "Use an extractive model (BERT). Supports only a "
"subset of the schema (string, integer and "
"number), suited for extraction of short and clear "
"fields, with a fixed per-request cost.",
"title": "extract",
},
"generate": {
"description": "Use a generative model (LLM). The most powerful "
"and versatile, but more expensive, with variable "
"per-request cost.",
"title": "generate",
},
},
"title": "Custom attributes extraction method",
"type": "string",
},
},
"title": "EcommerceSpiderParams",
"type": "object",
Expand Down
10 changes: 5 additions & 5 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ commands =
basepython = python3.9
deps =
{[testenv]deps}
pydantic==2
pydantic==2.1
requests==0.10.1
scrapy==2.11.0
scrapy-poet==0.21.0
scrapy-spider-metadata==0.1.2
scrapy-zyte-api[provider]==0.16.0
zyte-common-items==0.22.0
scrapy-poet==0.23.0
scrapy-spider-metadata==0.2.0
scrapy-zyte-api[provider]==0.23.0
zyte-common-items==0.23.0

[testenv:mypy]
deps =
Expand Down
59 changes: 57 additions & 2 deletions zyte_spider_templates/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,17 @@
import re
from enum import Enum
from logging import getLogger
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

import requests
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic import (
BaseModel,
ConfigDict,
Field,
Json,
field_validator,
model_validator,
)

try:
from pydantic.config import JsonDict
Expand Down Expand Up @@ -34,6 +41,18 @@ class ExtractFrom(str, Enum):
"""Use browser rendering. Often provides the best quality."""


@document_enum
class CustomAttrsMethod(str, Enum):
generate: str = "generate"
"""Use a generative model (LLM). The most powerful and versatile, but more
expensive, with variable per-request cost."""

extract: str = "extract"
"""Use an extractive model (BERT). Supports only a subset of the schema (string,
integer and number), suited for extraction of short and clear fields, with a fixed
per-request cost."""


class ExtractFromParam(BaseModel):
extract_from: Optional[ExtractFrom] = Field(
title="Extraction source",
Expand Down Expand Up @@ -304,3 +323,39 @@ def validate_location(
return PostalAddress(**value)

raise ValueError(f"{value!r} type {type(value)} is not a supported type")


class CustomAttrsInputParam(BaseModel):
custom_attrs_input: Optional[Json[Dict[str, Any]]] = Field(
title="Custom attributes schema",
description="Custom attributes to extract.",
default=None,
json_schema_extra={
"widget": "custom-attrs",
},
)


class CustomAttrsMethodParam(BaseModel):
custom_attrs_method: CustomAttrsMethod = Field(
title="Custom attributes extraction method",
description="Which model to use for custom attribute extraction.",
default=CustomAttrsMethod.generate,
json_schema_extra={
"enumMeta": {
CustomAttrsMethod.generate: {
"title": "generate",
"description": "Use a generative model (LLM). The most powerful "
"and versatile, but more expensive, with variable "
"per-request cost.",
},
CustomAttrsMethod.extract: {
"title": "extract",
"description": "Use an extractive model (BERT). Supports only a "
"subset of the schema (string, integer and "
"number), suited for extraction of short and clear "
"fields, with a fixed per-request cost.",
},
},
},
)
23 changes: 22 additions & 1 deletion zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from importlib.metadata import version
from typing import Any, Dict
from typing import Annotated, Any, Dict
from warnings import warn

import scrapy
from pydantic import BaseModel, ConfigDict, model_validator
from scrapy.crawler import Crawler
from scrapy_zyte_api import custom_attrs
from zyte_common_items import CustomAttributes

from ..params import (
INPUT_GROUP,
Expand Down Expand Up @@ -63,6 +65,8 @@ class BaseSpider(scrapy.Spider):

_NEXT_PAGE_PRIORITY: int = 100

_custom_attrs_dep = None

@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
spider = super().from_crawler(crawler, *args, **kwargs)
Expand All @@ -86,4 +90,21 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
spider.args.max_requests,
priority=ARG_SETTING_PRIORITY,
)

if custom_attrs_input := getattr(spider.args, "custom_attrs_input", None):
custom_attrs_options = {
"method": spider.args.custom_attrs_method,
}
if max_input_tokens := crawler.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"):
custom_attrs_options["maxInputTokens"] = max_input_tokens
if max_output_tokens := crawler.settings.getint(
"ZYTE_API_MAX_OUTPUT_TOKENS"
):
custom_attrs_options["maxOutputTokens"] = max_output_tokens

spider._custom_attrs_dep = Annotated[
CustomAttributes,
custom_attrs(custom_attrs_input, custom_attrs_options),
]

return spider
51 changes: 38 additions & 13 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@
from typing import Any, Callable, Dict, Iterable, Optional, Union

import scrapy
from andi.typeutils import strip_annotated
from pydantic import BaseModel, ConfigDict, Field
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy_poet import DummyResponse
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
from zyte_common_items import (
CustomAttributes,
ProbabilityRequest,
Product,
ProductNavigation,
)

from zyte_spider_templates.heuristics import is_homepage
from zyte_spider_templates.params import parse_input_params
Expand All @@ -20,6 +26,8 @@

from ..documentation import document_enum
from ..params import (
CustomAttrsInputParam,
CustomAttrsMethodParam,
ExtractFromParam,
GeolocationParam,
MaxRequestsParam,
Expand Down Expand Up @@ -61,7 +69,7 @@ class EcommerceCrawlStrategy(str, Enum):

class EcommerceCrawlStrategyParam(BaseModel):
crawl_strategy: EcommerceCrawlStrategy = Field(
title="Crawl Strategy",
title="Crawl strategy",
description="Determines how the start URL and follow-up URLs are crawled.",
default=EcommerceCrawlStrategy.automatic,
json_schema_extra={
Expand Down Expand Up @@ -110,6 +118,8 @@ class EcommerceCrawlStrategyParam(BaseModel):


class EcommerceSpiderParams(
CustomAttrsMethodParam,
CustomAttrsInputParam,
ExtractFromParam,
MaxRequestsParam,
GeolocationParam,
Expand Down Expand Up @@ -227,13 +237,23 @@ def parse_navigation(
yield self.get_subcategory_request(request, page_params=page_params)

def parse_product(
self, response: DummyResponse, product: Product
) -> Iterable[Product]:
self, response: DummyResponse, product: Product, dynamic: DynamicDeps
) -> Iterable[
Union[Product, Dict[str, Union[Product, Optional[CustomAttributes]]]]
]:
probability = product.get_probability()

# TODO: convert to a configurable parameter later on after the launch
if probability is None or probability >= 0.1:
yield product
if self.args.custom_attrs_input:
custom_attrs = None
for cls, value in dynamic.items():
if strip_annotated(cls) is CustomAttributes:
custom_attrs = value
break
yield {"product": product, "customAttributes": custom_attrs}
else:
yield product
else:
self.crawler.stats.inc_value("drop_item/product/low_probability")
self.logger.info(
Expand Down Expand Up @@ -319,17 +339,22 @@ def get_parse_product_request(
priority = self.get_parse_product_request_priority(request)

probability = request.get_probability()
meta = {
"crawling_logs": {
"name": request.name,
"probability": probability,
"page_type": "product",
},
}
if self._custom_attrs_dep:
meta["inject"] = [
self._custom_attrs_dep,
]

scrapy_request = request.to_scrapy(
callback=callback,
priority=priority,
meta={
"crawling_logs": {
"name": request.name,
"probability": probability,
"page_type": "product",
}
},
meta=meta,
)
scrapy_request.meta["allow_offsite"] = True
return scrapy_request
Expand Down

0 comments on commit 9fe01af

Please sign in to comment.