Skip to content

Commit 9fe01af

Browse files
authored
Merge pull request #69 from zytedata/custom-attrs-ecommerce
Custom attributes extraction
2 parents 72240e4 + faa382a commit 9fe01af

File tree

6 files changed

+166
-29
lines changed

6 files changed

+166
-29
lines changed

setup.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@
1212
packages=find_packages(),
1313
include_package_data=True,
1414
install_requires=[
15-
"pydantic>=2",
15+
"pydantic>=2.1",
1616
"requests>=0.10.1",
1717
"scrapy>=2.11.0",
18-
"scrapy-poet>=0.21.0",
19-
"scrapy-spider-metadata>=0.1.2",
20-
"scrapy-zyte-api[provider]>=0.16.0",
21-
"zyte-common-items>=0.22.0",
18+
"scrapy-poet>=0.23.0",
19+
"scrapy-spider-metadata>=0.2.0",
20+
"scrapy-zyte-api[provider]>=0.23.0",
21+
"zyte-common-items>=0.23.0",
2222
],
2323
classifiers=[
2424
"Development Status :: 3 - Alpha",

tests/test_ecommerce.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import requests
66
import scrapy
77
from pydantic import ValidationError
8-
from scrapy_poet import DummyResponse
8+
from scrapy_poet import DummyResponse, DynamicDeps
99
from scrapy_spider_metadata import get_spider_metadata
1010
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request
1111

@@ -243,7 +243,7 @@ def test_parse_product(probability, has_item, item_drop, caplog):
243243
mock_crawler = MagicMock()
244244
spider.crawler = mock_crawler
245245
logging.getLogger().setLevel(logging.INFO)
246-
items = list(spider.parse_product(response, product))
246+
items = list(spider.parse_product(response, product, DynamicDeps()))
247247
if item_drop:
248248
assert mock_crawler.method_calls == [
249249
call.stats.inc_value("drop_item/product/low_probability")
@@ -463,7 +463,7 @@ def test_metadata():
463463
"title": "Pagination Only",
464464
},
465465
},
466-
"title": "Crawl Strategy",
466+
"title": "Crawl strategy",
467467
"enum": [
468468
"automatic",
469469
"full",
@@ -528,6 +528,42 @@ def test_metadata():
528528
"title": "Extraction source",
529529
"enum": ["httpResponseBody", "browserHtml"],
530530
},
531+
"custom_attrs_input": {
532+
"anyOf": [
533+
{
534+
"contentMediaType": "application/json",
535+
"contentSchema": {"type": "object"},
536+
"type": "string",
537+
},
538+
{"type": "null"},
539+
],
540+
"default": None,
541+
"description": "Custom attributes to extract.",
542+
"title": "Custom attributes schema",
543+
"widget": "custom-attrs",
544+
},
545+
"custom_attrs_method": {
546+
"default": "generate",
547+
"description": "Which model to use for custom attribute extraction.",
548+
"enum": ["generate", "extract"],
549+
"enumMeta": {
550+
"extract": {
551+
"description": "Use an extractive model (BERT). Supports only a "
552+
"subset of the schema (string, integer and "
553+
"number), suited for extraction of short and clear "
554+
"fields, with a fixed per-request cost.",
555+
"title": "extract",
556+
},
557+
"generate": {
558+
"description": "Use a generative model (LLM). The most powerful "
559+
"and versatile, but more expensive, with variable "
560+
"per-request cost.",
561+
"title": "generate",
562+
},
563+
},
564+
"title": "Custom attributes extraction method",
565+
"type": "string",
566+
},
531567
},
532568
"title": "EcommerceSpiderParams",
533569
"type": "object",

tox.ini

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ commands =
2020
basepython = python3.9
2121
deps =
2222
{[testenv]deps}
23-
pydantic==2
23+
pydantic==2.1
2424
requests==0.10.1
2525
scrapy==2.11.0
26-
scrapy-poet==0.21.0
27-
scrapy-spider-metadata==0.1.2
28-
scrapy-zyte-api[provider]==0.16.0
29-
zyte-common-items==0.22.0
26+
scrapy-poet==0.23.0
27+
scrapy-spider-metadata==0.2.0
28+
scrapy-zyte-api[provider]==0.23.0
29+
zyte-common-items==0.23.0
3030

3131
[testenv:mypy]
3232
deps =

zyte_spider_templates/params.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,17 @@
22
import re
33
from enum import Enum
44
from logging import getLogger
5-
from typing import Dict, List, Optional, Union
5+
from typing import Any, Dict, List, Optional, Union
66

77
import requests
8-
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
8+
from pydantic import (
9+
BaseModel,
10+
ConfigDict,
11+
Field,
12+
Json,
13+
field_validator,
14+
model_validator,
15+
)
916

1017
try:
1118
from pydantic.config import JsonDict
@@ -34,6 +41,18 @@ class ExtractFrom(str, Enum):
3441
"""Use browser rendering. Often provides the best quality."""
3542

3643

44+
@document_enum
45+
class CustomAttrsMethod(str, Enum):
46+
generate: str = "generate"
47+
"""Use a generative model (LLM). The most powerful and versatile, but more
48+
expensive, with variable per-request cost."""
49+
50+
extract: str = "extract"
51+
"""Use an extractive model (BERT). Supports only a subset of the schema (string,
52+
integer and number), suited for extraction of short and clear fields, with a fixed
53+
per-request cost."""
54+
55+
3756
class ExtractFromParam(BaseModel):
3857
extract_from: Optional[ExtractFrom] = Field(
3958
title="Extraction source",
@@ -304,3 +323,39 @@ def validate_location(
304323
return PostalAddress(**value)
305324

306325
raise ValueError(f"{value!r} type {type(value)} is not a supported type")
326+
327+
328+
class CustomAttrsInputParam(BaseModel):
329+
custom_attrs_input: Optional[Json[Dict[str, Any]]] = Field(
330+
title="Custom attributes schema",
331+
description="Custom attributes to extract.",
332+
default=None,
333+
json_schema_extra={
334+
"widget": "custom-attrs",
335+
},
336+
)
337+
338+
339+
class CustomAttrsMethodParam(BaseModel):
340+
custom_attrs_method: CustomAttrsMethod = Field(
341+
title="Custom attributes extraction method",
342+
description="Which model to use for custom attribute extraction.",
343+
default=CustomAttrsMethod.generate,
344+
json_schema_extra={
345+
"enumMeta": {
346+
CustomAttrsMethod.generate: {
347+
"title": "generate",
348+
"description": "Use a generative model (LLM). The most powerful "
349+
"and versatile, but more expensive, with variable "
350+
"per-request cost.",
351+
},
352+
CustomAttrsMethod.extract: {
353+
"title": "extract",
354+
"description": "Use an extractive model (BERT). Supports only a "
355+
"subset of the schema (string, integer and "
356+
"number), suited for extraction of short and clear "
357+
"fields, with a fixed per-request cost.",
358+
},
359+
},
360+
},
361+
)

zyte_spider_templates/spiders/base.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from importlib.metadata import version
2-
from typing import Any, Dict
2+
from typing import Annotated, Any, Dict
33
from warnings import warn
44

55
import scrapy
66
from pydantic import BaseModel, ConfigDict, model_validator
77
from scrapy.crawler import Crawler
8+
from scrapy_zyte_api import custom_attrs
9+
from zyte_common_items import CustomAttributes
810

911
from ..params import (
1012
INPUT_GROUP,
@@ -63,6 +65,8 @@ class BaseSpider(scrapy.Spider):
6365

6466
_NEXT_PAGE_PRIORITY: int = 100
6567

68+
_custom_attrs_dep = None
69+
6670
@classmethod
6771
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
6872
spider = super().from_crawler(crawler, *args, **kwargs)
@@ -86,4 +90,21 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
8690
spider.args.max_requests,
8791
priority=ARG_SETTING_PRIORITY,
8892
)
93+
94+
if custom_attrs_input := getattr(spider.args, "custom_attrs_input", None):
95+
custom_attrs_options = {
96+
"method": spider.args.custom_attrs_method,
97+
}
98+
if max_input_tokens := crawler.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"):
99+
custom_attrs_options["maxInputTokens"] = max_input_tokens
100+
if max_output_tokens := crawler.settings.getint(
101+
"ZYTE_API_MAX_OUTPUT_TOKENS"
102+
):
103+
custom_attrs_options["maxOutputTokens"] = max_output_tokens
104+
105+
spider._custom_attrs_dep = Annotated[
106+
CustomAttributes,
107+
custom_attrs(custom_attrs_input, custom_attrs_options),
108+
]
109+
89110
return spider

zyte_spider_templates/spiders/ecommerce.py

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,18 @@
22
from typing import Any, Callable, Dict, Iterable, Optional, Union
33

44
import scrapy
5+
from andi.typeutils import strip_annotated
56
from pydantic import BaseModel, ConfigDict, Field
67
from scrapy import Request
78
from scrapy.crawler import Crawler
8-
from scrapy_poet import DummyResponse
9+
from scrapy_poet import DummyResponse, DynamicDeps
910
from scrapy_spider_metadata import Args
10-
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
11+
from zyte_common_items import (
12+
CustomAttributes,
13+
ProbabilityRequest,
14+
Product,
15+
ProductNavigation,
16+
)
1117

1218
from zyte_spider_templates.heuristics import is_homepage
1319
from zyte_spider_templates.params import parse_input_params
@@ -20,6 +26,8 @@
2026

2127
from ..documentation import document_enum
2228
from ..params import (
29+
CustomAttrsInputParam,
30+
CustomAttrsMethodParam,
2331
ExtractFromParam,
2432
GeolocationParam,
2533
MaxRequestsParam,
@@ -61,7 +69,7 @@ class EcommerceCrawlStrategy(str, Enum):
6169

6270
class EcommerceCrawlStrategyParam(BaseModel):
6371
crawl_strategy: EcommerceCrawlStrategy = Field(
64-
title="Crawl Strategy",
72+
title="Crawl strategy",
6573
description="Determines how the start URL and follow-up URLs are crawled.",
6674
default=EcommerceCrawlStrategy.automatic,
6775
json_schema_extra={
@@ -110,6 +118,8 @@ class EcommerceCrawlStrategyParam(BaseModel):
110118

111119

112120
class EcommerceSpiderParams(
121+
CustomAttrsMethodParam,
122+
CustomAttrsInputParam,
113123
ExtractFromParam,
114124
MaxRequestsParam,
115125
GeolocationParam,
@@ -227,13 +237,23 @@ def parse_navigation(
227237
yield self.get_subcategory_request(request, page_params=page_params)
228238

229239
def parse_product(
230-
self, response: DummyResponse, product: Product
231-
) -> Iterable[Product]:
240+
self, response: DummyResponse, product: Product, dynamic: DynamicDeps
241+
) -> Iterable[
242+
Union[Product, Dict[str, Union[Product, Optional[CustomAttributes]]]]
243+
]:
232244
probability = product.get_probability()
233245

234246
# TODO: convert to a configurable parameter later on after the launch
235247
if probability is None or probability >= 0.1:
236-
yield product
248+
if self.args.custom_attrs_input:
249+
custom_attrs = None
250+
for cls, value in dynamic.items():
251+
if strip_annotated(cls) is CustomAttributes:
252+
custom_attrs = value
253+
break
254+
yield {"product": product, "customAttributes": custom_attrs}
255+
else:
256+
yield product
237257
else:
238258
self.crawler.stats.inc_value("drop_item/product/low_probability")
239259
self.logger.info(
@@ -319,17 +339,22 @@ def get_parse_product_request(
319339
priority = self.get_parse_product_request_priority(request)
320340

321341
probability = request.get_probability()
342+
meta = {
343+
"crawling_logs": {
344+
"name": request.name,
345+
"probability": probability,
346+
"page_type": "product",
347+
},
348+
}
349+
if self._custom_attrs_dep:
350+
meta["inject"] = [
351+
self._custom_attrs_dep,
352+
]
322353

323354
scrapy_request = request.to_scrapy(
324355
callback=callback,
325356
priority=priority,
326-
meta={
327-
"crawling_logs": {
328-
"name": request.name,
329-
"probability": probability,
330-
"page_type": "product",
331-
}
332-
},
357+
meta=meta,
333358
)
334359
scrapy_request.meta["allow_offsite"] = True
335360
return scrapy_request

0 commit comments

Comments
 (0)