From 4f8b2c8f6cd857b8759c26326e8527ecdfc44813 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 19 Sep 2024 20:55:29 +0500 Subject: [PATCH 1/7] Custom attributes extraction support. --- setup.py | 7 +-- tests/test_ecommerce.py | 35 ++++++++++-- tox.ini | 4 +- zyte_spider_templates/params.py | 48 +++++++++++++++++ zyte_spider_templates/spiders/ecommerce.py | 62 +++++++++++++++++----- 5 files changed, 136 insertions(+), 20 deletions(-) diff --git a/setup.py b/setup.py index 61845a1..5b2fce8 100644 --- a/setup.py +++ b/setup.py @@ -15,10 +15,11 @@ "pydantic>=2", "requests>=0.10.1", "scrapy>=2.11.0", - "scrapy-poet>=0.21.0", + "scrapy-poet>=0.23.0", "scrapy-spider-metadata>=0.1.2", - "scrapy-zyte-api[provider]>=0.16.0", - "zyte-common-items>=0.22.0", + # https://github.com/scrapy-plugins/scrapy-zyte-api/pull/213 + "scrapy-zyte-api[provider] @ git+https://github.com/scrapy-plugins/scrapy-zyte-api.git@custom-attrs", + "zyte-common-items>=0.23.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index ae77049..9bdd645 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -5,7 +5,7 @@ import requests import scrapy from pydantic import ValidationError -from scrapy_poet import DummyResponse +from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request @@ -243,7 +243,7 @@ def test_parse_product(probability, has_item, item_drop, caplog): mock_crawler = MagicMock() spider.crawler = mock_crawler logging.getLogger().setLevel(logging.INFO) - items = list(spider.parse_product(response, product)) + items = list(spider.parse_product(response, product, DynamicDeps())) if item_drop: assert mock_crawler.method_calls == [ call.stats.inc_value("drop_item/product/low_probability") @@ -251,7 +251,7 @@ def test_parse_product(probability, has_item, item_drop, caplog): if has_item: assert len(items) == 1 - assert items[0] == product + assert items[0] == {"product": product} assert caplog.text == "" else: assert len(items) == 0 @@ -528,6 +528,35 @@ def test_metadata(): "title": "Extraction source", "enum": ["httpResponseBody", "browserHtml"], }, + "custom_attrs_input": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + "description": "Custom attributes to extract.", + "title": "Custom attributes schema", + "widget": "custom-attrs", + }, + "custom_attrs_method": { + "default": "generate", + "description": "Which model to use for custom attribute extraction.", + "enum": ["generate", "extract"], + "enumMeta": { + "extract": { + "description": "Use an extractive model (BERT). Supports only a " + "subset of the schema (string, integer and " + "number), suited for extraction of short and clear " + "fields, with a fixed per-request cost.", + "title": "extract", + }, + "generate": { + "description": "Use a generative model (LLM). The most powerful " + "and versatile, but more expensive, with variable " + "per-request cost.", + "title": "generate", + }, + }, + "title": "Custom Attrs Method", + "type": "string", + }, }, "title": "EcommerceSpiderParams", "type": "object", diff --git a/tox.ini b/tox.ini index a88f936..d7982c7 100644 --- a/tox.ini +++ b/tox.ini @@ -23,10 +23,10 @@ deps = pydantic==2 requests==0.10.1 scrapy==2.11.0 - scrapy-poet==0.21.0 + scrapy-poet==0.23.0 scrapy-spider-metadata==0.1.2 scrapy-zyte-api[provider]==0.16.0 - zyte-common-items==0.22.0 + zyte-common-items==0.23.0 [testenv:mypy] deps = diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index f3190ab..7e44950 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -34,6 +34,18 @@ class ExtractFrom(str, Enum): """Use browser rendering. Often provides the best quality.""" +@document_enum +class CustomAttrsMethod(str, Enum): + generate: str = "generate" + """Use a generative model (LLM). The most powerful and versatile, but more + expensive, with variable per-request cost.""" + + extract: str = "extract" + """Use an extractive model (BERT). Supports only a subset of the schema (string, + integer and number), suited for extraction of short and clear fields, with a fixed + per-request cost.""" + + class ExtractFromParam(BaseModel): extract_from: Optional[ExtractFrom] = Field( title="Extraction source", @@ -304,3 +316,39 @@ def validate_location( return PostalAddress(**value) raise ValueError(f"{value!r} type {type(value)} is not a supported type") + + +class CustomAttrsInputParam(BaseModel): + custom_attrs_input: Optional[str] = Field( + title="Custom attributes schema", + description="Custom attributes to extract.", + default=None, + json_schema_extra={ + "widget": "custom-attrs", + }, + ) + + +class CustomAttrsMethodParam(BaseModel): + custom_attrs_method: CustomAttrsMethod = Field( + title="Custom attributes extraction method", + description="Which model to use for custom attribute extraction.", + default=CustomAttrsMethod.generate, + json_schema_extra={ + "enumMeta": { + CustomAttrsMethod.generate: { + "title": "generate", + "description": "Use a generative model (LLM). The most powerful " + "and versatile, but more expensive, with variable " + "per-request cost.", + }, + CustomAttrsMethod.extract: { + "title": "extract", + "description": "Use an extractive model (BERT). Supports only a " + "subset of the schema (string, integer and " + "number), suited for extraction of short and clear " + "fields, with a fixed per-request cost.", + }, + }, + }, + ) diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 3868649..a10ea0e 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,13 +1,22 @@ +import json from enum import Enum -from typing import Any, Callable, Dict, Iterable, Optional, Union +from json import JSONDecodeError +from typing import Annotated, Any, Callable, Dict, Iterable, Optional, Union import scrapy +from andi.typeutils import strip_annotated from pydantic import BaseModel, ConfigDict, Field from scrapy import Request from scrapy.crawler import Crawler -from scrapy_poet import DummyResponse +from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args -from zyte_common_items import ProbabilityRequest, Product, ProductNavigation +from scrapy_zyte_api import custom_attrs +from zyte_common_items import ( + CustomAttributesValues, + ProbabilityRequest, + Product, + ProductNavigation, +) from zyte_spider_templates.heuristics import is_homepage from zyte_spider_templates.params import parse_input_params @@ -20,6 +29,8 @@ from ..documentation import document_enum from ..params import ( + CustomAttrsInputParam, + CustomAttrsMethodParam, ExtractFromParam, GeolocationParam, MaxRequestsParam, @@ -110,6 +121,8 @@ class EcommerceCrawlStrategyParam(BaseModel): class EcommerceSpiderParams( + CustomAttrsMethodParam, + CustomAttrsInputParam, ExtractFromParam, MaxRequestsParam, GeolocationParam, @@ -227,13 +240,19 @@ def parse_navigation( yield self.get_subcategory_request(request, page_params=page_params) def parse_product( - self, response: DummyResponse, product: Product + self, response: DummyResponse, product: Product, dynamic: DynamicDeps ) -> Iterable[Product]: probability = product.get_probability() # TODO: convert to a configurable parameter later on after the launch if probability is None or probability >= 0.1: - yield product + for cls, value in dynamic.items(): + cls = strip_annotated(cls) + if cls is CustomAttributesValues: + yield {"product": product, "custom_attrs": value} + break + else: + yield {"product": product} else: self.crawler.stats.inc_value("drop_item/product/low_probability") self.logger.info( @@ -319,17 +338,36 @@ def get_parse_product_request( priority = self.get_parse_product_request_priority(request) probability = request.get_probability() + meta = { + "crawling_logs": { + "name": request.name, + "probability": probability, + "page_type": "product", + }, + } + if self.args.custom_attrs_input: + custom_attrs_options = { + "method": self.args.custom_attrs_method, + } + if max_input_tokens := self.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"): + custom_attrs_options["maxInputTokens"] = max_input_tokens + if max_output_tokens := self.settings.getint("ZYTE_API_MAX_OUTPUT_TOKENS"): + custom_attrs_options["maxOutputTokens"] = max_output_tokens + + try: + custom_attrs_input = json.loads(self.args.custom_attrs_input) + except JSONDecodeError as e: + self.logger.error(f"Invalid JSON passed in custom_attrs_input: {e}") + else: + annotation = custom_attrs(custom_attrs_input, custom_attrs_options) + meta["inject"] = [ + Annotated[CustomAttributesValues, annotation], # FIXME Python < 3.9 + ] scrapy_request = request.to_scrapy( callback=callback, priority=priority, - meta={ - "crawling_logs": { - "name": request.name, - "probability": probability, - "page_type": "product", - } - }, + meta=meta, ) scrapy_request.meta["allow_offsite"] = True return scrapy_request From fc40279dcb8fe07f6980fa2587533e58b885adc0 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 23 Sep 2024 18:36:16 +0500 Subject: [PATCH 2/7] Address review feedback. --- setup.py | 2 +- tests/test_ecommerce.py | 2 +- zyte_spider_templates/spiders/base.py | 30 +++++++++++++++- zyte_spider_templates/spiders/ecommerce.py | 41 +++++++--------------- 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/setup.py b/setup.py index 5b2fce8..252c4e1 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "scrapy-poet>=0.23.0", "scrapy-spider-metadata>=0.1.2", # https://github.com/scrapy-plugins/scrapy-zyte-api/pull/213 - "scrapy-zyte-api[provider] @ git+https://github.com/scrapy-plugins/scrapy-zyte-api.git@custom-attrs", + "scrapy-zyte-api[provider] @ git+https://github.com/scrapy-plugins/scrapy-zyte-api.git@47d3e1f", "zyte-common-items>=0.23.0", ], classifiers=[ diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 9bdd645..387402a 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -251,7 +251,7 @@ def test_parse_product(probability, has_item, item_drop, caplog): if has_item: assert len(items) == 1 - assert items[0] == {"product": product} + assert items[0] == product assert caplog.text == "" else: assert len(items) == 0 diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index deb00ee..a29cf73 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,10 +1,14 @@ +import json from importlib.metadata import version -from typing import Any, Dict +from json import JSONDecodeError +from typing import Annotated, Any, Dict from warnings import warn import scrapy from pydantic import BaseModel, ConfigDict, model_validator from scrapy.crawler import Crawler +from scrapy_zyte_api import custom_attrs +from zyte_common_items import CustomAttributesValues from ..params import ( INPUT_GROUP, @@ -63,6 +67,8 @@ class BaseSpider(scrapy.Spider): _NEXT_PAGE_PRIORITY: int = 100 + _custom_attrs_dep = None + @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super().from_crawler(crawler, *args, **kwargs) @@ -86,4 +92,26 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider.args.max_requests, priority=ARG_SETTING_PRIORITY, ) + + if custom_attrs_input_arg := getattr(spider.args, "custom_attrs_input", None): + custom_attrs_options = { + "method": spider.args.custom_attrs_method, + } + if max_input_tokens := crawler.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"): + custom_attrs_options["maxInputTokens"] = max_input_tokens + if max_output_tokens := crawler.settings.getint( + "ZYTE_API_MAX_OUTPUT_TOKENS" + ): + custom_attrs_options["maxOutputTokens"] = max_output_tokens + + try: + custom_attrs_input = json.loads(custom_attrs_input_arg) + except JSONDecodeError as e: + spider.logger.error(f"Invalid JSON passed in custom_attrs_input: {e}") + else: + spider._custom_attrs_dep = Annotated[ + CustomAttributesValues, + custom_attrs(custom_attrs_input, custom_attrs_options), + ] + return spider diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index a10ea0e..d59de31 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,7 +1,5 @@ -import json from enum import Enum -from json import JSONDecodeError -from typing import Annotated, Any, Callable, Dict, Iterable, Optional, Union +from typing import Any, Callable, Dict, Iterable, Optional, Union import scrapy from andi.typeutils import strip_annotated @@ -10,7 +8,6 @@ from scrapy.crawler import Crawler from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args -from scrapy_zyte_api import custom_attrs from zyte_common_items import ( CustomAttributesValues, ProbabilityRequest, @@ -246,13 +243,15 @@ def parse_product( # TODO: convert to a configurable parameter later on after the launch if probability is None or probability >= 0.1: - for cls, value in dynamic.items(): - cls = strip_annotated(cls) - if cls is CustomAttributesValues: - yield {"product": product, "custom_attrs": value} - break + if self.args.custom_attrs_input: + custom_attr_values = {} + for cls, value in dynamic.items(): + if strip_annotated(cls) is CustomAttributesValues: + custom_attr_values = value + break + yield {"product": product, "customAttributeValues": custom_attr_values} else: - yield {"product": product} + yield product else: self.crawler.stats.inc_value("drop_item/product/low_probability") self.logger.info( @@ -345,24 +344,10 @@ def get_parse_product_request( "page_type": "product", }, } - if self.args.custom_attrs_input: - custom_attrs_options = { - "method": self.args.custom_attrs_method, - } - if max_input_tokens := self.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"): - custom_attrs_options["maxInputTokens"] = max_input_tokens - if max_output_tokens := self.settings.getint("ZYTE_API_MAX_OUTPUT_TOKENS"): - custom_attrs_options["maxOutputTokens"] = max_output_tokens - - try: - custom_attrs_input = json.loads(self.args.custom_attrs_input) - except JSONDecodeError as e: - self.logger.error(f"Invalid JSON passed in custom_attrs_input: {e}") - else: - annotation = custom_attrs(custom_attrs_input, custom_attrs_options) - meta["inject"] = [ - Annotated[CustomAttributesValues, annotation], # FIXME Python < 3.9 - ] + if self._custom_attrs_dep: + meta["inject"] = [ + self._custom_attrs_dep, + ] scrapy_request = request.to_scrapy( callback=callback, From a0e2dec5ffd9ed062e482b2c86c1513a61a9a9fa Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 24 Sep 2024 17:47:36 +0500 Subject: [PATCH 3/7] Emit full CustomAttributes. --- zyte_spider_templates/spiders/base.py | 4 ++-- zyte_spider_templates/spiders/ecommerce.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index a29cf73..48777f0 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, ConfigDict, model_validator from scrapy.crawler import Crawler from scrapy_zyte_api import custom_attrs -from zyte_common_items import CustomAttributesValues +from zyte_common_items import CustomAttributes from ..params import ( INPUT_GROUP, @@ -110,7 +110,7 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider.logger.error(f"Invalid JSON passed in custom_attrs_input: {e}") else: spider._custom_attrs_dep = Annotated[ - CustomAttributesValues, + CustomAttributes, custom_attrs(custom_attrs_input, custom_attrs_options), ] diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index d59de31..5740493 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -9,7 +9,7 @@ from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from zyte_common_items import ( - CustomAttributesValues, + CustomAttributes, ProbabilityRequest, Product, ProductNavigation, @@ -244,12 +244,12 @@ def parse_product( # TODO: convert to a configurable parameter later on after the launch if probability is None or probability >= 0.1: if self.args.custom_attrs_input: - custom_attr_values = {} + custom_attrs = {} for cls, value in dynamic.items(): - if strip_annotated(cls) is CustomAttributesValues: - custom_attr_values = value + if strip_annotated(cls) is CustomAttributes: + custom_attrs = value break - yield {"product": product, "customAttributeValues": custom_attr_values} + yield {"product": product, "customAttributes": custom_attrs} else: yield product else: From bbd984a7ff5a232f84d1d9361fada4f5f1490ae5 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 26 Sep 2024 18:02:14 +0500 Subject: [PATCH 4/7] Update scrapy-spider-metadata and scrapy-zyte-api versions. --- setup.py | 5 ++--- tests/test_ecommerce.py | 4 ++-- tox.ini | 4 ++-- zyte_spider_templates/spiders/ecommerce.py | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 8344c88..b5a165c 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,8 @@ "requests>=0.10.1", "scrapy>=2.11.0", "scrapy-poet>=0.23.0", - "scrapy-spider-metadata>=0.1.2", - # https://github.com/scrapy-plugins/scrapy-zyte-api/pull/213 - "scrapy-zyte-api[provider] @ git+https://github.com/scrapy-plugins/scrapy-zyte-api.git@47d3e1f", + "scrapy-spider-metadata>=0.2.0", + "scrapy-zyte-api[provider]>=0.23.0", "zyte-common-items>=0.23.0", ], classifiers=[ diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 387402a..1cff254 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -463,7 +463,7 @@ def test_metadata(): "title": "Pagination Only", }, }, - "title": "Crawl Strategy", + "title": "Crawl strategy", "enum": [ "automatic", "full", @@ -554,7 +554,7 @@ def test_metadata(): "title": "generate", }, }, - "title": "Custom Attrs Method", + "title": "Custom attributes extraction method", "type": "string", }, }, diff --git a/tox.ini b/tox.ini index 7ee70bf..f10ac67 100644 --- a/tox.ini +++ b/tox.ini @@ -24,8 +24,8 @@ deps = requests==0.10.1 scrapy==2.11.0 scrapy-poet==0.23.0 - scrapy-spider-metadata==0.1.2 - scrapy-zyte-api[provider]==0.16.0 + scrapy-spider-metadata==0.2.0 + scrapy-zyte-api[provider]==0.23.0 zyte-common-items==0.23.0 [testenv:mypy] diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 5740493..a1d5530 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -69,7 +69,7 @@ class EcommerceCrawlStrategy(str, Enum): class EcommerceCrawlStrategyParam(BaseModel): crawl_strategy: EcommerceCrawlStrategy = Field( - title="Crawl Strategy", + title="Crawl strategy", description="Determines how the start URL and follow-up URLs are crawled.", default=EcommerceCrawlStrategy.automatic, json_schema_extra={ From a42c45b5e00d358661d36bd22c7fdc69bdeeea88 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 26 Sep 2024 19:58:49 +0500 Subject: [PATCH 5/7] Update parse_product() typing. --- zyte_spider_templates/spiders/ecommerce.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index a1d5530..7f12148 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -238,13 +238,15 @@ def parse_navigation( def parse_product( self, response: DummyResponse, product: Product, dynamic: DynamicDeps - ) -> Iterable[Product]: + ) -> Iterable[ + Union[Product, Dict[str, Union[Product, Optional[CustomAttributes]]]] + ]: probability = product.get_probability() # TODO: convert to a configurable parameter later on after the launch if probability is None or probability >= 0.1: if self.args.custom_attrs_input: - custom_attrs = {} + custom_attrs = None for cls, value in dynamic.items(): if strip_annotated(cls) is CustomAttributes: custom_attrs = value From 3aea5696beff85cb75591d59d5281b0eba253d2a Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 26 Sep 2024 20:54:16 +0500 Subject: [PATCH 6/7] Use Pydantic JSON validation for custom_attrs_input. --- tests/test_ecommerce.py | 9 ++++++++- zyte_spider_templates/params.py | 13 ++++++++++--- zyte_spider_templates/spiders/base.py | 17 +++++------------ 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 1cff254..23bb15a 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -529,7 +529,14 @@ def test_metadata(): "enum": ["httpResponseBody", "browserHtml"], }, "custom_attrs_input": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [ + { + "contentMediaType": "application/json", + "contentSchema": {"type": "object"}, + "type": "string", + }, + {"type": "null"}, + ], "default": None, "description": "Custom attributes to extract.", "title": "Custom attributes schema", diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index 7e44950..688d7e8 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -2,10 +2,17 @@ import re from enum import Enum from logging import getLogger -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import requests -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + Json, + field_validator, + model_validator, +) try: from pydantic.config import JsonDict @@ -319,7 +326,7 @@ def validate_location( class CustomAttrsInputParam(BaseModel): - custom_attrs_input: Optional[str] = Field( + custom_attrs_input: Optional[Json[Dict[str, Any]]] = Field( title="Custom attributes schema", description="Custom attributes to extract.", default=None, diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index 48777f0..e6e78f4 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,6 +1,4 @@ -import json from importlib.metadata import version -from json import JSONDecodeError from typing import Annotated, Any, Dict from warnings import warn @@ -93,7 +91,7 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: priority=ARG_SETTING_PRIORITY, ) - if custom_attrs_input_arg := getattr(spider.args, "custom_attrs_input", None): + if custom_attrs_input := getattr(spider.args, "custom_attrs_input", None): custom_attrs_options = { "method": spider.args.custom_attrs_method, } @@ -104,14 +102,9 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: ): custom_attrs_options["maxOutputTokens"] = max_output_tokens - try: - custom_attrs_input = json.loads(custom_attrs_input_arg) - except JSONDecodeError as e: - spider.logger.error(f"Invalid JSON passed in custom_attrs_input: {e}") - else: - spider._custom_attrs_dep = Annotated[ - CustomAttributes, - custom_attrs(custom_attrs_input, custom_attrs_options), - ] + spider._custom_attrs_dep = Annotated[ + CustomAttributes, + custom_attrs(custom_attrs_input, custom_attrs_options), + ] return spider From faa382a4e88f06dd8ed64627395fb6ae43be4680 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 30 Sep 2024 15:16:59 +0500 Subject: [PATCH 7/7] Bump Pydantic to 2.1. --- setup.py | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b5a165c..a358315 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - "pydantic>=2", + "pydantic>=2.1", "requests>=0.10.1", "scrapy>=2.11.0", "scrapy-poet>=0.23.0", diff --git a/tox.ini b/tox.ini index f10ac67..19b8859 100644 --- a/tox.ini +++ b/tox.ini @@ -20,7 +20,7 @@ commands = basepython = python3.9 deps = {[testenv]deps} - pydantic==2 + pydantic==2.1 requests==0.10.1 scrapy==2.11.0 scrapy-poet==0.23.0