From 19065d4cc50ab986b96c35342b02a72bd8687eaa Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 24 Dec 2024 20:26:30 +0500 Subject: [PATCH] Add custom attribute support to the article spider. --- tests/test_article.py | 83 ++++++++++++++++++------ zyte_spider_templates/spiders/article.py | 55 ++++++++++++---- 2 files changed, 103 insertions(+), 35 deletions(-) diff --git a/tests/test_article.py b/tests/test_article.py index 9a492c3..0d7d9c9 100644 --- a/tests/test_article.py +++ b/tests/test_article.py @@ -1,4 +1,4 @@ -from typing import Tuple, Type, cast +from typing import Iterable, Tuple, Type, cast from unittest.mock import patch import pytest @@ -106,7 +106,7 @@ def test_crawl_strategy_direct_item(): ) start_requests = list(spider.start_requests()) assert len(start_requests) == 1 - assert start_requests[0].callback == cast(ArticleSpider, spider).parse_dynamic + assert start_requests[0].callback == spider.parse_dynamic assert start_requests[0].url == "https://example.com" assert start_requests[0].meta["request_type"] == RequestType.ARTICLE assert start_requests[0].meta["crawling_logs"]["name"] == "[article]" @@ -235,7 +235,7 @@ def test_init_input_without_urls_file(): crawler = get_crawler() base_kwargs = {"url": "https://example.com"} spider = ArticleSpider.from_crawler(crawler, **base_kwargs) - cast(ArticleSpider, spider)._init_input() + spider._init_input() assert spider.start_urls == ["https://example.com"] @@ -413,6 +413,42 @@ def test_metadata(): "title": "Extraction source", "enum": ["httpResponseBody", "browserHtml"], }, + "custom_attrs_input": { + "anyOf": [ + { + "contentMediaType": "application/json", + "contentSchema": {"type": "object"}, + "type": "string", + }, + {"type": "null"}, + ], + "default": None, + "description": "Custom attributes to extract.", + "title": "Custom attributes schema", + "widget": "custom-attrs", + }, + "custom_attrs_method": { + "default": "generate", + "description": "Which model to use for custom attribute extraction.", + "enum": ["generate", "extract"], + "enumMeta": { + "extract": { + "description": "Use an extractive model (BERT). Supports only a " + "subset of the schema (string, integer and " + "number), suited for extraction of short and clear " + "fields, with a fixed per-request cost.", + "title": "extract", + }, + "generate": { + "description": "Use a generative model (LLM). The most powerful " + "and versatile, but more expensive, with variable " + "per-request cost.", + "title": "generate", + }, + }, + "title": "Custom attributes extraction method", + "type": "string", + }, }, "title": "ArticleSpiderParams", "type": "object", @@ -482,8 +518,9 @@ def test_crawl(): subCategories=article_navigation_items["subCategories"], ) requests = list( - cast(ArticleSpider, spider).parse_dynamic( - response, {ArticleNavigation: navigation} + cast( + Iterable[scrapy.Request], + spider.parse_dynamic(response, {ArticleNavigation: navigation}), ) ) assert requests[2].url == "https://example.com/link_4" @@ -494,7 +531,7 @@ def test_crawl(): ) assert requests[2].meta["crawling_logs"]["page_type"] == "article" assert requests[2].meta["crawling_logs"]["probability"] == 0.99 - assert requests[2].callback == cast(ArticleSpider, spider).parse_dynamic + assert requests[2].callback == spider.parse_dynamic assert requests[5].url == "https://example.com/link_3" assert requests[5].meta["request_type"] == RequestType.NAVIGATION @@ -504,7 +541,7 @@ def test_crawl(): ) assert requests[5].meta["crawling_logs"]["page_type"] == "subCategories" assert requests[5].meta["crawling_logs"]["probability"] == 1.0 - assert requests[5].callback == cast(ArticleSpider, spider).parse_dynamic + assert requests[5].callback == spider.parse_dynamic assert requests[0].url == "https://example.com/link_1" assert requests[0].meta["request_type"] == RequestType.ARTICLE_AND_NAVIGATION @@ -515,7 +552,7 @@ def test_crawl(): requests[0].meta["crawling_logs"]["page_type"] == "articleNavigation-heuristics" ) assert requests[0].meta["crawling_logs"]["probability"] == 0.5 - assert requests[0].callback == cast(ArticleSpider, spider).parse_dynamic + assert requests[0].callback == spider.parse_dynamic assert requests[1].url == "https://example.com/link_2" assert requests[1].meta["request_type"] == RequestType.ARTICLE_AND_NAVIGATION @@ -526,18 +563,13 @@ def test_crawl(): requests[1].meta["crawling_logs"]["page_type"] == "articleNavigation-heuristics" ) assert requests[1].meta["crawling_logs"]["probability"] == 0.5 - assert requests[1].callback == cast(ArticleSpider, spider).parse_dynamic + assert requests[1].callback == spider.parse_dynamic # parse_article request = scrapy.Request(url=url) response = DummyResponse(url=request.url, request=request) article = Article(url=article_url) - assert ( - article - == list( - cast(ArticleSpider, spider).parse_dynamic(response, {Article: article}) - )[0] - ) + assert article == list(spider.parse_dynamic(response, {Article: article}))[0] # parse article_and_navigation request = scrapy.Request(url=url) @@ -549,8 +581,11 @@ def test_crawl(): subCategories=article_navigation_items["subCategories"], ) requests = list( - cast(ArticleSpider, spider).parse_dynamic( - response, {Article: article, ArticleNavigation: navigation} + cast( + Iterable[scrapy.Request], + spider.parse_dynamic( + response, {Article: article, ArticleNavigation: navigation} + ), ) ) @@ -573,8 +608,11 @@ def test_crawl(): nextPage=Request(url="https://example.com/next_page", name="nextPage"), ) requests = list( - cast(ArticleSpider, spider).parse_dynamic( - response, {Article: article, ArticleNavigation: navigation} + cast( + Iterable[scrapy.Request], + spider.parse_dynamic( + response, {Article: article, ArticleNavigation: navigation} + ), ) ) @@ -599,8 +637,11 @@ def test_crawl(): nextPage=Request(url="https://example.com/next_page", name="nextPage"), ) requests = list( - cast(ArticleSpider, spider).parse_dynamic( - response, {Article: article, ArticleNavigation: navigation} + cast( + Iterable[scrapy.Request], + spider.parse_dynamic( + response, {Article: article, ArticleNavigation: navigation} + ), ) ) diff --git a/zyte_spider_templates/spiders/article.py b/zyte_spider_templates/spiders/article.py index ef9907a..cb32ee8 100644 --- a/zyte_spider_templates/spiders/article.py +++ b/zyte_spider_templates/spiders/article.py @@ -1,7 +1,7 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional +from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Union import attrs import requests @@ -16,6 +16,7 @@ from zyte_common_items import ( Article, ArticleNavigation, + CustomAttributes, ProbabilityMetadata, ProbabilityRequest, ) @@ -25,6 +26,8 @@ from zyte_spider_templates.pages.article_heuristics import is_feed_request from zyte_spider_templates.params import ( INPUT_GROUP, + CustomAttrsInputParam, + CustomAttrsMethodParam, ExtractFrom, ExtractFromParam, GeolocationParam, @@ -141,6 +144,8 @@ class ArticleCrawlStrategyParam(BaseModel): class ArticleSpiderParams( + CustomAttrsMethodParam, + CustomAttrsInputParam, ExtractFromParam, MaxRequestsPerSeedParam, MaxRequestsParam, @@ -248,7 +253,9 @@ def _init_incremental(self): f"INCREMENTAL_CRAWL_COLLECTION_NAME={self.args.incremental_collection_name} " ) - def _update_inject_meta(self, meta: Dict[str, Any], is_feed: bool) -> None: + def _update_inject_meta( + self, meta: Dict[str, Any], is_feed: bool, request_type: RequestType + ) -> None: """ The issue: `HeuristicsArticleNavigationPage` has only `AnyResponse` as a dependency, so the current implementation of `ScrapyZyteApiProvider` always uses `HttpResponse` @@ -257,19 +264,24 @@ def _update_inject_meta(self, meta: Dict[str, Any], is_feed: bool) -> None: This function forces `browserHtml` extraction when `extract_from=browserHtml` for Article and ArticleNavigation pages, while continuing to use `HttpResponse` for feeds. + + It also adds the `CustomAttributes` dep when needed. """ + inject = meta["inject"].copy() + if is_feed: - inject = meta["inject"].copy() inject.append(HttpResponse) - meta["inject"] = inject - return None - - if self.args.extract_from == ExtractFrom.browserHtml: - inject = meta["inject"].copy() + elif self.args.extract_from == ExtractFrom.browserHtml: inject.append(BrowserResponse) - meta["inject"] = inject - return None + + if self._custom_attrs_dep and request_type in ( + RequestType.ARTICLE.value, + RequestType.ARTICLE_AND_NAVIGATION.value, + ): + inject.append(self._custom_attrs_dep) + + meta["inject"] = inject def _update_request_name(self, req: ProbabilityRequest) -> None: replacements = { @@ -310,7 +322,13 @@ def parse_dynamic( self, response: DummyResponse, dynamic: DynamicDeps, - ) -> Iterable[scrapy.Request]: + ) -> Iterable[ + Union[ + scrapy.Request, + Article, + Dict[str, Union[Article, Optional[CustomAttributes]]], + ] + ]: if Article in dynamic: yield from self._parse_as_article(response, dynamic) @@ -319,8 +337,17 @@ def parse_dynamic( def _parse_as_article( self, response: DummyResponse, dynamic: DynamicDeps - ) -> Iterable[scrapy.Request]: - yield dynamic[Article] + ) -> Iterable[ + Union[Article, Dict[str, Union[Article, Optional[CustomAttributes]]]] + ]: + article = dynamic[Article] + if self.args.custom_attrs_input: + yield { + "article": article, + "customAttributes": dynamic.get(CustomAttributes), + } + else: + yield article def _parse_as_navigation( self, response: DummyResponse, dynamic: DynamicDeps @@ -408,7 +435,7 @@ def get_parse_request( "inject": request_type.inject, }, ) - self._update_inject_meta(meta, is_feed) + self._update_inject_meta(meta, is_feed, request_type) return request.to_scrapy( callback=self.parse_dynamic,