Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom attribute support to the article spider. #109

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 62 additions & 21 deletions tests/test_article.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple, Type, cast
from typing import Iterable, Tuple, Type, cast
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -106,7 +106,7 @@ def test_crawl_strategy_direct_item():
)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].callback == cast(ArticleSpider, spider).parse_dynamic
assert start_requests[0].callback == spider.parse_dynamic
assert start_requests[0].url == "https://example.com"
assert start_requests[0].meta["request_type"] == RequestType.ARTICLE
assert start_requests[0].meta["crawling_logs"]["name"] == "[article]"
Expand Down Expand Up @@ -235,7 +235,7 @@ def test_init_input_without_urls_file():
crawler = get_crawler()
base_kwargs = {"url": "https://example.com"}
spider = ArticleSpider.from_crawler(crawler, **base_kwargs)
cast(ArticleSpider, spider)._init_input()
spider._init_input()

assert spider.start_urls == ["https://example.com"]

Expand Down Expand Up @@ -413,6 +413,42 @@ def test_metadata():
"title": "Extraction source",
"enum": ["httpResponseBody", "browserHtml"],
},
"custom_attrs_input": {
"anyOf": [
{
"contentMediaType": "application/json",
"contentSchema": {"type": "object"},
"type": "string",
},
{"type": "null"},
],
"default": None,
"description": "Custom attributes to extract.",
"title": "Custom attributes schema",
"widget": "custom-attrs",
},
"custom_attrs_method": {
"default": "generate",
"description": "Which model to use for custom attribute extraction.",
"enum": ["generate", "extract"],
"enumMeta": {
"extract": {
"description": "Use an extractive model (BERT). Supports only a "
"subset of the schema (string, integer and "
"number), suited for extraction of short and clear "
"fields, with a fixed per-request cost.",
"title": "extract",
},
"generate": {
"description": "Use a generative model (LLM). The most powerful "
"and versatile, but more expensive, with variable "
"per-request cost.",
"title": "generate",
},
},
"title": "Custom attributes extraction method",
"type": "string",
},
},
"title": "ArticleSpiderParams",
"type": "object",
Expand Down Expand Up @@ -482,8 +518,9 @@ def test_crawl():
subCategories=article_navigation_items["subCategories"],
)
requests = list(
cast(ArticleSpider, spider).parse_dynamic(
response, {ArticleNavigation: navigation}
cast(
Iterable[scrapy.Request],
spider.parse_dynamic(response, {ArticleNavigation: navigation}),
)
)
assert requests[2].url == "https://example.com/link_4"
Expand All @@ -494,7 +531,7 @@ def test_crawl():
)
assert requests[2].meta["crawling_logs"]["page_type"] == "article"
assert requests[2].meta["crawling_logs"]["probability"] == 0.99
assert requests[2].callback == cast(ArticleSpider, spider).parse_dynamic
assert requests[2].callback == spider.parse_dynamic

assert requests[5].url == "https://example.com/link_3"
assert requests[5].meta["request_type"] == RequestType.NAVIGATION
Expand All @@ -504,7 +541,7 @@ def test_crawl():
)
assert requests[5].meta["crawling_logs"]["page_type"] == "subCategories"
assert requests[5].meta["crawling_logs"]["probability"] == 1.0
assert requests[5].callback == cast(ArticleSpider, spider).parse_dynamic
assert requests[5].callback == spider.parse_dynamic

assert requests[0].url == "https://example.com/link_1"
assert requests[0].meta["request_type"] == RequestType.ARTICLE_AND_NAVIGATION
Expand All @@ -515,7 +552,7 @@ def test_crawl():
requests[0].meta["crawling_logs"]["page_type"] == "articleNavigation-heuristics"
)
assert requests[0].meta["crawling_logs"]["probability"] == 0.5
assert requests[0].callback == cast(ArticleSpider, spider).parse_dynamic
assert requests[0].callback == spider.parse_dynamic

assert requests[1].url == "https://example.com/link_2"
assert requests[1].meta["request_type"] == RequestType.ARTICLE_AND_NAVIGATION
Expand All @@ -526,18 +563,13 @@ def test_crawl():
requests[1].meta["crawling_logs"]["page_type"] == "articleNavigation-heuristics"
)
assert requests[1].meta["crawling_logs"]["probability"] == 0.5
assert requests[1].callback == cast(ArticleSpider, spider).parse_dynamic
assert requests[1].callback == spider.parse_dynamic

# parse_article
request = scrapy.Request(url=url)
response = DummyResponse(url=request.url, request=request)
article = Article(url=article_url)
assert (
article
== list(
cast(ArticleSpider, spider).parse_dynamic(response, {Article: article})
)[0]
)
assert article == list(spider.parse_dynamic(response, {Article: article}))[0]

# parse article_and_navigation
request = scrapy.Request(url=url)
Expand All @@ -549,8 +581,11 @@ def test_crawl():
subCategories=article_navigation_items["subCategories"],
)
requests = list(
cast(ArticleSpider, spider).parse_dynamic(
response, {Article: article, ArticleNavigation: navigation}
cast(
Iterable[scrapy.Request],
spider.parse_dynamic(
response, {Article: article, ArticleNavigation: navigation}
),
)
)

Expand All @@ -573,8 +608,11 @@ def test_crawl():
nextPage=Request(url="https://example.com/next_page", name="nextPage"),
)
requests = list(
cast(ArticleSpider, spider).parse_dynamic(
response, {Article: article, ArticleNavigation: navigation}
cast(
Iterable[scrapy.Request],
spider.parse_dynamic(
response, {Article: article, ArticleNavigation: navigation}
),
)
)

Expand All @@ -599,8 +637,11 @@ def test_crawl():
nextPage=Request(url="https://example.com/next_page", name="nextPage"),
)
requests = list(
cast(ArticleSpider, spider).parse_dynamic(
response, {Article: article, ArticleNavigation: navigation}
cast(
Iterable[scrapy.Request],
spider.parse_dynamic(
response, {Article: article, ArticleNavigation: navigation}
),
)
)

Expand Down
55 changes: 41 additions & 14 deletions zyte_spider_templates/spiders/article.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from enum import Enum
from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional
from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Union

import attrs
import requests
Expand All @@ -16,6 +16,7 @@
from zyte_common_items import (
Article,
ArticleNavigation,
CustomAttributes,
ProbabilityMetadata,
ProbabilityRequest,
)
Expand All @@ -25,6 +26,8 @@
from zyte_spider_templates.pages.article_heuristics import is_feed_request
from zyte_spider_templates.params import (
INPUT_GROUP,
CustomAttrsInputParam,
CustomAttrsMethodParam,
ExtractFrom,
ExtractFromParam,
GeolocationParam,
Expand Down Expand Up @@ -141,6 +144,8 @@


class ArticleSpiderParams(
CustomAttrsMethodParam,
CustomAttrsInputParam,
ExtractFromParam,
MaxRequestsPerSeedParam,
MaxRequestsParam,
Expand Down Expand Up @@ -248,7 +253,9 @@
f"INCREMENTAL_CRAWL_COLLECTION_NAME={self.args.incremental_collection_name} "
)

def _update_inject_meta(self, meta: Dict[str, Any], is_feed: bool) -> None:
def _update_inject_meta(
self, meta: Dict[str, Any], is_feed: bool, request_type: RequestType
) -> None:
"""
The issue: `HeuristicsArticleNavigationPage` has only `AnyResponse` as a dependency, so
the current implementation of `ScrapyZyteApiProvider` always uses `HttpResponse`
Expand All @@ -257,19 +264,24 @@
This function forces `browserHtml` extraction when `extract_from=browserHtml`
for Article and ArticleNavigation pages, while continuing to use
`HttpResponse` for feeds.

It also adds the `CustomAttributes` dep when needed.
"""

inject = meta["inject"].copy()

if is_feed:
inject = meta["inject"].copy()
inject.append(HttpResponse)
meta["inject"] = inject
return None

if self.args.extract_from == ExtractFrom.browserHtml:
inject = meta["inject"].copy()
elif self.args.extract_from == ExtractFrom.browserHtml:
inject.append(BrowserResponse)
meta["inject"] = inject
return None

if self._custom_attrs_dep and request_type in (
RequestType.ARTICLE.value,
RequestType.ARTICLE_AND_NAVIGATION.value,
):
inject.append(self._custom_attrs_dep)

Check warning on line 282 in zyte_spider_templates/spiders/article.py

View check run for this annotation

Codecov / codecov/patch

zyte_spider_templates/spiders/article.py#L282

Added line #L282 was not covered by tests

meta["inject"] = inject

def _update_request_name(self, req: ProbabilityRequest) -> None:
replacements = {
Expand Down Expand Up @@ -310,7 +322,13 @@
self,
response: DummyResponse,
dynamic: DynamicDeps,
) -> Iterable[scrapy.Request]:
) -> Iterable[
Union[
scrapy.Request,
Article,
Dict[str, Union[Article, Optional[CustomAttributes]]],
]
]:
if Article in dynamic:
yield from self._parse_as_article(response, dynamic)

Expand All @@ -319,8 +337,17 @@

def _parse_as_article(
self, response: DummyResponse, dynamic: DynamicDeps
) -> Iterable[scrapy.Request]:
yield dynamic[Article]
) -> Iterable[
Union[Article, Dict[str, Union[Article, Optional[CustomAttributes]]]]
]:
article = dynamic[Article]
if self.args.custom_attrs_input:
yield {

Check warning on line 345 in zyte_spider_templates/spiders/article.py

View check run for this annotation

Codecov / codecov/patch

zyte_spider_templates/spiders/article.py#L345

Added line #L345 was not covered by tests
"article": article,
"customAttributes": dynamic.get(CustomAttributes),
}
else:
yield article

def _parse_as_navigation(
self, response: DummyResponse, dynamic: DynamicDeps
Expand Down Expand Up @@ -408,7 +435,7 @@
"inject": request_type.inject,
},
)
self._update_inject_meta(meta, is_feed)
self._update_inject_meta(meta, is_feed, request_type)

return request.to_scrapy(
callback=self.parse_dynamic,
Expand Down
Loading