From bf0c0c2b9c9ec0879dd6cc184bf794877e784b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 11 Jun 2024 21:16:44 +0200 Subject: [PATCH 01/10] Track in stats which fields from Zyte API automatic extraction are used --- scrapy_zyte_api/providers.py | 105 +++++++-- tests/test_providers.py | 443 ++++++++++++++++++++++++++++++++++- 2 files changed, 532 insertions(+), 16 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 3a7979fc..ac6f8884 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,10 +1,11 @@ -from typing import Any, Callable, Dict, List, Optional, Sequence, Set +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type +import attrs from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request from scrapy.crawler import Crawler from scrapy.utils.defer import maybe_deferred_to_future -from scrapy_poet import PageObjectInputProvider +from scrapy_poet import InjectionMiddleware, PageObjectInputProvider from web_poet import ( AnyResponse, BrowserHtml, @@ -13,10 +14,19 @@ HttpResponseHeaders, ) from web_poet.annotated import AnnotatedInstance +from web_poet.fields import get_fields_dict +from web_poet.utils import get_fq_class_name from zyte_common_items import ( Article, ArticleList, ArticleNavigation, + AutoArticleListPage, + AutoArticleNavigationPage, + AutoArticlePage, + AutoJobPostingPage, + AutoProductListPage, + AutoProductNavigationPage, + AutoProductPage, Item, JobPosting, Product, @@ -35,6 +45,35 @@ NO_CALLBACK = None +_ITEM_KEYWORDS: Dict[type, str] = { + Product: "product", + ProductList: "productList", + ProductNavigation: "productNavigation", + Article: "article", + ArticleList: "articleList", + ArticleNavigation: "articleNavigation", + JobPosting: "jobPosting", +} +_AUTO_PAGES: Set[Type] = { + AutoArticlePage, + AutoArticleListPage, + AutoArticleNavigationPage, + AutoJobPostingPage, + AutoProductPage, + AutoProductListPage, + AutoProductNavigationPage, +} + + +# https://stackoverflow.com/a/25959545 +def _field_cls(page_cls, field_name): + for cls in page_cls.__mro__: + if field_name in cls.__dict__: + return cls + # Only used with fields known to exist + assert False # noqa: B011 + + class ZyteApiProvider(PageObjectInputProvider): name = "zyte_api" @@ -54,9 +93,55 @@ class ZyteApiProvider(PageObjectInputProvider): Screenshot, } + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._injection_mw = None + self._tracked_auto_fields = set() + def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) + def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): + if cls not in _ITEM_KEYWORDS: + return + if self._injection_mw is None: + try: + self._injection_mw = crawler.get_downloader_middleware( + InjectionMiddleware + ) + except AttributeError: + for component in crawler.engine.downloader.middleware.middlewares: + if isinstance(component, InjectionMiddleware): + self._injection_mw = component + break + if self._injection_mw is None: + raise RuntimeError( + "Could not find the InjectionMiddleware among enabled " + "downloader middlewares. Please, ensure you have properly " + "configured scrapy-poet." + ) + cls = self._injection_mw.registry.page_cls_for_item(request.url, cls) or cls + if cls in self._tracked_auto_fields: + return + self._tracked_auto_fields.add(cls) + if cls in _ITEM_KEYWORDS: + auto_fields = set(attrs.fields_dict(cls)) + else: + auto_cls = None + for ancestor in cls.__mro__: + if ancestor in _AUTO_PAGES: + auto_cls = ancestor + break + auto_fields = set() + if auto_cls: + for field_name in get_fields_dict(cls): + field_cls = _field_cls(cls, field_name) + if field_cls is auto_cls: + auto_fields.add(field_name) + cls_fqn = get_fq_class_name(cls) + field_list = " ".join(sorted(auto_fields)) + crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) + async def __call__( # noqa: C901 self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: @@ -66,6 +151,7 @@ async def __call__( # noqa: C901 http_response = None screenshot_requested = Screenshot in to_provide for cls in list(to_provide): + self._track_auto_fields(crawler, request, cls) item = self.injector.weak_cache.get(request, {}).get(cls) if item: results.append(item) @@ -89,15 +175,6 @@ async def __call__( # noqa: C901 return results html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide - item_keywords: Dict[type, str] = { - Product: "product", - ProductList: "productList", - ProductNavigation: "productNavigation", - Article: "article", - ArticleList: "articleList", - ArticleNavigation: "articleNavigation", - JobPosting: "jobPosting", - } zyte_api_meta = { **crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS"), @@ -135,7 +212,7 @@ async def __call__( # noqa: C901 } ) continue - kw = item_keywords.get(cls_stripped) + kw = _ITEM_KEYWORDS.get(cls_stripped) if not kw: continue item_requested = True @@ -165,7 +242,7 @@ async def __call__( # noqa: C901 ) extract_from = None # type: ignore[assignment] - for item_type, kw in item_keywords.items(): + for item_type, kw in _ITEM_KEYWORDS.items(): options_name = f"{kw}Options" if item_type not in to_provide_stripped and options_name in zyte_api_meta: del zyte_api_meta[options_name] @@ -271,7 +348,7 @@ async def __call__( # noqa: C901 result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined] results.append(result) continue - kw = item_keywords.get(cls_stripped) + kw = _ITEM_KEYWORDS.get(cls_stripped) if not kw: continue assert issubclass(cls_stripped, Item) diff --git a/tests/test_providers.py b/tests/test_providers.py index 7b4abb05..687b0fca 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -18,14 +18,16 @@ BrowserResponse, HttpResponse, ItemPage, + default_registry, field, handle_urls, ) -from zyte_common_items import BasePage, Product +from web_poet.pages import get_item_cls +from zyte_common_items import AutoProductPage, BasePage, BaseProductPage, Product from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot, actions from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler -from scrapy_zyte_api.providers import ZyteApiProvider +from scrapy_zyte_api.providers import _AUTO_PAGES, _ITEM_KEYWORDS, ZyteApiProvider from . import SETTINGS from .mockserver import get_ephemeral_port @@ -1015,3 +1017,440 @@ def parse_(self, response: DummyResponse, page: ActionProductPage): # type: ign }, ] ) + + +def test_auto_pages_set(): + assert set(_ITEM_KEYWORDS) == {get_item_cls(cls) for cls in _AUTO_PAGES} + + +@ensureDeferred +async def test_auto_field_stat_no_override(mockserver): + """When requesting an item directly from Zyte API, without an override to + change fields, stats reflect the entire list of item fields.""" + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( + "additionalProperties aggregateRating availability brand breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn name price productId " + "regularPrice size sku style url variants" + ), + } + + +@ensureDeferred +async def test_auto_field_stat_partial_override(mockserver): + """When requesting an item and having an Auto…Page subclass to change + fields, stats reflect the list of item fields not defined in the + subclass. Defined field method are not listed, even if they return the + original item field, directly or as a fallback.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_partial_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + default_registry.__init__() # Reset rules + + +@ensureDeferred +async def test_auto_field_stat_full_override(mockserver): + """When requesting an item and having an Auto…Page subclass to change + all fields, stats reflect the list of non-overriden item fields as an empty + string.""" + + # Copy-paste of fields from the AutoProductPage implementation, with type + # hints removed. + class MyProductPage(AutoProductPage): + + @field + def additionalProperties(self): + return self.product.additionalProperties + + @field + def aggregateRating(self): + return self.product.aggregateRating + + @field + def availability(self): + return self.product.availability + + @field + def brand(self): + return self.product.brand + + @field + def breadcrumbs(self): + return self.product.breadcrumbs + + @field + def canonicalUrl(self): + return self.product.canonicalUrl + + @field + def color(self): + return self.product.color + + @field + def currency(self): + return self.product.currency + + @field + def currencyRaw(self): + return self.product.currencyRaw + + @field + def description(self): + return self.product.description + + @field + def descriptionHtml(self): + return self.product.descriptionHtml + + @field + def features(self): + return self.product.features + + @field + def gtin(self): + return self.product.gtin + + @field + def images(self): + return self.product.images + + @field + def mainImage(self): + return self.product.mainImage + + @field + def metadata(self): + return self.product.metadata + + @field + def mpn(self): + return self.product.mpn + + @field + def name(self): + return self.product.name + + @field + def price(self): + return self.product.price + + @field + def productId(self): + return self.product.productId + + @field + def regularPrice(self): + return self.product.regularPrice + + @field + def size(self): + return self.product.size + + @field + def sku(self): + return self.product.sku + + @field + def style(self): + return self.product.style + + @field + def url(self): + return self.product.url + + @field + def variants(self): + return self.product.variants + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_full_override..MyProductPage": "", + } + + default_registry.__init__() # Reset rules + + +@ensureDeferred +async def test_auto_field_stat_callback_override(mockserver): + """Fields overridden in callbacks, instead of using a page object, are not + taken into account.""" + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + product["name"] = "foo" + yield product + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( + "additionalProperties aggregateRating availability brand breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn name price productId " + "regularPrice size sku style url variants" + ), + } + + default_registry.__init__() # Reset rules + + +@ensureDeferred +async def test_auto_field_stat_item_page_override(mockserver): + """The stat accounts for the configured page for a given item, so if you + request that page directly, things work the same as if you request the item + itself.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: MyProductPage): + pass + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_item_page_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + default_registry.__init__() # Reset rules + + +@ensureDeferred +async def test_auto_field_stat_alt_page_override(mockserver): + """The stat does not account for alternatives pages, so if you request a + page that provides an item, the page that counts for stats is the + configured page for that item, not the actual page requested.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class AltProductPage(AutoProductPage): + + @field + def sku(self): + return "foo" + + @field + def currencyRaw(self): + return self.product.currencyRaw + + handle_urls(f"{mockserver.host}:{mockserver.port}", priority=0)(AltProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: AltProductPage): + pass + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_alt_page_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + default_registry.__init__() # Reset rules + + +@ensureDeferred +async def test_auto_field_stat_non_auto_override(mockserver): + """If instead of using an Auto…Page class you use a custom class, all + fields are assumed to be overridden.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @field + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_non_auto_override..MyProductPage": "", + } + + default_registry.__init__() # Reset rules From 3d79ac354a5ce065719ece18c19cd2ea0e7d4361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 11 Jun 2024 21:56:46 +0200 Subject: [PATCH 02/10] Solve issues reported by CI --- scrapy_zyte_api/providers.py | 6 +++--- setup.py | 2 +- tests/test_providers.py | 22 ++++++++++++++-------- tox.ini | 2 +- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index ac6f8884..efcc86a9 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast import attrs from andi.typeutils import is_typing_annotated, strip_annotated @@ -54,7 +54,7 @@ ArticleNavigation: "articleNavigation", JobPosting: "jobPosting", } -_AUTO_PAGES: Set[Type] = { +_AUTO_PAGES: Set[type] = { AutoArticlePage, AutoArticleListPage, AutoArticleNavigationPage, @@ -151,7 +151,7 @@ async def __call__( # noqa: C901 http_response = None screenshot_requested = Screenshot in to_provide for cls in list(to_provide): - self._track_auto_fields(crawler, request, cls) + self._track_auto_fields(crawler, request, cast(type, cls)) item = self.injector.weak_cache.get(request, {}).get(cls) if item: results.append(item) diff --git a/setup.py b/setup.py index 05c9c532..40d4d858 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_version(): "andi>=0.6.0", "scrapy-poet>=0.22.3", "web-poet>=0.17.0", - "zyte-common-items>=0.8.0", + "zyte-common-items>=0.16.0", ] }, classifiers=[ diff --git a/tests/test_providers.py b/tests/test_providers.py index 687b0fca..9a365e8b 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1020,7 +1020,7 @@ def parse_(self, response: DummyResponse, page: ActionProductPage): # type: ign def test_auto_pages_set(): - assert set(_ITEM_KEYWORDS) == {get_item_cls(cls) for cls in _AUTO_PAGES} + assert set(_ITEM_KEYWORDS) == {get_item_cls(cls) for cls in _AUTO_PAGES} # type: ignore[call-overload] @ensureDeferred @@ -1108,7 +1108,8 @@ def parse(self, response: DummyResponse, product: Product): ), } - default_registry.__init__() # Reset rules + # Reset rules + default_registry.__init__() # type: ignore[misc] @ensureDeferred @@ -1253,7 +1254,8 @@ def parse(self, response: DummyResponse, product: Product): "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_full_override..MyProductPage": "", } - default_registry.__init__() # Reset rules + # Reset rules + default_registry.__init__() # type: ignore[misc] @ensureDeferred @@ -1269,7 +1271,7 @@ def start_requests(self): yield Request(self.url, callback=self.parse) def parse(self, response: DummyResponse, product: Product): - product["name"] = "foo" + product.name = "foo" yield product settings = create_scrapy_settings() @@ -1293,7 +1295,8 @@ def parse(self, response: DummyResponse, product: Product): ), } - default_registry.__init__() # Reset rules + # Reset rules + default_registry.__init__() # type: ignore[misc] @ensureDeferred @@ -1345,7 +1348,8 @@ def parse(self, response: DummyResponse, page: MyProductPage): ), } - default_registry.__init__() # Reset rules + # Reset rules + default_registry.__init__() # type: ignore[misc] @ensureDeferred @@ -1409,7 +1413,8 @@ def parse(self, response: DummyResponse, page: AltProductPage): ), } - default_registry.__init__() # Reset rules + # Reset rules + default_registry.__init__() # type: ignore[misc] @ensureDeferred @@ -1453,4 +1458,5 @@ def parse(self, response: DummyResponse, product: Product): "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_non_auto_override..MyProductPage": "", } - default_registry.__init__() # Reset rules + # Reset rules + default_registry.__init__() # type: ignore[misc] diff --git a/tox.ini b/tox.ini index 073909e5..79933e23 100644 --- a/tox.ini +++ b/tox.ini @@ -90,7 +90,7 @@ deps = andi==0.6.0 scrapy-poet==0.22.3 web-poet==0.17.0 - zyte-common-items==0.8.0 + zyte-common-items==0.16.0 [testenv:pinned-extra] basepython=python3.8 From 806a0fe93d96a9a49e069aea555375ecbc97b762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Jun 2024 07:34:31 +0200 Subject: [PATCH 03/10] =?UTF-8?q?Min=20zyte-common-items:=200.16.0=20?= =?UTF-8?q?=E2=86=92=200.19.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 40d4d858..240e83cf 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_version(): "andi>=0.6.0", "scrapy-poet>=0.22.3", "web-poet>=0.17.0", - "zyte-common-items>=0.16.0", + "zyte-common-items>=0.19.0", ] }, classifiers=[ diff --git a/tox.ini b/tox.ini index 79933e23..17cfa713 100644 --- a/tox.ini +++ b/tox.ini @@ -90,7 +90,7 @@ deps = andi==0.6.0 scrapy-poet==0.22.3 web-poet==0.17.0 - zyte-common-items==0.16.0 + zyte-common-items==0.19.0 [testenv:pinned-extra] basepython=python3.8 From e9dc36dcd06cc5d81a5ad695f5f909e87c1b4da2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Jun 2024 09:05:24 +0200 Subject: [PATCH 04/10] Make auto field stats opt-in and provide documentation on the setting reference --- docs/reference/settings.rst | 32 +++++++++++++++++ scrapy_zyte_api/providers.py | 7 ++++ tests/test_providers.py | 70 ++++++++++++++++++++++++++---------- 3 files changed, 90 insertions(+), 19 deletions(-) diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 3b0dc4ba..827236e7 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -6,6 +6,38 @@ Settings :ref:`Settings ` for scrapy-zyte-api. +.. setting:: ZYTE_API_AUTO_FIELD_STATS + +ZYTE_API_AUTO_FIELD_STATS +========================= + +Default: ``False`` + +Enables stats that indicate which requested fields come directly from +:ref:`zyte-api-extract`. + +If for any request no page object class is used to override +:ref:`zyte-api-extract` fields for a given item type, the following stat is +set: + +.. code-block:: python + + "scrapy-zyte-api/auto_fields/": "" + +If for any request a custom page object class is used to override some +:ref:`zyte-api-extract` fields, the following stat is set: + +.. code-block:: python + + "scrapy-zyte-api/auto_fields/": ( + "" + ) + +.. note:: If that page object class is not a subclass of an ``Auto``-prefixed + class from :doc:`zyte-common-items `, all fields + are assumed to have been overridden, i.e. the stat value is always an empty + string. + .. setting:: ZYTE_API_AUTOMAP_PARAMS ZYTE_API_AUTOMAP_PARAMS diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index efcc86a9..0e50b447 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -96,6 +96,7 @@ class ZyteApiProvider(PageObjectInputProvider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._injection_mw = None + self._should_track_auto_fields = None self._tracked_auto_fields = set() def is_provided(self, type_: Callable) -> bool: @@ -104,6 +105,12 @@ def is_provided(self, type_: Callable) -> bool: def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): if cls not in _ITEM_KEYWORDS: return + if self._should_track_auto_fields is None: + self._should_track_auto_fields = crawler.settings.getbool( + "ZYTE_API_AUTO_FIELD_STATS", False + ) + if self._should_track_auto_fields is False: + return if self._injection_mw is None: try: self._injection_mw = crawler.get_downloader_middleware( diff --git a/tests/test_providers.py b/tests/test_providers.py index 9a365e8b..c4218207 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1024,7 +1024,32 @@ def test_auto_pages_set(): @ensureDeferred -async def test_auto_field_stat_no_override(mockserver): +async def test_auto_field_stats_not_enabled(mockserver): + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == {} + + +@ensureDeferred +async def test_auto_field_stats_no_override(mockserver): """When requesting an item directly from Zyte API, without an override to change fields, stats reflect the entire list of item fields.""" @@ -1039,8 +1064,9 @@ def parse(self, response: DummyResponse, product: Product): pass settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) auto_field_stats = { @@ -1059,7 +1085,7 @@ def parse(self, response: DummyResponse, product: Product): @ensureDeferred -async def test_auto_field_stat_partial_override(mockserver): +async def test_auto_field_stats_partial_override(mockserver): """When requesting an item and having an Auto…Page subclass to change fields, stats reflect the list of item fields not defined in the subclass. Defined field method are not listed, even if they return the @@ -1088,8 +1114,9 @@ def parse(self, response: DummyResponse, product: Product): pass settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( TestSpider, HtmlResource, settings, port=mockserver.port ) @@ -1100,7 +1127,7 @@ def parse(self, response: DummyResponse, product: Product): if k.startswith("scrapy-zyte-api/auto_fields") } assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_partial_override..MyProductPage": ( + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_partial_override..MyProductPage": ( "additionalProperties aggregateRating availability breadcrumbs " "canonicalUrl color currency currencyRaw description descriptionHtml " "features gtin images mainImage metadata mpn price productId " @@ -1113,7 +1140,7 @@ def parse(self, response: DummyResponse, product: Product): @ensureDeferred -async def test_auto_field_stat_full_override(mockserver): +async def test_auto_field_stats_full_override(mockserver): """When requesting an item and having an Auto…Page subclass to change all fields, stats reflect the list of non-overriden item fields as an empty string.""" @@ -1239,8 +1266,9 @@ def parse(self, response: DummyResponse, product: Product): pass settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( TestSpider, HtmlResource, settings, port=mockserver.port ) @@ -1251,7 +1279,7 @@ def parse(self, response: DummyResponse, product: Product): if k.startswith("scrapy-zyte-api/auto_fields") } assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_full_override..MyProductPage": "", + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_full_override..MyProductPage": "", } # Reset rules @@ -1259,7 +1287,7 @@ def parse(self, response: DummyResponse, product: Product): @ensureDeferred -async def test_auto_field_stat_callback_override(mockserver): +async def test_auto_field_stats_callback_override(mockserver): """Fields overridden in callbacks, instead of using a page object, are not taken into account.""" @@ -1275,8 +1303,9 @@ def parse(self, response: DummyResponse, product: Product): yield product settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( TestSpider, HtmlResource, settings, port=mockserver.port ) @@ -1300,7 +1329,7 @@ def parse(self, response: DummyResponse, product: Product): @ensureDeferred -async def test_auto_field_stat_item_page_override(mockserver): +async def test_auto_field_stats_item_page_override(mockserver): """The stat accounts for the configured page for a given item, so if you request that page directly, things work the same as if you request the item itself.""" @@ -1328,8 +1357,9 @@ def parse(self, response: DummyResponse, page: MyProductPage): pass settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( TestSpider, HtmlResource, settings, port=mockserver.port ) @@ -1340,7 +1370,7 @@ def parse(self, response: DummyResponse, page: MyProductPage): if k.startswith("scrapy-zyte-api/auto_fields") } assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_item_page_override..MyProductPage": ( + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_item_page_override..MyProductPage": ( "additionalProperties aggregateRating availability breadcrumbs " "canonicalUrl color currency currencyRaw description descriptionHtml " "features gtin images mainImage metadata mpn price productId " @@ -1353,7 +1383,7 @@ def parse(self, response: DummyResponse, page: MyProductPage): @ensureDeferred -async def test_auto_field_stat_alt_page_override(mockserver): +async def test_auto_field_stats_alt_page_override(mockserver): """The stat does not account for alternatives pages, so if you request a page that provides an item, the page that counts for stats is the configured page for that item, not the actual page requested.""" @@ -1393,8 +1423,9 @@ def parse(self, response: DummyResponse, page: AltProductPage): pass settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( TestSpider, HtmlResource, settings, port=mockserver.port ) @@ -1405,7 +1436,7 @@ def parse(self, response: DummyResponse, page: AltProductPage): if k.startswith("scrapy-zyte-api/auto_fields") } assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_alt_page_override..MyProductPage": ( + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_alt_page_override..MyProductPage": ( "additionalProperties aggregateRating availability breadcrumbs " "canonicalUrl color currency currencyRaw description descriptionHtml " "features gtin images mainImage metadata mpn price productId " @@ -1418,7 +1449,7 @@ def parse(self, response: DummyResponse, page: AltProductPage): @ensureDeferred -async def test_auto_field_stat_non_auto_override(mockserver): +async def test_auto_field_stats_non_auto_override(mockserver): """If instead of using an Auto…Page class you use a custom class, all fields are assumed to be overridden.""" @@ -1443,8 +1474,9 @@ def parse(self, response: DummyResponse, product: Product): pass settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( TestSpider, HtmlResource, settings, port=mockserver.port ) @@ -1455,7 +1487,7 @@ def parse(self, response: DummyResponse, product: Product): if k.startswith("scrapy-zyte-api/auto_fields") } assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stat_non_auto_override..MyProductPage": "", + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_non_auto_override..MyProductPage": "", } # Reset rules From a8d9c81a4d7954e50befcd2e96eb0b2d30ba7651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 19 Jun 2024 12:10:30 +0200 Subject: [PATCH 05/10] Use zyte_common_items.fields.is_auto_field --- CHANGES.rst | 7 +++ docs/reference/settings.rst | 9 ++-- scrapy_zyte_api/providers.py | 23 ++------- setup.py | 2 +- tests/test_providers.py | 92 ++++++++++++++++++++++++++++++++++++ tox.ini | 2 +- 6 files changed, 109 insertions(+), 26 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index f8361a38..78c7aa27 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,13 @@ Changes ======= +N.N.N (YYYY-MM-DD) +------------------ + +* ``scrapy-zyte-api[provider]`` now requires zyte-common-items >= 0.20.0. + +* Added the :setting:`ZYTE_API_AUTO_FIELD_STATS` setting. + 0.18.4 (2024-06-10) ------------------- diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 827236e7..1194aba9 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -13,7 +13,8 @@ ZYTE_API_AUTO_FIELD_STATS Default: ``False`` -Enables stats that indicate which requested fields come directly from +Enables stats that indicate which requested fields :ref:`obtained through +scrapy-poet integration ` come directly from :ref:`zyte-api-extract`. If for any request no page object class is used to override @@ -33,10 +34,8 @@ If for any request a custom page object class is used to override some "" ) -.. note:: If that page object class is not a subclass of an ``Auto``-prefixed - class from :doc:`zyte-common-items `, all fields - are assumed to have been overridden, i.e. the stat value is always an empty - string. +.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine + whether a field has been overridden or not. .. setting:: ZYTE_API_AUTOMAP_PARAMS diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 0e50b447..b045dae7 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -33,6 +33,7 @@ ProductList, ProductNavigation, ) +from zyte_common_items.fields import is_auto_field from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot from scrapy_zyte_api._annotations import _ActionResult @@ -65,15 +66,6 @@ } -# https://stackoverflow.com/a/25959545 -def _field_cls(page_cls, field_name): - for cls in page_cls.__mro__: - if field_name in cls.__dict__: - return cls - # Only used with fields known to exist - assert False # noqa: B011 - - class ZyteApiProvider(PageObjectInputProvider): name = "zyte_api" @@ -134,17 +126,10 @@ def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): if cls in _ITEM_KEYWORDS: auto_fields = set(attrs.fields_dict(cls)) else: - auto_cls = None - for ancestor in cls.__mro__: - if ancestor in _AUTO_PAGES: - auto_cls = ancestor - break auto_fields = set() - if auto_cls: - for field_name in get_fields_dict(cls): - field_cls = _field_cls(cls, field_name) - if field_cls is auto_cls: - auto_fields.add(field_name) + for field_name in get_fields_dict(cls): + if is_auto_field(cls, field_name): + auto_fields.add(field_name) cls_fqn = get_fq_class_name(cls) field_list = " ".join(sorted(auto_fields)) crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) diff --git a/setup.py b/setup.py index 240e83cf..ac2de981 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_version(): "andi>=0.6.0", "scrapy-poet>=0.22.3", "web-poet>=0.17.0", - "zyte-common-items>=0.19.0", + "zyte-common-items>=0.20.0", ] }, classifiers=[ diff --git a/tests/test_providers.py b/tests/test_providers.py index c4218207..c81bde3d 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -24,6 +24,7 @@ ) from web_poet.pages import get_item_cls from zyte_common_items import AutoProductPage, BasePage, BaseProductPage, Product +from zyte_common_items.fields import auto_field from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot, actions from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler @@ -1492,3 +1493,94 @@ def parse(self, response: DummyResponse, product: Product): # Reset rules default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_auto_field_decorator(mockserver): + """Using @auto_field forces a field to not be considered overridden.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @auto_field + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_decorator..MyProductPage": "additionalProperties", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_auto_field_meta(mockserver): + """Using @field(meta={"auto_field": True}) has the same effect as using + @auto_field.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @field(meta={"auto_field": True}) + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_meta..MyProductPage": "additionalProperties", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] diff --git a/tox.ini b/tox.ini index 17cfa713..086d7971 100644 --- a/tox.ini +++ b/tox.ini @@ -90,7 +90,7 @@ deps = andi==0.6.0 scrapy-poet==0.22.3 web-poet==0.17.0 - zyte-common-items==0.19.0 + zyte-common-items==0.20.0 [testenv:pinned-extra] basepython=python3.8 From 916af0d05770425355896d802042358dbe6f74af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 11 Jul 2024 17:43:03 +0200 Subject: [PATCH 06/10] Do not list all item fields if there is no override --- docs/reference/settings.rst | 5 ++++- scrapy_zyte_api/providers.py | 5 ++--- tests/test_providers.py | 7 ++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 1194aba9..ca9ca5d7 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -23,7 +23,10 @@ set: .. code-block:: python - "scrapy-zyte-api/auto_fields/": "" + "scrapy-zyte-api/auto_fields/": "(all fields)" + +.. note:: A literal ``(all fields)`` string is used as value, not a list with + all fields. If for any request a custom page object class is used to override some :ref:`zyte-api-extract` fields, the following stat is set: diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index b045dae7..310189ce 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,6 +1,5 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast -import attrs from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request from scrapy.crawler import Crawler @@ -124,14 +123,14 @@ def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): return self._tracked_auto_fields.add(cls) if cls in _ITEM_KEYWORDS: - auto_fields = set(attrs.fields_dict(cls)) + field_list = "(all fields)" else: auto_fields = set() for field_name in get_fields_dict(cls): if is_auto_field(cls, field_name): auto_fields.add(field_name) + field_list = " ".join(sorted(auto_fields)) cls_fqn = get_fq_class_name(cls) - field_list = " ".join(sorted(auto_fields)) crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) async def __call__( # noqa: C901 diff --git a/tests/test_providers.py b/tests/test_providers.py index c81bde3d..03dbc347 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1077,10 +1077,7 @@ def parse(self, response: DummyResponse, product: Product): } assert auto_field_stats == { "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( - "additionalProperties aggregateRating availability brand breadcrumbs " - "canonicalUrl color currency currencyRaw description descriptionHtml " - "features gtin images mainImage metadata mpn name price productId " - "regularPrice size sku style url variants" + "(all fields)" ), } @@ -1089,7 +1086,7 @@ def parse(self, response: DummyResponse, product: Product): async def test_auto_field_stats_partial_override(mockserver): """When requesting an item and having an Auto…Page subclass to change fields, stats reflect the list of item fields not defined in the - subclass. Defined field method are not listed, even if they return the + subclass. Defined field methods are not listed, even if they return the original item field, directly or as a fallback.""" class MyProductPage(AutoProductPage): From e6a3bfb479d8f21cd5f4e40f3875c95e786c2a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 11 Jul 2024 17:48:27 +0200 Subject: [PATCH 07/10] Silence a mypy error --- scrapy_zyte_api/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 310189ce..34f5159b 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -127,7 +127,7 @@ def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): else: auto_fields = set() for field_name in get_fields_dict(cls): - if is_auto_field(cls, field_name): + if is_auto_field(cls, field_name): # type: ignore[arg-type] auto_fields.add(field_name) field_list = " ".join(sorted(auto_fields)) cls_fqn = get_fq_class_name(cls) From ab702fb1786ccaa3fd2e26ce30adbcb62fc24cc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Jul 2024 08:42:00 +0200 Subject: [PATCH 08/10] Update test expectations --- tests/test_providers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index 03dbc347..774cb292 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1315,10 +1315,7 @@ def parse(self, response: DummyResponse, product: Product): } assert auto_field_stats == { "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( - "additionalProperties aggregateRating availability brand breadcrumbs " - "canonicalUrl color currency currencyRaw description descriptionHtml " - "features gtin images mainImage metadata mpn name price productId " - "regularPrice size sku style url variants" + "(all fields)" ), } From 0795cf39553f8af06b13df21f90ca151508108e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Jul 2024 10:59:14 +0200 Subject: [PATCH 09/10] Complete test coverage with a significant cleanup --- scrapy_zyte_api/providers.py | 21 ++------------------- tests/test_providers.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 34f5159b..4042775c 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -4,7 +4,7 @@ from scrapy import Request from scrapy.crawler import Crawler from scrapy.utils.defer import maybe_deferred_to_future -from scrapy_poet import InjectionMiddleware, PageObjectInputProvider +from scrapy_poet import PageObjectInputProvider from web_poet import ( AnyResponse, BrowserHtml, @@ -86,7 +86,6 @@ class ZyteApiProvider(PageObjectInputProvider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._injection_mw = None self._should_track_auto_fields = None self._tracked_auto_fields = set() @@ -102,23 +101,7 @@ def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): ) if self._should_track_auto_fields is False: return - if self._injection_mw is None: - try: - self._injection_mw = crawler.get_downloader_middleware( - InjectionMiddleware - ) - except AttributeError: - for component in crawler.engine.downloader.middleware.middlewares: - if isinstance(component, InjectionMiddleware): - self._injection_mw = component - break - if self._injection_mw is None: - raise RuntimeError( - "Could not find the InjectionMiddleware among enabled " - "downloader middlewares. Please, ensure you have properly " - "configured scrapy-poet." - ) - cls = self._injection_mw.registry.page_cls_for_item(request.url, cls) or cls + cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls if cls in self._tracked_auto_fields: return self._tracked_auto_fields.add(cls) diff --git a/tests/test_providers.py b/tests/test_providers.py index 774cb292..1f8dce8a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,4 +1,5 @@ import sys +from collections import defaultdict import pytest @@ -1054,17 +1055,45 @@ async def test_auto_field_stats_no_override(mockserver): """When requesting an item directly from Zyte API, without an override to change fields, stats reflect the entire list of item fields.""" + from scrapy.statscollectors import MemoryStatsCollector + + duplicate_stat_calls = defaultdict(int) + + class OnlyOnceStatsCollector(MemoryStatsCollector): + + def track_duplicate_stat_calls(self, key): + if key.startswith("scrapy-zyte-api/auto_fields/") and key in self._stats: + duplicate_stat_calls[key] += 1 + + def set_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().set_value(key, value, spider) + + def inc_value(self, key, count=1, start=1, spider=None): + self.track_duplicate_stat_calls(key) + super().inc_value(key, count, start, spider) + + def max_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().max_value(key, value, spider) + + def min_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().min_value(key, value, spider) + class TestSpider(Spider): name = "test_spider" url: str def start_requests(self): - yield Request(self.url, callback=self.parse) + for url in ("data:,a", "data:,b"): + yield Request(url, callback=self.parse) def parse(self, response: DummyResponse, product: Product): pass settings = create_scrapy_settings() + settings["STATS_CLASS"] = OnlyOnceStatsCollector settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") @@ -1080,6 +1109,7 @@ def parse(self, response: DummyResponse, product: Product): "(all fields)" ), } + assert all(value == 0 for value in duplicate_stat_calls.values()) @ensureDeferred From c64d781023c978480c6e6f83a576b7d640620ca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Jul 2024 11:05:41 +0200 Subject: [PATCH 10/10] Keep mypy happy --- tests/test_providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index 1f8dce8a..74dd17cc 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1057,7 +1057,7 @@ async def test_auto_field_stats_no_override(mockserver): from scrapy.statscollectors import MemoryStatsCollector - duplicate_stat_calls = defaultdict(int) + duplicate_stat_calls: defaultdict[str, int] = defaultdict(int) class OnlyOnceStatsCollector(MemoryStatsCollector):