Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track in stats which fields from Zyte API automatic extraction are not overridden #202

Merged
merged 11 commits into from
Jul 25, 2024
7 changes: 7 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changes
=======

N.N.N (YYYY-MM-DD)
------------------

* ``scrapy-zyte-api[provider]`` now requires zyte-common-items >= 0.20.0.
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

* Added the :setting:`ZYTE_API_AUTO_FIELD_STATS` setting.

0.18.4 (2024-06-10)
-------------------

Expand Down
9 changes: 4 additions & 5 deletions docs/reference/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ ZYTE_API_AUTO_FIELD_STATS

Default: ``False``

Enables stats that indicate which requested fields come directly from
Enables stats that indicate which requested fields :ref:`obtained through
scrapy-poet integration <scrapy-poet>` come directly from
:ref:`zyte-api-extract`.

If for any request no page object class is used to override
Expand All @@ -33,10 +34,8 @@ If for any request a custom page object class is used to override some
"<space-separated list of fields not overridden>"
)

.. note:: If that page object class is not a subclass of an ``Auto``-prefixed
class from :doc:`zyte-common-items <zyte-common-items:index>`, all fields
are assumed to have been overridden, i.e. the stat value is always an empty
string.
.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine
whether a field has been overridden or not.

.. setting:: ZYTE_API_AUTOMAP_PARAMS

Expand Down
23 changes: 4 additions & 19 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
ProductList,
ProductNavigation,
)
from zyte_common_items.fields import is_auto_field

from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
from scrapy_zyte_api._annotations import _ActionResult
Expand Down Expand Up @@ -65,15 +66,6 @@
}


# https://stackoverflow.com/a/25959545
def _field_cls(page_cls, field_name):
for cls in page_cls.__mro__:
if field_name in cls.__dict__:
return cls
# Only used with fields known to exist
assert False # noqa: B011


class ZyteApiProvider(PageObjectInputProvider):
name = "zyte_api"

Expand Down Expand Up @@ -134,17 +126,10 @@ def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
if cls in _ITEM_KEYWORDS:
auto_fields = set(attrs.fields_dict(cls))
else:
auto_cls = None
for ancestor in cls.__mro__:
if ancestor in _AUTO_PAGES:
auto_cls = ancestor
break
auto_fields = set()
if auto_cls:
for field_name in get_fields_dict(cls):
field_cls = _field_cls(cls, field_name)
if field_cls is auto_cls:
auto_fields.add(field_name)
for field_name in get_fields_dict(cls):
if is_auto_field(cls, field_name):
auto_fields.add(field_name)
cls_fqn = get_fq_class_name(cls)
field_list = " ".join(sorted(auto_fields))
crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_version():
"andi>=0.6.0",
"scrapy-poet>=0.22.3",
"web-poet>=0.17.0",
"zyte-common-items>=0.19.0",
"zyte-common-items>=0.20.0",
]
},
classifiers=[
Expand Down
92 changes: 92 additions & 0 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
)
from web_poet.pages import get_item_cls
from zyte_common_items import AutoProductPage, BasePage, BaseProductPage, Product
from zyte_common_items.fields import auto_field

from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot, actions
from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler
Expand Down Expand Up @@ -1492,3 +1493,94 @@ def parse(self, response: DummyResponse, product: Product):

# Reset rules
default_registry.__init__() # type: ignore[misc]


@ensureDeferred
async def test_auto_field_stats_auto_field_decorator(mockserver):
"""Using @auto_field forces a field to not be considered overridden."""

@attrs.define
class MyProductPage(BaseProductPage):
product: Product

@auto_field
def additionalProperties(self):
return self.product.additionalProperties

handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage)

class TestSpider(Spider):
name = "test_spider"
url: str

def start_requests(self):
yield Request(self.url, callback=self.parse)

def parse(self, response: DummyResponse, product: Product):
pass

settings = create_scrapy_settings()
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0}
settings["ZYTE_API_AUTO_FIELD_STATS"] = True
settings["ZYTE_API_URL"] = mockserver.urljoin("/")
_, _, crawler = await crawl_single_item(
TestSpider, HtmlResource, settings, port=mockserver.port
)

auto_field_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/auto_fields")
}
assert auto_field_stats == {
"scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_decorator.<locals>.MyProductPage": "additionalProperties",
}

# Reset rules
default_registry.__init__() # type: ignore[misc]


@ensureDeferred
async def test_auto_field_stats_auto_field_meta(mockserver):
"""Using @field(meta={"auto_field": True}) has the same effect as using
@auto_field."""

@attrs.define
class MyProductPage(BaseProductPage):
product: Product

@field(meta={"auto_field": True})
def additionalProperties(self):
return self.product.additionalProperties

handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage)

class TestSpider(Spider):
name = "test_spider"
url: str

def start_requests(self):
yield Request(self.url, callback=self.parse)

def parse(self, response: DummyResponse, product: Product):
pass

settings = create_scrapy_settings()
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0}
settings["ZYTE_API_AUTO_FIELD_STATS"] = True
settings["ZYTE_API_URL"] = mockserver.urljoin("/")
_, _, crawler = await crawl_single_item(
TestSpider, HtmlResource, settings, port=mockserver.port
)

auto_field_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/auto_fields")
}
assert auto_field_stats == {
"scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_meta.<locals>.MyProductPage": "additionalProperties",
}

# Reset rules
default_registry.__init__() # type: ignore[misc]
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ deps =
andi==0.6.0
scrapy-poet==0.22.3
web-poet==0.17.0
zyte-common-items==0.19.0
zyte-common-items==0.20.0

[testenv:pinned-extra]
basepython=python3.8
Expand Down