Skip to content

Commit

Permalink
Merge pull request #95 from Gallaecio/auto-fields
Browse files Browse the repository at this point in the history
Implement auto_field field metadata
  • Loading branch information
kmike authored Jun 18, 2024
2 parents a3aa8e4 + 31bff17 commit 4a6ceac
Show file tree
Hide file tree
Showing 15 changed files with 278 additions and 189 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Changelog
with the ``probability`` value lower than a set threshold.

* Added the :class:`~.BaseMetadata`, :class:`~.ListMetadata`, and
:class:`~.DetailMetadata` classes (they were previously private).
:class:`~.DetailsMetadata` classes (they were previously private).

* Added the :attr:`.ListMetadata.validationMessages` attribute.

Expand Down
56 changes: 41 additions & 15 deletions docs/usage/pages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,32 @@ whose ``to_item`` method returns an instance of
def name(self):
return self.css("h1::text").get()
.. _extractors:

Extractors
==========

For some nested fields (:class:`~.ProductFromList`, :class:`~.ProductVariant`),
:ref:`base extractors <default-processors-nested>` exist that you can subclass
to write your own extractors.

They provide the following base line:

- They declare the :ref:`item class <items>` that they return, allowing for
their ``to_item`` method to automatically build an instance of it from
``@field``-decorated methods. See :ref:`fields`.

- They also provide default :ref:`processors <processors>` for some
item-specific fields.

See :ref:`extractor-api`.


.. _auto:

Auto page object classes
========================

Page object classes with the ``Auto`` prefix can be used to easily define page
object classes that get an :ref:`item <items>` as a dependency from another
page object class, can generate an identical item by default, and can also
Expand Down Expand Up @@ -72,23 +98,23 @@ extra fields. For example:
def foo(self):
return "bar"
.. _extractors:

Extractors
==========
Fields of these classes have ``auto_field`` set to ``True`` in their field
metadata, so that you can check if a page object subclass is overriding a field
using :func:`~zyte_common_items.fields.is_auto_field`:

For some nested fields (:class:`~.ProductFromList`, :class:`~.ProductVariant`),
:ref:`base extractors <default-processors-nested>` exist that you can subclass
to write your own extractors.
.. autofunction:: zyte_common_items.fields.is_auto_field

They provide the following base line:

- They declare the :ref:`item class <items>` that they return, allowing for
their ``to_item`` method to automatically build an instance of it from
``@field``-decorated methods. See :ref:`fields`.
.. code-block:: python
- They also provide default :ref:`processors <processors>` for some
item-specific fields.
print(is_auto_field(ExtendedProductPage, "name")) # Returns False
print(is_auto_field(ExtendedProductPage, "foo")) # Returns False
print(is_auto_field(ExtendedProductPage, "brand")) # Returns True
print(is_auto_field(ExtendedProductPage, "bar")) # Raises KeyError
See :ref:`extractor-api`.
If you are overriding a field method but the method continues to return the
value straight from the ``Auto``-prefixed class, you should also set
``auto_field`` to ``True``. Instead of setting it manually in the field meta,
you can replace the :func:`~web_poet.fields.field` decorator with
:func:`~zyte_common_items.fields.auto_field`:

.. autofunction:: zyte_common_items.fields.auto_field
15 changes: 6 additions & 9 deletions tests/test_adapter.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
from collections import deque
from collections.abc import Collection
from contextlib import contextmanager
from copy import copy

# In Python ≤ 3.8 you cannot annotate with “collections.abc.Collection[Item]”,
# so we need to import typing.Collection for annotation instead.
from typing import Collection as CollectionType
from typing import Deque, Optional, Type, cast
from typing import Optional

import attrs
import pytest
from itemadapter import ItemAdapter
from itemadapter.adapter import AdapterInterface

from zyte_common_items import Item, Product, ZyteItemAdapter
from zyte_common_items.adapter import ZyteItemKeepEmptyAdapter
Expand All @@ -20,11 +19,12 @@

@contextmanager
def configured_adapter(adapter=ZyteItemAdapter):
ItemAdapter.ADAPTER_CLASSES.appendleft(adapter)
original_value = copy(ItemAdapter.ADAPTER_CLASSES)
ItemAdapter.ADAPTER_CLASSES = (adapter, *ItemAdapter.ADAPTER_CLASSES)
try:
yield
finally:
ItemAdapter.ADAPTER_CLASSES.popleft()
ItemAdapter.ADAPTER_CLASSES = original_value


def test_asdict_all_fields():
Expand Down Expand Up @@ -391,10 +391,7 @@ class _Item(Item):
children: CollectionType[Item]

class TestAdapter(ItemAdapter):
ADAPTER_CLASSES = (
cast(Deque[Type[AdapterInterface]], deque([ZyteItemKeepEmptyAdapter]))
+ ItemAdapter.ADAPTER_CLASSES
)
ADAPTER_CLASSES = [ZyteItemKeepEmptyAdapter] + list(ItemAdapter.ADAPTER_CLASSES)

item = _Item([])
adapter = TestAdapter(item)
Expand Down
18 changes: 16 additions & 2 deletions tests/test_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import attrs
import pytest
from web_poet import HttpResponse, RequestUrl, ResponseUrl, Returns, field
from web_poet.fields import get_fields_dict

import zyte_common_items
from zyte_common_items import (
Expand All @@ -25,6 +26,7 @@
Request,
)
from zyte_common_items._dateutils import utcnow
from zyte_common_items.fields import is_auto_field


@pytest.mark.parametrize(
Expand Down Expand Up @@ -111,8 +113,6 @@ class MyProductPage(ProductPage):
def brand(self):
return "baz"

from web_poet.fields import get_fields_dict

assert set(get_fields_dict(MyProductListPage)) == {"metadata", "products", "url"}
assert set(get_fields_dict(MyProductPage)) == {
"brand",
Expand Down Expand Up @@ -469,3 +469,17 @@ def nextPage(self):
assert request.metadata is not None
assert request.metadata.probability == 1.0
assert type(page.nextPage) is Request


def test_auto_fields():
"""Every field of an Auto-prefixed class should have ``auto_field`` set to
``True`` in its field metadata."""
auto_page_names = {
obj_name
for obj_name in zyte_common_items.__dict__
if (obj_name.startswith("Auto") and obj_name.endswith("Page"))
}
for auto_page_name in auto_page_names:
auto_page_cls = zyte_common_items.__dict__[auto_page_name]
for field_name in get_fields_dict(auto_page_cls):
assert is_auto_field(auto_page_cls, field_name)
42 changes: 42 additions & 0 deletions zyte_common_items/fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Callable, List, Optional

from web_poet import ItemPage, field
from web_poet.fields import get_fields_dict


def auto_field(
method=None,
*,
cached: bool = False,
meta: Optional[dict] = None,
out: Optional[List[Callable]] = None,
):
"""Decorator that works like :func:`web_poet.fields.field` but sets
``auto_field`` to ``True`` by default in *meta*.
.. code-block:: python
from zyte_common_items import AutoProductPage
from zyte_common_items.fields import auto_field
class ProductPage(AutoProductPage):
@auto_field
def name(self):
return super().name
"""
meta = meta or {}
meta.setdefault("auto_field", True)
return field(method, cached=cached, meta=meta, out=out)


def is_auto_field(cls: ItemPage, field: str):
"""Return ``True`` if the field named *field* of the *cls* page object
class has ``auto_field`` set to ``True`` in its field metadata.
All fields defined in :ref:`auto page object classes <auto>` meet this
condition.
"""
fields_dict = get_fields_dict(cls)
field_meta = fields_dict[field].meta or {}
return field_meta.get("auto_field", False)
39 changes: 20 additions & 19 deletions zyte_common_items/pages/article.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from typing import List, Optional

import attrs
from web_poet import Returns, field
from web_poet import Returns

from zyte_common_items.components import Audio, Author, Breadcrumb, Image, Video
from zyte_common_items.fields import auto_field
from zyte_common_items.items import Article, ArticleMetadata
from zyte_common_items.processors import breadcrumbs_processor

Expand All @@ -29,74 +30,74 @@ class Processors(Page.Processors):
class AutoArticlePage(BaseArticlePage):
article: Article

@field
@auto_field
def headline(self) -> Optional[str]:
return self.article.headline

@field
@auto_field
def datePublished(self) -> Optional[str]:
return self.article.datePublished

@field
@auto_field
def datePublishedRaw(self) -> Optional[str]:
return self.article.datePublishedRaw

@field
@auto_field
def dateModified(self) -> Optional[str]:
return self.article.dateModified

@field
@auto_field
def dateModifiedRaw(self) -> Optional[str]:
return self.article.dateModifiedRaw

@field
@auto_field
def authors(self) -> Optional[List[Author]]:
return self.article.authors

@field
@auto_field
def breadcrumbs(self) -> Optional[List[Breadcrumb]]:
return self.article.breadcrumbs

@field
@auto_field
def inLanguage(self) -> Optional[str]:
return self.article.inLanguage

@field
@auto_field
def mainImage(self) -> Optional[Image]:
return self.article.mainImage

@field
@auto_field
def images(self) -> Optional[List[Image]]:
return self.article.images

@field
@auto_field
def description(self) -> Optional[str]:
return self.article.description

@field
@auto_field
def articleBody(self) -> Optional[str]:
return self.article.articleBody

@field
@auto_field
def articleBodyHtml(self) -> Optional[str]:
return self.article.articleBodyHtml

@field
@auto_field
def videos(self) -> Optional[List[Video]]:
return self.article.videos

@field
@auto_field
def audios(self) -> Optional[List[Audio]]:
return self.article.audios

@field
@auto_field
def canonicalUrl(self) -> Optional[str]:
return self.article.canonicalUrl

@field
@auto_field
def url(self) -> Optional[str]:
return self.article.url

@field
@auto_field
def metadata(self) -> Optional[ArticleMetadata]:
return self.article.metadata
13 changes: 7 additions & 6 deletions zyte_common_items/pages/article_list.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from typing import List, Optional

import attrs
from web_poet import Returns, field
from web_poet import Returns

from zyte_common_items.components import Breadcrumb
from zyte_common_items.fields import auto_field
from zyte_common_items.items import ArticleFromList, ArticleList, ArticleListMetadata
from zyte_common_items.processors import breadcrumbs_processor

Expand Down Expand Up @@ -31,22 +32,22 @@ class Processors(Page.Processors):
class AutoArticleListPage(BaseArticleListPage):
article_list: ArticleList

@field
@auto_field
def articles(self) -> Optional[List[ArticleFromList]]:
return self.article_list.articles

@field
@auto_field
def breadcrumbs(self) -> Optional[List[Breadcrumb]]:
return self.article_list.breadcrumbs

@field
@auto_field
def canonicalUrl(self) -> Optional[str]:
return self.article_list.canonicalUrl

@field
@auto_field
def metadata(self) -> Optional[ArticleListMetadata]:
return self.article_list.metadata

@field
@auto_field
def url(self) -> Optional[str]:
return self.article_list.url
Loading

0 comments on commit 4a6ceac

Please sign in to comment.