From 7587243ecf997d9f7c25038f5f745cbe7a0711c7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 28 Feb 2022 15:40:14 +0800 Subject: [PATCH 01/13] introduce PageObjectRegistry that has @hande_urls annotation --- CHANGELOG.rst | 3 +- docs/api_reference.rst | 14 + docs/conf.py | 1 + docs/index.rst | 1 + setup.py | 1 + tests/po_lib/__init__.py | 39 +++ tests/po_lib/a_module.py | 16 + tests/po_lib/an_empty_module.py | 0 tests/po_lib/an_empty_package/__init__.py | 0 tests/po_lib/nested_package/__init__.py | 15 + .../po_lib/nested_package/a_nested_module.py | 15 + tests/po_lib_sub/__init__.py | 25 ++ tests/test_overrides.py | 92 ++++++ tests_extra/__init__.py | 5 + .../po_lib_sub_not_imported/__init__.py | 28 ++ web_poet/__init__.py | 12 +- web_poet/overrides.py | 280 ++++++++++++++++++ 17 files changed, 545 insertions(+), 2 deletions(-) create mode 100644 tests/po_lib/__init__.py create mode 100644 tests/po_lib/a_module.py create mode 100644 tests/po_lib/an_empty_module.py create mode 100644 tests/po_lib/an_empty_package/__init__.py create mode 100644 tests/po_lib/nested_package/__init__.py create mode 100644 tests/po_lib/nested_package/a_nested_module.py create mode 100644 tests/po_lib_sub/__init__.py create mode 100644 tests/test_overrides.py create mode 100644 tests_extra/__init__.py create mode 100644 tests_extra/po_lib_sub_not_imported/__init__.py create mode 100644 web_poet/overrides.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 28734261..0e8068e2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,10 +5,11 @@ Changelog TBR ------------------ +* added a ``PageObjectRegistry`` class which has the ``handle_urls`` decorator + to write override rules. * removed support for Python 3.6 * added support for Python 3.10 - 0.1.1 (2021-06-02) ------------------ diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 011f878e..e4d06484 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -1,3 +1,5 @@ +.. _`api-reference`: + ============= API Reference ============= @@ -45,3 +47,15 @@ Mixins .. autoclass:: web_poet.mixins.ResponseShortcutsMixin :members: :no-special-members: + + +.. _`api-overrides`: + +Overrides +========= + +.. autofunction:: web_poet.handle_urls + +.. automodule:: web_poet.overrides + :members: + :exclude-members: handle_urls diff --git a/docs/conf.py b/docs/conf.py index 353e5968..09dfab08 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -192,4 +192,5 @@ intersphinx_mapping = { 'python': ('https://docs.python.org/3', None, ), 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), + 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None, ), } diff --git a/docs/index.rst b/docs/index.rst index d6d2e269..db4d852d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,6 +33,7 @@ and the motivation behind ``web-poet``, start with :ref:`from-ground-up`. intro/tutorial intro/from-ground-up + intro/overrides .. toctree:: :caption: Reference diff --git a/setup.py b/setup.py index 9553a73c..9ad91a3c 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ install_requires=( 'attrs', 'parsel', + 'url-matcher', ), classifiers=( 'Development Status :: 2 - Pre-Alpha', diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py new file mode 100644 index 00000000..4f661310 --- /dev/null +++ b/tests/po_lib/__init__.py @@ -0,0 +1,39 @@ +""" +This package is just for overrides testing purposes. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from .. import po_lib_sub # NOTE: this module contains a PO with @handle_rules +from web_poet import handle_urls, PageObjectRegistry + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POTopLevelOverriden1: + ... + + +class POTopLevelOverriden2: + ... + + +# This first annotation is ignored. A single annotation per registry is allowed +@handle_urls("example.com", POTopLevelOverriden1) +@handle_urls("example.com", POTopLevelOverriden1, exclude="/*.jpg|", priority=300) +class POTopLevel1(POBase): + expected_overrides = POTopLevelOverriden1 + expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) + expected_meta = {} # type: ignore + + +@handle_urls("example.com", POTopLevelOverriden2) +class POTopLevel2(POBase): + expected_overrides = POTopLevelOverriden2 + expected_patterns = Patterns(["example.com"]) + expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py new file mode 100644 index 00000000..0dcf04c6 --- /dev/null +++ b/tests/po_lib/a_module.py @@ -0,0 +1,16 @@ +from url_matcher import Patterns + +from tests.po_lib import POBase +from web_poet import handle_urls + + +class POModuleOverriden: + ... + + +@handle_urls("example.com", overrides=POModuleOverriden, extra_arg="foo") +class POModule(POBase): + expected_overrides = POModuleOverriden + expected_patterns = Patterns(["example.com"]) + expected_meta = {"extra_arg": "foo"} # type: ignore + diff --git a/tests/po_lib/an_empty_module.py b/tests/po_lib/an_empty_module.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/po_lib/an_empty_package/__init__.py b/tests/po_lib/an_empty_package/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py new file mode 100644 index 00000000..537a995d --- /dev/null +++ b/tests/po_lib/nested_package/__init__.py @@ -0,0 +1,15 @@ +from url_matcher import Patterns + +from tests.po_lib import POBase +from web_poet import handle_urls + + +class PONestedPkgOverriden: + ... + + +@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedPkgOverriden) +class PONestedPkg(POBase): + expected_overrides = PONestedPkgOverriden + expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) + expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py new file mode 100644 index 00000000..e9de9416 --- /dev/null +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -0,0 +1,15 @@ +from url_matcher import Patterns + +from tests.po_lib import POBase +from web_poet import handle_urls + + +class PONestedModuleOverriden: + ... + + +@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedModuleOverriden) +class PONestedModule(POBase): + expected_overrides = PONestedModuleOverriden + expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"]) + expected_meta = {} # type: ignore diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py new file mode 100644 index 00000000..33850f5c --- /dev/null +++ b/tests/po_lib_sub/__init__.py @@ -0,0 +1,25 @@ +"""This package is being used by tests/po_lib to validate some behaviors on +external depedencies. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from web_poet import handle_urls + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POLibSubOverriden: + ... + + +@handle_urls("sub_example.com", POLibSubOverriden) +class POLibSub(POBase): + expected_overrides = POLibSubOverriden + expected_patterns = Patterns(["sub_example.com"]) + expected_meta = {} # type: ignore diff --git a/tests/test_overrides.py b/tests/test_overrides.py new file mode 100644 index 00000000..ab68af6f --- /dev/null +++ b/tests/test_overrides.py @@ -0,0 +1,92 @@ +import pytest +from url_matcher import Patterns + +from tests.po_lib_sub import POLibSub +from tests.po_lib import ( + POTopLevel1, + POTopLevel2, + POTopLevelOverriden2, +) +from tests.po_lib.a_module import POModule, POModuleOverriden +from tests.po_lib.nested_package import PONestedPkg +from tests.po_lib.nested_package.a_nested_module import PONestedModule +from web_poet import default_registry, consume_modules +from web_poet.overrides import OverrideRule + + +POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} + + +def test_override_rule_uniqueness(): + """The same instance of an OverrideRule with the same attribute values should + have the same hash identity. + """ + + patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) + + rule1 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + meta={"key_1": 1}, + ) + rule2 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + meta={"key_2": 2}, + ) + + assert hash(rule1) == hash(rule2) + + +def test_list_page_objects_all(): + rules = default_registry.get_overrides() + page_objects = {po.use for po in rules} + + # Note that the 'tests_extra.po_lib_sub_not_imported.POLibSubNotImported' + # Page Object is not included here since it was never imported anywhere in + # our test package. It would only be included if we run any of the following + # below. (Note that they should run before `get_overrides` is called.) + # - from tests_extra import po_lib_sub_not_imported + # - import tests_extra.po_lib_sub_not_imported + # - web_poet.consume_modules("tests_extra") + # Merely having `import tests_extra` won't work since the subpackages and + # modules needs to be traversed and imported as well. + assert all(["po_lib_sub_not_imported" not in po.__module__ for po in page_objects]) + + # Ensure that ALL Override Rules are returned as long as the given + # registry's @handle_urls annotation was used. + assert page_objects == POS.union({POLibSub}) + for rule in rules: + assert rule.instead_of == rule.use.expected_overrides, rule.use + assert rule.for_patterns == rule.use.expected_patterns, rule.use + assert rule.meta == rule.use.expected_meta, rule.use + + +def test_consume_module_not_existing(): + with pytest.raises(ImportError): + consume_modules("this_does_not_exist") + + +def test_list_page_objects_all_consume(): + """A test similar to the one above but calls ``consume_modules()`` to properly + load the @handle_urls annotations from other modules/packages. + """ + rules = default_registry.get_overrides(consume="tests_extra") + page_objects = {po.use for po in rules} + assert any(["po_lib_sub_not_imported" in po.__module__ for po in page_objects]) + + +def test_registry_search_overrides(): + rules = default_registry.search_overrides(use=POTopLevel2) + assert len(rules) == 1 + assert rules[0].use == POTopLevel2 + + rules = default_registry.search_overrides(instead_of=POTopLevelOverriden2) + assert len(rules) == 1 + assert rules[0].instead_of == POTopLevelOverriden2 + + # These rules doesn't exist + rules = default_registry.search_overrides(use=POModuleOverriden) + assert len(rules) == 0 diff --git a/tests_extra/__init__.py b/tests_extra/__init__.py new file mode 100644 index 00000000..62c40098 --- /dev/null +++ b/tests_extra/__init__.py @@ -0,0 +1,5 @@ +""" +This test package was created separately to see the behavior of retrieving the +Override rules declared on a registry where @handle_urls is defined on another +package. +""" diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py new file mode 100644 index 00000000..8f68f79b --- /dev/null +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -0,0 +1,28 @@ +""" +This package quite is similar to tests/po_lib_sub in terms of code contents. + +What we're ultimately trying to test here is to see if the `default_registry` +captures the rules annotated in this module if it was not imported. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from web_poet import handle_urls + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POLibSubOverridenNotImported: + ... + + +@handle_urls("sub_example_not_imported.com", POLibSubOverridenNotImported) +class POLibSubNotImported(POBase): + expected_overrides = POLibSubOverridenNotImported + expected_patterns = Patterns(["sub_example_not_imported.com"]) + expected_meta = {} # type: ignore diff --git a/web_poet/__init__.py b/web_poet/__init__.py index e5bf1f54..3e401755 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,2 +1,12 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable -from .page_inputs import ResponseData \ No newline at end of file +from .page_inputs import ResponseData +from .overrides import ( + PageObjectRegistry, + consume_modules, +) + + +# For ease of use, we'll create a default registry so that users can simply +# use its `handle_urls()` method directly by `from web_poet import handle_urls` +default_registry = PageObjectRegistry() +handle_urls = default_registry.handle_urls diff --git a/web_poet/overrides.py b/web_poet/overrides.py new file mode 100644 index 00000000..afdd8f9a --- /dev/null +++ b/web_poet/overrides.py @@ -0,0 +1,280 @@ +from __future__ import annotations # https://www.python.org/dev/peps/pep-0563/ + +import importlib +import importlib.util +import warnings +import pkgutil +from collections import deque +from dataclasses import dataclass, field +from operator import attrgetter +from typing import Iterable, Optional, Union, List, Callable, Dict, Any + +from url_matcher import Patterns + +Strings = Union[str, Iterable[str]] + + +@dataclass(frozen=True) +class OverrideRule: + """A single override rule that specifies when a Page Object should be used + instead of another. + + This is instantiated when using the :func:`web_poet.handle_urls` decorator. + It's also being returned as a ``List[OverrideRule]`` when calling + :meth:`~.PageObjectRegistry.get_overrides`. + + You can access any of its attributes: + + * ``for_patterns: Patterns`` - contains the URL patterns associated + with this rule. You can read the API documentation of the + `url-matcher `_ package for more + information. + * ``use: Callable`` - the Page Object that will be used. + * ``instead_of: Callable`` - the Page Object that will be **replaced**. + * ``meta: Dict[str, Any] = field(default_factory=dict)`` - Any other + information you many want to store. This doesn't do anything for now + but may be useful for future API updates. + """ + + for_patterns: Patterns + use: Callable + instead_of: Callable + meta: Dict[str, Any] = field(default_factory=dict) + + def __hash__(self): + return hash((self.for_patterns, self.use, self.instead_of)) + + +def _as_list(value: Optional[Strings]) -> List[str]: + """ + >>> _as_list(None) + [] + >>> _as_list("foo") + ['foo'] + >>> _as_list(["foo", "bar"]) + ['foo', 'bar'] + """ + if value is None: + return [] + if isinstance(value, str): + return [value] + return list(value) + + +class PageObjectRegistry: + """This contains the mapping rules that associates the Page Objects available + for a given URL matching rule. + + ``web-poet`` already provides a default Registry name ``default_registry`` + for convenience. It can be directly accessed via: + + .. code-block:: python + + from web_poet import handle_urls, default_registry + + @handle_urls("example.com", overrides=ProductPageObject) + class ExampleComProductPage(ItemPage): + ... + + override_rules = default_registry.get_overrides() + + Notice that the ``handle_urls`` that we've imported is a part of + ``default_registry``. This provides a shorter and quicker way to interact + with the built-in default Registry. + + .. note:: + + It is encouraged to simply use and import the already existing registry + via ``from web_poet import default_registry`` instead of creating your + own :class:`~.PageObjectRegistry` instance. Using multiple registries + would be unwieldy in general cases. However, it could be applicable in + certain scenarios. + """ + + def __init__(self): + self._data: Dict[Callable, OverrideRule] = {} + + def handle_urls( + self, + include: Strings, + overrides: Callable, + *, + exclude: Optional[Strings] = None, + priority: int = 500, + **kwargs, + ): + """ + Class decorator that indicates that the decorated Page Object should be + used instead of the overridden one for a particular set the URLs. + + Which Page Object is overridden is determined by the ``overrides`` + parameter. + + Over which URLs the override happens is determined by the ``include``, + ``exclude`` and ``priority`` parameters. See the documentation of the + `url-matcher `_ package for more + information about them. + + Any extra parameters are stored as meta information that can be later used. + + :param include: Defines the URLs that should be handled by the overridden Page Object. + :param overrides: The Page Object that should be replaced by the annotated one. + :param exclude: Defines URLs over which the override should not happen. + :param priority: The resolution priority in case of conflicting annotations. + """ + + def wrapper(cls): + rule = OverrideRule( + for_patterns=Patterns( + include=_as_list(include), + exclude=_as_list(exclude), + priority=priority, + ), + use=cls, + instead_of=overrides, + meta=kwargs, + ) + # If it was already defined, we don't want to override it + if cls not in self._data: + self._data[cls] = rule + else: + warnings.warn( + f"Multiple @handle_urls annotations with the same 'overrides' " + f"are ignored in the same Registry. Ignoring duplicate " + f"annotation on '{include}' for {cls}." + ) + + return cls + + return wrapper + + def get_overrides(self, consume: Optional[Strings] = None) -> List[OverrideRule]: + """Returns a ``List`` of :class:`~.OverrideRule` that were declared using + ``@handle_urls``. + + :param consume: packages/modules that need to be imported so that it can + properly load the :meth:`~.PageObjectRegistry.handle_urls` annotations. + + .. warning:: + + Remember to consider using the ``consume`` parameter to properly load + the :meth:`~.PageObjectRegistry.handle_urls` from external Page + Objects + + The ``consume`` parameter provides a convenient shortcut for calling + :func:`~.web_poet.overrides.consume_modules`. + """ + if consume: + consume_modules(*_as_list(consume)) + + return list(self._data.values()) + + def search_overrides(self, **kwargs) -> List[OverrideRule]: + """Returns a list of :class:`OverrideRule` if any of the attributes + matches the rules inside the registry. + + Sample usage: + + .. code-block:: python + + rules = registry.search_overrides(use=ProductPO, instead_of=GenericPO) + print(len(rules)) # 1 + + """ + + # Short-circuit operation if "use" is the only search param used, since + # we know that it's being used as the dict key. + if set(["use"]) == kwargs.keys(): + rule = self._data.get(kwargs["use"]) + if rule: + return [rule] + return [] + + getter = attrgetter(*kwargs.keys()) + + def matcher(rule: OverrideRule): + attribs = getter(rule) + if not isinstance(attribs, tuple): + attribs = tuple([attribs]) + if list(attribs) == list(kwargs.values()): + return True + return False + + results = [] + for rule in self.get_overrides(): + if matcher(rule): + results.append(rule) + return results + + +def walk_module(module: str) -> Iterable: + """Return all modules from a module recursively. + + Note that this will import all the modules and submodules. It returns the + provided module as well. + """ + + def onerror(err): + raise err # pragma: no cover + + spec = importlib.util.find_spec(module) + if not spec: + raise ImportError(f"Module {module} not found") + mod = importlib.import_module(spec.name) + yield mod + if spec.submodule_search_locations: + for info in pkgutil.walk_packages( + spec.submodule_search_locations, f"{spec.name}.", onerror + ): + mod = importlib.import_module(info.name) + yield mod + + +def consume_modules(*modules: str) -> None: + """A quick wrapper for :func:`~.walk_module` to efficiently consume the + generator and recursively load all packages/modules. + + This function is essential to be run before attempting to retrieve all + :meth:`~.PageObjectRegistry.handle_urls` annotations from :class:`~.PageObjectRegistry` + to ensure that they are properly acknowledged by importing them in runtime. + + Let's take a look at an example: + + .. code-block:: python + + # my_page_obj_project/load_rules.py + + from web_poet import default_registry, consume_modules + + consume_modules("other_external_pkg.po", "another_pkg.lib") + rules = default_registry.get_overrides() + + For this case, the ``List`` of :class:`~.OverrideRule` are coming from: + + - ``my_page_obj_project`` `(since it's the same module as the file above)` + - ``other_external_pkg.po`` + - ``another_pkg.lib`` + + So if the ``default_registry`` had other ``@handle_urls`` annotations outside + of the packages/modules listed above, then the :class:`~.OverrideRule` won't + be returned. + + .. note:: + + :meth:`~.PageObjectRegistry.get_overrides` provides a shortcut for this + using its ``consume`` parameter. Thus, the code example above could be + shortened even further by: + + .. code-block:: python + + from web_poet import default_registry + + rules = default_registry.get_overrides(consume=["other_external_pkg.po", "another_pkg.lib"]) + """ + + for module in modules: + gen = walk_module(module) + + # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html + # Using a deque() results in a tiny bit performance improvement that list(). + deque(gen, maxlen=0) From 63bf390f2d289fdbfd986f96bf2ac7706d9e37cb Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 1 Mar 2022 15:15:29 +0800 Subject: [PATCH 02/13] add docs for overrides --- docs/intro/overrides.rst | 317 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 317 insertions(+) create mode 100644 docs/intro/overrides.rst diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst new file mode 100644 index 00000000..07a23e33 --- /dev/null +++ b/docs/intro/overrides.rst @@ -0,0 +1,317 @@ +.. _`intro-overrides`: + +Overrides +========= + +Overrides contains mapping rules to associate which URLs a particular +Page Object would be used. The URL matching rules is handled by another library +called `url-matcher `_. + +Using such rules establishes the core concept of Overrides wherein a developer +could declare that a specific Page Object must be used *(instead of another)* +for a given set of URL patterns. + +This enables **web-poet** to be used effectively by other frameworks like +`scrapy-poet `_. + +Example Use Case +---------------- + +Let's explore an example use case for the Overrides concept. + +Suppose we're using Page Objects for our broadcrawl project which explores +eCommerce websites to discover product pages. It wouldn't be entirely possible +for us to create parsers for all websites since we don't know which sites we're +going to crawl beforehand. + +However, we could at least create a generic Page Object to support parsing of +some fields in well-known locations of product information like ````. +This enables our broadcrawler to at least parse some useful information. Let's +call such Page Object to be ``GenericProductPage``. + +Assuming that one of our project requirements is to fully support parsing of the +`top 3 eCommerce websites`, then we'd need to create a Page Object for each one +to parse more specific fields. + +Here's where the Overrides concept comes in: + + 1. The ``GenericProductPage`` is used to parse all eCommerce product pages + `by default`. + 2. Whenever one of our declared URL rules matches with a given page URL, + then the Page Object associated with that rule `overrides (or replaces)` + the default ``GenericProductPage``. + +This enables us to conveniently declare which Page Object would be used for a +given webpage `(based on a page's URL pattern)`. + +Let's see this in action by declaring the Overrides in the Page Objects below. + + +Creating Overrides +------------------ + +Let's take a look at how the following code is structured: + +.. code-block:: python + + from web_poet import handle_urls + from web_poet.pages import ItemWebPage + + + class GenericProductPage(ItemWebPage): + def to_item(self): + return {"product title": self.css("title::text").get()} + + + @handle_urls("example.com", overrides=GenericProductPage) + class ExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + + @handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") + class AnotherExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) + class DualExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + +The code above declares that: + + - For sites that match the ``example.com`` pattern, ``ExampleProductPage`` + would be used instead of ``GenericProductPage``. + - The same is true for ``DualExampleProductPage`` where it is used + instead of ``GenericProductPage`` for two URL patterns which works as: + + - :sub:`(match) https://www.dualexample.com/shop/electronics/?product=123` + - :sub:`(match) https://www.dualexample.com/shop/books/paperback/?product=849` + - :sub:`(NO match) https://www.dualexample.com/on-sale/books/?product=923` + - :sub:`(match) https://www.dualexample.net/store/kitchen/?pid=776` + - :sub:`(match) https://www.dualexample.net/store/?pid=892` + - :sub:`(NO match) https://www.dualexample.net/new-offers/fitness/?pid=892` + + - On the other hand, ``AnotherExampleProductPage`` is only used instead of + ``GenericProductPage`` when we're parsing pages from ``anotherexample.com`` + that doesn't contain ``/digital-goods/`` in its URL path. + +.. tip:: + + The URL patterns declared in the :func:`web_poet.handle_urls` can still be + further customized. You can read some of the specific parameters and + alternative ways in the :ref:`API section <api-overrides>` of + :func:`web_poet.handle_urls`. + + +Retrieving all available Overrides +---------------------------------- + +The :meth:`~.PageObjectRegistry.get_overrides` method from the ``web_poet.default_registry`` +allows discovery and retrieval of all :class:`~.OverrideRule` from your project. +Following from our example above, using it would be: + +.. code-block:: python + + from web_poet import default_registry + + # Retrieves all OverrideRules that were registered in the registry + rules = default_registry.get_overrides() + + print(len(rules)) # 3 + print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) + +Remember that using :func:`web_poet.handle_urls` to annotate the Page Objects +would result in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. + + +.. warning:: + + :meth:`~.PageObjectRegistry.get_overrides` relies on the fact that all essential + packages/modules which contains the :func:`web_poet.handle_urls` + annotations are properly loaded. + + Thus, for cases like importing Page Objects from another external package, you'd + need to properly load all :meth:`web_poet.handle_urls` annotations + from the external module. This ensures that the external Page Objects have + their annotations properly loaded. + + This can be done via the function named :func:`~.web_poet.overrides.consume_modules`. + Here's an example: + + .. code-block:: python + + from web_poet import default_registry, consume_modules + + consume_modules("external_package_A.po", "another_ext_package.lib") + rules = default_registry.get_overrides() + + # Fortunately, `get_overrides()` provides a shortcut for the lines above: + rules = default_registry.get_overrides(consume=["external_package_A.po", "another_ext_package.lib"]) + + The next section explores this caveat further. + + +Using Overrides from External Packages +-------------------------------------- + +Developers have the option to import existing Page Objects alongside the +:class:`~.OverrideRule` attached to them. This section aims to showcase different +scenarios that come up when using multiple Page Object Projects. + +Using all available OverrideRules from multiple Page Object Projects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's suppose we have the following use case before us: + + - An **external** Python package named ``ecommerce_page_objects`` is available + which contains Page Objects for common websites. + - Another similar **external** package named ``gadget_sites_page_objects`` is + available for even more specific websites. + - Your project's objectives is to handle as much eCommerce websites as you + can. Thus, you'd want to use the already available packages above and + perhaps improve on them or create new Page Objects for new websites. + +Remember that all of the :class:`~.OverrideRule` are declared by annotating +Page Objects using the :func:`web_poet.handle_urls`. Thus, they can easily be +accessed using the :meth:`~.PageObjectRegistry.get_overrides` of ``web_poet.default_registry``. + +This can be done something like: + +.. code-block:: python + + from web_poet import default_registry, consume_modules + + # ❌ Remember that this wouldn't retrieve any rules at all since the + # annotations are NOT properly loaded. + rules = default_registry.get_overrides() + print(rules) # [] + + # ✅ Instead, you need to run the following so that all of the Page + # Objects in the external packages are recursively loaded. + consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") + rules = default_registry.get_overrides() + + # Alternatively, this could be shortened using: + rules = default_registry.get_overrides(consume=["ecommerce_page_objects", "gadget_sites_page_objects"]) + + # The rules would then be as follows: + print(rules) + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + +.. _`intro-rule-subset`: + +Using only a subset of the available OverrideRules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose that the use case from the previous section has changed wherein a +subset of :class:`~.OverrideRule` would be used. This could be achieved by +using the :meth:`~.PageObjectRegistry.search_overrides` method allows for +convenient selection of a subset of rules from the ``default_registry``. + +Here's an example of how you could manually select the rules using the +:meth:`~.PageObjectRegistry.search_overrides` method instead: + +.. code-block:: python + + from web_poet import default_registry, consume_modules + import ecommerce_page_objects, gadget_sites_page_objects + + consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") + + ecom_rules = default_registry.search_overrides(instead_of=ecommerce_page_objects.EcomGenericPage) + print(ecom_rules) + # OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + + gadget_rules = default_registry.search_overrides(use=gadget_sites_page_objects.site_3.GadgetSite3) + print(gadget_rules) + # OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + + rules = ecom_rules + gadget_rules + +As you can see, using the :meth:`~.PageObjectRegistry.search_overrides` method allows you to +conveniently select for :class:`~.OverrideRule` which conform to a specific criteria. This +allows you to conveniently drill down to which :class:`~.OverrideRule` you're interested in +using. + +Handling conflicts from using Multiple External Packages +-------------------------------------------------------- + +You might've observed from the previous section that retrieving the list of all +:class:`~.OverrideRule` from two different external packages may result in a +conflict. + +We can take a look at the rules for **#2** and **#3**: + +.. code-block:: python + + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + +However, it's technically **NOT** a conflict, **yet**, since: + + - ``ecommerce_page_objects.site_2.EcomSite2`` would only be used in **site_2.com** + if ``ecommerce_page_objects.EcomGenericPage`` is to be replaced. + - The same case with ``gadget_sites_page_objects.site_2.GadgetSite2`` wherein + it's only going to be utilized for **site_2.com** if the following is to be + replaced: ``gadget_sites_page_objects.GadgetGenericPage``. + +It would be only become a conflict if the **#2** and **#3** :class:`~.OverrideRule` +for **site_2.com** both `intend to replace the` **same** `Page Object`. In fact, +none of the :class:`~.OverrideRule` above would ever be used if your project never +intends to use the following Page Objects *(since there's nothing to override)*. + +However, let's suppose that there are :class:`~.OverrideRule` which actually result +in a conflict. To give an example, let's suppose that rules **#2** and **#3** `intends +to replace the` **same** `Page Object`. It would look something like: + +.. code-block:: python + + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + +There are two main ways we recommend in solving this. + +**1. Priority Resolution** + +If you notice, the ``for_patterns`` attribute of :class:`~.OverrideRule` is an +instance of `url_matcher.Patterns +<https://url-matcher.readthedocs.io/en/stable/api_reference.html#module-url-matcher>`_. +This instance also has a ``priority`` param where a higher value will be chosen +in times of conflict. + +.. note:: + + The `url-matcher`_ library is the one responsible breaking such ``priority`` conflicts + `(amongst others)`. It's specifically discussed in this section: `rules-conflict-resolution + <https://url-matcher.readthedocs.io/en/stable/intro.html#rules-conflict-resolution>`_. + +Thus, it is recommended that you always update the ``priority`` value when using +an external package containing Page Objects before repackaging it again. This ensures +that no conflicts would be made when other developers use your new package. + +Unfortunately, updating the ``priority`` value directly isn't possible as +:class:`url_matcher.Patterns` is a **frozen** `dataclass`. The same is true for +:class:`~.OverrideRule`. This is made by design so that they are hashable and could +be deduplicated immediately without consequences of them changing in value. + +Thus, if the conflict cannot be resolved by the ``priority`` param, then +the next approach could be used. + +**2. Specifically Selecting the Rules** + +When the last resort of ``priority``-resolution doesn't work, then you could always +specifically select the list of :class:`~.OverrideRule` you want to use. + +We recommend in creating an **inclusion**-list rather than an **exclusion**-list +since the latter is quite brittle. For instance, an external package you're using +has updated its rules where an exlusion strategy misses out on a few rules. +This could lead to a `silent-error` of receiving a different set of rules than +expected. + From 8eb91036ec9beda710f923ecacd816b8047d063d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 3 Mar 2022 18:34:07 +0800 Subject: [PATCH 03/13] update @handle_urls to require keyword in 'override' --- tests/po_lib/__init__.py | 6 +++--- tests/po_lib_sub/__init__.py | 2 +- tests_extra/po_lib_sub_not_imported/__init__.py | 2 +- web_poet/overrides.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 4f661310..0c78c914 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -24,15 +24,15 @@ class POTopLevelOverriden2: # This first annotation is ignored. A single annotation per registry is allowed -@handle_urls("example.com", POTopLevelOverriden1) -@handle_urls("example.com", POTopLevelOverriden1, exclude="/*.jpg|", priority=300) +@handle_urls("example.com", overrides=POTopLevelOverriden1) +@handle_urls("example.com", overrides=POTopLevelOverriden1, exclude="/*.jpg|", priority=300) class POTopLevel1(POBase): expected_overrides = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) expected_meta = {} # type: ignore -@handle_urls("example.com", POTopLevelOverriden2) +@handle_urls("example.com", overrides=POTopLevelOverriden2) class POTopLevel2(POBase): expected_overrides = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py index 33850f5c..d64378d0 100644 --- a/tests/po_lib_sub/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -18,7 +18,7 @@ class POLibSubOverriden: ... -@handle_urls("sub_example.com", POLibSubOverriden) +@handle_urls("sub_example.com", overrides=POLibSubOverriden) class POLibSub(POBase): expected_overrides = POLibSubOverriden expected_patterns = Patterns(["sub_example.com"]) diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py index 8f68f79b..3d31f2f3 100644 --- a/tests_extra/po_lib_sub_not_imported/__init__.py +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -21,7 +21,7 @@ class POLibSubOverridenNotImported: ... -@handle_urls("sub_example_not_imported.com", POLibSubOverridenNotImported) +@handle_urls("sub_example_not_imported.com", overrides=POLibSubOverridenNotImported) class POLibSubNotImported(POBase): expected_overrides = POLibSubOverridenNotImported expected_patterns = Patterns(["sub_example_not_imported.com"]) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index afdd8f9a..ba88ed0e 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -97,8 +97,8 @@ def __init__(self): def handle_urls( self, include: Strings, - overrides: Callable, *, + overrides: Callable, exclude: Optional[Strings] = None, priority: int = 500, **kwargs, From 006c63c5066f5f0eabb14a8aa19ab58485df4ab1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 23 Mar 2022 15:50:22 +0800 Subject: [PATCH 04/13] slight improvements in the docs --- CHANGELOG.rst | 2 +- docs/api_reference.rst | 3 ++ docs/intro/overrides.rst | 104 +++++++++++++++++++++++---------------- web_poet/overrides.py | 94 ++++++++++++++++++----------------- 4 files changed, 114 insertions(+), 89 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0e8068e2..0abd7a66 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ TBR ------------------ * added a ``PageObjectRegistry`` class which has the ``handle_urls`` decorator - to write override rules. + to conveniently declare and collect ``OverrideRule``. * removed support for Python 3.6 * added support for Python 3.10 diff --git a/docs/api_reference.rst b/docs/api_reference.rst index e4d06484..f0da5884 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -54,6 +54,9 @@ Mixins Overrides ========= +See the tutorial section on :ref:`intro-overrides` for more context about its +use cases and some examples. + .. autofunction:: web_poet.handle_urls .. automodule:: web_poet.overrides diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 07a23e33..d2c6fb88 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -54,13 +54,12 @@ Let's take a look at how the following code is structured: .. code-block:: python - from web_poet import handle_urls - from web_poet.pages import ItemWebPage + from web_poet import handle_urls, ItemWebPage class GenericProductPage(ItemWebPage): def to_item(self): - return {"product title": self.css("title::text").get()} + return {"product-title": self.css("title::text").get()} @handle_urls("example.com", overrides=GenericProductPage) @@ -85,7 +84,8 @@ The code above declares that: - For sites that match the ``example.com`` pattern, ``ExampleProductPage`` would be used instead of ``GenericProductPage``. - The same is true for ``DualExampleProductPage`` where it is used - instead of ``GenericProductPage`` for two URL patterns which works as: + instead of ``GenericProductPage`` for two URL patterns which works as + something like: - :sub:`(match) https://www.dualexample.com/shop/electronics/?product=123` - :sub:`(match) https://www.dualexample.com/shop/books/paperback/?product=849` @@ -95,15 +95,14 @@ The code above declares that: - :sub:`(NO match) https://www.dualexample.net/new-offers/fitness/?pid=892` - On the other hand, ``AnotherExampleProductPage`` is only used instead of - ``GenericProductPage`` when we're parsing pages from ``anotherexample.com`` + ``GenericProductPage`` when we're handling pages from ``anotherexample.com`` that doesn't contain ``/digital-goods/`` in its URL path. .. tip:: - The URL patterns declared in the :func:`web_poet.handle_urls` can still be - further customized. You can read some of the specific parameters and - alternative ways in the :ref:`API section <api-overrides>` of - :func:`web_poet.handle_urls`. + The URL patterns declared in the ``@handle_urls`` annotation can still be + further customized. You can read some of the specific parameters in the + :ref:`API section <api-overrides>` of :func:`web_poet.handle_urls`. Retrieving all available Overrides @@ -123,8 +122,8 @@ Following from our example above, using it would be: print(len(rules)) # 3 print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) -Remember that using :func:`web_poet.handle_urls` to annotate the Page Objects -would result in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. +Remember that using ``@handle_urls`` to annotate the Page Objects would result +in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. .. warning:: @@ -133,10 +132,10 @@ would result in the :class:`~.OverrideRule` to be written into ``web_poet.defaul packages/modules which contains the :func:`web_poet.handle_urls` annotations are properly loaded. - Thus, for cases like importing Page Objects from another external package, you'd - need to properly load all :meth:`web_poet.handle_urls` annotations - from the external module. This ensures that the external Page Objects have - their annotations properly loaded. + Thus, for cases like importing Page Objects from another external package, + you'd need to properly load all ``@handle_urls`` annotations from the external + source. This ensures that the external Page Objects have their annotations + properly loaded. This can be done via the function named :func:`~.web_poet.overrides.consume_modules`. Here's an example: @@ -170,13 +169,16 @@ Let's suppose we have the following use case before us: which contains Page Objects for common websites. - Another similar **external** package named ``gadget_sites_page_objects`` is available for even more specific websites. - - Your project's objectives is to handle as much eCommerce websites as you - can. Thus, you'd want to use the already available packages above and - perhaps improve on them or create new Page Objects for new websites. + - Your project's objective is to handle as much eCommerce websites as you + can. + + - Thus, you'd want to use the already available packages above and + perhaps improve on them or create new Page Objects for new websites. Remember that all of the :class:`~.OverrideRule` are declared by annotating -Page Objects using the :func:`web_poet.handle_urls`. Thus, they can easily be -accessed using the :meth:`~.PageObjectRegistry.get_overrides` of ``web_poet.default_registry``. +Page Objects using the :func:`web_poet.handle_urls` via ``@handle_urls``. Thus, +they can easily be accessed using the :meth:`~.PageObjectRegistry.get_overrides` +of ``web_poet.default_registry``. This can be done something like: @@ -197,13 +199,20 @@ This can be done something like: # Alternatively, this could be shortened using: rules = default_registry.get_overrides(consume=["ecommerce_page_objects", "gadget_sites_page_objects"]) - # The rules would then be as follows: + # The collected rules would then be as follows: print(rules) # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) +.. note:: + + Once :func:`~.web_poet.overrides.consume_modules` is called, then all + external Page Objects are loaded and available for the entire runtime + duration. Calling :func:`~.web_poet.overrides.consume_modules` again makes + no difference unless a new set of modules are provided. + .. _`intro-rule-subset`: Using only a subset of the available OverrideRules @@ -211,7 +220,7 @@ Using only a subset of the available OverrideRules Suppose that the use case from the previous section has changed wherein a subset of :class:`~.OverrideRule` would be used. This could be achieved by -using the :meth:`~.PageObjectRegistry.search_overrides` method allows for +using the :meth:`~.PageObjectRegistry.search_overrides` method which allows for convenient selection of a subset of rules from the ``default_registry``. Here's an example of how you could manually select the rules using the @@ -232,8 +241,12 @@ Here's an example of how you could manually select the rules using the gadget_rules = default_registry.search_overrides(use=gadget_sites_page_objects.site_3.GadgetSite3) print(gadget_rules) # OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) - - rules = ecom_rules + gadget_rules + + rules = ecom_rules + gadget_rules + print(rules) + # OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) As you can see, using the :meth:`~.PageObjectRegistry.search_overrides` method allows you to conveniently select for :class:`~.OverrideRule` which conform to a specific criteria. This @@ -247,14 +260,15 @@ You might've observed from the previous section that retrieving the list of all :class:`~.OverrideRule` from two different external packages may result in a conflict. -We can take a look at the rules for **#2** and **#3**: +We can take a look at the rules for **#2** and **#3** when we were loading all +available rules: .. code-block:: python # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) -However, it's technically **NOT** a conflict, **yet**, since: +However, it's technically **NOT** a `conflict`, **yet**, since: - ``ecommerce_page_objects.site_2.EcomSite2`` would only be used in **site_2.com** if ``ecommerce_page_objects.EcomGenericPage`` is to be replaced. @@ -262,20 +276,21 @@ However, it's technically **NOT** a conflict, **yet**, since: it's only going to be utilized for **site_2.com** if the following is to be replaced: ``gadget_sites_page_objects.GadgetGenericPage``. -It would be only become a conflict if the **#2** and **#3** :class:`~.OverrideRule` -for **site_2.com** both `intend to replace the` **same** `Page Object`. In fact, -none of the :class:`~.OverrideRule` above would ever be used if your project never -intends to use the following Page Objects *(since there's nothing to override)*. +It would be only become a conflict if both rules for **site_2.com** `intend to +replace the` **same** `Page Object`. -However, let's suppose that there are :class:`~.OverrideRule` which actually result -in a conflict. To give an example, let's suppose that rules **#2** and **#3** `intends -to replace the` **same** `Page Object`. It would look something like: +However, let's suppose that there are some :class:`~.OverrideRule` which actually +result in a conflict. To give an example, let's suppose that rules **#2** and **#3** +`intends to replace the` **same** `Page Object`. It would look something like: .. code-block:: python # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) +Notice that the ``instead_of`` param are the same and only the ``use`` param +remained different. + There are two main ways we recommend in solving this. **1. Priority Resolution** @@ -292,16 +307,17 @@ in times of conflict. `(amongst others)`. It's specifically discussed in this section: `rules-conflict-resolution <https://url-matcher.readthedocs.io/en/stable/intro.html#rules-conflict-resolution>`_. -Thus, it is recommended that you always update the ``priority`` value when using -an external package containing Page Objects before repackaging it again. This ensures -that no conflicts would be made when other developers use your new package. - -Unfortunately, updating the ``priority`` value directly isn't possible as +Unfortunately, updating the ``priority`` value directly isn't possible as the :class:`url_matcher.Patterns` is a **frozen** `dataclass`. The same is true for :class:`~.OverrideRule`. This is made by design so that they are hashable and could be deduplicated immediately without consequences of them changing in value. -Thus, if the conflict cannot be resolved by the ``priority`` param, then +The only way that the ``priority`` value can be changed is by creating a new +:class:`~.OverrideRule` with a different ``priority`` value (`higher if it needs +more priority`). You don't necessarily need to `delete` the **old** +:class:`~.OverrideRule` since they will be resolved via ``priority`` anyways. + +If the conflict cannot be resolved by the ``priority`` param, then the next approach could be used. **2. Specifically Selecting the Rules** @@ -309,9 +325,11 @@ the next approach could be used. When the last resort of ``priority``-resolution doesn't work, then you could always specifically select the list of :class:`~.OverrideRule` you want to use. -We recommend in creating an **inclusion**-list rather than an **exclusion**-list +We **recommend** in creating an **inclusion**-list rather than an **exclusion**-list since the latter is quite brittle. For instance, an external package you're using -has updated its rules where an exlusion strategy misses out on a few rules. -This could lead to a `silent-error` of receiving a different set of rules than -expected. +has updated its rules and the exlusion strategy misses out on a few rules that +were recently added. This could lead to a `silent-error` of receiving a different +set of rules than expected. +For this approach, you can use the :meth:`~.PageObjectRegistry.search_overrides` +functionality as described from this tutorial section: :ref:`intro-rule-subset`. diff --git a/web_poet/overrides.py b/web_poet/overrides.py index ba88ed0e..ba610b09 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -17,23 +17,28 @@ @dataclass(frozen=True) class OverrideRule: """A single override rule that specifies when a Page Object should be used - instead of another. + in lieu of another. This is instantiated when using the :func:`web_poet.handle_urls` decorator. - It's also being returned as a ``List[OverrideRule]`` when calling - :meth:`~.PageObjectRegistry.get_overrides`. + It's also being returned as a ``List[OverrideRule]`` when calling the + ``web_poet.default_registry``'s :meth:`~.PageObjectRegistry.get_overrides` + method. You can access any of its attributes: - * ``for_patterns: Patterns`` - contains the URL patterns associated - with this rule. You can read the API documentation of the - `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more - information. - * ``use: Callable`` - the Page Object that will be used. - * ``instead_of: Callable`` - the Page Object that will be **replaced**. - * ``meta: Dict[str, Any] = field(default_factory=dict)`` - Any other - information you many want to store. This doesn't do anything for now - but may be useful for future API updates. + * ``for_patterns`` - contains the list of URL patterns associated with + this rule. You can read the API documentation of the `url-matcher + <https://url-matcher.readthedocs.io/>`_ package for more information + about the patterns. + * ``use`` - The Page Object that will be **used**. + * ``instead_of`` - The Page Object that will be **replaced**. + * ``meta`` - Any other information you may want to store. This doesn't + do anything for now but may be useful for future API updates. + + .. tip:: + + The :class:`~.OverrideRule` is also hashable. This makes it easy to store + unique rules and identify any duplicates. """ for_patterns: Patterns @@ -65,29 +70,30 @@ class PageObjectRegistry: """This contains the mapping rules that associates the Page Objects available for a given URL matching rule. - ``web-poet`` already provides a default Registry name ``default_registry`` + ``web-poet`` already provides a default Registry named ``default_registry`` for convenience. It can be directly accessed via: .. code-block:: python - from web_poet import handle_urls, default_registry + from web_poet import handle_urls, default_registry, ItemWebPage @handle_urls("example.com", overrides=ProductPageObject) - class ExampleComProductPage(ItemPage): + class ExampleComProductPage(ItemWebPage): ... override_rules = default_registry.get_overrides() - Notice that the ``handle_urls`` that we've imported is a part of + Notice that the ``@handle_urls`` that we're using is a part of the ``default_registry``. This provides a shorter and quicker way to interact - with the built-in default Registry. + with the built-in default :class:`~.PageObjectRegistry` instead of writing + the longer ``@default_registry.handle_urls``. .. note:: It is encouraged to simply use and import the already existing registry via ``from web_poet import default_registry`` instead of creating your own :class:`~.PageObjectRegistry` instance. Using multiple registries - would be unwieldy in general cases. However, it could be applicable in + would be unwieldy in most cases. However, it might be applicable in certain scenarios. """ @@ -107,20 +113,23 @@ def handle_urls( Class decorator that indicates that the decorated Page Object should be used instead of the overridden one for a particular set the URLs. - Which Page Object is overridden is determined by the ``overrides`` + The Page Object that is **overridden** is declared using the ``overrides`` parameter. - Over which URLs the override happens is determined by the ``include``, - ``exclude`` and ``priority`` parameters. See the documentation of the + The **override** mechanism only works on certain URLs that match the + ``include`` and ``exclude`` parameters. See the documentation of the `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more information about them. Any extra parameters are stored as meta information that can be later used. - :param include: Defines the URLs that should be handled by the overridden Page Object. - :param overrides: The Page Object that should be replaced by the annotated one. - :param exclude: Defines URLs over which the override should not happen. - :param priority: The resolution priority in case of conflicting annotations. + :param include: The URLs that should be handled by the decorated Page Object. + :param overrides: The Page Object that should be `replaced`. + :param exclude: The URLs over which the override should **not** happen. + :param priority: The resolution priority in case of `conflicting` rules. + A conflict happens when the ``include``, ``override``, and ``exclude`` + parameters are the same. If so, the `highest priority` will be + chosen. """ def wrapper(cls): @@ -149,17 +158,16 @@ def wrapper(cls): return wrapper def get_overrides(self, consume: Optional[Strings] = None) -> List[OverrideRule]: - """Returns a ``List`` of :class:`~.OverrideRule` that were declared using - ``@handle_urls``. + """Returns all of the :class:`~.OverrideRule` that were declared using + the ``@handle_urls`` annotation. :param consume: packages/modules that need to be imported so that it can - properly load the :meth:`~.PageObjectRegistry.handle_urls` annotations. + properly load the :func:`web_poet.handle_urls` annotations. .. warning:: Remember to consider using the ``consume`` parameter to properly load - the :meth:`~.PageObjectRegistry.handle_urls` from external Page - Objects + the ``@handle_urls`` annotations from external Page Objects. The ``consume`` parameter provides a convenient shortcut for calling :func:`~.web_poet.overrides.consume_modules`. @@ -170,8 +178,8 @@ def get_overrides(self, consume: Optional[Strings] = None) -> List[OverrideRule] return list(self._data.values()) def search_overrides(self, **kwargs) -> List[OverrideRule]: - """Returns a list of :class:`OverrideRule` if any of the attributes - matches the rules inside the registry. + """Returns any :class:`OverrideRule` that has any of its attributes + match the rules inside the registry. Sample usage: @@ -207,7 +215,7 @@ def matcher(rule: OverrideRule): return results -def walk_module(module: str) -> Iterable: +def _walk_module(module: str) -> Iterable: """Return all modules from a module recursively. Note that this will import all the modules and submodules. It returns the @@ -231,33 +239,29 @@ def onerror(err): def consume_modules(*modules: str) -> None: - """A quick wrapper for :func:`~.walk_module` to efficiently consume the - generator and recursively load all packages/modules. - - This function is essential to be run before attempting to retrieve all - :meth:`~.PageObjectRegistry.handle_urls` annotations from :class:`~.PageObjectRegistry` - to ensure that they are properly acknowledged by importing them in runtime. + """This recursively imports all packages/modules so that the ``@handle_urls`` + annotation are properly discovered and loaded. Let's take a look at an example: .. code-block:: python - # my_page_obj_project/load_rules.py + # FILE: my_page_obj_project/load_rules.py from web_poet import default_registry, consume_modules consume_modules("other_external_pkg.po", "another_pkg.lib") rules = default_registry.get_overrides() - For this case, the ``List`` of :class:`~.OverrideRule` are coming from: + For this case, the :class:`~.OverrideRule` are coming from: - ``my_page_obj_project`` `(since it's the same module as the file above)` - ``other_external_pkg.po`` - ``another_pkg.lib`` - So if the ``default_registry`` had other ``@handle_urls`` annotations outside - of the packages/modules listed above, then the :class:`~.OverrideRule` won't - be returned. + If the ``default_registry`` had other ``@handle_urls`` annotations outside + of the packages/modules listed above, then the corresponding + :class:`~.OverrideRule` won't be returned. .. note:: @@ -273,7 +277,7 @@ def consume_modules(*modules: str) -> None: """ for module in modules: - gen = walk_module(module) + gen = _walk_module(module) # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html # Using a deque() results in a tiny bit performance improvement that list(). From 35e7b670fa51234f6c0d06acdbcd8dfe4b406d5a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 23 Mar 2022 17:12:05 +0800 Subject: [PATCH 05/13] make PageObjectRegistry be a dict subclass; create new alternative constructor: from_override_rules() --- docs/intro/overrides.rst | 14 ++++++++++++++ tests/test_overrides.py | 25 ++++++++++++++++++++++--- web_poet/__init__.py | 8 +------- web_poet/overrides.py | 35 +++++++++++++++++++++++++---------- 4 files changed, 62 insertions(+), 20 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index d2c6fb88..43819ee4 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -253,6 +253,20 @@ conveniently select for :class:`~.OverrideRule` which conform to a specific crit allows you to conveniently drill down to which :class:`~.OverrideRule` you're interested in using. +.. _`overrides-custom-registry`: + +After gathering all the pre-selected rules, we can then store it in a new instance +of :class:`~.PageObjectRegistry` in order to separate it from the ``default_registry`` +which contains all of the rules. We can use the :meth:`~.PageObjectRegistry.from_override_rules` +for this: + +.. code-block:: python + + from web_poet import PageObjectRegistry + + my_new_registry = PageObjectRegistry.from_override_rules(rules) + + Handling conflicts from using Multiple External Packages -------------------------------------------------------- diff --git a/tests/test_overrides.py b/tests/test_overrides.py index ab68af6f..c129d38f 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -10,8 +10,12 @@ from tests.po_lib.a_module import POModule, POModuleOverriden from tests.po_lib.nested_package import PONestedPkg from tests.po_lib.nested_package.a_nested_module import PONestedModule -from web_poet import default_registry, consume_modules -from web_poet.overrides import OverrideRule +from web_poet import ( + default_registry, + consume_modules, + OverrideRule, + PageObjectRegistry, +) POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} @@ -87,6 +91,21 @@ def test_registry_search_overrides(): assert len(rules) == 1 assert rules[0].instead_of == POTopLevelOverriden2 - # These rules doesn't exist + # Such rules doesn't exist rules = default_registry.search_overrides(use=POModuleOverriden) assert len(rules) == 0 + + +def test_from_override_rules(): + rules = [ + OverrideRule( + for_patterns=Patterns(include=["sample.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + ) + ] + + registry = PageObjectRegistry.from_override_rules(rules) + + assert registry.get_overrides() == rules + assert default_registry.get_overrides() != rules diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 3e401755..67d6244b 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,12 +1,6 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import ResponseData -from .overrides import ( - PageObjectRegistry, - consume_modules, -) +from .overrides import PageObjectRegistry, consume_modules, OverrideRule - -# For ease of use, we'll create a default registry so that users can simply -# use its `handle_urls()` method directly by `from web_poet import handle_urls` default_registry = PageObjectRegistry() handle_urls = default_registry.handle_urls diff --git a/web_poet/overrides.py b/web_poet/overrides.py index ba610b09..95e803eb 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -7,12 +7,14 @@ from collections import deque from dataclasses import dataclass, field from operator import attrgetter -from typing import Iterable, Optional, Union, List, Callable, Dict, Any +from typing import Iterable, Optional, Union, List, Callable, Dict, Any, TypeVar, Type from url_matcher import Patterns Strings = Union[str, Iterable[str]] +PageObjectRegistryTV = TypeVar("PageObjectRegistryTV", bound="PageObjectRegistry") + @dataclass(frozen=True) class OverrideRule: @@ -66,7 +68,7 @@ def _as_list(value: Optional[Strings]) -> List[str]: return list(value) -class PageObjectRegistry: +class PageObjectRegistry(dict): """This contains the mapping rules that associates the Page Objects available for a given URL matching rule. @@ -93,12 +95,25 @@ class ExampleComProductPage(ItemWebPage): It is encouraged to simply use and import the already existing registry via ``from web_poet import default_registry`` instead of creating your own :class:`~.PageObjectRegistry` instance. Using multiple registries - would be unwieldy in most cases. However, it might be applicable in - certain scenarios. + would be unwieldy in most cases. + + However, it might be applicable in certain scenarios like storing custom + rules to separate it from the ``default_registry``. This :ref:`example + <overrides-custom-registry>` from the tutorial section may provide some + context. """ - def __init__(self): - self._data: Dict[Callable, OverrideRule] = {} + @classmethod + def from_override_rules( + cls: Type[PageObjectRegistryTV], rules: List[OverrideRule] + ) -> PageObjectRegistryTV: + """An alternative constructor for creating a :class:`~.PageObjectRegistry` + instance byaccepting a list of :class:`~.OverrideRule`. + + This is useful in cases wherein you need to store some selected rules + from multiple external packages. + """ + return cls({rule.use: rule for rule in rules}) def handle_urls( self, @@ -144,8 +159,8 @@ def wrapper(cls): meta=kwargs, ) # If it was already defined, we don't want to override it - if cls not in self._data: - self._data[cls] = rule + if cls not in self: + self[cls] = rule else: warnings.warn( f"Multiple @handle_urls annotations with the same 'overrides' " @@ -175,7 +190,7 @@ def get_overrides(self, consume: Optional[Strings] = None) -> List[OverrideRule] if consume: consume_modules(*_as_list(consume)) - return list(self._data.values()) + return list(self.values()) def search_overrides(self, **kwargs) -> List[OverrideRule]: """Returns any :class:`OverrideRule` that has any of its attributes @@ -193,7 +208,7 @@ def search_overrides(self, **kwargs) -> List[OverrideRule]: # Short-circuit operation if "use" is the only search param used, since # we know that it's being used as the dict key. if set(["use"]) == kwargs.keys(): - rule = self._data.get(kwargs["use"]) + rule = self.get(kwargs["use"]) if rule: return [rule] return [] From 561f6afa5f4dc090e83a111c302c400fbc20bc5a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 23 Mar 2022 18:22:47 +0800 Subject: [PATCH 06/13] fix small typo in docstring --- web_poet/overrides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 95e803eb..7ed64972 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -108,7 +108,7 @@ def from_override_rules( cls: Type[PageObjectRegistryTV], rules: List[OverrideRule] ) -> PageObjectRegistryTV: """An alternative constructor for creating a :class:`~.PageObjectRegistry` - instance byaccepting a list of :class:`~.OverrideRule`. + instance by accepting a list of :class:`~.OverrideRule`. This is useful in cases wherein you need to store some selected rules from multiple external packages. From 1d95f35b7bba2ead6ca03d3ce40264209c00a5a1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 23 Mar 2022 21:11:19 +0800 Subject: [PATCH 07/13] update docs to emphasize that PageObjectRegistry is a dict subclass --- web_poet/overrides.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 7ed64972..62f6f543 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -72,6 +72,12 @@ class PageObjectRegistry(dict): """This contains the mapping rules that associates the Page Objects available for a given URL matching rule. + Note that it's simply a ``dict`` subclass with added functionalities on + storing, retrieving, and searching for the :class:`~.OverrideRule` instances. + The **value** represents the :class:`~.OverrideRule` instance from which the + Page Object in the **key** is allowed to be used. Since it's essentially a + ``dict``, you can use any ``dict`` operations with it. + ``web-poet`` already provides a default Registry named ``default_registry`` for convenience. It can be directly accessed via: From 37e5ba1a3696f629801478fe78ea1729164f39df Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 31 Mar 2022 11:07:29 +0800 Subject: [PATCH 08/13] remove 'consume' param in get_overrides(); more explicit docs about how rules are imported --- docs/intro/overrides.rst | 32 +++++++++++++------------------- tests/test_overrides.py | 3 ++- web_poet/overrides.py | 35 +++++++++-------------------------- 3 files changed, 24 insertions(+), 46 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 43819ee4..4ef7a751 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -109,7 +109,7 @@ Retrieving all available Overrides ---------------------------------- The :meth:`~.PageObjectRegistry.get_overrides` method from the ``web_poet.default_registry`` -allows discovery and retrieval of all :class:`~.OverrideRule` from your project. +allows retrieval of all :class:`~.OverrideRule` in the given registry. Following from our example above, using it would be: .. code-block:: python @@ -130,12 +130,12 @@ in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. :meth:`~.PageObjectRegistry.get_overrides` relies on the fact that all essential packages/modules which contains the :func:`web_poet.handle_urls` - annotations are properly loaded. + annotations are properly loaded `(i.e imported)`. - Thus, for cases like importing Page Objects from another external package, - you'd need to properly load all ``@handle_urls`` annotations from the external - source. This ensures that the external Page Objects have their annotations - properly loaded. + Thus, for cases like importing and using Page Objects from other external packages, + the ``@handle_urls`` annotations from these external sources must be read and + processed properly. This ensures that the external Page Objects have all of their + :class:`~.OverrideRule` present. This can be done via the function named :func:`~.web_poet.overrides.consume_modules`. Here's an example: @@ -147,9 +147,6 @@ in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. consume_modules("external_package_A.po", "another_ext_package.lib") rules = default_registry.get_overrides() - # Fortunately, `get_overrides()` provides a shortcut for the lines above: - rules = default_registry.get_overrides(consume=["external_package_A.po", "another_ext_package.lib"]) - The next section explores this caveat further. @@ -187,18 +184,15 @@ This can be done something like: from web_poet import default_registry, consume_modules # ❌ Remember that this wouldn't retrieve any rules at all since the - # annotations are NOT properly loaded. + # annotations are NOT properly imported. rules = default_registry.get_overrides() print(rules) # [] # ✅ Instead, you need to run the following so that all of the Page - # Objects in the external packages are recursively loaded. + # Objects in the external packages are recursively imported. consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") rules = default_registry.get_overrides() - # Alternatively, this could be shortened using: - rules = default_registry.get_overrides(consume=["ecommerce_page_objects", "gadget_sites_page_objects"]) - # The collected rules would then be as follows: print(rules) # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) @@ -209,9 +203,9 @@ This can be done something like: .. note:: Once :func:`~.web_poet.overrides.consume_modules` is called, then all - external Page Objects are loaded and available for the entire runtime - duration. Calling :func:`~.web_poet.overrides.consume_modules` again makes - no difference unless a new set of modules are provided. + external Page Objects are recursively imported and available for the entire + runtime duration. Calling :func:`~.web_poet.overrides.consume_modules` again + makes no difference unless a new set of modules are provided. .. _`intro-rule-subset`: @@ -221,7 +215,7 @@ Using only a subset of the available OverrideRules Suppose that the use case from the previous section has changed wherein a subset of :class:`~.OverrideRule` would be used. This could be achieved by using the :meth:`~.PageObjectRegistry.search_overrides` method which allows for -convenient selection of a subset of rules from the ``default_registry``. +convenient selection of a subset of rules from a given registry. Here's an example of how you could manually select the rules using the :meth:`~.PageObjectRegistry.search_overrides` method instead: @@ -274,7 +268,7 @@ You might've observed from the previous section that retrieving the list of all :class:`~.OverrideRule` from two different external packages may result in a conflict. -We can take a look at the rules for **#2** and **#3** when we were loading all +We can take a look at the rules for **#2** and **#3** when we were importing all available rules: .. code-block:: python diff --git a/tests/test_overrides.py b/tests/test_overrides.py index c129d38f..88369334 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -77,7 +77,8 @@ def test_list_page_objects_all_consume(): """A test similar to the one above but calls ``consume_modules()`` to properly load the @handle_urls annotations from other modules/packages. """ - rules = default_registry.get_overrides(consume="tests_extra") + consume_modules("tests_extra") + rules = default_registry.get_overrides() page_objects = {po.use for po in rules} assert any(["po_lib_sub_not_imported" in po.__module__ for po in page_objects]) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 62f6f543..8148333d 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -178,24 +178,16 @@ def wrapper(cls): return wrapper - def get_overrides(self, consume: Optional[Strings] = None) -> List[OverrideRule]: + def get_overrides(self) -> List[OverrideRule]: """Returns all of the :class:`~.OverrideRule` that were declared using the ``@handle_urls`` annotation. - :param consume: packages/modules that need to be imported so that it can - properly load the :func:`web_poet.handle_urls` annotations. - .. warning:: - Remember to consider using the ``consume`` parameter to properly load - the ``@handle_urls`` annotations from external Page Objects. - - The ``consume`` parameter provides a convenient shortcut for calling - :func:`~.web_poet.overrides.consume_modules`. + Remember to consider calling :func:`~.web_poet.overrides.consume_modules` + beforehand to recursively import all submodules which contains the + ``@handle_urls`` annotations from external Page Objects. """ - if consume: - consume_modules(*_as_list(consume)) - return list(self.values()) def search_overrides(self, **kwargs) -> List[OverrideRule]: @@ -261,7 +253,7 @@ def onerror(err): def consume_modules(*modules: str) -> None: """This recursively imports all packages/modules so that the ``@handle_urls`` - annotation are properly discovered and loaded. + annotation are properly discovered and imported. Let's take a look at an example: @@ -279,22 +271,13 @@ def consume_modules(*modules: str) -> None: - ``my_page_obj_project`` `(since it's the same module as the file above)` - ``other_external_pkg.po`` - ``another_pkg.lib`` + - any other modules that was imported in the same process inside the + packages/modules above. If the ``default_registry`` had other ``@handle_urls`` annotations outside of the packages/modules listed above, then the corresponding - :class:`~.OverrideRule` won't be returned. - - .. note:: - - :meth:`~.PageObjectRegistry.get_overrides` provides a shortcut for this - using its ``consume`` parameter. Thus, the code example above could be - shortened even further by: - - .. code-block:: python - - from web_poet import default_registry - - rules = default_registry.get_overrides(consume=["other_external_pkg.po", "another_pkg.lib"]) + :class:`~.OverrideRule` won't be returned. Unless, they were recursively + imported in some way similar to :func:`~.web_poet.overrides.consume_modules`. """ for module in modules: From 50c0046daca9dae90ba0ed1223d2ea91e98768df Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 31 Mar 2022 11:18:37 +0800 Subject: [PATCH 09/13] move _as_list() into utils.py --- web_poet/overrides.py | 22 ++++------------------ web_poet/utils.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 8148333d..826ce8e7 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -11,6 +11,8 @@ from url_matcher import Patterns +from web_poet.utils import as_list + Strings = Union[str, Iterable[str]] PageObjectRegistryTV = TypeVar("PageObjectRegistryTV", bound="PageObjectRegistry") @@ -52,22 +54,6 @@ def __hash__(self): return hash((self.for_patterns, self.use, self.instead_of)) -def _as_list(value: Optional[Strings]) -> List[str]: - """ - >>> _as_list(None) - [] - >>> _as_list("foo") - ['foo'] - >>> _as_list(["foo", "bar"]) - ['foo', 'bar'] - """ - if value is None: - return [] - if isinstance(value, str): - return [value] - return list(value) - - class PageObjectRegistry(dict): """This contains the mapping rules that associates the Page Objects available for a given URL matching rule. @@ -156,8 +142,8 @@ def handle_urls( def wrapper(cls): rule = OverrideRule( for_patterns=Patterns( - include=_as_list(include), - exclude=_as_list(exclude), + include=as_list(include), + exclude=as_list(exclude), priority=priority, ), use=cls, diff --git a/web_poet/utils.py b/web_poet/utils.py index 63f3c9c4..abddfbfe 100644 --- a/web_poet/utils.py +++ b/web_poet/utils.py @@ -1,5 +1,6 @@ import weakref from functools import wraps +from typing import Any, Optional, List def memoizemethod_noargs(method): @@ -15,3 +16,22 @@ def new_method(self, *args, **kwargs): return cache[self] return new_method + + +def as_list(value: Optional[Any]) -> List[Any]: + """Normalizes the value input as a list. + + >>> as_list(None) + [] + >>> as_list("foo") + ['foo'] + >>> as_list(123) + [123] + >>> as_list(["foo", "bar", 123]) + ['foo', 'bar', 123] + """ + if value is None: + return [] + if not isinstance(value, list): + return [value] + return list(value) From a6060827d55b25417dba36993ea59a79a2cd765a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 31 Mar 2022 11:46:03 +0800 Subject: [PATCH 10/13] use `{}` set primitive instead of calling `set()` Co-authored-by: Mikhail Korobov <kmike84@gmail.com> --- web_poet/overrides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 826ce8e7..b54e8ab0 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -191,7 +191,7 @@ def search_overrides(self, **kwargs) -> List[OverrideRule]: # Short-circuit operation if "use" is the only search param used, since # we know that it's being used as the dict key. - if set(["use"]) == kwargs.keys(): + if {"use"} == kwargs.keys(): rule = self.get(kwargs["use"]) if rule: return [rule] From 7944956802e8c38d939c42b7866d2915f853b1fe Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 1 Apr 2022 15:38:06 +0800 Subject: [PATCH 11/13] update doc to have more examples in priority resolution --- docs/intro/overrides.rst | 93 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 4ef7a751..2ec0187c 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -157,6 +157,8 @@ Developers have the option to import existing Page Objects alongside the :class:`~.OverrideRule` attached to them. This section aims to showcase different scenarios that come up when using multiple Page Object Projects. +.. _`intro-rule-all`: + Using all available OverrideRules from multiple Page Object Projects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -261,6 +263,63 @@ for this: my_new_registry = PageObjectRegistry.from_override_rules(rules) +.. _`intro-improve-po`: + +Improving on external Page Objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There would be cases wherein you're using Page Objects with :class:`~.OverrideRule` +from external packages only to find out that a few of them lacks some of the +fields or features that you need. + +Let's suppose that we wanted to use `all` of the :class:`~.OverrideRule` similar +to this section: :ref:`intro-rule-all`. However, the ``EcomSite1`` Page Object +needs to properly handle some edge cases where some fields are not being extracted +properly. One way to fix this is to subclass the said Page Object and improve its +``to_item()`` method, or even creating a new class entirely. For simplicity, let's +have the first approach as an example: + +.. code-block:: python + + from web_poet import default_registry, consume_modules, handle_urls + import ecommerce_page_objects, gadget_sites_page_objects + + consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") + rules = default_registry.get_overrides() + + # The collected rules would then be as follows: + print(rules) + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + + @handle_urls("site_1.com", overrides=ecommerce_page_objects.EcomGenericPage, priority=1000) + class ImprovedEcomSite1(ecommerce_page_objects.site_1.EcomSite1): + def to_item(self): + ... # call super().to_item() and improve on the item's shortcomings + + rules = default_registry.get_overrides() + print(rules) + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 5. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=1000), use=<class 'my_project.ImprovedEcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + +Notice that we're adding a new :class:`~.OverrideRule` for the same URL pattern +for ``site_1.com``. + +When the time comes that a Page Object needs to be selected when parsing ``site_1.com`` +and it needs to replace ``ecommerce_page_objects.EcomGenericPage``, rules **#1** +and **#5** will be the choices. However, since we've assigned a much **higher priority** +for the new rule in **#5** than the default ``500`` value, rule **#5** will be +chosen because of its higher priority value. + +More details on this in the :ref:`Priority Resolution <priority-resolution>` +subsection. + + Handling conflicts from using Multiple External Packages -------------------------------------------------------- @@ -301,6 +360,8 @@ remained different. There are two main ways we recommend in solving this. +.. _`priority-resolution`: + **1. Priority Resolution** If you notice, the ``for_patterns`` attribute of :class:`~.OverrideRule` is an @@ -325,8 +386,36 @@ The only way that the ``priority`` value can be changed is by creating a new more priority`). You don't necessarily need to `delete` the **old** :class:`~.OverrideRule` since they will be resolved via ``priority`` anyways. -If the conflict cannot be resolved by the ``priority`` param, then -the next approach could be used. +Creating a new :class:`~.OverrideRule` with a higher priority could be as easy as: + + 1. Subclassing the Page Object in question. + 2. Create a new :func:`web_poet.handle_urls` annotation with the same URL + pattern and Page Object to override but with a much higher priority. + +Here's an example: + +.. code-block:: python + + from web_poet import default_registry, consume_modules, handle_urls + import ecommerce_page_objects, gadget_sites_page_objects, common_items + + @handle_urls("site_2.com", overrides=common_items.ProductGenericPage, priority=1000) + class EcomSite2Copy(ecommerce_page_objects.site_1.EcomSite1): + def to_item(self): + return super().to_item() + +Now, the conflicting **#2** and **#3** rules would never be selected because of +the new :class:`~.OverrideRule` having a much higher priority (see rule **#4**): + +.. code-block:: python + + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + + # 4. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=1000), use=<class 'my_project.EcomSite2Copy'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + +A similar idea was also discussed in the :ref:`intro-improve-po` section. + **2. Specifically Selecting the Rules** From 74573319c7b495dbf16dc615be846af51721d508 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Mon, 4 Apr 2022 16:29:03 +0800 Subject: [PATCH 12/13] update docs to include more examples for search_overrides() --- docs/intro/overrides.rst | 54 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 2ec0187c..a43b0252 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -428,5 +428,57 @@ has updated its rules and the exlusion strategy misses out on a few rules that were recently added. This could lead to a `silent-error` of receiving a different set of rules than expected. -For this approach, you can use the :meth:`~.PageObjectRegistry.search_overrides` +This **inclusion**-list approach can be done by importing the Page Objects directly +and creating instances of :class:`~.OverrideRule` from it. You could also import +all of the available :class:`~.OverrideRule` using :meth:`~.PageObjectRegistry.get_overrides` +to sift through the list of available rules and manually selecting the rules you need. + +Most of the time, the needed rules are the ones which uses the Page Objects we're +interested in. Since :class:`~.PageObjectRegistry` is a ``dict`` subclass, you can +easily find the Page Object's rule using its `key`. Here's an example: + +.. code-block:: python + + from web_poet import default_registry, consume_modules + import package_A, package_B, package_C + + consume_modules("package_A", "package_B", "package_C") + + rules = [ + default_registry[package_A.PageObject1], # OverrideRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, meta={}) + default_registry[package_B.PageObject2], # OverrideRule(for_patterns=Patterns(include=['site_B.com'], exclude=[], priority=500), use=<class 'package_B.PageObject2'>, instead_of=<class 'GenericPage'>, meta={}) + default_registry[package_C.PageObject3], # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, meta={}) + ] + +Another approach would be using the :meth:`~.PageObjectRegistry.search_overrides` functionality as described from this tutorial section: :ref:`intro-rule-subset`. +The :meth:`~.PageObjectRegistry.search_overrides` is quite useful in cases wherein +the **POP** contains a lot of rules as it presents a utility for programmatically +searching for them. + +Here's an example: + +.. code-block:: python + + from url_matcher import Patterns + from web_poet import default_registry, consume_modules + import package_A, package_B, package_C + + consume_modules("package_A", "package_B", "package_C") + + rule_from_A = default_registry.search_overrides(use=package_A.PageObject1) + print(rule_from_A) + # [OverrideRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, meta={})] + + rule_from_B = default_registry.search_overrides(instead_of=GenericProductPage) + print(rule_from_B) + # [] + + rule_from_C = default_registry.search_overrides(for_patterns=Patterns(include=["site_C.com"])) + print(rule_from_C) + # [ + # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, meta={}), + # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=1000), use=<class 'package_C.PageObject3_improved'>, instead_of=<class 'GenericPage'>, meta={}) + # ] + + rules = rule_from_A + rule_from_B + rule_from_C From 67e72979bb7b56aebf7f0be07ea69131211eb8e9 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 6 Apr 2022 14:00:44 +0800 Subject: [PATCH 13/13] update as_list() utility to explicitly support more input types --- web_poet/utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/web_poet/utils.py b/web_poet/utils.py index abddfbfe..ae4ff5c2 100644 --- a/web_poet/utils.py +++ b/web_poet/utils.py @@ -1,4 +1,5 @@ import weakref +from collections.abc import Iterable from functools import wraps from typing import Any, Optional, List @@ -29,9 +30,20 @@ def as_list(value: Optional[Any]) -> List[Any]: [123] >>> as_list(["foo", "bar", 123]) ['foo', 'bar', 123] + >>> as_list(("foo", "bar", 123)) + ['foo', 'bar', 123] + >>> as_list(range(5)) + [0, 1, 2, 3, 4] + >>> def gen(): + ... yield 1 + ... yield 2 + >>> as_list(gen()) + [1, 2] """ if value is None: return [] - if not isinstance(value, list): + if isinstance(value, str): + return [value] + if not isinstance(value, Iterable): return [value] return list(value)