diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 939024b9..3e079162 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,8 @@ Changelog TBR ------------------ +* added a ``PageObjectRegistry`` class which has the ``handle_urls`` decorator + to conveniently declare and collect ``OverrideRule``. * removed support for Python 3.6 * added support for Python 3.10 * Backward Incompatible Change: @@ -13,7 +15,6 @@ TBR specific attribute types like ``HttpResponseBody`` and ``HttpResponseHeaders``. - 0.1.1 (2021-06-02) ------------------ diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 011f878e..f0da5884 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -1,3 +1,5 @@ +.. _`api-reference`: + ============= API Reference ============= @@ -45,3 +47,18 @@ Mixins .. autoclass:: web_poet.mixins.ResponseShortcutsMixin :members: :no-special-members: + + +.. _`api-overrides`: + +Overrides +========= + +See the tutorial section on :ref:`intro-overrides` for more context about its +use cases and some examples. + +.. autofunction:: web_poet.handle_urls + +.. automodule:: web_poet.overrides + :members: + :exclude-members: handle_urls diff --git a/docs/conf.py b/docs/conf.py index 5d692c9c..1c5fcc66 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -192,5 +192,6 @@ intersphinx_mapping = { 'python': ('https://docs.python.org/3', None, ), 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), + 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None, ), 'parsel': ('https://parsel.readthedocs.io/en/latest/', None, ), } diff --git a/docs/index.rst b/docs/index.rst index d6d2e269..db4d852d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,6 +33,7 @@ and the motivation behind ``web-poet``, start with :ref:`from-ground-up`. intro/tutorial intro/from-ground-up + intro/overrides .. toctree:: :caption: Reference diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst new file mode 100644 index 00000000..a43b0252 --- /dev/null +++ b/docs/intro/overrides.rst @@ -0,0 +1,484 @@ +.. _`intro-overrides`: + +Overrides +========= + +Overrides contains mapping rules to associate which URLs a particular +Page Object would be used. The URL matching rules is handled by another library +called `url-matcher `_. + +Using such rules establishes the core concept of Overrides wherein a developer +could declare that a specific Page Object must be used *(instead of another)* +for a given set of URL patterns. + +This enables **web-poet** to be used effectively by other frameworks like +`scrapy-poet `_. + +Example Use Case +---------------- + +Let's explore an example use case for the Overrides concept. + +Suppose we're using Page Objects for our broadcrawl project which explores +eCommerce websites to discover product pages. It wouldn't be entirely possible +for us to create parsers for all websites since we don't know which sites we're +going to crawl beforehand. + +However, we could at least create a generic Page Object to support parsing of +some fields in well-known locations of product information like ````. +This enables our broadcrawler to at least parse some useful information. Let's +call such Page Object to be ``GenericProductPage``. + +Assuming that one of our project requirements is to fully support parsing of the +`top 3 eCommerce websites`, then we'd need to create a Page Object for each one +to parse more specific fields. + +Here's where the Overrides concept comes in: + + 1. The ``GenericProductPage`` is used to parse all eCommerce product pages + `by default`. + 2. Whenever one of our declared URL rules matches with a given page URL, + then the Page Object associated with that rule `overrides (or replaces)` + the default ``GenericProductPage``. + +This enables us to conveniently declare which Page Object would be used for a +given webpage `(based on a page's URL pattern)`. + +Let's see this in action by declaring the Overrides in the Page Objects below. + + +Creating Overrides +------------------ + +Let's take a look at how the following code is structured: + +.. code-block:: python + + from web_poet import handle_urls, ItemWebPage + + + class GenericProductPage(ItemWebPage): + def to_item(self): + return {"product-title": self.css("title::text").get()} + + + @handle_urls("example.com", overrides=GenericProductPage) + class ExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + + @handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") + class AnotherExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) + class DualExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + +The code above declares that: + + - For sites that match the ``example.com`` pattern, ``ExampleProductPage`` + would be used instead of ``GenericProductPage``. + - The same is true for ``DualExampleProductPage`` where it is used + instead of ``GenericProductPage`` for two URL patterns which works as + something like: + + - :sub:`(match) https://www.dualexample.com/shop/electronics/?product=123` + - :sub:`(match) https://www.dualexample.com/shop/books/paperback/?product=849` + - :sub:`(NO match) https://www.dualexample.com/on-sale/books/?product=923` + - :sub:`(match) https://www.dualexample.net/store/kitchen/?pid=776` + - :sub:`(match) https://www.dualexample.net/store/?pid=892` + - :sub:`(NO match) https://www.dualexample.net/new-offers/fitness/?pid=892` + + - On the other hand, ``AnotherExampleProductPage`` is only used instead of + ``GenericProductPage`` when we're handling pages from ``anotherexample.com`` + that doesn't contain ``/digital-goods/`` in its URL path. + +.. tip:: + + The URL patterns declared in the ``@handle_urls`` annotation can still be + further customized. You can read some of the specific parameters in the + :ref:`API section <api-overrides>` of :func:`web_poet.handle_urls`. + + +Retrieving all available Overrides +---------------------------------- + +The :meth:`~.PageObjectRegistry.get_overrides` method from the ``web_poet.default_registry`` +allows retrieval of all :class:`~.OverrideRule` in the given registry. +Following from our example above, using it would be: + +.. code-block:: python + + from web_poet import default_registry + + # Retrieves all OverrideRules that were registered in the registry + rules = default_registry.get_overrides() + + print(len(rules)) # 3 + print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) + +Remember that using ``@handle_urls`` to annotate the Page Objects would result +in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. + + +.. warning:: + + :meth:`~.PageObjectRegistry.get_overrides` relies on the fact that all essential + packages/modules which contains the :func:`web_poet.handle_urls` + annotations are properly loaded `(i.e imported)`. + + Thus, for cases like importing and using Page Objects from other external packages, + the ``@handle_urls`` annotations from these external sources must be read and + processed properly. This ensures that the external Page Objects have all of their + :class:`~.OverrideRule` present. + + This can be done via the function named :func:`~.web_poet.overrides.consume_modules`. + Here's an example: + + .. code-block:: python + + from web_poet import default_registry, consume_modules + + consume_modules("external_package_A.po", "another_ext_package.lib") + rules = default_registry.get_overrides() + + The next section explores this caveat further. + + +Using Overrides from External Packages +-------------------------------------- + +Developers have the option to import existing Page Objects alongside the +:class:`~.OverrideRule` attached to them. This section aims to showcase different +scenarios that come up when using multiple Page Object Projects. + +.. _`intro-rule-all`: + +Using all available OverrideRules from multiple Page Object Projects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's suppose we have the following use case before us: + + - An **external** Python package named ``ecommerce_page_objects`` is available + which contains Page Objects for common websites. + - Another similar **external** package named ``gadget_sites_page_objects`` is + available for even more specific websites. + - Your project's objective is to handle as much eCommerce websites as you + can. + + - Thus, you'd want to use the already available packages above and + perhaps improve on them or create new Page Objects for new websites. + +Remember that all of the :class:`~.OverrideRule` are declared by annotating +Page Objects using the :func:`web_poet.handle_urls` via ``@handle_urls``. Thus, +they can easily be accessed using the :meth:`~.PageObjectRegistry.get_overrides` +of ``web_poet.default_registry``. + +This can be done something like: + +.. code-block:: python + + from web_poet import default_registry, consume_modules + + # ❌ Remember that this wouldn't retrieve any rules at all since the + # annotations are NOT properly imported. + rules = default_registry.get_overrides() + print(rules) # [] + + # ✅ Instead, you need to run the following so that all of the Page + # Objects in the external packages are recursively imported. + consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") + rules = default_registry.get_overrides() + + # The collected rules would then be as follows: + print(rules) + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + +.. note:: + + Once :func:`~.web_poet.overrides.consume_modules` is called, then all + external Page Objects are recursively imported and available for the entire + runtime duration. Calling :func:`~.web_poet.overrides.consume_modules` again + makes no difference unless a new set of modules are provided. + +.. _`intro-rule-subset`: + +Using only a subset of the available OverrideRules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose that the use case from the previous section has changed wherein a +subset of :class:`~.OverrideRule` would be used. This could be achieved by +using the :meth:`~.PageObjectRegistry.search_overrides` method which allows for +convenient selection of a subset of rules from a given registry. + +Here's an example of how you could manually select the rules using the +:meth:`~.PageObjectRegistry.search_overrides` method instead: + +.. code-block:: python + + from web_poet import default_registry, consume_modules + import ecommerce_page_objects, gadget_sites_page_objects + + consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") + + ecom_rules = default_registry.search_overrides(instead_of=ecommerce_page_objects.EcomGenericPage) + print(ecom_rules) + # OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + + gadget_rules = default_registry.search_overrides(use=gadget_sites_page_objects.site_3.GadgetSite3) + print(gadget_rules) + # OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + + rules = ecom_rules + gadget_rules + print(rules) + # OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + +As you can see, using the :meth:`~.PageObjectRegistry.search_overrides` method allows you to +conveniently select for :class:`~.OverrideRule` which conform to a specific criteria. This +allows you to conveniently drill down to which :class:`~.OverrideRule` you're interested in +using. + +.. _`overrides-custom-registry`: + +After gathering all the pre-selected rules, we can then store it in a new instance +of :class:`~.PageObjectRegistry` in order to separate it from the ``default_registry`` +which contains all of the rules. We can use the :meth:`~.PageObjectRegistry.from_override_rules` +for this: + +.. code-block:: python + + from web_poet import PageObjectRegistry + + my_new_registry = PageObjectRegistry.from_override_rules(rules) + + +.. _`intro-improve-po`: + +Improving on external Page Objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There would be cases wherein you're using Page Objects with :class:`~.OverrideRule` +from external packages only to find out that a few of them lacks some of the +fields or features that you need. + +Let's suppose that we wanted to use `all` of the :class:`~.OverrideRule` similar +to this section: :ref:`intro-rule-all`. However, the ``EcomSite1`` Page Object +needs to properly handle some edge cases where some fields are not being extracted +properly. One way to fix this is to subclass the said Page Object and improve its +``to_item()`` method, or even creating a new class entirely. For simplicity, let's +have the first approach as an example: + +.. code-block:: python + + from web_poet import default_registry, consume_modules, handle_urls + import ecommerce_page_objects, gadget_sites_page_objects + + consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") + rules = default_registry.get_overrides() + + # The collected rules would then be as follows: + print(rules) + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + + @handle_urls("site_1.com", overrides=ecommerce_page_objects.EcomGenericPage, priority=1000) + class ImprovedEcomSite1(ecommerce_page_objects.site_1.EcomSite1): + def to_item(self): + ... # call super().to_item() and improve on the item's shortcomings + + rules = default_registry.get_overrides() + print(rules) + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 5. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=1000), use=<class 'my_project.ImprovedEcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + +Notice that we're adding a new :class:`~.OverrideRule` for the same URL pattern +for ``site_1.com``. + +When the time comes that a Page Object needs to be selected when parsing ``site_1.com`` +and it needs to replace ``ecommerce_page_objects.EcomGenericPage``, rules **#1** +and **#5** will be the choices. However, since we've assigned a much **higher priority** +for the new rule in **#5** than the default ``500`` value, rule **#5** will be +chosen because of its higher priority value. + +More details on this in the :ref:`Priority Resolution <priority-resolution>` +subsection. + + +Handling conflicts from using Multiple External Packages +-------------------------------------------------------- + +You might've observed from the previous section that retrieving the list of all +:class:`~.OverrideRule` from two different external packages may result in a +conflict. + +We can take a look at the rules for **#2** and **#3** when we were importing all +available rules: + +.. code-block:: python + + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + +However, it's technically **NOT** a `conflict`, **yet**, since: + + - ``ecommerce_page_objects.site_2.EcomSite2`` would only be used in **site_2.com** + if ``ecommerce_page_objects.EcomGenericPage`` is to be replaced. + - The same case with ``gadget_sites_page_objects.site_2.GadgetSite2`` wherein + it's only going to be utilized for **site_2.com** if the following is to be + replaced: ``gadget_sites_page_objects.GadgetGenericPage``. + +It would be only become a conflict if both rules for **site_2.com** `intend to +replace the` **same** `Page Object`. + +However, let's suppose that there are some :class:`~.OverrideRule` which actually +result in a conflict. To give an example, let's suppose that rules **#2** and **#3** +`intends to replace the` **same** `Page Object`. It would look something like: + +.. code-block:: python + + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + +Notice that the ``instead_of`` param are the same and only the ``use`` param +remained different. + +There are two main ways we recommend in solving this. + +.. _`priority-resolution`: + +**1. Priority Resolution** + +If you notice, the ``for_patterns`` attribute of :class:`~.OverrideRule` is an +instance of `url_matcher.Patterns +<https://url-matcher.readthedocs.io/en/stable/api_reference.html#module-url-matcher>`_. +This instance also has a ``priority`` param where a higher value will be chosen +in times of conflict. + +.. note:: + + The `url-matcher`_ library is the one responsible breaking such ``priority`` conflicts + `(amongst others)`. It's specifically discussed in this section: `rules-conflict-resolution + <https://url-matcher.readthedocs.io/en/stable/intro.html#rules-conflict-resolution>`_. + +Unfortunately, updating the ``priority`` value directly isn't possible as the +:class:`url_matcher.Patterns` is a **frozen** `dataclass`. The same is true for +:class:`~.OverrideRule`. This is made by design so that they are hashable and could +be deduplicated immediately without consequences of them changing in value. + +The only way that the ``priority`` value can be changed is by creating a new +:class:`~.OverrideRule` with a different ``priority`` value (`higher if it needs +more priority`). You don't necessarily need to `delete` the **old** +:class:`~.OverrideRule` since they will be resolved via ``priority`` anyways. + +Creating a new :class:`~.OverrideRule` with a higher priority could be as easy as: + + 1. Subclassing the Page Object in question. + 2. Create a new :func:`web_poet.handle_urls` annotation with the same URL + pattern and Page Object to override but with a much higher priority. + +Here's an example: + +.. code-block:: python + + from web_poet import default_registry, consume_modules, handle_urls + import ecommerce_page_objects, gadget_sites_page_objects, common_items + + @handle_urls("site_2.com", overrides=common_items.ProductGenericPage, priority=1000) + class EcomSite2Copy(ecommerce_page_objects.site_1.EcomSite1): + def to_item(self): + return super().to_item() + +Now, the conflicting **#2** and **#3** rules would never be selected because of +the new :class:`~.OverrideRule` having a much higher priority (see rule **#4**): + +.. code-block:: python + + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + + # 4. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=1000), use=<class 'my_project.EcomSite2Copy'>, instead_of=<class 'common_items.ProductGenericPage'>, meta={}) + +A similar idea was also discussed in the :ref:`intro-improve-po` section. + + +**2. Specifically Selecting the Rules** + +When the last resort of ``priority``-resolution doesn't work, then you could always +specifically select the list of :class:`~.OverrideRule` you want to use. + +We **recommend** in creating an **inclusion**-list rather than an **exclusion**-list +since the latter is quite brittle. For instance, an external package you're using +has updated its rules and the exlusion strategy misses out on a few rules that +were recently added. This could lead to a `silent-error` of receiving a different +set of rules than expected. + +This **inclusion**-list approach can be done by importing the Page Objects directly +and creating instances of :class:`~.OverrideRule` from it. You could also import +all of the available :class:`~.OverrideRule` using :meth:`~.PageObjectRegistry.get_overrides` +to sift through the list of available rules and manually selecting the rules you need. + +Most of the time, the needed rules are the ones which uses the Page Objects we're +interested in. Since :class:`~.PageObjectRegistry` is a ``dict`` subclass, you can +easily find the Page Object's rule using its `key`. Here's an example: + +.. code-block:: python + + from web_poet import default_registry, consume_modules + import package_A, package_B, package_C + + consume_modules("package_A", "package_B", "package_C") + + rules = [ + default_registry[package_A.PageObject1], # OverrideRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, meta={}) + default_registry[package_B.PageObject2], # OverrideRule(for_patterns=Patterns(include=['site_B.com'], exclude=[], priority=500), use=<class 'package_B.PageObject2'>, instead_of=<class 'GenericPage'>, meta={}) + default_registry[package_C.PageObject3], # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, meta={}) + ] + +Another approach would be using the :meth:`~.PageObjectRegistry.search_overrides` +functionality as described from this tutorial section: :ref:`intro-rule-subset`. +The :meth:`~.PageObjectRegistry.search_overrides` is quite useful in cases wherein +the **POP** contains a lot of rules as it presents a utility for programmatically +searching for them. + +Here's an example: + +.. code-block:: python + + from url_matcher import Patterns + from web_poet import default_registry, consume_modules + import package_A, package_B, package_C + + consume_modules("package_A", "package_B", "package_C") + + rule_from_A = default_registry.search_overrides(use=package_A.PageObject1) + print(rule_from_A) + # [OverrideRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, meta={})] + + rule_from_B = default_registry.search_overrides(instead_of=GenericProductPage) + print(rule_from_B) + # [] + + rule_from_C = default_registry.search_overrides(for_patterns=Patterns(include=["site_C.com"])) + print(rule_from_C) + # [ + # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, meta={}), + # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=1000), use=<class 'package_C.PageObject3_improved'>, instead_of=<class 'GenericPage'>, meta={}) + # ] + + rules = rule_from_A + rule_from_B + rule_from_C diff --git a/setup.py b/setup.py index c45f7ec2..6b342aad 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ install_requires=[ 'attrs >= 21.3.0', 'parsel', + 'url-matcher', 'multidict', 'w3lib >= 1.22.0', ], diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py new file mode 100644 index 00000000..0c78c914 --- /dev/null +++ b/tests/po_lib/__init__.py @@ -0,0 +1,39 @@ +""" +This package is just for overrides testing purposes. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from .. import po_lib_sub # NOTE: this module contains a PO with @handle_rules +from web_poet import handle_urls, PageObjectRegistry + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POTopLevelOverriden1: + ... + + +class POTopLevelOverriden2: + ... + + +# This first annotation is ignored. A single annotation per registry is allowed +@handle_urls("example.com", overrides=POTopLevelOverriden1) +@handle_urls("example.com", overrides=POTopLevelOverriden1, exclude="/*.jpg|", priority=300) +class POTopLevel1(POBase): + expected_overrides = POTopLevelOverriden1 + expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) + expected_meta = {} # type: ignore + + +@handle_urls("example.com", overrides=POTopLevelOverriden2) +class POTopLevel2(POBase): + expected_overrides = POTopLevelOverriden2 + expected_patterns = Patterns(["example.com"]) + expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py new file mode 100644 index 00000000..0dcf04c6 --- /dev/null +++ b/tests/po_lib/a_module.py @@ -0,0 +1,16 @@ +from url_matcher import Patterns + +from tests.po_lib import POBase +from web_poet import handle_urls + + +class POModuleOverriden: + ... + + +@handle_urls("example.com", overrides=POModuleOverriden, extra_arg="foo") +class POModule(POBase): + expected_overrides = POModuleOverriden + expected_patterns = Patterns(["example.com"]) + expected_meta = {"extra_arg": "foo"} # type: ignore + diff --git a/tests/po_lib/an_empty_module.py b/tests/po_lib/an_empty_module.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/po_lib/an_empty_package/__init__.py b/tests/po_lib/an_empty_package/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py new file mode 100644 index 00000000..537a995d --- /dev/null +++ b/tests/po_lib/nested_package/__init__.py @@ -0,0 +1,15 @@ +from url_matcher import Patterns + +from tests.po_lib import POBase +from web_poet import handle_urls + + +class PONestedPkgOverriden: + ... + + +@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedPkgOverriden) +class PONestedPkg(POBase): + expected_overrides = PONestedPkgOverriden + expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) + expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py new file mode 100644 index 00000000..e9de9416 --- /dev/null +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -0,0 +1,15 @@ +from url_matcher import Patterns + +from tests.po_lib import POBase +from web_poet import handle_urls + + +class PONestedModuleOverriden: + ... + + +@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedModuleOverriden) +class PONestedModule(POBase): + expected_overrides = PONestedModuleOverriden + expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"]) + expected_meta = {} # type: ignore diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py new file mode 100644 index 00000000..d64378d0 --- /dev/null +++ b/tests/po_lib_sub/__init__.py @@ -0,0 +1,25 @@ +"""This package is being used by tests/po_lib to validate some behaviors on +external depedencies. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from web_poet import handle_urls + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POLibSubOverriden: + ... + + +@handle_urls("sub_example.com", overrides=POLibSubOverriden) +class POLibSub(POBase): + expected_overrides = POLibSubOverriden + expected_patterns = Patterns(["sub_example.com"]) + expected_meta = {} # type: ignore diff --git a/tests/test_overrides.py b/tests/test_overrides.py new file mode 100644 index 00000000..88369334 --- /dev/null +++ b/tests/test_overrides.py @@ -0,0 +1,112 @@ +import pytest +from url_matcher import Patterns + +from tests.po_lib_sub import POLibSub +from tests.po_lib import ( + POTopLevel1, + POTopLevel2, + POTopLevelOverriden2, +) +from tests.po_lib.a_module import POModule, POModuleOverriden +from tests.po_lib.nested_package import PONestedPkg +from tests.po_lib.nested_package.a_nested_module import PONestedModule +from web_poet import ( + default_registry, + consume_modules, + OverrideRule, + PageObjectRegistry, +) + + +POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} + + +def test_override_rule_uniqueness(): + """The same instance of an OverrideRule with the same attribute values should + have the same hash identity. + """ + + patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) + + rule1 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + meta={"key_1": 1}, + ) + rule2 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + meta={"key_2": 2}, + ) + + assert hash(rule1) == hash(rule2) + + +def test_list_page_objects_all(): + rules = default_registry.get_overrides() + page_objects = {po.use for po in rules} + + # Note that the 'tests_extra.po_lib_sub_not_imported.POLibSubNotImported' + # Page Object is not included here since it was never imported anywhere in + # our test package. It would only be included if we run any of the following + # below. (Note that they should run before `get_overrides` is called.) + # - from tests_extra import po_lib_sub_not_imported + # - import tests_extra.po_lib_sub_not_imported + # - web_poet.consume_modules("tests_extra") + # Merely having `import tests_extra` won't work since the subpackages and + # modules needs to be traversed and imported as well. + assert all(["po_lib_sub_not_imported" not in po.__module__ for po in page_objects]) + + # Ensure that ALL Override Rules are returned as long as the given + # registry's @handle_urls annotation was used. + assert page_objects == POS.union({POLibSub}) + for rule in rules: + assert rule.instead_of == rule.use.expected_overrides, rule.use + assert rule.for_patterns == rule.use.expected_patterns, rule.use + assert rule.meta == rule.use.expected_meta, rule.use + + +def test_consume_module_not_existing(): + with pytest.raises(ImportError): + consume_modules("this_does_not_exist") + + +def test_list_page_objects_all_consume(): + """A test similar to the one above but calls ``consume_modules()`` to properly + load the @handle_urls annotations from other modules/packages. + """ + consume_modules("tests_extra") + rules = default_registry.get_overrides() + page_objects = {po.use for po in rules} + assert any(["po_lib_sub_not_imported" in po.__module__ for po in page_objects]) + + +def test_registry_search_overrides(): + rules = default_registry.search_overrides(use=POTopLevel2) + assert len(rules) == 1 + assert rules[0].use == POTopLevel2 + + rules = default_registry.search_overrides(instead_of=POTopLevelOverriden2) + assert len(rules) == 1 + assert rules[0].instead_of == POTopLevelOverriden2 + + # Such rules doesn't exist + rules = default_registry.search_overrides(use=POModuleOverriden) + assert len(rules) == 0 + + +def test_from_override_rules(): + rules = [ + OverrideRule( + for_patterns=Patterns(include=["sample.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + ) + ] + + registry = PageObjectRegistry.from_override_rules(rules) + + assert registry.get_overrides() == rules + assert default_registry.get_overrides() != rules diff --git a/tests_extra/__init__.py b/tests_extra/__init__.py new file mode 100644 index 00000000..62c40098 --- /dev/null +++ b/tests_extra/__init__.py @@ -0,0 +1,5 @@ +""" +This test package was created separately to see the behavior of retrieving the +Override rules declared on a registry where @handle_urls is defined on another +package. +""" diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py new file mode 100644 index 00000000..3d31f2f3 --- /dev/null +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -0,0 +1,28 @@ +""" +This package quite is similar to tests/po_lib_sub in terms of code contents. + +What we're ultimately trying to test here is to see if the `default_registry` +captures the rules annotated in this module if it was not imported. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from web_poet import handle_urls + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POLibSubOverridenNotImported: + ... + + +@handle_urls("sub_example_not_imported.com", overrides=POLibSubOverridenNotImported) +class POLibSubNotImported(POBase): + expected_overrides = POLibSubOverridenNotImported + expected_patterns = Patterns(["sub_example_not_imported.com"]) + expected_meta = {} # type: ignore diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 883c210f..76c6498f 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,2 +1,6 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import HttpResponse, HttpResponseBody, HttpResponseHeaders +from .overrides import PageObjectRegistry, consume_modules, OverrideRule + +default_registry = PageObjectRegistry() +handle_urls = default_registry.handle_urls diff --git a/web_poet/overrides.py b/web_poet/overrides.py new file mode 100644 index 00000000..b54e8ab0 --- /dev/null +++ b/web_poet/overrides.py @@ -0,0 +1,274 @@ +from __future__ import annotations # https://www.python.org/dev/peps/pep-0563/ + +import importlib +import importlib.util +import warnings +import pkgutil +from collections import deque +from dataclasses import dataclass, field +from operator import attrgetter +from typing import Iterable, Optional, Union, List, Callable, Dict, Any, TypeVar, Type + +from url_matcher import Patterns + +from web_poet.utils import as_list + +Strings = Union[str, Iterable[str]] + +PageObjectRegistryTV = TypeVar("PageObjectRegistryTV", bound="PageObjectRegistry") + + +@dataclass(frozen=True) +class OverrideRule: + """A single override rule that specifies when a Page Object should be used + in lieu of another. + + This is instantiated when using the :func:`web_poet.handle_urls` decorator. + It's also being returned as a ``List[OverrideRule]`` when calling the + ``web_poet.default_registry``'s :meth:`~.PageObjectRegistry.get_overrides` + method. + + You can access any of its attributes: + + * ``for_patterns`` - contains the list of URL patterns associated with + this rule. You can read the API documentation of the `url-matcher + <https://url-matcher.readthedocs.io/>`_ package for more information + about the patterns. + * ``use`` - The Page Object that will be **used**. + * ``instead_of`` - The Page Object that will be **replaced**. + * ``meta`` - Any other information you may want to store. This doesn't + do anything for now but may be useful for future API updates. + + .. tip:: + + The :class:`~.OverrideRule` is also hashable. This makes it easy to store + unique rules and identify any duplicates. + """ + + for_patterns: Patterns + use: Callable + instead_of: Callable + meta: Dict[str, Any] = field(default_factory=dict) + + def __hash__(self): + return hash((self.for_patterns, self.use, self.instead_of)) + + +class PageObjectRegistry(dict): + """This contains the mapping rules that associates the Page Objects available + for a given URL matching rule. + + Note that it's simply a ``dict`` subclass with added functionalities on + storing, retrieving, and searching for the :class:`~.OverrideRule` instances. + The **value** represents the :class:`~.OverrideRule` instance from which the + Page Object in the **key** is allowed to be used. Since it's essentially a + ``dict``, you can use any ``dict`` operations with it. + + ``web-poet`` already provides a default Registry named ``default_registry`` + for convenience. It can be directly accessed via: + + .. code-block:: python + + from web_poet import handle_urls, default_registry, ItemWebPage + + @handle_urls("example.com", overrides=ProductPageObject) + class ExampleComProductPage(ItemWebPage): + ... + + override_rules = default_registry.get_overrides() + + Notice that the ``@handle_urls`` that we're using is a part of the + ``default_registry``. This provides a shorter and quicker way to interact + with the built-in default :class:`~.PageObjectRegistry` instead of writing + the longer ``@default_registry.handle_urls``. + + .. note:: + + It is encouraged to simply use and import the already existing registry + via ``from web_poet import default_registry`` instead of creating your + own :class:`~.PageObjectRegistry` instance. Using multiple registries + would be unwieldy in most cases. + + However, it might be applicable in certain scenarios like storing custom + rules to separate it from the ``default_registry``. This :ref:`example + <overrides-custom-registry>` from the tutorial section may provide some + context. + """ + + @classmethod + def from_override_rules( + cls: Type[PageObjectRegistryTV], rules: List[OverrideRule] + ) -> PageObjectRegistryTV: + """An alternative constructor for creating a :class:`~.PageObjectRegistry` + instance by accepting a list of :class:`~.OverrideRule`. + + This is useful in cases wherein you need to store some selected rules + from multiple external packages. + """ + return cls({rule.use: rule for rule in rules}) + + def handle_urls( + self, + include: Strings, + *, + overrides: Callable, + exclude: Optional[Strings] = None, + priority: int = 500, + **kwargs, + ): + """ + Class decorator that indicates that the decorated Page Object should be + used instead of the overridden one for a particular set the URLs. + + The Page Object that is **overridden** is declared using the ``overrides`` + parameter. + + The **override** mechanism only works on certain URLs that match the + ``include`` and ``exclude`` parameters. See the documentation of the + `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more + information about them. + + Any extra parameters are stored as meta information that can be later used. + + :param include: The URLs that should be handled by the decorated Page Object. + :param overrides: The Page Object that should be `replaced`. + :param exclude: The URLs over which the override should **not** happen. + :param priority: The resolution priority in case of `conflicting` rules. + A conflict happens when the ``include``, ``override``, and ``exclude`` + parameters are the same. If so, the `highest priority` will be + chosen. + """ + + def wrapper(cls): + rule = OverrideRule( + for_patterns=Patterns( + include=as_list(include), + exclude=as_list(exclude), + priority=priority, + ), + use=cls, + instead_of=overrides, + meta=kwargs, + ) + # If it was already defined, we don't want to override it + if cls not in self: + self[cls] = rule + else: + warnings.warn( + f"Multiple @handle_urls annotations with the same 'overrides' " + f"are ignored in the same Registry. Ignoring duplicate " + f"annotation on '{include}' for {cls}." + ) + + return cls + + return wrapper + + def get_overrides(self) -> List[OverrideRule]: + """Returns all of the :class:`~.OverrideRule` that were declared using + the ``@handle_urls`` annotation. + + .. warning:: + + Remember to consider calling :func:`~.web_poet.overrides.consume_modules` + beforehand to recursively import all submodules which contains the + ``@handle_urls`` annotations from external Page Objects. + """ + return list(self.values()) + + def search_overrides(self, **kwargs) -> List[OverrideRule]: + """Returns any :class:`OverrideRule` that has any of its attributes + match the rules inside the registry. + + Sample usage: + + .. code-block:: python + + rules = registry.search_overrides(use=ProductPO, instead_of=GenericPO) + print(len(rules)) # 1 + + """ + + # Short-circuit operation if "use" is the only search param used, since + # we know that it's being used as the dict key. + if {"use"} == kwargs.keys(): + rule = self.get(kwargs["use"]) + if rule: + return [rule] + return [] + + getter = attrgetter(*kwargs.keys()) + + def matcher(rule: OverrideRule): + attribs = getter(rule) + if not isinstance(attribs, tuple): + attribs = tuple([attribs]) + if list(attribs) == list(kwargs.values()): + return True + return False + + results = [] + for rule in self.get_overrides(): + if matcher(rule): + results.append(rule) + return results + + +def _walk_module(module: str) -> Iterable: + """Return all modules from a module recursively. + + Note that this will import all the modules and submodules. It returns the + provided module as well. + """ + + def onerror(err): + raise err # pragma: no cover + + spec = importlib.util.find_spec(module) + if not spec: + raise ImportError(f"Module {module} not found") + mod = importlib.import_module(spec.name) + yield mod + if spec.submodule_search_locations: + for info in pkgutil.walk_packages( + spec.submodule_search_locations, f"{spec.name}.", onerror + ): + mod = importlib.import_module(info.name) + yield mod + + +def consume_modules(*modules: str) -> None: + """This recursively imports all packages/modules so that the ``@handle_urls`` + annotation are properly discovered and imported. + + Let's take a look at an example: + + .. code-block:: python + + # FILE: my_page_obj_project/load_rules.py + + from web_poet import default_registry, consume_modules + + consume_modules("other_external_pkg.po", "another_pkg.lib") + rules = default_registry.get_overrides() + + For this case, the :class:`~.OverrideRule` are coming from: + + - ``my_page_obj_project`` `(since it's the same module as the file above)` + - ``other_external_pkg.po`` + - ``another_pkg.lib`` + - any other modules that was imported in the same process inside the + packages/modules above. + + If the ``default_registry`` had other ``@handle_urls`` annotations outside + of the packages/modules listed above, then the corresponding + :class:`~.OverrideRule` won't be returned. Unless, they were recursively + imported in some way similar to :func:`~.web_poet.overrides.consume_modules`. + """ + + for module in modules: + gen = _walk_module(module) + + # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html + # Using a deque() results in a tiny bit performance improvement that list(). + deque(gen, maxlen=0) diff --git a/web_poet/utils.py b/web_poet/utils.py index 63f3c9c4..ae4ff5c2 100644 --- a/web_poet/utils.py +++ b/web_poet/utils.py @@ -1,5 +1,7 @@ import weakref +from collections.abc import Iterable from functools import wraps +from typing import Any, Optional, List def memoizemethod_noargs(method): @@ -15,3 +17,33 @@ def new_method(self, *args, **kwargs): return cache[self] return new_method + + +def as_list(value: Optional[Any]) -> List[Any]: + """Normalizes the value input as a list. + + >>> as_list(None) + [] + >>> as_list("foo") + ['foo'] + >>> as_list(123) + [123] + >>> as_list(["foo", "bar", 123]) + ['foo', 'bar', 123] + >>> as_list(("foo", "bar", 123)) + ['foo', 'bar', 123] + >>> as_list(range(5)) + [0, 1, 2, 3, 4] + >>> def gen(): + ... yield 1 + ... yield 2 + >>> as_list(gen()) + [1, 2] + """ + if value is None: + return [] + if isinstance(value, str): + return [value] + if not isinstance(value, Iterable): + return [value] + return list(value)