scrapinghub · BurnzZ · May 19, 2022 · Dec 8, 2021 · Dec 9, 2021 · Dec 21, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,8 @@ TBR:
 ------------------
 
 * Cache mechanism using SCRAPY_POET_CACHE setting
+* New and richer SCRAPY_POET_OVERRIDES registry that uses the
+  url-matcher patterns to configure the overrides
 
 0.2.1 (2021-06-11)
 ------------------

diff --git a/docs/conf.py b/docs/conf.py
@@ -188,7 +188,8 @@
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3', None, ),
     'scrapy': ('https://docs.scrapy.org/en/latest', None, ),
-    'web_poet': ('https://web-poet.readthedocs.io/en/stable/', None),
+    'web-poet': ('https://web-poet.readthedocs.io/en/stable/', None),
+    'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None),
 }
 
 autodoc_default_options = {

diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
@@ -348,12 +348,10 @@ be done by configuring ``SCRAPY_POET_OVERRIDES`` into ``settings.py``:
 
 .. code-block:: python
 
-    SCRAPY_POET_OVERRIDES = {
-        "toscrape.com": {
-            BookListPage: BTSBookListPage,
-            BookPage: BTSBookPage
-        }
-    }
+        "SCRAPY_POET_OVERRIDES": [
+            ("toscrape.com", BTSBookListPage, BookListPage),
+            ("toscrape.com", BTSBookPage, BookPage)
+        ]
 
 The spider is back to life!
 ``SCRAPY_POET_OVERRIDES`` contain rules that overrides the Page Objects
@@ -381,15 +379,15 @@ to implement new ones:
     class BPBookListPage(WebPage):
 
         def book_urls(self):
-            return self.css('.article-info a::attr(href)').getall()
+            return self.css('article.post h4 a::attr(href)').getall()
 
 
     class BPBookPage(ItemWebPage):
 
         def to_item(self):
             return {
                 'url': self.url,
-                'name': self.css(".book-data h4::text").get().strip(),
+                'name': self.css("body div > h1::text").get().strip(),
             }
 
 The last step is configuring the overrides so that these new Page Objects
@@ -399,16 +397,12 @@ are used for the domain
 
 .. code-block:: python
 
-    SCRAPY_POET_OVERRIDES = {
-        "toscrape.com": {
-            BookListPage: BTSBookListPage,
-            BookPage: BTSBookPage
-        },
-        "bookpage.com": {
-            BookListPage: BPBookListPage,
-            BookPage: BPBookPage
-        }
-    }
+        "SCRAPY_POET_OVERRIDES": [
+            ("toscrape.com", BTSBookListPage, BookListPage),
+            ("toscrape.com", BTSBookPage, BookPage),
+            ("bookpage.com", BPBookListPage, BookListPage),
+            ("bookpage.com", BPBookPage, BookPage)
+        ]
 
 The spider is now ready to extract books from both sites 😀.
 The full example
@@ -418,6 +412,20 @@ On a surface, it looks just like a different way to organize Scrapy spider
 code - and indeed, it *is* just a different way to organize the code,
 but it opens some cool possibilities.
 
+.. note::
+
+    In the examples above we have been configuring the overrides
+    for a particular domain, but more complex URL patterns are also possible.
+    For example, the pattern ``books.toscrape.com/cataloge/category/``
+    is accepted and it would restrict the override only to category pages.
+
+    It is even possible to configure more complex patterns by
+    using the ``OverrideRule`` class instead of a triplet in
+    the configuration.
+
+    Also see the `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
+    documentation for more information about the patterns syntax.
+
 Next steps
 ==========
 

diff --git a/docs/overrides.rst b/docs/overrides.rst
@@ -47,11 +47,9 @@ And then override it for a particular domain using ``settings.py``:
 
 .. code-block:: python
 
-    SCRAPY_POET_OVERRIDES = {
-        "example.com": {
-            BookPage: ISBNBookPage
-        }
-    }
+    SCRAPY_POET_OVERRIDES = [
+        ("example.com", ISBNBookPage, BookPage)
+    ]
 
 This new Page Objects gets the original ``BookPage`` as dependency and enrich
 the obtained item with the ISBN from the page HTML.
@@ -79,13 +77,82 @@ the obtained item with the ISBN from the page HTML.
                 return item
 
 
+Overrides rules
+===============
+
+The default way of configuring the override rules is using triplets
+of the form (``url pattern``, ``override_type``, ``overridden_type``). But
+more complex rules can be introduced if the class ``OverrideRule``
+is used. The following example configures an override that
+is only applied for book pages from ``books.toscrape.com``:
+
+.. code-block:: python
+
+
+    SCRAPY_POET_OVERRIDES = [
+        OverrideRule(
+            for_patterns=Patterns(
+                include=["books.toscrape.com/cataloge/*index.html|"],
+                exclude=["/catalogue/category/"]),
+            use=MyBookPage,
+            instead_of=BookPage
+        )
+    ]
+
+Note how category pages are excludes by using a ``exclude`` pattern.
+You can find more information about the patterns syntax in the
+`url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
+documentation.
+
+
+Decorate Page Objects with the rules
+====================================
+
+Having the rules along with the Page Objects is a good idea,
+as you can identify with a single sight what the Page Object is doing
+along with where it is applied. This can be done by decorating the
+Page Objects with ``handle_urls`` and then
+configure the overrides automatically with the help of the function
+``find_page_object_overrides``.
+
+Let's see an example:
+
+.. code-block:: python
+
+        @handle_urls("toscrape.com", BookPage)
+        class BTSBookPage(BookPage):
+
+        def to_item(self):
+            return {
+                'url': self.url,
+                'name': self.css("title::text").get(),
+            }
+
+The ``handle_urls`` decorator in this case is indicating that
+the class ``BSTBookPage`` should be used instead of ``BookPage``
+for the domain ``toscrape.com``.
+
+In order to configure the scrapy-poet overrides automatically
+using these annotations,
+you can use the function ``find_page_object_overrides``.
+For example:
+
+.. code-block:: python
+
+    SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module")
+
+The function will collect all the ``handle_urls`` annotations from the
+``my_page_objects_module`` and submodules, and will convert them
+to rules ready to be used with ``SCRAPY_POET_OVERRIDES``.
+
 Overrides registry
 ==================
 
-The overrides registry is responsible for informing whether there exists an
-override for a particular type for a given response. The default overrides
-registry keeps a map of overrides for each domain and read this configuration
-from settings ``SCRAPY_POET_OVERRIDES`` as has been seen in the :ref:`intro-tutorial`
+The overrides registry is responsible of informing whether there exists an
+override for a particular type for a given request. The default overrides
+registry allows to configure these rules using patterns that follows the
+`url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_ syntax. These rules can be configured using the
+``SCRAPY_POET_OVERRIDES`` setting, as it has been seen in the :ref:`intro-tutorial`
 example.
 
 But the registry implementation can be changed at convenience. A different

diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py
@@ -28,15 +28,15 @@ def to_item(self):
 class BPBookListPage(WebPage):
     """Logic to extract listings from pages like https://bookpage.com/reviews"""
     def book_urls(self):
-        return self.css('.article-info a::attr(href)').getall()
+        return self.css('article.post h4 a::attr(href)').getall()
 
 
 class BPBookPage(ItemWebPage):
     """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction"""
     def to_item(self):
         return {
             'url': self.url,
-            'name': self.css(".book-data h4::text").get().strip(),
+            'name': self.css("body div > h1::text").get().strip(),
         }
 
 
@@ -45,12 +45,10 @@ class BooksSpider(scrapy.Spider):
     start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews']
     # Configuring different page objects pages from the bookpage.com domain
     custom_settings = {
-        "SCRAPY_POET_OVERRIDES": {
-            "bookpage.com": {
-                BookListPage: BPBookListPage,
-                BookPage: BPBookPage
-            }
-        }
+        "SCRAPY_POET_OVERRIDES": [
+            ("bookpage.com", BPBookListPage, BookListPage),
+            ("bookpage.com", BPBookPage, BookPage)
+        ]
     }
 
     def parse(self, response, page: BookListPage):

diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py
@@ -41,15 +41,15 @@ def to_item(self):
 class BPBookListPage(BookListPage):
     """Logic to extract listings from pages like https://bookpage.com/reviews"""
     def book_urls(self):
-        return self.css('.article-info a::attr(href)').getall()
+        return self.css('article.post h4 a::attr(href)').getall()
 
 
 class BPBookPage(BookPage):
     """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction"""
     def to_item(self):
         return {
             'url': self.url,
-            'name': self.css(".book-data h4::text").get().strip(),
+            'name': self.css("body div > h1::text").get().strip(),
         }
 
 
@@ -58,16 +58,12 @@ class BooksSpider(scrapy.Spider):
     start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews']
     # Configuring different page objects pages for different domains
     custom_settings = {
-        "SCRAPY_POET_OVERRIDES": {
-            "toscrape.com": {
-                BookListPage: BTSBookListPage,
-                BookPage: BTSBookPage
-            },
-            "bookpage.com": {
-                BookListPage: BPBookListPage,
-                BookPage: BPBookPage
-            },
-        }
+        "SCRAPY_POET_OVERRIDES": [
+            ("toscrape.com", BTSBookListPage, BookListPage),
+            ("toscrape.com", BTSBookPage, BookPage),
+            ("bookpage.com", BPBookListPage, BookListPage),
+            ("bookpage.com", BPBookPage, BookPage)
+        ]
     }
 
     def parse(self, response, page: BookListPage):

diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py
@@ -15,14 +15,14 @@
 from scrapy.statscollectors import StatsCollector
 from scrapy.utils.conf import build_component_list
 from scrapy.utils.defer import maybeDeferred_coro
-from scrapy.utils.misc import load_object
+from scrapy.utils.misc import load_object, create_instance
 
 from scrapy_poet.cache import SqlitedictCache
 from scrapy_poet.injection_errors import (UndeclaredProvidedTypeError,
                                           NonCallableProviderError,
                                           InjectionError)
 from scrapy_poet.overrides import OverridesRegistryBase, \
-    PerDomainOverridesRegistry
+    OverridesRegistry
 from scrapy_poet.page_input_providers import PageObjectInputProvider
 from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse
 from web_poet.pages import is_injectable
@@ -43,7 +43,7 @@ def __init__(self,
                  overrides_registry: Optional[OverridesRegistryBase] = None):
         self.crawler = crawler
         self.spider = crawler.spider
-        self.overrides_registry = overrides_registry or PerDomainOverridesRegistry()
+        self.overrides_registry = overrides_registry or OverridesRegistry()
         self.load_providers(default_providers)
         self.init_cache()
 
@@ -348,6 +348,8 @@ class MySpider(Spider):
     spider = MySpider()
     spider.settings = settings
     crawler.spider = spider
+    if not overrides_registry:
+        overrides_registry = create_instance(OverridesRegistry, settings, crawler)
     return Injector(crawler, overrides_registry=overrides_registry)
 
 

diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py
@@ -11,7 +11,7 @@
 
 from scrapy.utils.misc import create_instance, load_object
 from . import api
-from .overrides import PerDomainOverridesRegistry
+from .overrides import OverridesRegistry
 from .page_input_providers import ResponseDataProvider
 from .injection import Injector
 
@@ -35,7 +35,7 @@ def __init__(self, crawler: Crawler):
         self.crawler = crawler
         settings = self.crawler.settings
         registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY",
-                                                PerDomainOverridesRegistry))
+                                                OverridesRegistry))
         self.overrides_registry = create_instance(registry_cls, settings, crawler)
         self.injector = Injector(crawler,
                                  default_providers=DEFAULT_PROVIDERS,