scrapinghub · ivanprado · Aug 11, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,13 @@
 Changelog
 =========
 
+TBR
+------------------
+
+* More powerful overrides configuration by the introduction of
+  the ``HierarchicalOverridesRegistry``. The old registry was
+  removed.
+
 0.2.1 (2021-06-11)
 ------------------
 

diff --git a/docs/overrides.rst b/docs/overrides.rst
@@ -78,20 +78,36 @@ the obtained item with the ISBN from the page HTML.
                 item['isbn'] = self.css(".isbn-class::text").get()
                 return item
 
+Overrides rules
+---------------
+
+The former example showed how to configure the overrides for a particular
+domain. This is by far the most common case, but sometimes this is not
+enough: in some cases you may require to have different overrides for some subdomains
+(e.g. ``uk.somesite.com`` and ``us.somesite.com``); in other cases
+you may want to have specific overrides for a subsection of a site
+(e.g. ``somesite.com`` and ``somesite.com/deals``). This is entirely possible.
+In fact, the examples presented above are already valid keys to be used
+in the setting dictionary ``SCRAPY_POET_OVERRIDES``.
+
+There is more information about how to configure ``SCRAPY_POET_OVERRIDES``
+and the supported rules in :class:`scrapy_poet.overrides.HierarchicalOverridesRegistry`
+documentation.
+
 
 Overrides registry
 ==================
 
 The overrides registry is responsible for informing whether there exists an
 override for a particular type for a given response. The default overrides
-registry keeps a map of overrides for each domain and read this configuration
-from settings ``SCRAPY_POET_OVERRIDES`` as has been seen in the :ref:`intro-tutorial`
-example.
+registry allows to configure the overriding rules and reads the configuration
+from settings ``SCRAPY_POET_OVERRIDES``. See :class:`scrapy_poet.overrides.HierarchicalOverridesRegistry`
+for more information.
 
 But the registry implementation can be changed at convenience. A different
 registry implementation can be configured using the property
 ``SCRAPY_POET_OVERRIDES_REGISTRY`` in ``settings.py``. The new registry
-must be a subclass of ``scrapy_poet.overrides.OverridesRegistryBase``
+must be a subclass of :class:`scrapy_poet.overrides.OverridesRegistryBase`
 and must implement the method ``overrides_for``. As other Scrapy components,
 it can be initialized from the ``from_crawler`` class method if implemented.
 This might be handy to be able to access settings, stats, request meta, etc.

diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py
@@ -18,7 +18,7 @@
                                           NonCallableProviderError,
                                           InjectionError)
 from scrapy_poet.overrides import OverridesRegistryBase, \
-    PerDomainOverridesRegistry
+    HierarchicalOverridesRegistry
 from scrapy_poet.page_input_providers import PageObjectInputProvider
 from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse
 from web_poet.pages import is_injectable
@@ -39,7 +39,7 @@ def __init__(self,
                  overrides_registry: Optional[OverridesRegistryBase] = None):
         self.crawler = crawler
         self.spider = crawler.spider
-        self.overrides_registry = overrides_registry or PerDomainOverridesRegistry()
+        self.overrides_registry = overrides_registry or HierarchicalOverridesRegistry()
         self.load_providers(default_providers)
 
     def load_providers(self, default_providers: Optional[Mapping] = None):

diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py
@@ -11,7 +11,7 @@
 
 from scrapy.utils.misc import create_instance, load_object
 from . import api
-from .overrides import PerDomainOverridesRegistry
+from .overrides import HierarchicalOverridesRegistry
 from .page_input_providers import ResponseDataProvider
 from .injection import Injector
 
@@ -35,7 +35,7 @@ def __init__(self, crawler: Crawler):
         self.crawler = crawler
         settings = self.crawler.settings
         registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY",
-                                                PerDomainOverridesRegistry))
+                                                HierarchicalOverridesRegistry))
         self.overrides_registry = create_instance(registry_cls, settings, crawler)
         self.injector = Injector(crawler,
                                  default_providers=DEFAULT_PROVIDERS,

diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
@@ -1,9 +1,11 @@
 from abc import ABC, abstractmethod
-from typing import Dict, Mapping, Callable
+from typing import Dict, Mapping, Callable, Optional, List
+
+from marisa_trie import Trie
 
 from scrapy import Request
 from scrapy.crawler import Crawler
-from scrapy_poet.utils import get_domain
+from scrapy_poet.utils import get_domain, url_hierarchical_str
 
 
 class OverridesRegistryBase(ABC):
@@ -42,3 +44,101 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
         return self.get(get_domain(request.url), {})
 
 
+class OverridesRecord:
+
+    def __init__(self, hierarchical_url: str, overrides: Mapping[Callable, Callable]):
+        self.hierarchical_url = hierarchical_url
+        self.overrides = overrides
+
+
+class HierarchicalOverridesRegistry(OverridesRegistryBase):
+    """
+    Overrides registry that reads the overrides
+    from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings
+
+    Example of overrides configuration:
+
+    .. code-block:: python
+
+        SCRAPY_POET_OVERRIDES = {
+            "example.com": {
+                BookPage: ExampleBookPage
+                BookListPage: ExampleListBookPage
+            }
+        }
+
+    The former example configures ``ExampleBookPage``
+    and ``ExampleListBookPage`` to be used instead
+    of ``BookPage`` and ``BookListPage`` respectively
+    for any request to the domain ``example.com``.
+
+    Each set of rules can be configured to override a particular
+    domain, subdomain or even a specific path. The following
+    table shows some examples of keys and what are they effect.
+
+    .. list-table:: Overrides keys examples
+       :widths: auto
+       :width: 80%
+       :header-rows: 1
+
+       * - Key
+         - The overrides apply to
+       * - ``"subdomain.example.com"``
+         - any request belonging to ``subdomain.example.com`` or any of its
+           subdomains
+       * - ``"example.com/path_to_content"``
+         - any request to the netlocs ``example.com`` or ``www.example.com`` whose
+           URL path is a children of ``/path_to_content``
+       * - ``""``
+         - any request. Useful to set default overrides
+
+    **The most specific rule is applied** when several rules could be
+    applied to the same URL. Imagine, for example, the case where you have rules
+    for ``""``, ``"toscrape.com"``, ``"books.toscrape.com"`` and ``"books.toscrape.com/catalogue"``:
+
+    * The rules for ``""`` would be applied for the URL ``http://example.com``
+    * The rules for ``"toscrape.com"`` would be applied for the URL ``http://toscrape.com/index.html``
+    * The rules for ``"books.toscrape.com"`` would be applied for the URL ``http://books.toscrape.com``
+    * The rules for ``"books.toscrape.com/catalogue"`` would be applied for the URL ``http://books.toscrape.com/catalogue/category``
+
+    This is useful as it allows to configure some general overrides for a site
+    and also some more specific overrides for some subsections of the site.
+    """
+
+    def __init__(self, all_overrides: Optional[Mapping[str, Mapping[Callable, Callable]]] = None) -> None:
+        super().__init__()
+        self.overrides: List[OverridesRecord] = []
+        self.trie = Trie()
+        for domain_or_more, overrides in (all_overrides or {}).items():
+            self.register(domain_or_more, overrides)
+
+    def register(self, domain_or_more: str, overrides: Mapping[Callable, Callable]):
+        url = f"http://{domain_or_more}"
+        hurl = url_hierarchical_str(url)
+        record = OverridesRecord(hurl, overrides)
+        # Update case
+        if hurl in self.trie:
+            self.overrides[self.trie[hurl]] = record
+            return
+
+        # Insert case. We have to rebuild the trie and the reindex the
+        # overrides list based on the new trie.
+        # Note that this is O(N), but register is expected to be executed only
+        # at initialization and we expect N to be low enough.
+        new_overrides = self.overrides + [record]
+        self.trie = Trie([override.hierarchical_url for override in new_overrides])
+        self.overrides = [None] * len(new_overrides)  # type: ignore
+        for override in new_overrides:
+            self.overrides[self.trie[override.hierarchical_url]] = override
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        return cls(crawler.settings.getdict("SCRAPY_POET_OVERRIDES", {}))
+
+    def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
+        hurl = url_hierarchical_str(request.url)
+        max_prefix = max(self.trie.prefixes(hurl), default=None)
+        if max_prefix is not None:
+            return self.overrides[self.trie[max_prefix]].overrides
+        else:
+            return {}
diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py
@@ -1,3 +1,6 @@
+import re
+from urllib.parse import urlsplit
+
 from tldextract import tldextract
 
 
@@ -11,5 +14,85 @@ def get_domain(url):
     'example.com'
     >>> get_domain("http://deeper.blog.example.co.uk")
     'example.co.uk'
+    >>> get_domain("http://127.0.0.1")
+    '127.0.0.1'
+    """
+    return ".".join(el for el in tldextract.extract(url)[-2:] if el)
+
+
+# Is IP Regex, from https://www.oreilly.com/library/view/regular-expressions-cookbook/9780596802837/ch07s16.html
+_IS_IP_ADDRESS_RE = re.compile(
+    r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
+)
+
+
+def url_hierarchical_str(url: str) -> str:
+    """
+    Return a string that represents the url in a way that its
+    components are ordered by its hierarchical importance. That is, the top
+    level domain is the most important, so it is the fist element in the string.
+    Then goes the rest of the levels in the domain, the port and finally the path.
+
+    Can be very useful to verify if a URL is a subpath of the other just
+    by checking if one url hierarchical str is the prefix of the other.
+
+    Trailing slash for the path is removed and the query and the fragment
+    are ignored.
+
+    >>> url_hierarchical_str("http://")
+    ''
+    >>> url_hierarchical_str("http://example.com:343")
+    'com.example.:343'
+    >>> url_hierarchical_str("http://example.com:343/")
+    'com.example.:343'
+    >>> url_hierarchical_str("http://WWW.example.com:343/")
+    'com.example.:343'
+    >>> url_hierarchical_str("http://www.EXAMPLE.com:343/?id=23")
+    'com.example.:343'
+    >>> url_hierarchical_str("http://www.example.com:343/page?id=23")
+    'com.example.:343/page'
+    >>> url_hierarchical_str("http://www.example.com:343/page?id=23;params#fragment")
+    'com.example.:343/page'
+    >>> url_hierarchical_str("http://127.0.0.1:80/page?id=23;params#fragment")
+    '127.0.0.1./page'
+    >>> url_hierarchical_str("https://127.0.0.1:443/page?id=23;params#fragment")
+    '127.0.0.1./page'
+    >>> url_hierarchical_str("https://127.0.0.1:333/page?id=23;params#fragment")
+    '127.0.0.1.:333/page'
+    >>> url_hierarchical_str("http://example.com:333/path/to/something")
+    'com.example.:333/path/to/something'
+    >>> url_hierarchical_str("mailto://example.com")
+    Traceback (most recent call last):
+    ...
+    ValueError: Unsupported scheme for URL mailto://example.com
+    >>> url_hierarchical_str("http://example.com:k34")  # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+    ...
+    ValueError: Port could not be cast to integer value as 'k34'
+    >>> url_hierarchical_str("/path")
+    Traceback (most recent call last):
+    ...
+    ValueError: Unsupported scheme for URL /path
     """
-    return ".".join(tldextract.extract(url)[-2:])
+    parts = urlsplit(url.strip())
+    scheme, netloc, path, query, fragment = parts
+    if scheme.lower() not in ["http", "https"]:
+        raise ValueError(f"Unsupported scheme for URL {url}")
+    host = (parts.hostname or "").lower()
+    port = f":{parts.port}" if parts.port and parts.port not in [80, 443] else ""
+
+    if not _IS_IP_ADDRESS_RE.match(host):
+        # Remove www and reverse the domains
+        dom_secs = host.split(".")
+        if dom_secs:
+            if dom_secs[0] == "www":
+                dom_secs = dom_secs[1:]
+        host = ".".join(reversed(dom_secs))
+    if host:
+        host += "."
+
+    if path.endswith("/"):
+        path = path[:-1]
+
+    return f"{host}{port}{path}"
+
diff --git a/setup.py b/setup.py
@@ -15,7 +15,8 @@
         'attrs',
         'parsel',
         'web-poet',
-        'tldextract'],
+        'tldextract',
+        'marisa-trie'],
     classifiers=[
         'Development Status :: 3 - Alpha',
         'Intended Audience :: Developers',

diff --git a/tests/test_injection.py b/tests/test_injection.py
@@ -11,7 +11,7 @@
     get_injector_for_testing, get_response_for_testing
 from scrapy_poet.injection_errors import NonCallableProviderError, \
     InjectionError, UndeclaredProvidedTypeError
-from scrapy_poet.overrides import PerDomainOverridesRegistry
+from scrapy_poet.overrides import HierarchicalOverridesRegistry
 from web_poet import Injectable, ItemPage
 from web_poet.mixins import ResponseShortcutsMixin
 
@@ -303,7 +303,7 @@ def test_overrides(self, providers, override_should_happen):
                 EurDollarRate: OtherEurDollarRate
             }
         }
-        registry = PerDomainOverridesRegistry(overrides)
+        registry = HierarchicalOverridesRegistry(overrides)
         injector = get_injector_for_testing(providers,
                                             overrides_registry=registry)