From a39ef5c5a9714dfdae979a5cba6d766d25699d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Nov 2023 15:12:45 +0100 Subject: [PATCH 01/19] Implement a request fingerprinter that accounts for dependencies --- README.rst | 1 + docs/intro/install.rst | 1 + docs/settings.rst | 15 ++ scrapy_poet/__init__.py | 1 + scrapy_poet/_request_fingerprinter.py | 91 +++++++++++++ tests/__init__.py | 36 +++++ tests/test_request_fingerprinting.py | 189 ++++++++++++++++++++++++++ 7 files changed, 334 insertions(+) create mode 100644 scrapy_poet/_request_fingerprinter.py create mode 100644 tests/test_request_fingerprinting.py diff --git a/README.rst b/README.rst index 548aeaf0..d8011864 100644 --- a/README.rst +++ b/README.rst @@ -63,6 +63,7 @@ Add the following inside Scrapy's ``settings.py`` file: SPIDER_MIDDLEWARES = { "scrapy_poet.RetryMiddleware": 275, } + REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter" Developing ========== diff --git a/docs/intro/install.rst b/docs/intro/install.rst index 9011a252..c64bd386 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -32,6 +32,7 @@ of your Scrapy project: SPIDER_MIDDLEWARES = { "scrapy_poet.RetryMiddleware": 275, } + REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter" Things that are good to know ============================ diff --git a/docs/settings.rst b/docs/settings.rst index 3cfaa9ee..9fdc3ef1 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -107,3 +107,18 @@ Sets the class, or its import path, that will be used as an adapter in the generated test fixtures. More info at :ref:`fixture-adapter`. + + +SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS +-------------------------------------------- + +The default value is the default value of the ``REQUEST_FINGERPRINTER_CLASS`` +setting for the version of Scrapy currently installed (e.g. +``"scrapy.utils.request.RequestFingerprinter"``). + +You can assign a request fingerprinter class to this setting to configure a +custom request fingerprinter class to use for requests. + +This class is used to generate a base fingerprint for a request. If that +request uses dependency injection, that fingerprint is then modified to account +for requested dependencies. Otherwise, the fingerprint is used as is. diff --git a/scrapy_poet/__init__.py b/scrapy_poet/__init__.py index 337188a9..215a40dc 100644 --- a/scrapy_poet/__init__.py +++ b/scrapy_poet/__init__.py @@ -1,3 +1,4 @@ +from ._request_fingerprinter import ScrapyPoetRequestFingerprinter from .api import DummyResponse, callback_for from .downloadermiddlewares import InjectionMiddleware from .page_input_providers import HttpResponseProvider, PageObjectInputProvider diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py new file mode 100644 index 00000000..aaee5745 --- /dev/null +++ b/scrapy_poet/_request_fingerprinter.py @@ -0,0 +1,91 @@ +from typing import TYPE_CHECKING + +try: + from scrapy.utils.request import RequestFingerprinter # NOQA +except ImportError: + if not TYPE_CHECKING: + ScrapyPoetRequestFingerprinter = None +else: + import hashlib + import json + from functools import cached_property + from typing import Callable, Dict, List, Optional + from weakref import WeakKeyDictionary + + from scrapy import Request + from scrapy.crawler import Crawler + from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS + from scrapy.utils.misc import create_instance, load_object + + from scrapy_poet import InjectionMiddleware + from scrapy_poet.injection import get_callback + + class ScrapyPoetRequestFingerprinter: + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler: Crawler) -> None: + settings = crawler.settings + self._fallback_request_fingerprinter = create_instance( + load_object( + settings.get( + "SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS", + REQUEST_FINGERPRINTER_CLASS, + ) + ), + settings=crawler.settings, + crawler=crawler, + ) + self._callback_cache: Dict[Callable, bytes] = {} + self._request_cache: "WeakKeyDictionary[Request, bytes]" = ( + WeakKeyDictionary() + ) + self._crawler: Crawler = crawler + + @cached_property + def _injector(self): + middlewares = self._crawler.engine.downloader.middleware.middlewares + for middleware in middlewares: + if isinstance(middleware, InjectionMiddleware): + return middleware.injector + raise RuntimeError( + "scrapy_poet.InjectionMiddleware not found at run time, has it " + "been configured in the DOWNLOADER_MIDDLEWARES setting?" + ) + + def _get_deps(self, request: Request) -> Optional[List[str]]: + """Return a JSON-serializable structure that uniquely identifies the + dependencies requested by the request, or None if dependency injection + is not required.""" + plan = self._injector.build_plan(request) + root_deps = plan[-1][1] + if not root_deps: + return None + return [repr(cls) for cls in root_deps.values()] + + def fingerprint_deps(self, request: Request) -> Optional[bytes]: + """Return a fingerprint based on dependencies requested through + scrapy-poet injection, or None if no injection was requested.""" + callback = get_callback(request, self._crawler.spider) + if callback in self._callback_cache: + return self._callback_cache[callback] + + deps = self._get_deps(request) + if deps is None: + return None + + deps_key = json.dumps(deps, sort_keys=True).encode() + self._callback_cache[callback] = hashlib.sha1(deps_key).digest() + return self._callback_cache[callback] + + def fingerprint(self, request: Request) -> bytes: + if request in self._request_cache: + return self._request_cache[request] + fingerprint = self._fallback_request_fingerprinter.fingerprint(request) + deps_fingerprint = self.fingerprint_deps(request) + if deps_fingerprint is None: + return fingerprint + fingerprints = fingerprint + deps_fingerprint + self._request_cache[request] = hashlib.sha1(fingerprints).digest() + return self._request_cache[request] diff --git a/tests/__init__.py b/tests/__init__.py index 54de4c80..e0ba6150 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,8 +1,44 @@ import os +from scrapy import Spider +from scrapy.crawler import Crawler +from scrapy.utils.test import get_crawler as _get_crawler + # Note that tox.ini should only set the REACTOR env variable when running # pytest with "--reactor=asyncio". if os.environ.get("REACTOR", "") == "asyncio": from scrapy.utils.reactor import install_reactor install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") + + +def get_download_handler(crawler, schema): + return crawler.engine.downloader.handlers._get_handler(schema) + + +def setup_crawler_engine(crawler: Crawler): + """Run the crawl steps until engine setup, so that crawler.engine is not + None. + + https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122 + """ + + crawler.crawling = True + crawler.spider = crawler._create_spider() + crawler.engine = crawler._create_engine() + + handler = get_download_handler(crawler, "https") + if hasattr(handler, "engine_started"): + handler.engine_started() + + +class DummySpider(Spider): + name = "dummy" + + +def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True): + settings = settings or {} + crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls) + if setup_engine: + setup_crawler_engine(crawler) + return crawler diff --git a/tests/test_request_fingerprinting.py b/tests/test_request_fingerprinting.py new file mode 100644 index 00000000..57564bf2 --- /dev/null +++ b/tests/test_request_fingerprinting.py @@ -0,0 +1,189 @@ +import sys + +import pytest +from packaging.version import Version +from scrapy import __version__ as SCRAPY_VERSION + +if Version(SCRAPY_VERSION) < Version("2.7"): + pytest.skip("Skipping tests for Scrapy < 2.7", allow_module_level=True) + +from importlib.metadata import version as package_version + +from scrapy import Request, Spider +from web_poet import ItemPage, WebPage + +from scrapy_poet import ScrapyPoetRequestFingerprinter + +from . import get_crawler as _get_crawler + +ANDI_VERSION = Version(package_version("andi")) + +SETTINGS = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + }, + "REQUEST_FINGERPRINTER_CLASS": ScrapyPoetRequestFingerprinter, +} + + +def get_crawler(spider_cls=None, settings=None): + settings = SETTINGS if settings is None else settings + kwargs = {} + if spider_cls is not None: + kwargs["spider_cls"] = spider_cls + return _get_crawler(settings=settings, **kwargs) + + +def test_no_deps_vs_dep(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 + + +def test_same_deps(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, a: WebPage): + pass + + async def parse_b(self, response, b: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +def test_same_deps_different_callbacks(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +def test_different_deps(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_item(self, response, item: ItemPage): + pass + + async def parse_web(self, response, web: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_item) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_web) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 + + +@pytest.mark.skipif( + sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" +) +@pytest.mark.skipif( + ANDI_VERSION <= Version("0.4.1"), + reason="https://github.com/scrapinghub/andi/pull/25", +) +def test_different_annotations(): + from typing import Annotated + + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, a: Annotated[WebPage, "a"]): + pass + + async def parse_b(self, response, b: Annotated[WebPage, "b"]): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 + + +def test_fallback_default(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + fallback_fingerprinter = ( + crawler.request_fingerprinter._fallback_request_fingerprinter + ) + + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + fallback_fingerprint = fallback_fingerprinter.fingerprint(request1) + assert fingerprint1 == fallback_fingerprint + + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fallback_fingerprint == fallback_fingerprinter.fingerprint(request2) + assert fingerprint2 != fallback_fingerprint + + +def test_fallback_custom(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + class CustomFingerprinter: + def fingerprint(self, request): + return b"foo" + + settings = { + **SETTINGS, + "SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS": CustomFingerprinter, + } + crawler = get_crawler(spider_cls=TestSpider, settings=settings) + fingerprinter = crawler.request_fingerprinter + + request = Request("https://example.com") + assert fingerprinter.fingerprint(request) == b"foo" + request = Request("https://example.com", callback=crawler.spider.parse_page) + assert fingerprinter.fingerprint(request) != b"foo" + + +def test_missing_middleware(): + settings = {**SETTINGS, "DOWNLOADER_MIDDLEWARES": {}} + crawler = get_crawler(settings=settings) + fingerprinter = crawler.request_fingerprinter + request = Request("https://example.com") + with pytest.raises(RuntimeError): + fingerprinter.fingerprint(request) From 1e8af9a5993196c7f072f57b2e70eabe5d9bc94f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Nov 2023 15:21:20 +0100 Subject: [PATCH 02/19] Fix circular dependency caused by isort --- pyproject.toml | 2 ++ scrapy_poet/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 530b2f7a..f8e62ea7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,8 @@ line-length = 88 [tool.isort] profile = "black" multi_line_output = 3 +# scrapy_poet/__init__.py: Automatic sorting causes circular dependencies. +skip = ["scrapy_poet/__init__.py"] [[tool.mypy.overrides]] module = [ diff --git a/scrapy_poet/__init__.py b/scrapy_poet/__init__.py index 215a40dc..bd50c143 100644 --- a/scrapy_poet/__init__.py +++ b/scrapy_poet/__init__.py @@ -1,5 +1,5 @@ -from ._request_fingerprinter import ScrapyPoetRequestFingerprinter from .api import DummyResponse, callback_for from .downloadermiddlewares import InjectionMiddleware from .page_input_providers import HttpResponseProvider, PageObjectInputProvider from .spidermiddlewares import RetryMiddleware +from ._request_fingerprinter import ScrapyPoetRequestFingerprinter From c3bb97d1c89711407fe95896fcc0c64a415f4b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Nov 2023 15:24:07 +0100 Subject: [PATCH 03/19] Fix test naming --- tests/test_request_fingerprinting.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_request_fingerprinting.py b/tests/test_request_fingerprinting.py index 57564bf2..2437a81f 100644 --- a/tests/test_request_fingerprinting.py +++ b/tests/test_request_fingerprinting.py @@ -54,17 +54,14 @@ def test_same_deps(): class TestSpider(Spider): name = "test_spider" - async def parse_a(self, response, a: WebPage): - pass - - async def parse_b(self, response, b: WebPage): + async def parse_page(self, response, page: WebPage): pass crawler = get_crawler(spider_cls=TestSpider) fingerprinter = crawler.request_fingerprinter - request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_page) fingerprint1 = fingerprinter.fingerprint(request1) - request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) fingerprint2 = fingerprinter.fingerprint(request2) assert fingerprint1 == fingerprint2 @@ -73,14 +70,17 @@ def test_same_deps_different_callbacks(): class TestSpider(Spider): name = "test_spider" - async def parse_page(self, response, page: WebPage): + async def parse_a(self, response, a: WebPage): + pass + + async def parse_b(self, response, b: WebPage): pass crawler = get_crawler(spider_cls=TestSpider) fingerprinter = crawler.request_fingerprinter - request1 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) fingerprint1 = fingerprinter.fingerprint(request1) - request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) fingerprint2 = fingerprinter.fingerprint(request2) assert fingerprint1 == fingerprint2 From a1d9d04fc7d61a04b76afc499e02ba7aefd9f3d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Nov 2023 15:25:24 +0100 Subject: [PATCH 04/19] =?UTF-8?q?fallback=20=E2=86=92=20base?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_poet/_request_fingerprinter.py | 4 ++-- tests/test_request_fingerprinting.py | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index aaee5745..f26ca7b2 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -27,7 +27,7 @@ def from_crawler(cls, crawler): def __init__(self, crawler: Crawler) -> None: settings = crawler.settings - self._fallback_request_fingerprinter = create_instance( + self._base_request_fingerprinter = create_instance( load_object( settings.get( "SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS", @@ -82,7 +82,7 @@ def fingerprint_deps(self, request: Request) -> Optional[bytes]: def fingerprint(self, request: Request) -> bytes: if request in self._request_cache: return self._request_cache[request] - fingerprint = self._fallback_request_fingerprinter.fingerprint(request) + fingerprint = self._base_request_fingerprinter.fingerprint(request) deps_fingerprint = self.fingerprint_deps(request) if deps_fingerprint is None: return fingerprint diff --git a/tests/test_request_fingerprinting.py b/tests/test_request_fingerprinting.py index 2437a81f..257a0b4a 100644 --- a/tests/test_request_fingerprinting.py +++ b/tests/test_request_fingerprinting.py @@ -132,7 +132,7 @@ async def parse_b(self, response, b: Annotated[WebPage, "b"]): assert fingerprint1 != fingerprint2 -def test_fallback_default(): +def test_base_default(): class TestSpider(Spider): name = "test_spider" @@ -141,22 +141,20 @@ async def parse_page(self, response, page: WebPage): crawler = get_crawler(spider_cls=TestSpider) fingerprinter = crawler.request_fingerprinter - fallback_fingerprinter = ( - crawler.request_fingerprinter._fallback_request_fingerprinter - ) + base_fingerprinter = crawler.request_fingerprinter._base_request_fingerprinter request1 = Request("https://toscrape.com") fingerprint1 = fingerprinter.fingerprint(request1) - fallback_fingerprint = fallback_fingerprinter.fingerprint(request1) - assert fingerprint1 == fallback_fingerprint + base_fingerprint = base_fingerprinter.fingerprint(request1) + assert fingerprint1 == base_fingerprint request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) fingerprint2 = fingerprinter.fingerprint(request2) - assert fallback_fingerprint == fallback_fingerprinter.fingerprint(request2) - assert fingerprint2 != fallback_fingerprint + assert base_fingerprint == base_fingerprinter.fingerprint(request2) + assert fingerprint2 != base_fingerprint -def test_fallback_custom(): +def test_base_custom(): class TestSpider(Spider): name = "test_spider" From d740418237775e387b2d8f73fa0633ecb6bb4eed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Nov 2023 12:28:10 +0100 Subject: [PATCH 05/19] Do not let dependency order affect fingerprints --- scrapy_poet/_request_fingerprinter.py | 2 +- tests/test_request_fingerprinting.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index f26ca7b2..c5223043 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -62,7 +62,7 @@ def _get_deps(self, request: Request) -> Optional[List[str]]: root_deps = plan[-1][1] if not root_deps: return None - return [repr(cls) for cls in root_deps.values()] + return sorted([repr(cls) for cls in root_deps.values()]) def fingerprint_deps(self, request: Request) -> Optional[bytes]: """Return a fingerprint based on dependencies requested through diff --git a/tests/test_request_fingerprinting.py b/tests/test_request_fingerprinting.py index 257a0b4a..3dd74c3d 100644 --- a/tests/test_request_fingerprinting.py +++ b/tests/test_request_fingerprinting.py @@ -85,6 +85,25 @@ async def parse_b(self, response, b: WebPage): assert fingerprint1 == fingerprint2 +def test_same_deps_different_order(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, a: WebPage, b: ItemPage): + pass + + async def parse_b(self, response, a: ItemPage, b: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + def test_different_deps(): class TestSpider(Spider): name = "test_spider" From 6bb32a22fdf761c2c2c77e95bd2db15b6de57612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Nov 2023 12:31:35 +0100 Subject: [PATCH 06/19] =?UTF-8?q?test=20file:=20fingerprinting=20=E2=86=92?= =?UTF-8?q?=20fingerprinter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...st_request_fingerprinting.py => test_request_fingerprinter.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_request_fingerprinting.py => test_request_fingerprinter.py} (100%) diff --git a/tests/test_request_fingerprinting.py b/tests/test_request_fingerprinter.py similarity index 100% rename from tests/test_request_fingerprinting.py rename to tests/test_request_fingerprinter.py From 99ba579a3d25454774c01f44a02df15e60e1a0eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Nov 2023 12:47:37 +0100 Subject: [PATCH 07/19] Test caching --- tests/test_request_fingerprinter.py | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 3dd74c3d..db9597ef 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -1,4 +1,5 @@ import sys +from unittest.mock import patch import pytest from packaging.version import Version @@ -204,3 +205,39 @@ def test_missing_middleware(): request = Request("https://example.com") with pytest.raises(RuntimeError): fingerprinter.fingerprint(request) + + +def test_callback_cache(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + to_wrap = fingerprinter._get_deps + request1 = Request("https://example.com", callback=crawler.spider.parse_page) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + with patch.object(fingerprinter, "_get_deps", wraps=to_wrap) as mock: + fingerprinter.fingerprint(request1) + fingerprinter.fingerprint(request2) + mock.assert_called_once_with(request1) + + +def test_request_cache(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + base_fingerprinter = fingerprinter._base_request_fingerprinter + to_wrap = base_fingerprinter.fingerprint + request = Request("https://example.com", callback=crawler.spider.parse_page) + with patch.object(base_fingerprinter, "fingerprint", wraps=to_wrap) as mock: + fingerprinter.fingerprint(request) + fingerprinter.fingerprint(request) + mock.assert_called_once_with(request) From cc2bb243a9a5f4b7418f38cac51612f8e59f4870 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 13 Nov 2023 17:50:00 +0100 Subject: [PATCH 08/19] =?UTF-8?q?tests/=5F=5Finit=5F=5F.py=20=E2=86=92=20s?= =?UTF-8?q?crapy=5Fpoet/utils/testing.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_poet/utils/testing.py | 34 ++++++++++++++++++++++++++- tests/__init__.py | 36 ----------------------------- tests/test_request_fingerprinter.py | 3 +-- 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/scrapy_poet/utils/testing.py b/scrapy_poet/utils/testing.py index 394fccd8..b334c8cb 100644 --- a/scrapy_poet/utils/testing.py +++ b/scrapy_poet/utils/testing.py @@ -3,11 +3,12 @@ from typing import Dict from unittest import mock -from scrapy import signals +from scrapy import Spider, signals from scrapy.crawler import Crawler from scrapy.exceptions import CloseSpider from scrapy.settings import Settings from scrapy.utils.python import to_bytes +from scrapy.utils.test import get_crawler as _get_crawler from twisted.internet import reactor from twisted.internet.defer import inlineCallbacks from twisted.internet.task import deferLater @@ -151,6 +152,10 @@ def crawl_single_item( return item, url, crawler +def get_download_handler(crawler, schema): + return crawler.engine.downloader.handlers._get_handler(schema) + + def make_crawler(spider_cls, settings): if not getattr(spider_cls, "name", None): @@ -163,6 +168,33 @@ class Spider(spider_cls): return Crawler(spider_cls, settings) +def setup_crawler_engine(crawler: Crawler): + """Run the crawl steps until engine setup, so that crawler.engine is not + None. + https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122 + """ + + crawler.crawling = True + crawler.spider = crawler._create_spider() + crawler.engine = crawler._create_engine() + + handler = get_download_handler(crawler, "https") + if hasattr(handler, "engine_started"): + handler.engine_started() + + +class DummySpider(Spider): + name = "dummy" + + +def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True): + settings = settings or {} + crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls) + if setup_engine: + setup_crawler_engine(crawler) + return crawler + + class CollectorPipeline: def open_spider(self, spider): spider.collected_items = [] diff --git a/tests/__init__.py b/tests/__init__.py index e0ba6150..54de4c80 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,44 +1,8 @@ import os -from scrapy import Spider -from scrapy.crawler import Crawler -from scrapy.utils.test import get_crawler as _get_crawler - # Note that tox.ini should only set the REACTOR env variable when running # pytest with "--reactor=asyncio". if os.environ.get("REACTOR", "") == "asyncio": from scrapy.utils.reactor import install_reactor install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") - - -def get_download_handler(crawler, schema): - return crawler.engine.downloader.handlers._get_handler(schema) - - -def setup_crawler_engine(crawler: Crawler): - """Run the crawl steps until engine setup, so that crawler.engine is not - None. - - https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122 - """ - - crawler.crawling = True - crawler.spider = crawler._create_spider() - crawler.engine = crawler._create_engine() - - handler = get_download_handler(crawler, "https") - if hasattr(handler, "engine_started"): - handler.engine_started() - - -class DummySpider(Spider): - name = "dummy" - - -def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True): - settings = settings or {} - crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls) - if setup_engine: - setup_crawler_engine(crawler) - return crawler diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index db9597ef..d19313eb 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -14,8 +14,7 @@ from web_poet import ItemPage, WebPage from scrapy_poet import ScrapyPoetRequestFingerprinter - -from . import get_crawler as _get_crawler +from scrapy_poet.utils.testing import get_crawler as _get_crawler ANDI_VERSION = Version(package_version("andi")) From 5bb01ec9b6bdd4191a37e4ab7fde214b5296273c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 13 Nov 2023 17:53:18 +0100 Subject: [PATCH 09/19] Merge 2 test cases with a lot of shared code --- tests/test_request_fingerprinter.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index d19313eb..2913f2c0 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -34,7 +34,7 @@ def get_crawler(spider_cls=None, settings=None): return _get_crawler(settings=settings, **kwargs) -def test_no_deps_vs_dep(): +def test_no_and_same_deps(): class TestSpider(Spider): name = "test_spider" @@ -47,23 +47,10 @@ async def parse_page(self, response, page: WebPage): fingerprint1 = fingerprinter.fingerprint(request1) request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) fingerprint2 = fingerprinter.fingerprint(request2) + request3 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint3 = fingerprinter.fingerprint(request3) assert fingerprint1 != fingerprint2 - - -def test_same_deps(): - class TestSpider(Spider): - name = "test_spider" - - async def parse_page(self, response, page: WebPage): - pass - - crawler = get_crawler(spider_cls=TestSpider) - fingerprinter = crawler.request_fingerprinter - request1 = Request("https://toscrape.com", callback=crawler.spider.parse_page) - fingerprint1 = fingerprinter.fingerprint(request1) - request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) - fingerprint2 = fingerprinter.fingerprint(request2) - assert fingerprint1 == fingerprint2 + assert fingerprint2 == fingerprint3 def test_same_deps_different_callbacks(): From 2af611d8ed065f26922de240a93df9806f452b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 13 Nov 2023 17:56:43 +0100 Subject: [PATCH 10/19] Test different URLs --- tests/test_request_fingerprinter.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 2913f2c0..383af080 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -34,7 +34,7 @@ def get_crawler(spider_cls=None, settings=None): return _get_crawler(settings=settings, **kwargs) -def test_no_and_same_deps(): +def test_single_callback(): class TestSpider(Spider): name = "test_spider" @@ -49,8 +49,11 @@ async def parse_page(self, response, page: WebPage): fingerprint2 = fingerprinter.fingerprint(request2) request3 = Request("https://toscrape.com", callback=crawler.spider.parse_page) fingerprint3 = fingerprinter.fingerprint(request3) - assert fingerprint1 != fingerprint2 - assert fingerprint2 == fingerprint3 + request4 = Request("https://books.toscrape.com", callback=crawler.spider.parse_page) + fingerprint4 = fingerprinter.fingerprint(request4) + assert fingerprint1 != fingerprint2 # same url, no deps vs deps + assert fingerprint2 == fingerprint3 # same url, same callback + assert fingerprint2 != fingerprint4 # different url, same callback def test_same_deps_different_callbacks(): From dc400402ed7793a9ecaa3912ed6c1bc4988d3e9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 13 Nov 2023 18:52:14 +0100 Subject: [PATCH 11/19] Add tests for DummyResponse, responseless inputs and (lack of) dependency resolution --- tests/test_request_fingerprinter.py | 96 ++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 2 deletions(-) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 383af080..35f3cb5c 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -11,9 +11,10 @@ from importlib.metadata import version as package_version from scrapy import Request, Spider -from web_poet import ItemPage, WebPage +from scrapy.http import Response +from web_poet import HttpResponse, ItemPage, PageParams, RequestUrl, WebPage -from scrapy_poet import ScrapyPoetRequestFingerprinter +from scrapy_poet import DummyResponse, ScrapyPoetRequestFingerprinter from scrapy_poet.utils.testing import get_crawler as _get_crawler ANDI_VERSION = Version(package_version("andi")) @@ -113,6 +114,97 @@ async def parse_web(self, response, web: WebPage): assert fingerprint1 != fingerprint2 +def test_response_typing(): + """The type of the response parameter is ignored, even when it is + DummyResponse.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_untyped(self, response, web: WebPage): + pass + + async def parse_typed(self, response: Response, web: WebPage): + pass + + async def parse_dummy(self, response: DummyResponse, web: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_untyped) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_typed) + fingerprint2 = fingerprinter.fingerprint(request2) + request3 = Request("https://toscrape.com", callback=crawler.spider.parse_dummy) + fingerprint3 = fingerprinter.fingerprint(request3) + assert fingerprint1 == fingerprint2 + assert fingerprint1 == fingerprint3 + + +def test_responseless_inputs(): + """Inputs that have no impact on the actual requests sent because they do + not require sending a request at all are considered valid, different + dependencies for fingerprinting purposes nonetheless.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_nothing(self, response: DummyResponse): + pass + + async def parse_page_params( + self, response: DummyResponse, page_params: PageParams + ): + # NOTE: requesting PageParams or not should not affect the request + # fingerprinting, setting page_params on the request should. + pass + + async def parse_request_url( + self, response: DummyResponse, request_url: RequestUrl + ): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_nothing) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request( + "https://toscrape.com", callback=crawler.spider.parse_page_params + ) + fingerprint2 = fingerprinter.fingerprint(request2) + request3 = Request( + "https://toscrape.com", callback=crawler.spider.parse_request_url + ) + fingerprint3 = fingerprinter.fingerprint(request3) + assert fingerprint1 != fingerprint2 + assert fingerprint1 != fingerprint3 + assert fingerprint2 != fingerprint3 + + +def test_dep_resolution(): + """We do not resolve dependencies, so it is possible for 2 callbacks that + when resolved have identical dependencies to get a different + fingerprint.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, web: WebPage): + pass + + async def parse_b(self, response, web: WebPage, http_response: HttpResponse): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 + + @pytest.mark.skipif( sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" ) From 3af6693e2766a9209154ffce8d259a31478ce2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 13 Nov 2023 19:46:53 +0100 Subject: [PATCH 12/19] Use fully-qualified names for dependency serialization --- scrapy_poet/_request_fingerprinter.py | 19 +++++++++++++++---- tests/test_request_fingerprinter.py | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index c5223043..9b9cbdac 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -1,15 +1,15 @@ -from typing import TYPE_CHECKING - try: from scrapy.utils.request import RequestFingerprinter # NOQA except ImportError: + from typing import TYPE_CHECKING + if not TYPE_CHECKING: ScrapyPoetRequestFingerprinter = None else: import hashlib import json from functools import cached_property - from typing import Callable, Dict, List, Optional + from typing import Callable, Dict, List, Optional, get_args, get_origin from weakref import WeakKeyDictionary from scrapy import Request @@ -20,6 +20,17 @@ from scrapy_poet import InjectionMiddleware from scrapy_poet.injection import get_callback + def _serialize_dep(cls): + try: + from typing import Annotated + except ImportError: + pass + else: + if get_origin(cls) is Annotated: + annotated, *annotations = get_args(cls) + return f"{_serialize_dep(annotated)}{repr(annotations)}" + return f"{cls.__module__}.{cls.__qualname__}" + class ScrapyPoetRequestFingerprinter: @classmethod def from_crawler(cls, crawler): @@ -62,7 +73,7 @@ def _get_deps(self, request: Request) -> Optional[List[str]]: root_deps = plan[-1][1] if not root_deps: return None - return sorted([repr(cls) for cls in root_deps.values()]) + return sorted([_serialize_dep(cls) for cls in root_deps.values()]) def fingerprint_deps(self, request: Request) -> Optional[bytes]: """Return a fingerprint based on dependencies requested through diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 35f3cb5c..61f6af9d 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -15,6 +15,7 @@ from web_poet import HttpResponse, ItemPage, PageParams, RequestUrl, WebPage from scrapy_poet import DummyResponse, ScrapyPoetRequestFingerprinter +from scrapy_poet._request_fingerprinter import _serialize_dep from scrapy_poet.utils.testing import get_crawler as _get_crawler ANDI_VERSION = Version(package_version("andi")) @@ -233,6 +234,22 @@ async def parse_b(self, response, b: Annotated[WebPage, "b"]): assert fingerprint1 != fingerprint2 +def test_serialize_dep(): + assert _serialize_dep(HttpResponse) == "web_poet.page_inputs.http.HttpResponse" + + +@pytest.mark.skipif( + sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" +) +def test_serialize_dep_annotated(): + from typing import Annotated + + assert ( + _serialize_dep(Annotated[HttpResponse, "foo"]) + == "web_poet.page_inputs.http.HttpResponse['foo']" + ) + + def test_base_default(): class TestSpider(Spider): name = "test_spider" From a611fd2714f60135af8299d184697dda1f9c3ac0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Nov 2023 11:18:06 +0100 Subject: [PATCH 13/19] Ignore some (unannotated) page input dependencies --- scrapy_poet/_request_fingerprinter.py | 35 ++++- tests/test_request_fingerprinter.py | 183 ++++++++++++++++++++++---- 2 files changed, 188 insertions(+), 30 deletions(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index 9b9cbdac..f341fee7 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -16,6 +16,15 @@ from scrapy.crawler import Crawler from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS from scrapy.utils.misc import create_instance, load_object + from web_poet import ( + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + PageParams, + RequestUrl, + Stats, + ) from scrapy_poet import InjectionMiddleware from scrapy_poet.injection import get_callback @@ -32,6 +41,22 @@ def _serialize_dep(cls): return f"{cls.__module__}.{cls.__qualname__}" class ScrapyPoetRequestFingerprinter: + + IGNORED_UNANNOTATED_DEPS = { + # These dependencies are tools for page objects that should have no + # bearing on the request itself. + HttpClient, + Stats, + # These dependencies do not impact the fingerprint as dependencies, + # it is their value on the request itself that should have an + # impact on the request fingerprint. + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + PageParams, + RequestUrl, + } + @classmethod def from_crawler(cls, crawler): return cls(crawler) @@ -73,7 +98,13 @@ def _get_deps(self, request: Request) -> Optional[List[str]]: root_deps = plan[-1][1] if not root_deps: return None - return sorted([_serialize_dep(cls) for cls in root_deps.values()]) + return sorted( + [ + _serialize_dep(cls) + for cls in root_deps.values() + if cls not in self.IGNORED_UNANNOTATED_DEPS + ] + ) def fingerprint_deps(self, request: Request) -> Optional[bytes]: """Return a fingerprint based on dependencies requested through @@ -83,7 +114,7 @@ def fingerprint_deps(self, request: Request) -> Optional[bytes]: return self._callback_cache[callback] deps = self._get_deps(request) - if deps is None: + if not deps: return None deps_key = json.dumps(deps, sort_keys=True).encode() diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 61f6af9d..d4aeda05 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -1,4 +1,6 @@ import sys +from itertools import combinations +from typing import Callable, Set from unittest.mock import patch import pytest @@ -12,10 +14,30 @@ from scrapy import Request, Spider from scrapy.http import Response -from web_poet import HttpResponse, ItemPage, PageParams, RequestUrl, WebPage +from scrapy.utils.misc import load_object +from web_poet import ( + BrowserHtml, + BrowserResponse, + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + HttpResponse, + HttpResponseBody, + HttpResponseHeaders, + ItemPage, + PageParams, + RequestUrl, + ResponseUrl, + Stats, + WebPage, +) from scrapy_poet import DummyResponse, ScrapyPoetRequestFingerprinter from scrapy_poet._request_fingerprinter import _serialize_dep +from scrapy_poet.downloadermiddlewares import DEFAULT_PROVIDERS +from scrapy_poet.injection import Injector, is_class_provided_by_any_provider_fn +from scrapy_poet.page_input_providers import PageObjectInputProvider from scrapy_poet.utils.testing import get_crawler as _get_crawler ANDI_VERSION = Version(package_version("andi")) @@ -28,14 +50,50 @@ } -def get_crawler(spider_cls=None, settings=None): - settings = SETTINGS if settings is None else settings +def get_crawler(spider_cls=None, settings=None, ensure_providers_for=None): + settings = {**SETTINGS} if settings is None else settings + kwargs = {} if spider_cls is not None: kwargs["spider_cls"] = spider_cls + + ensure_providers_for = ensure_providers_for or tuple() + if ensure_providers_for: + dummy_providers = get_dummy_providers(*ensure_providers_for) + if dummy_providers: + settings["SCRAPY_POET_PROVIDERS"] = { + provider: 0 for provider in dummy_providers + } + return _get_crawler(settings=settings, **kwargs) +dummy_injector = Injector(crawler=get_crawler()) +default_providers = [load_object(cls)(dummy_injector) for cls in DEFAULT_PROVIDERS] +is_class_provided_by_any_default_provider = is_class_provided_by_any_provider_fn( + default_providers +) + + +def get_dummy_providers(*input_classes): + dummy_providers = [] + + for input_cls in input_classes: + if is_class_provided_by_any_default_provider(input_cls): + continue + + class DummyProvider(PageObjectInputProvider): + provided_classes = {input_cls} + + def __call__(self, to_provide: Set[Callable]): + input_cls = next(iter(self.provided_classes)) + return [input_cls()] + + dummy_providers.append(DummyProvider) + + return dummy_providers + + def test_single_callback(): class TestSpider(Spider): name = "test_spider" @@ -117,7 +175,8 @@ async def parse_web(self, response, web: WebPage): def test_response_typing(): """The type of the response parameter is ignored, even when it is - DummyResponse.""" + DummyResponse. It’s the other injected parameters that should alter the + fingerprint.""" class TestSpider(Spider): name = "test_spider" @@ -143,40 +202,102 @@ async def parse_dummy(self, response: DummyResponse, web: WebPage): assert fingerprint1 == fingerprint3 -def test_responseless_inputs(): - """Inputs that have no impact on the actual requests sent because they do - not require sending a request at all are considered valid, different - dependencies for fingerprinting purposes nonetheless.""" +@pytest.mark.parametrize( + "input_cls", + ( + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + PageParams, + RequestUrl, + Stats, + ), +) +def test_ignored_unannotated_page_inputs(input_cls): + """These web-poet page input classes, unless annotated, cannot have any + bearing on the request on their own, so they should not alter the request + fingerprint.""" class TestSpider(Spider): name = "test_spider" - async def parse_nothing(self, response: DummyResponse): + async def parse_input(self, response, some_input: input_cls): pass - async def parse_page_params( - self, response: DummyResponse, page_params: PageParams - ): - # NOTE: requesting PageParams or not should not affect the request - # fingerprinting, setting page_params on the request should. - pass + crawler = get_crawler(spider_cls=TestSpider, ensure_providers_for=[input_cls]) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_input) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +# Inputs that affect the fingerprint. +# +# We do not try to be smart. e.g. although ResponseUrl should always be +# available, that could technically not be the case given a custom user +# provider. +FINGERPRINTING_INPUTS = ( + BrowserHtml, + BrowserResponse, + HttpResponse, + HttpResponseBody, + HttpResponseHeaders, + ResponseUrl, +) + - async def parse_request_url( - self, response: DummyResponse, request_url: RequestUrl - ): +@pytest.mark.parametrize("input_cls", FINGERPRINTING_INPUTS) +def test_fingerprinting_unannotated_page_inputs(input_cls): + """Inputs that may have an impact on the actual request sent even without + annotations.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_input(self, response, some_input: input_cls): pass - crawler = get_crawler(spider_cls=TestSpider) + crawler = get_crawler(spider_cls=TestSpider, ensure_providers_for=[input_cls]) fingerprinter = crawler.request_fingerprinter - request1 = Request("https://toscrape.com", callback=crawler.spider.parse_nothing) + request1 = Request("https://toscrape.com") fingerprint1 = fingerprinter.fingerprint(request1) - request2 = Request( - "https://toscrape.com", callback=crawler.spider.parse_page_params - ) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_input) fingerprint2 = fingerprinter.fingerprint(request2) - request3 = Request( - "https://toscrape.com", callback=crawler.spider.parse_request_url + assert fingerprint1 != fingerprint2 + + +@pytest.mark.parametrize( + "input_cls_a, input_cls_b", + (tuple(combination) for combination in combinations(FINGERPRINTING_INPUTS, 2)), +) +def test_fingerprinting_unannotated_page_input_combinations(input_cls_a, input_cls_b): + """Make sure that a combination of known inputs that alter the request + fingerprint does not result in the same fingerprint.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, input_a: input_cls_a): + pass + + async def parse_b(self, response, input_b: input_cls_b): + pass + + async def parse_all(self, response, input_a: input_cls_a, input_b: input_cls_b): + pass + + crawler = get_crawler( + spider_cls=TestSpider, ensure_providers_for=[input_cls_a, input_cls_b] ) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + request3 = Request("https://toscrape.com", callback=crawler.spider.parse_all) fingerprint3 = fingerprinter.fingerprint(request3) assert fingerprint1 != fingerprint2 assert fingerprint1 != fingerprint3 @@ -184,9 +305,15 @@ async def parse_request_url( def test_dep_resolution(): - """We do not resolve dependencies, so it is possible for 2 callbacks that - when resolved have identical dependencies to get a different - fingerprint.""" + """Currently we do not resolve dependencies, so it is possible to get a + different fingerprint for callbacks that resolve to identical + dependencies. + + The reason for not resolving dependencies is that it could be hard where + items are involved. This might be addressed in the future, in which case + we should consider to fingerprint based on leaf dependencies (those with a + provider) and not on the root dependencies present in the callback. + """ class TestSpider(Spider): name = "test_spider" From 23b7c33922bc832c243c458510f6ec1236e0183e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Nov 2023 12:09:40 +0100 Subject: [PATCH 14/19] Count page params towards request fingerprinting --- scrapy_poet/_request_fingerprinter.py | 54 +++++++++++++++++++++++---- tests/test_request_fingerprinter.py | 32 ++++++++++++++++ 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index f341fee7..5eb25ba4 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -9,6 +9,7 @@ import hashlib import json from functools import cached_property + from logging import getLogger from typing import Callable, Dict, List, Optional, get_args, get_origin from weakref import WeakKeyDictionary @@ -29,6 +30,8 @@ from scrapy_poet import InjectionMiddleware from scrapy_poet.injection import get_callback + logger = getLogger(__name__) + def _serialize_dep(cls): try: from typing import Annotated @@ -78,6 +81,7 @@ def __init__(self, crawler: Crawler) -> None: WeakKeyDictionary() ) self._crawler: Crawler = crawler + self._saw_unserializable_page_params = False @cached_property def _injector(self): @@ -106,9 +110,10 @@ def _get_deps(self, request: Request) -> Optional[List[str]]: ] ) - def fingerprint_deps(self, request: Request) -> Optional[bytes]: - """Return a fingerprint based on dependencies requested through - scrapy-poet injection, or None if no injection was requested.""" + def get_deps_key(self, request: Request) -> Optional[bytes]: + """Return a JSON array as bytes that uniquely identifies the + dependencies requested through scrapy-poet injection that could + impact the request, or None if there are no such dependencies.""" callback = get_callback(request, self._crawler.spider) if callback in self._callback_cache: return self._callback_cache[callback] @@ -118,16 +123,49 @@ def fingerprint_deps(self, request: Request) -> Optional[bytes]: return None deps_key = json.dumps(deps, sort_keys=True).encode() - self._callback_cache[callback] = hashlib.sha1(deps_key).digest() + self._callback_cache[callback] = deps_key return self._callback_cache[callback] + def serialize_page_params(self, request: Request) -> Optional[bytes]: + """Returns a JSON object as bytes that represents the page params, + or None if there are no page params or they are not + JSON-serializable.""" + page_params = request.meta.get("page_params", None) + if not page_params: + return None + + try: + return json.dumps(page_params, sort_keys=True).encode() + except TypeError: + if not self._saw_unserializable_page_params: + self._saw_unserializable_page_params = True + logger.warning( + f"Cannot serialize page params {page_params!r} of " + f"request {request} as JSON. This can be an issue if " + f"you have requests that are identical except for " + f"their page params, because unserializable page " + f"params are treated the same as missing or empty " + f"page params for purposes of request fingerprinting " + f"(see " + f"https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprints). " + f"This will be the only warning about this issue, " + f"other requests might be also affected." + ) + return None + def fingerprint(self, request: Request) -> bytes: if request in self._request_cache: return self._request_cache[request] + fingerprint = self._base_request_fingerprinter.fingerprint(request) - deps_fingerprint = self.fingerprint_deps(request) - if deps_fingerprint is None: + deps_key = self.get_deps_key(request) + serialized_page_params = self.serialize_page_params(request) + if deps_key is None and serialized_page_params is None: return fingerprint - fingerprints = fingerprint + deps_fingerprint - self._request_cache[request] = hashlib.sha1(fingerprints).digest() + if deps_key is not None: + fingerprint += deps_key + if serialized_page_params is not None: + fingerprint += serialized_page_params + + self._request_cache[request] = hashlib.sha1(fingerprint).digest() return self._request_cache[request] diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index d4aeda05..e56440a1 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -333,6 +333,38 @@ async def parse_b(self, response, web: WebPage, http_response: HttpResponse): assert fingerprint1 != fingerprint2 +def test_page_params(caplog): + class TestSpider(Spider): + name = "test_spider" + + Unserializable = object() + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + + request2 = Request("https://toscrape.com", meta={"page_params": {"a": "1"}}) + fingerprint2 = fingerprinter.fingerprint(request2) + + request3 = Request("https://toscrape.com", meta={"page_params": {"a": "2"}}) + fingerprint3 = fingerprinter.fingerprint(request3) + + request4 = Request( + "https://toscrape.com", meta={"page_params": {"a": Unserializable}} + ) + assert "Cannot serialize page params" not in caplog.text + caplog.clear() + fingerprint4 = fingerprinter.fingerprint(request4) + assert "Cannot serialize page params" in caplog.text + + assert fingerprint1 != fingerprint2 + assert fingerprint1 != fingerprint3 + assert fingerprint2 != fingerprint3 + assert fingerprint1 == fingerprint4 + + @pytest.mark.skipif( sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" ) From e8f0322b4be7af128d833ecedbfd4f85730df560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Nov 2023 12:25:33 +0100 Subject: [PATCH 15/19] Test that meta (empty page params and arbitrary keys) does not impact fingerprinting --- tests/test_request_fingerprinter.py | 36 ++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index e56440a1..4145d3dc 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -334,12 +334,9 @@ async def parse_b(self, response, web: WebPage, http_response: HttpResponse): def test_page_params(caplog): - class TestSpider(Spider): - name = "test_spider" - Unserializable = object() - crawler = get_crawler(spider_cls=TestSpider) + crawler = get_crawler() fingerprinter = crawler.request_fingerprinter request1 = Request("https://toscrape.com") @@ -352,17 +349,44 @@ class TestSpider(Spider): fingerprint3 = fingerprinter.fingerprint(request3) request4 = Request( + "https://toscrape.com", meta={"page_params": {"a": "2"}, "foo": "bar"} + ) + fingerprint4 = fingerprinter.fingerprint(request4) + + request5 = Request( "https://toscrape.com", meta={"page_params": {"a": Unserializable}} ) assert "Cannot serialize page params" not in caplog.text caplog.clear() - fingerprint4 = fingerprinter.fingerprint(request4) + fingerprint5 = fingerprinter.fingerprint(request5) assert "Cannot serialize page params" in caplog.text assert fingerprint1 != fingerprint2 assert fingerprint1 != fingerprint3 assert fingerprint2 != fingerprint3 - assert fingerprint1 == fingerprint4 + assert fingerprint3 == fingerprint4 + assert fingerprint1 == fingerprint5 + + +@pytest.mark.parametrize( + "meta", + ( + {}, + {"page_params": None}, + {"page_params": {}}, + {"foo": "bar"}, + {"foo": "bar", "page_params": None}, + {"foo": "bar", "page_params": {}}, + ), +) +def test_meta(meta): + crawler = get_crawler() + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", meta=meta) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 @pytest.mark.skipif( From e2c5b17fefd0d6168d032e524ebb04a3c7efdfe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 13 Dec 2023 16:45:34 +0100 Subject: [PATCH 16/19] Address feedback --- scrapy_poet/_request_fingerprinter.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index 5eb25ba4..7f1c2db4 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -26,6 +26,7 @@ RequestUrl, Stats, ) + from web_poet.utils import get_fq_class_name from scrapy_poet import InjectionMiddleware from scrapy_poet.injection import get_callback @@ -41,7 +42,7 @@ def _serialize_dep(cls): if get_origin(cls) is Annotated: annotated, *annotations = get_args(cls) return f"{_serialize_dep(annotated)}{repr(annotations)}" - return f"{cls.__module__}.{cls.__qualname__}" + return get_fq_class_name(cls) class ScrapyPoetRequestFingerprinter: @@ -120,6 +121,7 @@ def get_deps_key(self, request: Request) -> Optional[bytes]: deps = self._get_deps(request) if not deps: + self._callback_cache[callback] = None return None deps_key = json.dumps(deps, sort_keys=True).encode() @@ -127,7 +129,7 @@ def get_deps_key(self, request: Request) -> Optional[bytes]: return self._callback_cache[callback] def serialize_page_params(self, request: Request) -> Optional[bytes]: - """Returns a JSON object as bytes that represents the page params, + """Return a JSON object as bytes that represents the page params, or None if there are no page params or they are not JSON-serializable.""" page_params = request.meta.get("page_params", None) From a72f7c2591677f1d5586dcf0b176f91664cea1b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 13 Dec 2023 16:49:42 +0100 Subject: [PATCH 17/19] Fix typing --- scrapy_poet/_request_fingerprinter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index 7f1c2db4..674b3b81 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -77,7 +77,7 @@ def __init__(self, crawler: Crawler) -> None: settings=crawler.settings, crawler=crawler, ) - self._callback_cache: Dict[Callable, bytes] = {} + self._callback_cache: Dict[Callable, Optional[bytes]] = {} self._request_cache: "WeakKeyDictionary[Request, bytes]" = ( WeakKeyDictionary() ) From 596431b1c271c13144b1fd45546261b53bba5cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 26 Dec 2023 20:31:08 +0100 Subject: [PATCH 18/19] Document the use of repr() for annotation fingerprinting --- docs/providers.rst | 3 +++ docs/settings.rst | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/docs/providers.rst b/docs/providers.rst index 6fbbf2fe..a7433dcf 100644 --- a/docs/providers.rst +++ b/docs/providers.rst @@ -313,6 +313,9 @@ To have other settings respected, in addition to ``CONCURRENT_REQUESTS``, you'd need to use ``crawler.engine.download`` or something like that. Alternatively, you could implement those limits in the library itself. + +.. _annotated: + Attaching metadata to dependencies ================================== diff --git a/docs/settings.rst b/docs/settings.rst index 9fdc3ef1..0c2642ba 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -122,3 +122,8 @@ custom request fingerprinter class to use for requests. This class is used to generate a base fingerprint for a request. If that request uses dependency injection, that fingerprint is then modified to account for requested dependencies. Otherwise, the fingerprint is used as is. + +.. note:: Annotations of :ref:`annotated dependencies ` are + serialized with :func:`repr` for fingerprinting purposes. If you find a + real-world scenario where this is a problem, please `open an issue + `_. From 55326a4fd5e6c0643853829bb197ac0033adbfa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 26 Dec 2023 20:52:55 +0100 Subject: [PATCH 19/19] Implement dependency resolution --- scrapy_poet/_request_fingerprinter.py | 12 +++--------- tests/test_request_fingerprinter.py | 12 +----------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index 674b3b81..f6593a16 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -100,16 +100,10 @@ def _get_deps(self, request: Request) -> Optional[List[str]]: dependencies requested by the request, or None if dependency injection is not required.""" plan = self._injector.build_plan(request) - root_deps = plan[-1][1] - if not root_deps: + deps = {dep for dep, params in plan[:-1]} - self.IGNORED_UNANNOTATED_DEPS + if not deps: return None - return sorted( - [ - _serialize_dep(cls) - for cls in root_deps.values() - if cls not in self.IGNORED_UNANNOTATED_DEPS - ] - ) + return sorted([_serialize_dep(cls) for cls in deps]) def get_deps_key(self, request: Request) -> Optional[bytes]: """Return a JSON array as bytes that uniquely identifies the diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 4145d3dc..b329590d 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -305,16 +305,6 @@ async def parse_all(self, response, input_a: input_cls_a, input_b: input_cls_b): def test_dep_resolution(): - """Currently we do not resolve dependencies, so it is possible to get a - different fingerprint for callbacks that resolve to identical - dependencies. - - The reason for not resolving dependencies is that it could be hard where - items are involved. This might be addressed in the future, in which case - we should consider to fingerprint based on leaf dependencies (those with a - provider) and not on the root dependencies present in the callback. - """ - class TestSpider(Spider): name = "test_spider" @@ -330,7 +320,7 @@ async def parse_b(self, response, web: WebPage, http_response: HttpResponse): fingerprint1 = fingerprinter.fingerprint(request1) request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) fingerprint2 = fingerprinter.fingerprint(request2) - assert fingerprint1 != fingerprint2 + assert fingerprint1 == fingerprint2 def test_page_params(caplog):