diff --git a/README.rst b/README.rst index 548aeaf0..d8011864 100644 --- a/README.rst +++ b/README.rst @@ -63,6 +63,7 @@ Add the following inside Scrapy's ``settings.py`` file: SPIDER_MIDDLEWARES = { "scrapy_poet.RetryMiddleware": 275, } + REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter" Developing ========== diff --git a/docs/intro/install.rst b/docs/intro/install.rst index 9011a252..c64bd386 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -32,6 +32,7 @@ of your Scrapy project: SPIDER_MIDDLEWARES = { "scrapy_poet.RetryMiddleware": 275, } + REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter" Things that are good to know ============================ diff --git a/docs/providers.rst b/docs/providers.rst index 6fbbf2fe..a7433dcf 100644 --- a/docs/providers.rst +++ b/docs/providers.rst @@ -313,6 +313,9 @@ To have other settings respected, in addition to ``CONCURRENT_REQUESTS``, you'd need to use ``crawler.engine.download`` or something like that. Alternatively, you could implement those limits in the library itself. + +.. _annotated: + Attaching metadata to dependencies ================================== diff --git a/docs/settings.rst b/docs/settings.rst index 3cfaa9ee..0c2642ba 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -107,3 +107,23 @@ Sets the class, or its import path, that will be used as an adapter in the generated test fixtures. More info at :ref:`fixture-adapter`. + + +SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS +-------------------------------------------- + +The default value is the default value of the ``REQUEST_FINGERPRINTER_CLASS`` +setting for the version of Scrapy currently installed (e.g. +``"scrapy.utils.request.RequestFingerprinter"``). + +You can assign a request fingerprinter class to this setting to configure a +custom request fingerprinter class to use for requests. + +This class is used to generate a base fingerprint for a request. If that +request uses dependency injection, that fingerprint is then modified to account +for requested dependencies. Otherwise, the fingerprint is used as is. + +.. note:: Annotations of :ref:`annotated dependencies ` are + serialized with :func:`repr` for fingerprinting purposes. If you find a + real-world scenario where this is a problem, please `open an issue + `_. diff --git a/pyproject.toml b/pyproject.toml index 530b2f7a..f8e62ea7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,8 @@ line-length = 88 [tool.isort] profile = "black" multi_line_output = 3 +# scrapy_poet/__init__.py: Automatic sorting causes circular dependencies. +skip = ["scrapy_poet/__init__.py"] [[tool.mypy.overrides]] module = [ diff --git a/scrapy_poet/__init__.py b/scrapy_poet/__init__.py index 6b948a29..f3d76a1a 100644 --- a/scrapy_poet/__init__.py +++ b/scrapy_poet/__init__.py @@ -2,3 +2,4 @@ from .downloadermiddlewares import InjectionMiddleware from .page_input_providers import HttpResponseProvider, PageObjectInputProvider from .spidermiddlewares import RetryMiddleware +from ._request_fingerprinter import ScrapyPoetRequestFingerprinter diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py new file mode 100644 index 00000000..f6593a16 --- /dev/null +++ b/scrapy_poet/_request_fingerprinter.py @@ -0,0 +1,167 @@ +try: + from scrapy.utils.request import RequestFingerprinter # NOQA +except ImportError: + from typing import TYPE_CHECKING + + if not TYPE_CHECKING: + ScrapyPoetRequestFingerprinter = None +else: + import hashlib + import json + from functools import cached_property + from logging import getLogger + from typing import Callable, Dict, List, Optional, get_args, get_origin + from weakref import WeakKeyDictionary + + from scrapy import Request + from scrapy.crawler import Crawler + from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS + from scrapy.utils.misc import create_instance, load_object + from web_poet import ( + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + PageParams, + RequestUrl, + Stats, + ) + from web_poet.utils import get_fq_class_name + + from scrapy_poet import InjectionMiddleware + from scrapy_poet.injection import get_callback + + logger = getLogger(__name__) + + def _serialize_dep(cls): + try: + from typing import Annotated + except ImportError: + pass + else: + if get_origin(cls) is Annotated: + annotated, *annotations = get_args(cls) + return f"{_serialize_dep(annotated)}{repr(annotations)}" + return get_fq_class_name(cls) + + class ScrapyPoetRequestFingerprinter: + + IGNORED_UNANNOTATED_DEPS = { + # These dependencies are tools for page objects that should have no + # bearing on the request itself. + HttpClient, + Stats, + # These dependencies do not impact the fingerprint as dependencies, + # it is their value on the request itself that should have an + # impact on the request fingerprint. + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + PageParams, + RequestUrl, + } + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler: Crawler) -> None: + settings = crawler.settings + self._base_request_fingerprinter = create_instance( + load_object( + settings.get( + "SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS", + REQUEST_FINGERPRINTER_CLASS, + ) + ), + settings=crawler.settings, + crawler=crawler, + ) + self._callback_cache: Dict[Callable, Optional[bytes]] = {} + self._request_cache: "WeakKeyDictionary[Request, bytes]" = ( + WeakKeyDictionary() + ) + self._crawler: Crawler = crawler + self._saw_unserializable_page_params = False + + @cached_property + def _injector(self): + middlewares = self._crawler.engine.downloader.middleware.middlewares + for middleware in middlewares: + if isinstance(middleware, InjectionMiddleware): + return middleware.injector + raise RuntimeError( + "scrapy_poet.InjectionMiddleware not found at run time, has it " + "been configured in the DOWNLOADER_MIDDLEWARES setting?" + ) + + def _get_deps(self, request: Request) -> Optional[List[str]]: + """Return a JSON-serializable structure that uniquely identifies the + dependencies requested by the request, or None if dependency injection + is not required.""" + plan = self._injector.build_plan(request) + deps = {dep for dep, params in plan[:-1]} - self.IGNORED_UNANNOTATED_DEPS + if not deps: + return None + return sorted([_serialize_dep(cls) for cls in deps]) + + def get_deps_key(self, request: Request) -> Optional[bytes]: + """Return a JSON array as bytes that uniquely identifies the + dependencies requested through scrapy-poet injection that could + impact the request, or None if there are no such dependencies.""" + callback = get_callback(request, self._crawler.spider) + if callback in self._callback_cache: + return self._callback_cache[callback] + + deps = self._get_deps(request) + if not deps: + self._callback_cache[callback] = None + return None + + deps_key = json.dumps(deps, sort_keys=True).encode() + self._callback_cache[callback] = deps_key + return self._callback_cache[callback] + + def serialize_page_params(self, request: Request) -> Optional[bytes]: + """Return a JSON object as bytes that represents the page params, + or None if there are no page params or they are not + JSON-serializable.""" + page_params = request.meta.get("page_params", None) + if not page_params: + return None + + try: + return json.dumps(page_params, sort_keys=True).encode() + except TypeError: + if not self._saw_unserializable_page_params: + self._saw_unserializable_page_params = True + logger.warning( + f"Cannot serialize page params {page_params!r} of " + f"request {request} as JSON. This can be an issue if " + f"you have requests that are identical except for " + f"their page params, because unserializable page " + f"params are treated the same as missing or empty " + f"page params for purposes of request fingerprinting " + f"(see " + f"https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprints). " + f"This will be the only warning about this issue, " + f"other requests might be also affected." + ) + return None + + def fingerprint(self, request: Request) -> bytes: + if request in self._request_cache: + return self._request_cache[request] + + fingerprint = self._base_request_fingerprinter.fingerprint(request) + deps_key = self.get_deps_key(request) + serialized_page_params = self.serialize_page_params(request) + if deps_key is None and serialized_page_params is None: + return fingerprint + if deps_key is not None: + fingerprint += deps_key + if serialized_page_params is not None: + fingerprint += serialized_page_params + + self._request_cache[request] = hashlib.sha1(fingerprint).digest() + return self._request_cache[request] diff --git a/scrapy_poet/utils/testing.py b/scrapy_poet/utils/testing.py index 394fccd8..b334c8cb 100644 --- a/scrapy_poet/utils/testing.py +++ b/scrapy_poet/utils/testing.py @@ -3,11 +3,12 @@ from typing import Dict from unittest import mock -from scrapy import signals +from scrapy import Spider, signals from scrapy.crawler import Crawler from scrapy.exceptions import CloseSpider from scrapy.settings import Settings from scrapy.utils.python import to_bytes +from scrapy.utils.test import get_crawler as _get_crawler from twisted.internet import reactor from twisted.internet.defer import inlineCallbacks from twisted.internet.task import deferLater @@ -151,6 +152,10 @@ def crawl_single_item( return item, url, crawler +def get_download_handler(crawler, schema): + return crawler.engine.downloader.handlers._get_handler(schema) + + def make_crawler(spider_cls, settings): if not getattr(spider_cls, "name", None): @@ -163,6 +168,33 @@ class Spider(spider_cls): return Crawler(spider_cls, settings) +def setup_crawler_engine(crawler: Crawler): + """Run the crawl steps until engine setup, so that crawler.engine is not + None. + https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122 + """ + + crawler.crawling = True + crawler.spider = crawler._create_spider() + crawler.engine = crawler._create_engine() + + handler = get_download_handler(crawler, "https") + if hasattr(handler, "engine_started"): + handler.engine_started() + + +class DummySpider(Spider): + name = "dummy" + + +def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True): + settings = settings or {} + crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls) + if setup_engine: + setup_crawler_engine(crawler) + return crawler + + class CollectorPipeline: def open_spider(self, spider): spider.collected_items = [] diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py new file mode 100644 index 00000000..b329590d --- /dev/null +++ b/tests/test_request_fingerprinter.py @@ -0,0 +1,514 @@ +import sys +from itertools import combinations +from typing import Callable, Set +from unittest.mock import patch + +import pytest +from packaging.version import Version +from scrapy import __version__ as SCRAPY_VERSION + +if Version(SCRAPY_VERSION) < Version("2.7"): + pytest.skip("Skipping tests for Scrapy < 2.7", allow_module_level=True) + +from importlib.metadata import version as package_version + +from scrapy import Request, Spider +from scrapy.http import Response +from scrapy.utils.misc import load_object +from web_poet import ( + BrowserHtml, + BrowserResponse, + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + HttpResponse, + HttpResponseBody, + HttpResponseHeaders, + ItemPage, + PageParams, + RequestUrl, + ResponseUrl, + Stats, + WebPage, +) + +from scrapy_poet import DummyResponse, ScrapyPoetRequestFingerprinter +from scrapy_poet._request_fingerprinter import _serialize_dep +from scrapy_poet.downloadermiddlewares import DEFAULT_PROVIDERS +from scrapy_poet.injection import Injector, is_class_provided_by_any_provider_fn +from scrapy_poet.page_input_providers import PageObjectInputProvider +from scrapy_poet.utils.testing import get_crawler as _get_crawler + +ANDI_VERSION = Version(package_version("andi")) + +SETTINGS = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + }, + "REQUEST_FINGERPRINTER_CLASS": ScrapyPoetRequestFingerprinter, +} + + +def get_crawler(spider_cls=None, settings=None, ensure_providers_for=None): + settings = {**SETTINGS} if settings is None else settings + + kwargs = {} + if spider_cls is not None: + kwargs["spider_cls"] = spider_cls + + ensure_providers_for = ensure_providers_for or tuple() + if ensure_providers_for: + dummy_providers = get_dummy_providers(*ensure_providers_for) + if dummy_providers: + settings["SCRAPY_POET_PROVIDERS"] = { + provider: 0 for provider in dummy_providers + } + + return _get_crawler(settings=settings, **kwargs) + + +dummy_injector = Injector(crawler=get_crawler()) +default_providers = [load_object(cls)(dummy_injector) for cls in DEFAULT_PROVIDERS] +is_class_provided_by_any_default_provider = is_class_provided_by_any_provider_fn( + default_providers +) + + +def get_dummy_providers(*input_classes): + dummy_providers = [] + + for input_cls in input_classes: + if is_class_provided_by_any_default_provider(input_cls): + continue + + class DummyProvider(PageObjectInputProvider): + provided_classes = {input_cls} + + def __call__(self, to_provide: Set[Callable]): + input_cls = next(iter(self.provided_classes)) + return [input_cls()] + + dummy_providers.append(DummyProvider) + + return dummy_providers + + +def test_single_callback(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint2 = fingerprinter.fingerprint(request2) + request3 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint3 = fingerprinter.fingerprint(request3) + request4 = Request("https://books.toscrape.com", callback=crawler.spider.parse_page) + fingerprint4 = fingerprinter.fingerprint(request4) + assert fingerprint1 != fingerprint2 # same url, no deps vs deps + assert fingerprint2 == fingerprint3 # same url, same callback + assert fingerprint2 != fingerprint4 # different url, same callback + + +def test_same_deps_different_callbacks(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, a: WebPage): + pass + + async def parse_b(self, response, b: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +def test_same_deps_different_order(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, a: WebPage, b: ItemPage): + pass + + async def parse_b(self, response, a: ItemPage, b: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +def test_different_deps(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_item(self, response, item: ItemPage): + pass + + async def parse_web(self, response, web: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_item) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_web) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 + + +def test_response_typing(): + """The type of the response parameter is ignored, even when it is + DummyResponse. It’s the other injected parameters that should alter the + fingerprint.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_untyped(self, response, web: WebPage): + pass + + async def parse_typed(self, response: Response, web: WebPage): + pass + + async def parse_dummy(self, response: DummyResponse, web: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_untyped) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_typed) + fingerprint2 = fingerprinter.fingerprint(request2) + request3 = Request("https://toscrape.com", callback=crawler.spider.parse_dummy) + fingerprint3 = fingerprinter.fingerprint(request3) + assert fingerprint1 == fingerprint2 + assert fingerprint1 == fingerprint3 + + +@pytest.mark.parametrize( + "input_cls", + ( + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + PageParams, + RequestUrl, + Stats, + ), +) +def test_ignored_unannotated_page_inputs(input_cls): + """These web-poet page input classes, unless annotated, cannot have any + bearing on the request on their own, so they should not alter the request + fingerprint.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_input(self, response, some_input: input_cls): + pass + + crawler = get_crawler(spider_cls=TestSpider, ensure_providers_for=[input_cls]) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_input) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +# Inputs that affect the fingerprint. +# +# We do not try to be smart. e.g. although ResponseUrl should always be +# available, that could technically not be the case given a custom user +# provider. +FINGERPRINTING_INPUTS = ( + BrowserHtml, + BrowserResponse, + HttpResponse, + HttpResponseBody, + HttpResponseHeaders, + ResponseUrl, +) + + +@pytest.mark.parametrize("input_cls", FINGERPRINTING_INPUTS) +def test_fingerprinting_unannotated_page_inputs(input_cls): + """Inputs that may have an impact on the actual request sent even without + annotations.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_input(self, response, some_input: input_cls): + pass + + crawler = get_crawler(spider_cls=TestSpider, ensure_providers_for=[input_cls]) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_input) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 + + +@pytest.mark.parametrize( + "input_cls_a, input_cls_b", + (tuple(combination) for combination in combinations(FINGERPRINTING_INPUTS, 2)), +) +def test_fingerprinting_unannotated_page_input_combinations(input_cls_a, input_cls_b): + """Make sure that a combination of known inputs that alter the request + fingerprint does not result in the same fingerprint.""" + + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, input_a: input_cls_a): + pass + + async def parse_b(self, response, input_b: input_cls_b): + pass + + async def parse_all(self, response, input_a: input_cls_a, input_b: input_cls_b): + pass + + crawler = get_crawler( + spider_cls=TestSpider, ensure_providers_for=[input_cls_a, input_cls_b] + ) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + request3 = Request("https://toscrape.com", callback=crawler.spider.parse_all) + fingerprint3 = fingerprinter.fingerprint(request3) + assert fingerprint1 != fingerprint2 + assert fingerprint1 != fingerprint3 + assert fingerprint2 != fingerprint3 + + +def test_dep_resolution(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, web: WebPage): + pass + + async def parse_b(self, response, web: WebPage, http_response: HttpResponse): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +def test_page_params(caplog): + Unserializable = object() + + crawler = get_crawler() + fingerprinter = crawler.request_fingerprinter + + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + + request2 = Request("https://toscrape.com", meta={"page_params": {"a": "1"}}) + fingerprint2 = fingerprinter.fingerprint(request2) + + request3 = Request("https://toscrape.com", meta={"page_params": {"a": "2"}}) + fingerprint3 = fingerprinter.fingerprint(request3) + + request4 = Request( + "https://toscrape.com", meta={"page_params": {"a": "2"}, "foo": "bar"} + ) + fingerprint4 = fingerprinter.fingerprint(request4) + + request5 = Request( + "https://toscrape.com", meta={"page_params": {"a": Unserializable}} + ) + assert "Cannot serialize page params" not in caplog.text + caplog.clear() + fingerprint5 = fingerprinter.fingerprint(request5) + assert "Cannot serialize page params" in caplog.text + + assert fingerprint1 != fingerprint2 + assert fingerprint1 != fingerprint3 + assert fingerprint2 != fingerprint3 + assert fingerprint3 == fingerprint4 + assert fingerprint1 == fingerprint5 + + +@pytest.mark.parametrize( + "meta", + ( + {}, + {"page_params": None}, + {"page_params": {}}, + {"foo": "bar"}, + {"foo": "bar", "page_params": None}, + {"foo": "bar", "page_params": {}}, + ), +) +def test_meta(meta): + crawler = get_crawler() + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", meta=meta) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + +@pytest.mark.skipif( + sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" +) +@pytest.mark.skipif( + ANDI_VERSION <= Version("0.4.1"), + reason="https://github.com/scrapinghub/andi/pull/25", +) +def test_different_annotations(): + from typing import Annotated + + class TestSpider(Spider): + name = "test_spider" + + async def parse_a(self, response, a: Annotated[WebPage, "a"]): + pass + + async def parse_b(self, response, b: Annotated[WebPage, "b"]): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 + + +def test_serialize_dep(): + assert _serialize_dep(HttpResponse) == "web_poet.page_inputs.http.HttpResponse" + + +@pytest.mark.skipif( + sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9" +) +def test_serialize_dep_annotated(): + from typing import Annotated + + assert ( + _serialize_dep(Annotated[HttpResponse, "foo"]) + == "web_poet.page_inputs.http.HttpResponse['foo']" + ) + + +def test_base_default(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + base_fingerprinter = crawler.request_fingerprinter._base_request_fingerprinter + + request1 = Request("https://toscrape.com") + fingerprint1 = fingerprinter.fingerprint(request1) + base_fingerprint = base_fingerprinter.fingerprint(request1) + assert fingerprint1 == base_fingerprint + + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + fingerprint2 = fingerprinter.fingerprint(request2) + assert base_fingerprint == base_fingerprinter.fingerprint(request2) + assert fingerprint2 != base_fingerprint + + +def test_base_custom(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + class CustomFingerprinter: + def fingerprint(self, request): + return b"foo" + + settings = { + **SETTINGS, + "SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS": CustomFingerprinter, + } + crawler = get_crawler(spider_cls=TestSpider, settings=settings) + fingerprinter = crawler.request_fingerprinter + + request = Request("https://example.com") + assert fingerprinter.fingerprint(request) == b"foo" + request = Request("https://example.com", callback=crawler.spider.parse_page) + assert fingerprinter.fingerprint(request) != b"foo" + + +def test_missing_middleware(): + settings = {**SETTINGS, "DOWNLOADER_MIDDLEWARES": {}} + crawler = get_crawler(settings=settings) + fingerprinter = crawler.request_fingerprinter + request = Request("https://example.com") + with pytest.raises(RuntimeError): + fingerprinter.fingerprint(request) + + +def test_callback_cache(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + to_wrap = fingerprinter._get_deps + request1 = Request("https://example.com", callback=crawler.spider.parse_page) + request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page) + with patch.object(fingerprinter, "_get_deps", wraps=to_wrap) as mock: + fingerprinter.fingerprint(request1) + fingerprinter.fingerprint(request2) + mock.assert_called_once_with(request1) + + +def test_request_cache(): + class TestSpider(Spider): + name = "test_spider" + + async def parse_page(self, response, page: WebPage): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + base_fingerprinter = fingerprinter._base_request_fingerprinter + to_wrap = base_fingerprinter.fingerprint + request = Request("https://example.com", callback=crawler.spider.parse_page) + with patch.object(base_fingerprinter, "fingerprint", wraps=to_wrap) as mock: + fingerprinter.fingerprint(request) + fingerprinter.fingerprint(request) + mock.assert_called_once_with(request)