Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a request fingerprinter that accounts for dependencies #172

Merged
merged 20 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Add the following inside Scrapy's ``settings.py`` file:
SPIDER_MIDDLEWARES = {
"scrapy_poet.RetryMiddleware": 275,
}
REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter"

Developing
==========
Expand Down
1 change: 1 addition & 0 deletions docs/intro/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ of your Scrapy project:
SPIDER_MIDDLEWARES = {
"scrapy_poet.RetryMiddleware": 275,
}
REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter"

Things that are good to know
============================
Expand Down
3 changes: 3 additions & 0 deletions docs/providers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,9 @@ To have other settings respected, in addition to ``CONCURRENT_REQUESTS``, you'd
need to use ``crawler.engine.download`` or something like that. Alternatively,
you could implement those limits in the library itself.


.. _annotated:

Attaching metadata to dependencies
==================================

Expand Down
20 changes: 20 additions & 0 deletions docs/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,23 @@ Sets the class, or its import path, that will be used as an adapter in the
generated test fixtures.

More info at :ref:`fixture-adapter`.


SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS
--------------------------------------------

The default value is the default value of the ``REQUEST_FINGERPRINTER_CLASS``
setting for the version of Scrapy currently installed (e.g.
``"scrapy.utils.request.RequestFingerprinter"``).

You can assign a request fingerprinter class to this setting to configure a
custom request fingerprinter class to use for requests.

This class is used to generate a base fingerprint for a request. If that
request uses dependency injection, that fingerprint is then modified to account
for requested dependencies. Otherwise, the fingerprint is used as is.

.. note:: Annotations of :ref:`annotated dependencies <annotated>` are
serialized with :func:`repr` for fingerprinting purposes. If you find a
real-world scenario where this is a problem, please `open an issue
<https://github.com/scrapinghub/scrapy-poet/issues>`_.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ line-length = 88
[tool.isort]
profile = "black"
multi_line_output = 3
# scrapy_poet/__init__.py: Automatic sorting causes circular dependencies.
skip = ["scrapy_poet/__init__.py"]

[[tool.mypy.overrides]]
module = [
Expand Down
1 change: 1 addition & 0 deletions scrapy_poet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .downloadermiddlewares import InjectionMiddleware
from .page_input_providers import HttpResponseProvider, PageObjectInputProvider
from .spidermiddlewares import RetryMiddleware
from ._request_fingerprinter import ScrapyPoetRequestFingerprinter
167 changes: 167 additions & 0 deletions scrapy_poet/_request_fingerprinter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
try:
from scrapy.utils.request import RequestFingerprinter # NOQA
except ImportError:
from typing import TYPE_CHECKING

if not TYPE_CHECKING:
ScrapyPoetRequestFingerprinter = None
else:
import hashlib
import json
from functools import cached_property
from logging import getLogger
from typing import Callable, Dict, List, Optional, get_args, get_origin
from weakref import WeakKeyDictionary

from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
from scrapy.utils.misc import create_instance, load_object
from web_poet import (
HttpClient,
HttpRequest,
HttpRequestBody,
HttpRequestHeaders,
PageParams,
RequestUrl,
Stats,
)
from web_poet.utils import get_fq_class_name

from scrapy_poet import InjectionMiddleware
from scrapy_poet.injection import get_callback

logger = getLogger(__name__)

def _serialize_dep(cls):
try:
from typing import Annotated
except ImportError:
pass
else:
if get_origin(cls) is Annotated:
annotated, *annotations = get_args(cls)
return f"{_serialize_dep(annotated)}{repr(annotations)}"
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
return get_fq_class_name(cls)

class ScrapyPoetRequestFingerprinter:

IGNORED_UNANNOTATED_DEPS = {
# These dependencies are tools for page objects that should have no
# bearing on the request itself.
HttpClient,
Stats,
# These dependencies do not impact the fingerprint as dependencies,
# it is their value on the request itself that should have an
# impact on the request fingerprint.
HttpRequest,
HttpRequestBody,
HttpRequestHeaders,
PageParams,
RequestUrl,
}

@classmethod
def from_crawler(cls, crawler):
return cls(crawler)

def __init__(self, crawler: Crawler) -> None:
settings = crawler.settings
self._base_request_fingerprinter = create_instance(
load_object(
settings.get(
"SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS",
REQUEST_FINGERPRINTER_CLASS,
)
),
settings=crawler.settings,
crawler=crawler,
)
self._callback_cache: Dict[Callable, Optional[bytes]] = {}
self._request_cache: "WeakKeyDictionary[Request, bytes]" = (
WeakKeyDictionary()
)
self._crawler: Crawler = crawler
self._saw_unserializable_page_params = False

@cached_property
def _injector(self):
middlewares = self._crawler.engine.downloader.middleware.middlewares
for middleware in middlewares:
if isinstance(middleware, InjectionMiddleware):
return middleware.injector
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
raise RuntimeError(
"scrapy_poet.InjectionMiddleware not found at run time, has it "
"been configured in the DOWNLOADER_MIDDLEWARES setting?"
)

def _get_deps(self, request: Request) -> Optional[List[str]]:
"""Return a JSON-serializable structure that uniquely identifies the
dependencies requested by the request, or None if dependency injection
is not required."""
plan = self._injector.build_plan(request)
deps = {dep for dep, params in plan[:-1]} - self.IGNORED_UNANNOTATED_DEPS
if not deps:
return None
return sorted([_serialize_dep(cls) for cls in deps])

def get_deps_key(self, request: Request) -> Optional[bytes]:
"""Return a JSON array as bytes that uniquely identifies the
dependencies requested through scrapy-poet injection that could
impact the request, or None if there are no such dependencies."""
callback = get_callback(request, self._crawler.spider)
if callback in self._callback_cache:
return self._callback_cache[callback]

deps = self._get_deps(request)
if not deps:
self._callback_cache[callback] = None
return None
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

deps_key = json.dumps(deps, sort_keys=True).encode()
self._callback_cache[callback] = deps_key
return self._callback_cache[callback]

def serialize_page_params(self, request: Request) -> Optional[bytes]:
"""Return a JSON object as bytes that represents the page params,
or None if there are no page params or they are not
JSON-serializable."""
page_params = request.meta.get("page_params", None)
if not page_params:
return None

try:
return json.dumps(page_params, sort_keys=True).encode()
except TypeError:
if not self._saw_unserializable_page_params:
self._saw_unserializable_page_params = True
logger.warning(
f"Cannot serialize page params {page_params!r} of "
f"request {request} as JSON. This can be an issue if "
f"you have requests that are identical except for "
f"their page params, because unserializable page "
f"params are treated the same as missing or empty "
f"page params for purposes of request fingerprinting "
f"(see "
f"https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprints). "
f"This will be the only warning about this issue, "
f"other requests might be also affected."
)
return None

def fingerprint(self, request: Request) -> bytes:
if request in self._request_cache:
return self._request_cache[request]

fingerprint = self._base_request_fingerprinter.fingerprint(request)
deps_key = self.get_deps_key(request)
serialized_page_params = self.serialize_page_params(request)
if deps_key is None and serialized_page_params is None:
return fingerprint
if deps_key is not None:
fingerprint += deps_key
if serialized_page_params is not None:
fingerprint += serialized_page_params

self._request_cache[request] = hashlib.sha1(fingerprint).digest()
return self._request_cache[request]
34 changes: 33 additions & 1 deletion scrapy_poet/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from typing import Dict
from unittest import mock

from scrapy import signals
from scrapy import Spider, signals
from scrapy.crawler import Crawler
from scrapy.exceptions import CloseSpider
from scrapy.settings import Settings
from scrapy.utils.python import to_bytes
from scrapy.utils.test import get_crawler as _get_crawler
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks
from twisted.internet.task import deferLater
Expand Down Expand Up @@ -151,6 +152,10 @@
return item, url, crawler


def get_download_handler(crawler, schema):
return crawler.engine.downloader.handlers._get_handler(schema)


def make_crawler(spider_cls, settings):
if not getattr(spider_cls, "name", None):

Expand All @@ -163,6 +168,33 @@
return Crawler(spider_cls, settings)


def setup_crawler_engine(crawler: Crawler):
"""Run the crawl steps until engine setup, so that crawler.engine is not
None.
https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122
"""

crawler.crawling = True
crawler.spider = crawler._create_spider()
crawler.engine = crawler._create_engine()

handler = get_download_handler(crawler, "https")
if hasattr(handler, "engine_started"):
handler.engine_started()

Check warning on line 183 in scrapy_poet/utils/testing.py

View check run for this annotation

Codecov / codecov/patch

scrapy_poet/utils/testing.py#L183

Added line #L183 was not covered by tests


class DummySpider(Spider):
name = "dummy"


def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True):
settings = settings or {}
crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls)
if setup_engine:
setup_crawler_engine(crawler)
return crawler


class CollectorPipeline:
def open_spider(self, spider):
spider.collected_items = []
Expand Down
Loading
Loading