Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HubstorageDownloaderMiddleware: centralized request fingerprinting #87

Merged
merged 2 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions sh_scrapy/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from weakref import WeakKeyDictionary

from scrapy import Request
from scrapy.utils.request import request_fingerprint

from sh_scrapy.writer import pipe_writer

Expand Down Expand Up @@ -35,7 +34,7 @@ def process_spider_output(self, response, result, spider):
yield x


class HubstorageDownloaderMiddleware(object):
class HubstorageDownloaderMiddleware:
"""Hubstorage dowloader middleware.

What it does:
Expand All @@ -46,10 +45,37 @@ class HubstorageDownloaderMiddleware(object):

"""

def __init__(self):
@classmethod
def from_crawler(cls, crawler):
try:
result = cls(crawler)
except TypeError:
warn(
(
"Subclasses of HubstorageDownloaderMiddleware must now "
"accept a crawler parameter in their __init__ method. "
"This will become an error in the future."
),
DeprecationWarning,
)
result = cls()
result._crawler = crawler
result._load_fingerprinter()
return result

def __init__(self, crawler):
self._crawler = crawler
self._seen_requests = seen_requests
self.pipe_writer = pipe_writer
self.request_id_sequence = request_id_sequence
self._load_fingerprinter()

def _load_fingerprinter(self):
if hasattr(self._crawler, "request_fingerprinter"):
self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex()
else:
from scrapy.utils.request import request_fingerprint
self._fingerprint = request_fingerprint

def process_request(self, request, spider):
# Check if request id is set, which usually happens for retries or redirects because
Expand All @@ -72,7 +98,7 @@ def process_response(self, request, response, spider):
rs=len(response.body),
duration=request.meta.get('download_latency', 0) * 1000,
parent=request.meta.setdefault(HS_PARENT_ID_KEY),
fp=request_fingerprint(request),
fp=self._fingerprint(response.request),
)
# Generate and set request id.
request_id = next(self.request_id_sequence)
Expand Down
14 changes: 12 additions & 2 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
from scrapy import Spider, Request, Item
from scrapy.http import Response
from scrapy.utils.test import get_crawler
from typing import Optional

from sh_scrapy.middlewares import (
Expand All @@ -26,7 +27,8 @@ def hs_spider_middleware(monkeypatch_globals):

@pytest.fixture()
def hs_downloader_middleware(monkeypatch_globals):
return HubstorageDownloaderMiddleware()
crawler = get_crawler()
return HubstorageDownloaderMiddleware.from_crawler(crawler)


def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
Expand All @@ -46,13 +48,13 @@ def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
assert len(hs_spider_middleware._seen_requests) == 0
assert len(hs_downloader_middleware._seen_requests) == 0

response_0.request = request_0
hs_downloader_middleware.process_response(request_0, response_0, spider)

assert request_0.meta[HS_REQUEST_ID_KEY] == 0
assert request_0.meta[HS_PARENT_ID_KEY] is None
assert hs_spider_middleware._seen_requests[request_0] == 0

response_0.request = request_0
request_1 = Request(url)
request_2 = Request(url)
item1 = {}
Expand All @@ -69,12 +71,14 @@ def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):

response_1 = Response(url)
hs_downloader_middleware.process_request(request_1, spider)
response_1.request = request_1
hs_downloader_middleware.process_response(request_1, response_1, spider)
assert request_1.meta[HS_REQUEST_ID_KEY] == 1
assert request_1.meta[HS_PARENT_ID_KEY] == 0

response_2 = Response(url)
hs_downloader_middleware.process_request(request_2, spider)
response_2.request = request_2
hs_downloader_middleware.process_response(request_2, response_2, spider)
assert request_2.meta[HS_REQUEST_ID_KEY] == 2
assert request_2.meta[HS_PARENT_ID_KEY] == 0
Expand All @@ -101,12 +105,14 @@ def __init__(self, url: str, request: Optional[Request] = None):
response_1 = DummyResponse(url, request)
response_2 = Response(url)
hs_downloader_middleware.process_request(request, spider)
response_1.request = request
hs_downloader_middleware.process_response(request, response_1, spider)

with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file:
assert tmp_file.readline() == ""
assert request.meta == {}

response_2.request = request
hs_downloader_middleware.process_response(request, response_2, spider)
with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file:
assert tmp_file.readline().startswith('REQ')
Expand Down Expand Up @@ -138,6 +144,7 @@ def __init__(self, url: str, request: Optional[Request] = None):
assert len(hs_spider_middleware._seen_requests) == 0
assert len(hs_downloader_middleware._seen_requests) == 0

response_0.request = request_0
hs_downloader_middleware.process_response(request_0, response_0, spider)

assert request_0.meta[HS_REQUEST_ID_KEY] == 0
Expand All @@ -154,6 +161,7 @@ def __init__(self, url: str, request: Optional[Request] = None):
assert HS_REQUEST_ID_KEY not in request_1.meta
assert request_1.meta[HS_PARENT_ID_KEY] == 0

response_1.request = request_1
hs_downloader_middleware.process_response(request_1, response_1, spider)

assert request_1.meta[HS_REQUEST_ID_KEY] == 1
Expand All @@ -163,11 +171,13 @@ def __init__(self, url: str, request: Optional[Request] = None):
response_2_1 = DummyResponse(url, request_2)
response_2_2 = Response(url)

response_2_1.request = request_2
hs_downloader_middleware.process_response(request_2, response_2_1, spider)

assert request_2.meta[HS_REQUEST_ID_KEY] == 1
assert request_2.meta[HS_PARENT_ID_KEY] == 0

response_2_2.request = request_2
hs_downloader_middleware.process_response(request_2, response_2_2, spider)

assert request_2.meta[HS_REQUEST_ID_KEY] == 2
Expand Down
Loading