diff --git a/docs/index.rst b/docs/index.rst index 0be78fa6..2751d6bd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,6 +28,7 @@ either :ref:`globally ` or :ref:`per request `, or usage/scrapy-poet usage/session usage/stats + usage/actions usage/fingerprint usage/proxy diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 563052f9..f6d0528c 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -6,6 +6,44 @@ Settings :ref:`Settings ` for scrapy-zyte-api. +.. setting:: ZYTE_API_ACTION_ERROR_RETRY_ENABLED + +ZYTE_API_ACTION_ERROR_RETRY_ENABLED +=================================== + +Default: ``True`` + +Enables retries of Zyte API requests if responses contain an action error. + +Maximum retries and priority adjustment are handled with the same settings and +request metadata keywords as regular retries in Scrapy, see +:setting:`RETRY_TIMES ` and +:setting:`RETRY_PRIORITY_ADJUST `. + +See also :setting:`ZYTE_API_ACTION_ERROR_HANDLING`. + + +.. setting:: ZYTE_API_ACTION_ERROR_HANDLING + +ZYTE_API_ACTION_ERROR_HANDLING +============================== + +Default: ``"pass"`` + +Determines how to handle Zyte API responses that contain an action error: + +- ``"pass"``: Responses are treated as valid responses, i.e. they will reach + your spider callback. + +- ``"ignore"``: Responses are dropped, i.e. they will not reach your spider. + +- ``"err"``: :class:`~scrapy_zyte_api.exceptions.ActionError` is raised, and + will reach your spider errback if you set one. + +.. autoexception:: scrapy_zyte_api.exceptions.ActionError + :members: + + .. setting:: ZYTE_API_AUTO_FIELD_STATS ZYTE_API_AUTO_FIELD_STATS @@ -40,6 +78,7 @@ If for any request a custom page object class is used to override some .. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine whether a field has been overridden or not. + .. setting:: ZYTE_API_AUTOMAP_PARAMS ZYTE_API_AUTOMAP_PARAMS diff --git a/docs/usage/actions.rst b/docs/usage/actions.rst new file mode 100644 index 00000000..a7d054a3 --- /dev/null +++ b/docs/usage/actions.rst @@ -0,0 +1,47 @@ +.. _actions: + +====================== +Handling action errors +====================== + +Even though Zyte API considers a response successful :ref:`even if a browser +action fails `, scrapy-zyte-api retries such +responses by default. See :setting:`ZYTE_API_ACTION_ERROR_RETRY_ENABLED`. + +You can also use :setting:`ZYTE_API_ACTION_ERROR_HANDLING` to determine how +such responses are handled when they are not retried or when retries are +exceeded: treated as a success (default), ignored, or treated as an error. + +Action error caching +==================== + +If you use +:class:`~scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware`, you might +want to use a custom :setting:`HTTPCACHE_POLICY ` to +prevent responses with failed actions (i.e. after exceeding retries) to be +cached: + +.. code-block:: python + :caption: myproject/extensions.py + + from scrapy import Request + from scrapy.extensions.httpcache import DummyPolicy + from scrapy.http import Response + from scrapy_zyte_api.responses import ZyteAPIResponse, ZyteAPITextResponse + + + class ZyteAPIFailedActionsPolicy(DummyPolicy): + + def should_cache_response(self, response: Response, request: Request): + if isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)) and any( + "error" in action for action in response.raw_api_response.get("actions", []) + ): + return False + return super().should_cache_response(response, request) + +And enable it in your settings: + +.. code-block:: python + :caption: myproject/settings.py + + HTTPCACHE_POLICY = "myproject.extensions.ZyteAPIFailedActionsPolicy" diff --git a/pyproject.toml b/pyproject.toml index 19622e41..326a79d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,12 +15,12 @@ files = [ ] [tool.pytest.ini_options] +filterwarnings = [ + "ignore:::twisted", +] junit_family = "xunit2" testpaths = [ "scrapy_zyte_api/", "tests/" ] minversion = "6.0" -filterwarnings = [ - "ignore::DeprecationWarning:twisted.web.http", -] diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index 080ce52b..6812d9f6 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -1,11 +1,116 @@ from logging import getLogger -from typing import cast +from typing import Optional, Union, cast -from scrapy import Request +from scrapy import Request, Spider from scrapy.exceptions import IgnoreRequest +from scrapy.http import Response +from scrapy.utils.python import global_object_name from zyte_api import RequestError from ._params import _ParamParser +from .exceptions import ActionError +from .responses import ZyteAPIResponse, ZyteAPITextResponse + +try: + from scrapy.downloadermiddlewares.retry import get_retry_request +except ImportError: + # Backport get_retry_request for Scrapy < 2.5.0 + + from logging import Logger + from typing import Type + + from scrapy.downloadermiddlewares.retry import ( # type: ignore[attr-defined] # isort: skip + logger as retry_logger, + ) + + def get_retry_request( + request: Request, + *, + spider: Spider, + reason: Union[str, Exception, Type[Exception]] = "unspecified", + max_retry_times: Optional[int] = None, + priority_adjust: Optional[int] = None, + logger: Logger = retry_logger, + stats_base_key: str = "retry", + ) -> Optional[Request]: + """ + Returns a new :class:`~scrapy.Request` object to retry the specified + request, or ``None`` if retries of the specified request have been + exhausted. + + For example, in a :class:`~scrapy.Spider` callback, you could use it as + follows:: + + def parse(self, response): + if not response.text: + new_request_or_none = get_retry_request( + response.request, + spider=self, + reason='empty', + ) + return new_request_or_none + + *spider* is the :class:`~scrapy.Spider` instance which is asking for the + retry request. It is used to access the :ref:`settings ` + and :ref:`stats `, and to provide extra logging context (see + :func:`logging.debug`). + + *reason* is a string or an :class:`Exception` object that indicates the + reason why the request needs to be retried. It is used to name retry stats. + + *max_retry_times* is a number that determines the maximum number of times + that *request* can be retried. If not specified or ``None``, the number is + read from the :reqmeta:`max_retry_times` meta key of the request. If the + :reqmeta:`max_retry_times` meta key is not defined or ``None``, the number + is read from the :setting:`RETRY_TIMES` setting. + + *priority_adjust* is a number that determines how the priority of the new + request changes in relation to *request*. If not specified, the number is + read from the :setting:`RETRY_PRIORITY_ADJUST` setting. + + *logger* is the logging.Logger object to be used when logging messages + + *stats_base_key* is a string to be used as the base key for the + retry-related job stats + """ + settings = spider.crawler.settings + assert spider.crawler.stats + stats = spider.crawler.stats + retry_times = request.meta.get("retry_times", 0) + 1 + if max_retry_times is None: + max_retry_times = request.meta.get("max_retry_times") + if max_retry_times is None: + max_retry_times = settings.getint("RETRY_TIMES") + if retry_times <= max_retry_times: + logger.debug( + "Retrying %(request)s (failed %(retry_times)d times): %(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + new_request: Request = request.copy() + new_request.meta["retry_times"] = retry_times + new_request.dont_filter = True + if priority_adjust is None: + priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST") + new_request.priority = request.priority + priority_adjust + + if callable(reason): + reason = reason() + if isinstance(reason, Exception): + reason = global_object_name(reason.__class__) + + stats.inc_value(f"{stats_base_key}/count") + stats.inc_value(f"{stats_base_key}/reason_count/{reason}") + return new_request + stats.inc_value(f"{stats_base_key}/max_reached") + logger.error( + "Gave up retrying %(request)s (failed %(retry_times)d times): " + "%(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + return None + logger = getLogger(__name__) _start_requests_processed = object() @@ -49,6 +154,13 @@ def __init__(self, crawler) -> None: self._forbidden_domain_start_request_count = 0 self._total_start_request_count = 0 + self._retry_action_errors = crawler.settings.getbool( + "ZYTE_API_ACTION_ERROR_RETRY_ENABLED", True + ) + self._max_retry_times = crawler.settings.getint("RETRY_TIMES") + self._priority_adjust = crawler.settings.getint("RETRY_PRIORITY_ADJUST") + self._load_action_error_handling() + self._max_requests = crawler.settings.getint("ZYTE_API_MAX_REQUESTS") if self._max_requests: logger.info( @@ -62,6 +174,18 @@ def __init__(self, crawler) -> None: self._start_requests_processed, signal=_start_requests_processed ) + def _load_action_error_handling(self): + value = self._crawler.settings.get("ZYTE_API_ACTION_ERROR_HANDLING", "pass") + if value in ("pass", "ignore", "err"): + self._action_error_handling = value + else: + fallback_value = "pass" + logger.error( + f"Setting ZYTE_API_ACTION_ERROR_HANDLING got an unexpected " + f"value: {value!r}. Falling back to {fallback_value!r}." + ) + self._action_error_handling = fallback_value + def _get_spm_mw(self): spm_mw_classes = [] @@ -162,6 +286,52 @@ def _maybe_close(self): self._crawler.spider, "failed_forbidden_domain" ) + def _handle_action_error(self, response): + if self._action_error_handling == "pass": + return response + elif self._action_error_handling == "ignore": + raise IgnoreRequest + else: + assert self._action_error_handling == "err" + raise ActionError(response) + + def process_response( + self, request: Request, response: Response, spider: Spider + ) -> Union[Request, Response]: + if not isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)): + return response + + assert response.raw_api_response is not None + action_error = any( + "error" in action for action in response.raw_api_response.get("actions", []) + ) + if not action_error: + return response + + if not self._retry_action_errors or request.meta.get("dont_retry", False): + return self._handle_action_error(response) + + return self._retry( + request, reason="action-error", spider=spider + ) or self._handle_action_error(response) + + def _retry( + self, + request: Request, + *, + reason: str, + spider: Spider, + ) -> Optional[Request]: + max_retry_times = request.meta.get("max_retry_times", self._max_retry_times) + priority_adjust = request.meta.get("priority_adjust", self._priority_adjust) + return get_retry_request( + request, + reason=reason, + spider=spider, + max_retry_times=max_retry_times, + priority_adjust=priority_adjust, + ) + class ScrapyZyteAPISpiderMiddleware(_BaseMiddleware): def __init__(self, crawler): diff --git a/scrapy_zyte_api/exceptions.py b/scrapy_zyte_api/exceptions.py new file mode 100644 index 00000000..ae14da97 --- /dev/null +++ b/scrapy_zyte_api/exceptions.py @@ -0,0 +1,17 @@ +from typing import Union + +from .responses import ZyteAPIResponse, ZyteAPITextResponse + + +class ActionError(ValueError): + """Exception raised when a Zyte API response contains an action error.""" + + def __init__(self, response, *args, **kwargs): + super().__init__(*args, **kwargs) + + #: Offending Zyte API response. + #: + #: You can inspect the outcome of actions in the ``"actions"`` key of + #: :attr:`response.raw_api_response + #: `. + self.response: Union[ZyteAPIResponse, ZyteAPITextResponse] = response diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 3959b0f6..3c4f3c3c 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -1,3 +1,4 @@ +import logging from typing import Any, Dict, cast from unittest import SkipTest @@ -5,6 +6,7 @@ from packaging.version import Version from pytest_twisted import ensureDeferred from scrapy import Request, Spider +from scrapy.exceptions import IgnoreRequest from scrapy.http.response import Response from scrapy.item import Item from scrapy.utils.misc import create_instance @@ -14,6 +16,8 @@ ScrapyZyteAPIDownloaderMiddleware, ScrapyZyteAPISpiderMiddleware, ) +from scrapy_zyte_api.exceptions import ActionError +from scrapy_zyte_api.responses import ZyteAPIResponse from . import SETTINGS from .mockserver import DelayedResource, MockServer @@ -465,3 +469,332 @@ class CrawleraSpider(Spider): attribute, conflict, ) + + +@pytest.mark.parametrize( + "settings,meta,enabled", + [ + # ZYTE_API_ACTION_ERROR_RETRY_ENABLED enables, RETRY_ENABLED has no + # effect. + *( + ( + { + "RETRY_ENABLED": scrapy, + "ZYTE_API_ACTION_ERROR_RETRY_ENABLED": zyte_api, + }, + {}, + zyte_api, + ) + for zyte_api in (True, False) + for scrapy in (True, False) + ), + *( + ( + { + "RETRY_ENABLED": scrapy, + }, + {}, + True, + ) + for scrapy in (True, False) + ), + # dont_retry=True overrides. + *( + ( + {"ZYTE_API_ACTION_ERROR_RETRY_ENABLED": zyte_api}, + {"dont_retry": dont_retry}, + zyte_api and not dont_retry, + ) + for zyte_api in (True, False) + for dont_retry in (True, False) + ), + ], +) +@ensureDeferred +async def test_action_error_retry_enabled(settings, meta, enabled): + crawler = get_crawler(NamedSpider, settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com", meta=meta) + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + result = middleware.process_response(request, response, crawler.spider) + if enabled: + assert isinstance(result, Request) + assert result.meta["retry_times"] == 1 + else: + assert result is response + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,meta,max_retries", + [ + ( + {"RETRY_TIMES": 1}, + {}, + 1, + ), + ( + {}, + {"max_retry_times": 1}, + 1, + ), + ( + {"RETRY_TIMES": 1}, + {"max_retry_times": 2}, + 2, + ), + ( + {"RETRY_TIMES": 2}, + {"max_retry_times": 1}, + 1, + ), + ], +) +@ensureDeferred +async def test_action_error_retry_times(settings, meta, max_retries): + crawler = get_crawler(NamedSpider, settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request( + "https://example.com", meta={**meta, "retry_times": max_retries - 1} + ) + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + request2 = middleware.process_response(request, response, crawler.spider) + assert isinstance(request2, Request) + assert request2.meta["retry_times"] == max_retries + + result = middleware.process_response(request2, response, crawler.spider) + assert result is response + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,meta,priority", + [ + ( + {"RETRY_PRIORITY_ADJUST": 1}, + {}, + 1, + ), + ( + {}, + {"priority_adjust": 1}, + 1, + ), + ( + {"RETRY_PRIORITY_ADJUST": 1}, + {"priority_adjust": 2}, + 2, + ), + ( + {"RETRY_PRIORITY_ADJUST": 2}, + {"priority_adjust": 1}, + 1, + ), + ], +) +@ensureDeferred +async def test_action_error_retry_priority_adjust(settings, meta, priority): + crawler = get_crawler(NamedSpider, settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com", meta=meta) + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + request2 = middleware.process_response(request, response, crawler.spider) + assert isinstance(request2, Request) + assert request2.meta["retry_times"] == 1 + assert request2.priority == priority + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,expected,setup_errors", + [ + ( + {}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "pass"}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "ignore"}, + IgnoreRequest, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "err"}, + ActionError, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "foo"}, + Response, + [ + ( + "Setting ZYTE_API_ACTION_ERROR_HANDLING got an unexpected " + "value: 'foo'. Falling back to 'pass'." + ) + ], + ), + ], +) +@ensureDeferred +async def test_action_error_handling_no_retries( + settings, expected, setup_errors, caplog +): + settings["ZYTE_API_ACTION_ERROR_RETRY_ENABLED"] = False + crawler = get_crawler(NamedSpider, settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + if setup_errors: + assert caplog.record_tuples == [ + ("scrapy_zyte_api._middlewares", logging.ERROR, error) + for error in setup_errors + ] + else: + assert not caplog.records + + request = Request("https://example.com") + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + try: + result = middleware.process_response(request, response, crawler.spider) + except (ActionError, IgnoreRequest) as e: + result = e + assert isinstance(result, expected) + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,expected,setup_errors", + [ + ( + {}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "pass"}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "ignore"}, + IgnoreRequest, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "err"}, + ActionError, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "foo"}, + Response, + [ + ( + "Setting ZYTE_API_ACTION_ERROR_HANDLING got an unexpected " + "value: 'foo'. Falling back to 'pass'." + ) + ], + ), + ], +) +@ensureDeferred +async def test_action_error_handling_retries(settings, expected, setup_errors, caplog): + settings["RETRY_TIMES"] = 1 + crawler = get_crawler(NamedSpider, settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + if setup_errors: + assert caplog.record_tuples == [ + ("scrapy_zyte_api._middlewares", logging.ERROR, error) + for error in setup_errors + ] + else: + assert not caplog.records + + request = Request("https://example.com") + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + request2 = middleware.process_response(request, response, crawler.spider) + assert isinstance(request2, Request) + assert request2.meta["retry_times"] == 1 + + try: + result = middleware.process_response(request2, response, crawler.spider) + except (ActionError, IgnoreRequest) as e: + result = e + assert isinstance(result, expected) + + await crawler.stop() + + +@ensureDeferred +async def test_process_response_non_zyte_api(): + crawler = get_crawler(NamedSpider) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com") + response = Response(request.url) + result = middleware.process_response(request, response, crawler.spider) + assert result is response + + await crawler.stop() + + +@ensureDeferred +async def test_process_response_no_action_error(): + crawler = get_crawler(NamedSpider) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com") + raw_api_response = {"url": request.url, "actions": [{"action": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + result = middleware.process_response(request, response, crawler.spider) + assert result is response + + await crawler.stop() diff --git a/tests/test_sessions.py b/tests/test_sessions.py index cd5e401b..0284b4d2 100644 --- a/tests/test_sessions.py +++ b/tests/test_sessions.py @@ -588,6 +588,7 @@ async def test_checker_location(postal_code, url, close_reason, stats, mockserve """The default checker looks into the outcome of the ``setLocation`` action if a location meta/setting was used.""" settings = { + "ZYTE_API_ACTION_ERROR_RETRY_ENABLED": False, "ZYTE_API_URL": mockserver.urljoin("/"), "ZYTE_API_SESSION_ENABLED": True, "ZYTE_API_SESSION_MAX_BAD_INITS": 1,