diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index cc6b3391..49e01d43 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -92,6 +92,9 @@ def __init__( f"({self._truncate_limit}) is invalid. It must be 0 or a " f"positive integer." ) + self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE") + self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE") + crawler.signals.connect(self.engine_started, signal=signals.engine_started) self._crawler = crawler self._fallback_handler = None @@ -231,7 +234,9 @@ async def _download_request( finally: self._update_stats(api_params) - return _process_response(api_response, request, self._cookie_jars) + return _process_response( + api_response, request, self._cookie_jars, self._default_maxsize, self._default_warnsize + ) def _process_request_error(self, request, error): detail = (error.parsed.data or {}).get("detail", error.message) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index dd5cb55a..b9655571 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,3 +1,5 @@ +import logging + from base64 import b64decode from copy import copy from datetime import datetime @@ -15,6 +17,9 @@ _RESPONSE_HAS_PROTOCOL, ) +logger = logging.getLogger(__name__) + + _DEFAULT_ENCODING = "utf-8" @@ -113,7 +118,9 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): class ZyteAPITextResponse(ZyteAPIMixin, HtmlResponse): @classmethod - def from_api_response(cls, api_response: Dict, *, request: Request = None): + def from_api_response( + cls, api_response: Dict, maxsize: Optional[int], warnsize: Optional[int], *, request: Request = None + ): """Alternative constructor to instantiate the response from the raw Zyte API response. """ @@ -126,6 +133,9 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): elif api_response.get("httpResponseBody"): body = b64decode(api_response["httpResponseBody"]) + if _body_max_size_exceeded(len(body), maxsize, warnsize, request.url): + return None + return cls( url=api_response["url"], status=api_response.get("statusCode") or 200, @@ -144,14 +154,20 @@ def replace(self, *args, **kwargs): class ZyteAPIResponse(ZyteAPIMixin, Response): @classmethod - def from_api_response(cls, api_response: Dict, *, request: Request = None): + def from_api_response( + cls, api_response: Dict, maxsize: Optional[int], warnsize: Optional[int], *, request: Request = None + ): """Alternative constructor to instantiate the response from the raw Zyte API response. """ + body = b64decode(api_response.get("httpResponseBody") or "") + if _body_max_size_exceeded(len(body), maxsize, warnsize, request.url): + return None + return cls( url=api_response["url"], status=api_response.get("statusCode") or 200, - body=b64decode(api_response.get("httpResponseBody") or ""), + body=body, request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response), @@ -166,10 +182,33 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): _API_RESPONSE = Dict[str, _JSON] +def _body_max_size_exceeded( + body_size: int, + warnsize: Optional[int], + maxsize: Optional[int], + request_url: str, +) -> bool: + if warnsize and body_size > warnsize: + logger.warning( + f"Actual response size {body_size} larger than " + f"download warn size {warnsize} in request {request_url}." + ) + + if maxsize and body_size > maxsize: + logger.warning( + f"Cancelling download of {request_url}: actual response size " + f"{body_size} larger than download max size {maxsize}." + ) + return True + return False + + def _process_response( api_response: _API_RESPONSE, request: Request, cookie_jars: Optional[Dict[Any, CookieJar]], + default_maxsize: Optional[int], + default_warnsize: Optional[int], ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: """Given a Zyte API Response and the ``scrapy.Request`` that asked for it, this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending @@ -184,10 +223,13 @@ def _process_response( _process_cookies(api_response, request, cookie_jars) + maxsize = request.meta.get("download_maxsize", default_maxsize) + warnsize = request.meta.get("download_warnsize", default_warnsize) + if api_response.get("browserHtml"): # Using TextResponse because browserHtml always returns a browser-rendered page # even when requesting files (like images) - return ZyteAPITextResponse.from_api_response(api_response, request=request) + return ZyteAPITextResponse.from_api_response(api_response, maxsize, warnsize, request=request) if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): response_cls = responsetypes.from_args( @@ -197,6 +239,6 @@ def _process_response( body=b64decode(api_response["httpResponseBody"]), # type: ignore ) if issubclass(response_cls, TextResponse): - return ZyteAPITextResponse.from_api_response(api_response, request=request) + return ZyteAPITextResponse.from_api_response(api_response, maxsize, warnsize, request=request) - return ZyteAPIResponse.from_api_response(api_response, request=request) + return ZyteAPIResponse.from_api_response(api_response, maxsize, warnsize, request=request)