scrapy-plugins · kmike · Nov 12, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst
@@ -206,6 +206,42 @@ when :setting:`ZYTE_API_LOG_REQUESTS` is enabled, excluding object keys.
 To disable truncation, set this to ``0``.
 
 
+.. setting:: DOWNLOAD_MAXSIZE
+
+DOWNLOAD_MAXSIZE
+================
+
+Default: ``1073741824`` (1 GiB)
+
+The maximum response body size (in bytes) allowed. Bigger responses are
+aborted and ignored.
+
+This applies both before and after compression. If decompressing a response
+body would exceed this limit, decompression is aborted and the response is
+ignored.
+
+Use ``0`` to disable this limit.
+
+This limit can be set per spider using the :attr:`download_maxsize` spider
+attribute and per request using the :reqmeta:`download_maxsize` Request.meta
+key.
+
+.. setting:: DOWNLOAD_WARNSIZE
+
+DOWNLOAD_WARNSIZE
+=================
+
+Default: ``33554432`` (32 MiB)
+
+If the size of a response exceeds this value, before or after compression, a
+warning will be logged about it.
+
+Use ``0`` to disable this limit.
+
+This limit can be set per spider using the :attr:`download_warnsize` spider
+attribute and per request using the :reqmeta:`download_warnsize` Request.meta
+key.
+
 .. setting:: ZYTE_API_MAX_COOKIES
 
 ZYTE_API_MAX_COOKIES

diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py
@@ -92,6 +92,9 @@ def __init__(
                 f"({self._truncate_limit}) is invalid. It must be 0 or a "
                 f"positive integer."
             )
+        self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE")
+        self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE")
+
         crawler.signals.connect(self.engine_started, signal=signals.engine_started)
         self._crawler = crawler
         self._fallback_handler = None
@@ -231,7 +234,9 @@ async def _download_request(
         finally:
             self._update_stats(api_params)
 
-        return _process_response(api_response, request, self._cookie_jars)
+        return _process_response(
+            api_response, request, self._cookie_jars, self._default_maxsize, self._default_warnsize
+        )
 
     def _process_request_error(self, request, error):
         detail = (error.parsed.data or {}).get("detail", error.message)

diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py
@@ -1,3 +1,5 @@
+import logging
+
 from base64 import b64decode
 from copy import copy
 from datetime import datetime
@@ -15,6 +17,9 @@
     _RESPONSE_HAS_PROTOCOL,
 )
 
+logger = logging.getLogger(__name__)
+
+
 _DEFAULT_ENCODING = "utf-8"
 
 
@@ -166,10 +171,67 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None):
 _API_RESPONSE = Dict[str, _JSON]
 
 
+def _check_response_size_limits(
+    expected_size: int,
+    warnsize: Optional[int],
+    maxsize: Optional[int],
+    request_url: str,
+) -> bool:
+    if warnsize and expected_size > warnsize:
+        logger.warning(
+            f"Expected response size {expected_size} larger than "
+            f"download warn size {warnsize} in request {request_url}."
+        )
+
+    if maxsize and expected_size > maxsize:
+        logger.warning(
+            f"Cancelling download of {request_url}: expected response size "
+            f"{expected_size} larger than download max size {maxsize}."
+        )
+        return False
+    return True
+
+
+def _response_max_size_exceeded(
+    api_response: _API_RESPONSE,
+    request: Request,
+    default_maxsize: Optional[int],
+    default_warnsize: Optional[int],
+) -> bool:
+    maxsize = request.meta.get("download_maxsize", default_maxsize)
+    warnsize = request.meta.get("download_warnsize", default_warnsize)
+
+    if "browserHtml" in api_response:
+        expected_size = len(api_response["browserHtml"].encode(_DEFAULT_ENCODING))
+    elif api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"):
+        expected_size = None
+        for header in api_response.get("httpResponseHeaders"):
+            if header["name"].lower() == "content-length":
+                expected_size = int(header["value"])
+                break
+
+        if expected_size is None or (
+            (maxsize and expected_size < maxsize)
+            and (warnsize and expected_size < warnsize)
+        ):
+            expected_size = len(b64decode(api_response.get("httpResponseBody", b"")))
+    else:
+        return False
+
+    if expected_size is not None and not _check_response_size_limits(
+        expected_size, warnsize, maxsize, request.url
+    ):
+        return True
+
+    return False
+
+
 def _process_response(
     api_response: _API_RESPONSE,
     request: Request,
     cookie_jars: Optional[Dict[Any, CookieJar]],
+    default_maxsize: Optional[int],
+    default_warnsize: Optional[int],
 ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]:
     """Given a Zyte API Response and the ``scrapy.Request`` that asked for it,
     this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending
@@ -184,6 +246,9 @@ def _process_response(
 
     _process_cookies(api_response, request, cookie_jars)
 
+    if _response_max_size_exceeded(api_response, request, default_maxsize, default_warnsize):
+        return None
+
     if api_response.get("browserHtml"):
         # Using TextResponse because browserHtml always returns a browser-rendered page
         # even when requesting files (like images)