Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding DOWNLOAD_MAXSIZE and DOWNLOAD_WARNSIZE #227

Merged
merged 22 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion scrapy_zyte_api/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,27 @@
logger = logging.getLogger(__name__)


def _body_max_size_exceeded(
body_size: int,
warnsize: Optional[int],
maxsize: Optional[int],
request_url: str,
) -> bool:
if warnsize and body_size > warnsize:
logger.warning(
f"Actual response size {body_size} larger than "
f"download warn size {warnsize} in request {request_url}."
)

if maxsize and body_size > maxsize:
logger.warning(
f"Dropping the response for {request_url}: actual response size "
f"{body_size} larger than download max size {maxsize}."
)
return True
return False


def _truncate_str(obj, index, text, limit):
if len(text) <= limit:
return
Expand Down Expand Up @@ -92,6 +113,9 @@ def __init__(
f"({self._truncate_limit}) is invalid. It must be 0 or a "
f"positive integer."
)
self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE")
self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE")

crawler.signals.connect(self.engine_started, signal=signals.engine_started)
self._crawler = crawler
self._fallback_handler = None
Expand Down Expand Up @@ -231,7 +255,18 @@ async def _download_request(
finally:
self._update_stats(api_params)

return _process_response(api_response, request, self._cookie_jars)
process_response = _process_response(
api_response=api_response, request=request, cookie_jars=self._cookie_jars
)
if process_response and _body_max_size_exceeded(
len(process_response.body),
PyExplorer marked this conversation as resolved.
Show resolved Hide resolved
self._default_warnsize,
self._default_maxsize,
request.url,
):
return None

return process_response
PyExplorer marked this conversation as resolved.
Show resolved Hide resolved

def _process_request_error(self, request, error):
detail = (error.parsed.data or {}).get("detail", error.message)
Expand Down
104 changes: 103 additions & 1 deletion tests/test_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from zyte_api.aio.retry import RetryFactory
from zyte_api.constants import API_URL

from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler
from scrapy_zyte_api.handler import (
ScrapyZyteAPIDownloadHandler,
_body_max_size_exceeded,
)
from scrapy_zyte_api.responses import ZyteAPITextResponse
from scrapy_zyte_api.utils import USER_AGENT

from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS, SETTINGS_T, UNSET
Expand Down Expand Up @@ -557,3 +561,101 @@ async def test_fallback_setting():
handler = get_download_handler(crawler, "https")
assert isinstance(handler, ScrapyZyteAPIDownloadHandler)
assert isinstance(handler._fallback_handler, HTTPDownloadHandler)


@pytest.mark.parametrize(
"body_size, warnsize, maxsize, expected_result, expected_warnings",
[
# Warning only (exceeds warnsize but not maxsize)
(
1200,
1000,
1500,
False,
[
"Actual response size 1200 larger than download warn size 1000 in request http://example.com."
],
),
# Cancel download (exceeds both warnsize and maxsize)
(
1600,
1000,
1500,
True,
[
"Actual response size 1600 larger than download warn size 1000 in request http://example.com.",
"Dropping the response for http://example.com: actual response size 1600 larger than download max size 1500.",
],
),
# No limits - no warnings expected
(500, None, None, False, []),
],
)
def test_body_max_size_exceeded(
body_size, warnsize, maxsize, expected_result, expected_warnings
):
with mock.patch("scrapy_zyte_api.handler.logger") as logger:
result = _body_max_size_exceeded(
body_size=body_size,
warnsize=warnsize,
maxsize=maxsize,
request_url="http://example.com",
)

assert result == expected_result

if expected_warnings:
for call, expected_warning in zip(
logger.warning.call_args_list, expected_warnings
):
assert call[0][0] == expected_warning
else:
logger.warning.assert_not_called()


@ensureDeferred
@pytest.mark.parametrize(
"body_size, warnsize, maxsize, expect_null",
[
(500, None, None, False), # No limits, should return response
(
1500,
1000,
None,
False,
), # Exceeds warnsize, should log warning but return response
(2500, 1000, 2000, True), # Exceeds maxsize, should return None
(500, 1000, 2000, False), # Within limits, should return response
(
1500,
None,
1000,
True,
), # Exceeds maxsize with no warnsize, should return None
],
)
async def test_download_request_limits(
body_size, warnsize, maxsize, expect_null, mockserver
):
settings: SETTINGS_T = {"DOWNLOAD_WARNSIZE": warnsize, "DOWNLOAD_MAXSIZE": maxsize}
async with make_handler(settings, mockserver.urljoin("/")) as handler:
handler._session = mock.AsyncMock()
handler._session.get.return_value = mock.Mock(body=b"x" * body_size)

mock_api_response = mock.Mock(body=b"x" * body_size)

# Patch the `from_api_response` method of ZyteAPITextResponse only for the test
with mock.patch.object(
ZyteAPITextResponse, "from_api_response", return_value=mock_api_response
):
with mock.patch(
"scrapy_zyte_api.responses._process_response",
return_value=mock_api_response,
):
request = Request("https://example.com")
result = await handler._download_request({}, request, None)

if expect_null:
assert result is None
else:
assert result is not None