Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ZYTE_API_PRESERVE_DELAY added to provide possibility to set custom delay #204

Merged
merged 6 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docs/reference/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,31 @@ Note that requests with error responses that cannot be retried or exceed their
retry limit also count here.


.. setting:: ZYTE_API_PRESERVE_DELAY

ZYTE_API_PRESERVE_DELAY
=======================

Default: ``False if`` :setting:`AUTOTHROTTLE_ENABLED
<scrapy:AUTOTHROTTLE_ENABLED>` ``else True``

By default, requests for which use of scrapy-zyte-api is enabled get
``zyte-api@`` prepended to their download slot ID, and if
:setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>` is ``True``, the
corresponding download slot gets its download delay reset to 0. This nullifies
the effects of the :ref:`AutoThrottle extension <topics-autothrottle>` for Zyte
API requests, delegating throttling management to Zyte API.

If :setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>` is ``False``,
but you have a download delay set through :setting:`DOWNLOAD_DELAY
<scrapy:DOWNLOAD_DELAY>` and you do not want that delay to affect Zyte API
requests, set this setting to ``False``.

If you have :setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>`
enabled, and you want it to also work on Zyte API requests, set this setting to
``True``.


.. setting:: ZYTE_API_PROVIDER_PARAMS

ZYTE_API_PROVIDER_PARAMS
Expand Down
9 changes: 7 additions & 2 deletions scrapy_zyte_api/_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ def from_crawler(cls, crawler):
def __init__(self, crawler):
self._param_parser = _ParamParser(crawler, cookies_enabled=False)
self._crawler = crawler
self._preserve_delay = crawler.settings.getbool(
"ZYTE_API_PRESERVE_DELAY",
not crawler.settings.getbool("AUTOTHROTTLE_ENABLED"),
)

def slot_request(self, request, spider, force=False):
if not force and self._param_parser.parse(request) is None:
Expand All @@ -31,8 +35,9 @@ def slot_request(self, request, spider, force=False):
if not isinstance(slot_id, str) or not slot_id.startswith(self._slot_prefix):
slot_id = f"{self._slot_prefix}{slot_id}"
request.meta["download_slot"] = slot_id
_, slot = downloader._get_slot(request, spider)
slot.delay = 0
if not self._preserve_delay:
_, slot = downloader._get_slot(request, spider)
slot.delay = 0


class ScrapyZyteAPIDownloaderMiddleware(_BaseMiddleware):
Expand Down
31 changes: 21 additions & 10 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,26 @@ def spider_output_processor(middleware, request, spider):


@pytest.mark.parametrize(
"mw_cls,processor",
["mw_cls", "processor"],
[
(ScrapyZyteAPIDownloaderMiddleware, request_processor),
(ScrapyZyteAPISpiderMiddleware, start_request_processor),
(ScrapyZyteAPISpiderMiddleware, spider_output_processor),
],
)
@pytest.mark.parametrize(
["settings", "preserve"],
[
({}, True),
({"ZYTE_API_PRESERVE_DELAY": False}, False),
({"ZYTE_API_PRESERVE_DELAY": True}, True),
({"AUTOTHROTTLE_ENABLED": True}, False),
({"AUTOTHROTTLE_ENABLED": True, "ZYTE_API_PRESERVE_DELAY": True}, True),
],
)
@ensureDeferred
async def test_autothrottle_handling(mw_cls, processor):
crawler = get_crawler()
async def test_preserve_delay(mw_cls, processor, settings, preserve):
crawler = get_crawler(settings_dict=settings)
await crawler.crawl("a")
spider = crawler.spider

Expand All @@ -64,13 +74,13 @@ async def test_autothrottle_handling(mw_cls, processor):
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == spider.download_delay

# On Zyte API requests, the download slot is changed, and its delay is set
# to 0.
# On Zyte API requests, the download slot is changed, and its delay may be
# set to 0 depending on settings.
request = Request("https://example.com", meta={"zyte_api": {}})
processor(middleware, request, spider)
assert request.meta["download_slot"] == "[email protected]"
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == 0
assert slot.delay == (5 if preserve else 0)

# Requests that happen to already have the right download slot assigned
# work the same.
Expand All @@ -79,17 +89,18 @@ async def test_autothrottle_handling(mw_cls, processor):
processor(middleware, request, spider)
assert request.meta["download_slot"] == "[email protected]"
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == 0
assert slot.delay == (5 if preserve else 0)

# The slot delay is set to 0 every time a request for the slot is
# The slot delay is taken into account every time a request for the slot is
# processed, so even if it gets changed later on somehow, the downloader
# middleware will reset it to 0 again the next time it processes a request.
# middleware may reset it to 0 again the next time it processes a request
# depending on settings.
slot.delay = 10
request = Request("https://example.com", meta={"zyte_api": {}})
processor(middleware, request, spider)
assert request.meta["download_slot"] == "[email protected]"
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == 0
assert slot.delay == (10 if preserve else 0)

await crawler.stop()

Expand Down