Skip to content

Commit

Permalink
Implement ZYTE_API_PRESERVE_DELAY (#204)
Browse files Browse the repository at this point in the history
Co-authored-by: Georgiy Zatserklianyi <[email protected]>
Co-authored-by: Adrián Chaves <[email protected]>
  • Loading branch information
3 people authored Jun 26, 2024
1 parent beaf8ca commit 5856129
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 12 deletions.
25 changes: 25 additions & 0 deletions docs/reference/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,31 @@ Note that requests with error responses that cannot be retried or exceed their
retry limit also count here.


.. setting:: ZYTE_API_PRESERVE_DELAY

ZYTE_API_PRESERVE_DELAY
=======================

Default: ``False if`` :setting:`AUTOTHROTTLE_ENABLED
<scrapy:AUTOTHROTTLE_ENABLED>` ``else True``

By default, requests for which use of scrapy-zyte-api is enabled get
``zyte-api@`` prepended to their download slot ID, and if
:setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>` is ``True``, the
corresponding download slot gets its download delay reset to 0. This nullifies
the effects of the :ref:`AutoThrottle extension <topics-autothrottle>` for Zyte
API requests, delegating throttling management to Zyte API.

If :setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>` is ``False``,
but you have a download delay set through :setting:`DOWNLOAD_DELAY
<scrapy:DOWNLOAD_DELAY>` and you do not want that delay to affect Zyte API
requests, set this setting to ``False``.

If you have :setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>`
enabled, and you want it to also work on Zyte API requests, set this setting to
``True``.


.. setting:: ZYTE_API_PROVIDER_PARAMS

ZYTE_API_PROVIDER_PARAMS
Expand Down
9 changes: 7 additions & 2 deletions scrapy_zyte_api/_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ def from_crawler(cls, crawler):
def __init__(self, crawler):
self._param_parser = _ParamParser(crawler, cookies_enabled=False)
self._crawler = crawler
self._preserve_delay = crawler.settings.getbool(
"ZYTE_API_PRESERVE_DELAY",
not crawler.settings.getbool("AUTOTHROTTLE_ENABLED"),
)

def slot_request(self, request, spider, force=False):
if not force and self._param_parser.parse(request) is None:
Expand All @@ -31,8 +35,9 @@ def slot_request(self, request, spider, force=False):
if not isinstance(slot_id, str) or not slot_id.startswith(self._slot_prefix):
slot_id = f"{self._slot_prefix}{slot_id}"
request.meta["download_slot"] = slot_id
_, slot = downloader._get_slot(request, spider)
slot.delay = 0
if not self._preserve_delay:
_, slot = downloader._get_slot(request, spider)
slot.delay = 0


class ScrapyZyteAPIDownloaderMiddleware(_BaseMiddleware):
Expand Down
31 changes: 21 additions & 10 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,26 @@ def spider_output_processor(middleware, request, spider):


@pytest.mark.parametrize(
"mw_cls,processor",
["mw_cls", "processor"],
[
(ScrapyZyteAPIDownloaderMiddleware, request_processor),
(ScrapyZyteAPISpiderMiddleware, start_request_processor),
(ScrapyZyteAPISpiderMiddleware, spider_output_processor),
],
)
@pytest.mark.parametrize(
["settings", "preserve"],
[
({}, True),
({"ZYTE_API_PRESERVE_DELAY": False}, False),
({"ZYTE_API_PRESERVE_DELAY": True}, True),
({"AUTOTHROTTLE_ENABLED": True}, False),
({"AUTOTHROTTLE_ENABLED": True, "ZYTE_API_PRESERVE_DELAY": True}, True),
],
)
@ensureDeferred
async def test_autothrottle_handling(mw_cls, processor):
crawler = get_crawler()
async def test_preserve_delay(mw_cls, processor, settings, preserve):
crawler = get_crawler(settings_dict=settings)
await crawler.crawl("a")
spider = crawler.spider

Expand All @@ -64,13 +74,13 @@ async def test_autothrottle_handling(mw_cls, processor):
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == spider.download_delay

# On Zyte API requests, the download slot is changed, and its delay is set
# to 0.
# On Zyte API requests, the download slot is changed, and its delay may be
# set to 0 depending on settings.
request = Request("https://example.com", meta={"zyte_api": {}})
processor(middleware, request, spider)
assert request.meta["download_slot"] == "[email protected]"
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == 0
assert slot.delay == (5 if preserve else 0)

# Requests that happen to already have the right download slot assigned
# work the same.
Expand All @@ -79,17 +89,18 @@ async def test_autothrottle_handling(mw_cls, processor):
processor(middleware, request, spider)
assert request.meta["download_slot"] == "[email protected]"
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == 0
assert slot.delay == (5 if preserve else 0)

# The slot delay is set to 0 every time a request for the slot is
# The slot delay is taken into account every time a request for the slot is
# processed, so even if it gets changed later on somehow, the downloader
# middleware will reset it to 0 again the next time it processes a request.
# middleware may reset it to 0 again the next time it processes a request
# depending on settings.
slot.delay = 10
request = Request("https://example.com", meta={"zyte_api": {}})
processor(middleware, request, spider)
assert request.meta["download_slot"] == "[email protected]"
_, slot = crawler.engine.downloader._get_slot(request, spider)
assert slot.delay == 0
assert slot.delay == (10 if preserve else 0)

await crawler.stop()

Expand Down

0 comments on commit 5856129

Please sign in to comment.