From 98fb441ea45722b4c534ed1bc2b6edf24adad684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 22 Oct 2024 14:24:33 +0200 Subject: [PATCH] Count Zyte API requests from the downloader middleware itself (#228) --- scrapy_zyte_api/_middlewares.py | 21 +++---------- tests/test_middlewares.py | 55 ++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index 6a3a84c3..665289ce 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -53,6 +53,7 @@ def __init__(self, crawler) -> None: f"{self._max_requests}. The spider will close when it's " f"reached." ) + self._request_count = 0 crawler.signals.connect( self._start_requests_processed, signal=_start_requests_processed @@ -124,29 +125,15 @@ def process_request(self, request, spider): if self._param_parser.parse(request) is None: return - self.slot_request(request, spider, force=True) - - if self._max_requests_reached(self._crawler.engine.downloader): + self._request_count += 1 + if self._max_requests and self._request_count > self._max_requests: self._crawler.engine.close_spider(spider, "closespider_max_zapi_requests") raise IgnoreRequest( f"The request {request} is skipped as {self._max_requests} max " f"Zyte API requests have been reached." ) - def _max_requests_reached(self, downloader) -> bool: - if not self._max_requests: - return False - - zapi_req_count = self._crawler.stats.get_value("scrapy-zyte-api/processed", 0) - download_req_count = sum( - [ - len(slot.transferring) - for slot_id, slot in downloader.slots.items() - if slot_id.startswith(self._slot_prefix) - ] - ) - total_requests = zapi_req_count + download_req_count - return total_requests >= self._max_requests + self.slot_request(request, spider, force=True) def process_exception(self, request, exception, spider): if ( diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 698b3c37..7804bca3 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -136,8 +136,8 @@ def start_requests(self): for i in range(spider_requests): meta = {"zyte_api": {"browserHtml": True}} - # Alternating requests between ZAPI and non-ZAPI tests if - # ZYTE_API_MAX_REQUESTS solely limits ZAPI Requests. + # Alternating requests between ZAPI and non-ZAPI verifies + # that ZYTE_API_MAX_REQUESTS solely limits ZAPI requests. if i % 2: yield Request( @@ -166,8 +166,8 @@ def parse(self, response): f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}" in caplog.text ) - assert crawler.stats.get_value("scrapy-zyte-api/success") <= zapi_max_requests - assert crawler.stats.get_value("scrapy-zyte-api/processed") <= zapi_max_requests + assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests + assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests assert crawler.stats.get_value("item_scraped_count") <= zapi_max_requests + 6 assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests" assert ( @@ -178,6 +178,53 @@ def parse(self, response): ) +@ensureDeferred +async def test_max_requests_race_condition(caplog): + spider_requests = 8 + zapi_max_requests = 1 + + with MockServer(DelayedResource) as server: + + class TestSpider(Spider): + name = "test_spider" + + def start_requests(self): + for i in range(spider_requests): + meta = {"zyte_api": {"browserHtml": True}} + yield Request("https://example.com", meta=meta, dont_filter=True) + + def parse(self, response): + yield Item() + + settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633 + }, + "ZYTE_API_MAX_REQUESTS": zapi_max_requests, + "ZYTE_API_URL": server.urljoin("/"), + **SETTINGS, + } + + crawler = get_crawler(TestSpider, settings_dict=settings) + with caplog.at_level("INFO"): + await crawler.crawl() + + assert ( + f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}" + in caplog.text + ) + assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests + assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests + assert crawler.stats.get_value("item_scraped_count") == zapi_max_requests + assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests" + assert ( + crawler.stats.get_value( + "downloader/exception_type_count/scrapy.exceptions.IgnoreRequest" + ) + > 0 + ) + + @ensureDeferred async def test_forbidden_domain_start_url(): class TestSpider(Spider):