Skip to content

Commit

Permalink
Count Zyte API requests from the downloader middleware itself (#228)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Oct 22, 2024
1 parent b08c735 commit 98fb441
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 21 deletions.
21 changes: 4 additions & 17 deletions scrapy_zyte_api/_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, crawler) -> None:
f"{self._max_requests}. The spider will close when it's "
f"reached."
)
self._request_count = 0

crawler.signals.connect(
self._start_requests_processed, signal=_start_requests_processed
Expand Down Expand Up @@ -124,29 +125,15 @@ def process_request(self, request, spider):
if self._param_parser.parse(request) is None:
return

self.slot_request(request, spider, force=True)

if self._max_requests_reached(self._crawler.engine.downloader):
self._request_count += 1
if self._max_requests and self._request_count > self._max_requests:
self._crawler.engine.close_spider(spider, "closespider_max_zapi_requests")
raise IgnoreRequest(
f"The request {request} is skipped as {self._max_requests} max "
f"Zyte API requests have been reached."
)

def _max_requests_reached(self, downloader) -> bool:
if not self._max_requests:
return False

zapi_req_count = self._crawler.stats.get_value("scrapy-zyte-api/processed", 0)
download_req_count = sum(
[
len(slot.transferring)
for slot_id, slot in downloader.slots.items()
if slot_id.startswith(self._slot_prefix)
]
)
total_requests = zapi_req_count + download_req_count
return total_requests >= self._max_requests
self.slot_request(request, spider, force=True)

def process_exception(self, request, exception, spider):
if (
Expand Down
55 changes: 51 additions & 4 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def start_requests(self):
for i in range(spider_requests):
meta = {"zyte_api": {"browserHtml": True}}

# Alternating requests between ZAPI and non-ZAPI tests if
# ZYTE_API_MAX_REQUESTS solely limits ZAPI Requests.
# Alternating requests between ZAPI and non-ZAPI verifies
# that ZYTE_API_MAX_REQUESTS solely limits ZAPI requests.

if i % 2:
yield Request(
Expand Down Expand Up @@ -166,8 +166,8 @@ def parse(self, response):
f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}"
in caplog.text
)
assert crawler.stats.get_value("scrapy-zyte-api/success") <= zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") <= zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
assert crawler.stats.get_value("item_scraped_count") <= zapi_max_requests + 6
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
assert (
Expand All @@ -178,6 +178,53 @@ def parse(self, response):
)


@ensureDeferred
async def test_max_requests_race_condition(caplog):
spider_requests = 8
zapi_max_requests = 1

with MockServer(DelayedResource) as server:

class TestSpider(Spider):
name = "test_spider"

def start_requests(self):
for i in range(spider_requests):
meta = {"zyte_api": {"browserHtml": True}}
yield Request("https://example.com", meta=meta, dont_filter=True)

def parse(self, response):
yield Item()

settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633
},
"ZYTE_API_MAX_REQUESTS": zapi_max_requests,
"ZYTE_API_URL": server.urljoin("/"),
**SETTINGS,
}

crawler = get_crawler(TestSpider, settings_dict=settings)
with caplog.at_level("INFO"):
await crawler.crawl()

assert (
f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}"
in caplog.text
)
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
assert crawler.stats.get_value("item_scraped_count") == zapi_max_requests
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
assert (
crawler.stats.get_value(
"downloader/exception_type_count/scrapy.exceptions.IgnoreRequest"
)
> 0
)


@ensureDeferred
async def test_forbidden_domain_start_url():
class TestSpider(Spider):
Expand Down

0 comments on commit 98fb441

Please sign in to comment.