Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Count Zyte API requests from the downloader middleware itself #228

Merged
merged 2 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 4 additions & 17 deletions scrapy_zyte_api/_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, crawler) -> None:
f"{self._max_requests}. The spider will close when it's "
f"reached."
)
self._request_count = 0

crawler.signals.connect(
self._start_requests_processed, signal=_start_requests_processed
Expand Down Expand Up @@ -124,29 +125,15 @@ def process_request(self, request, spider):
if self._param_parser.parse(request) is None:
return

self.slot_request(request, spider, force=True)

if self._max_requests_reached(self._crawler.engine.downloader):
self._request_count += 1
if self._max_requests and self._request_count > self._max_requests:
self._crawler.engine.close_spider(spider, "closespider_max_zapi_requests")
raise IgnoreRequest(
f"The request {request} is skipped as {self._max_requests} max "
f"Zyte API requests have been reached."
)

def _max_requests_reached(self, downloader) -> bool:
if not self._max_requests:
return False

zapi_req_count = self._crawler.stats.get_value("scrapy-zyte-api/processed", 0)
download_req_count = sum(
[
len(slot.transferring)
for slot_id, slot in downloader.slots.items()
if slot_id.startswith(self._slot_prefix)
]
)
total_requests = zapi_req_count + download_req_count
return total_requests >= self._max_requests
self.slot_request(request, spider, force=True)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved after the count to skip this part if the request is going to be ignored anyway.


def process_exception(self, request, exception, spider):
if (
Expand Down
55 changes: 51 additions & 4 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def start_requests(self):
for i in range(spider_requests):
meta = {"zyte_api": {"browserHtml": True}}

# Alternating requests between ZAPI and non-ZAPI tests if
# ZYTE_API_MAX_REQUESTS solely limits ZAPI Requests.
# Alternating requests between ZAPI and non-ZAPI verifies
# that ZYTE_API_MAX_REQUESTS solely limits ZAPI requests.

if i % 2:
yield Request(
Expand Down Expand Up @@ -166,8 +166,8 @@ def parse(self, response):
f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}"
in caplog.text
)
assert crawler.stats.get_value("scrapy-zyte-api/success") <= zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") <= zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
assert crawler.stats.get_value("item_scraped_count") <= zapi_max_requests + 6
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
assert (
Expand All @@ -178,6 +178,53 @@ def parse(self, response):
)


@ensureDeferred
async def test_max_requests_race_condition(caplog):
spider_requests = 8
zapi_max_requests = 1

with MockServer(DelayedResource) as server:

class TestSpider(Spider):
name = "test_spider"

def start_requests(self):
for i in range(spider_requests):
meta = {"zyte_api": {"browserHtml": True}}
yield Request("https://example.com", meta=meta, dont_filter=True)

def parse(self, response):
yield Item()

settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633
},
"ZYTE_API_MAX_REQUESTS": zapi_max_requests,
"ZYTE_API_URL": server.urljoin("/"),
**SETTINGS,
}

crawler = get_crawler(TestSpider, settings_dict=settings)
with caplog.at_level("INFO"):
await crawler.crawl()

assert (
f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}"
in caplog.text
)
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
assert crawler.stats.get_value("item_scraped_count") == zapi_max_requests
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
assert (
crawler.stats.get_value(
"downloader/exception_type_count/scrapy.exceptions.IgnoreRequest"
)
> 0
)


@ensureDeferred
async def test_forbidden_domain_start_url():
class TestSpider(Spider):
Expand Down