diff --git a/docs/setup.rst b/docs/setup.rst index 96304a4..0d97881 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -70,6 +70,14 @@ The following additional settings are recommended: ``"zyte_crawlers.middlewares.CrawlingLogsMiddleware": 1000``, to log crawl data in JSON format for debugging purposes. +- Update :setting:`DOWNLOADER_MIDDLEWARES ` to + include + ``"zyte_spider_templates.middlewares.ForbiddenDomainDownloaderMiddleware": + 1100`` and :setting:`SPIDER_MIDDLEWARES ` to + include + ``"zyte_spider_templates.middlewares.ForbiddenDomainSpiderMiddleware": + 100``. + For an example of a properly configured ``settings.py`` file, see `the one in zyte-spider-templates-project`_. diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py index 6c2ecac..5fb30bb 100644 --- a/zyte_spider_templates/middlewares.py +++ b/zyte_spider_templates/middlewares.py @@ -5,7 +5,7 @@ from typing import Any, Dict from scrapy import Request -from scrapy.exceptions import CloseSpider, ScrapyDeprecationWarning +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.utils.request import request_fingerprint from zyte_api.aio.errors import RequestError @@ -112,7 +112,7 @@ def crawl_logs(self, response, result): return "\n".join(report) -start_requests_processed = object() +_start_requests_processed = object() class ForbiddenDomainSpiderMiddleware: @@ -133,7 +133,7 @@ def process_start_requests(self, start_requests, spider): request.meta["is_start_request"] = True yield request count += 1 - self._send_signal(start_requests_processed, count=count) + self._send_signal(_start_requests_processed, count=count) class ForbiddenDomainDownloaderMiddleware: @@ -148,12 +148,13 @@ def __init__(self, crawler): self._failed_start_request_count = 0 self._total_start_request_count = 0 crawler.signals.connect( - self.start_requests_processed, signal=start_requests_processed + self._start_requests_processed, signal=_start_requests_processed ) + self._crawler = crawler - def start_requests_processed(self, count): + def _start_requests_processed(self, count): self._total_start_request_count = count - self.maybe_close() # TODO: Ensure that raising here works. + self._maybe_close() def process_exception(self, request, exception, spider): if ( @@ -164,16 +165,17 @@ def process_exception(self, request, exception, spider): return self._failed_start_request_count += 1 + self._maybe_close() + def _maybe_close(self): if not self._total_start_request_count: return - else: - self.maybe_close() # TODO: Ensure that raising here works. - - def maybe_close(self): - if self._failed_start_request_count >= self._total_start_request_count: - logger.error( - "Stopping the spider, all start request failed because they " - "were pointing to a domain forbidden by Zyte API." - ) - raise CloseSpider("failed-forbidden-domain") + if self._failed_start_request_count < self._total_start_request_count: + return + logger.error( + "Stopping the spider, all start request failed because they " + "were pointing to a domain forbidden by Zyte API." + ) + self._crawler.engine.close_spider( + self._crawler.spider, "failed-forbidden-domain" + )