From d701f48e5e6038e89aef1484630a580e58bc6b22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 30 Nov 2023 08:32:29 +0100 Subject: [PATCH] Fail on SPM + transparent mode (#152) --- .github/workflows/test.yml | 5 ++ scrapy_zyte_api/_middlewares.py | 55 +++++++++++++++++ tests/test_middlewares.py | 105 ++++++++++++++++++++++++++++++++ tox.ini | 14 +++++ 4 files changed, 179 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ee74037b..e683c9cf 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,6 +39,11 @@ jobs: - python-version: '3.11' toxenv: provider + - python-version: '3.7' + toxenv: pinned-extra + - python-version: '3.11' + toxenv: extra + steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index 0c53cbe1..629aed4f 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -1,5 +1,7 @@ import logging +from typing import cast +from scrapy import signals from scrapy.exceptions import IgnoreRequest from zyte_api.aio.errors import RequestError @@ -32,10 +34,63 @@ def __init__(self, crawler) -> None: f"reached." ) + crawler.signals.connect(self.open_spider, signal=signals.spider_opened) crawler.signals.connect( self._start_requests_processed, signal=_start_requests_processed ) + def _get_spm_mw(self): + spm_mw_classes = [] + + try: + from scrapy_crawlera import CrawleraMiddleware + except ImportError: + pass + else: + spm_mw_classes.append(CrawleraMiddleware) + + try: + from scrapy_zyte_smartproxy import ZyteSmartProxyMiddleware + except ImportError: + pass + else: + spm_mw_classes.append(ZyteSmartProxyMiddleware) + + middlewares = self._crawler.engine.downloader.middleware.middlewares + for middleware in middlewares: + if isinstance(middleware, tuple(spm_mw_classes)): + return middleware + return None + + def open_spider(self, spider): + settings = self._crawler.settings + in_transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False) + spm_mw = self._get_spm_mw() + spm_is_enabled = spm_mw and spm_mw.is_enabled(spider) + if not in_transparent_mode or not spm_is_enabled: + return + logger.error( + "Both scrapy-zyte-smartproxy and the transparent mode of " + "scrapy-zyte-api are enabled. You should only enable one of " + "those at the same time.\n" + "\n" + "To combine requests that use scrapy-zyte-api and requests " + "that use scrapy-zyte-smartproxy in the same spider:\n" + "\n" + "1. Leave scrapy-zyte-smartproxy enabled.\n" + "2. Disable the transparent mode of scrapy-zyte-api.\n" + "3. To send a specific request through Zyte API, use " + "request.meta to set dont_proxy to True and zyte_api_automap " + "either to True or to a dictionary of extra request fields." + ) + from twisted.internet import reactor + from twisted.internet.interfaces import IReactorCore + + reactor = cast(IReactorCore, reactor) + reactor.callLater( + 0, self._crawler.engine.close_spider, spider, "plugin_conflict" + ) + def _start_requests_processed(self, count): self._total_start_request_count = count self._maybe_close() diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index bb882c93..2305d22a 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -1,3 +1,7 @@ +from typing import Any, Dict, cast +from unittest import SkipTest + +from packaging.version import Version from pytest_twisted import ensureDeferred from scrapy import Request, Spider from scrapy.item import Item @@ -10,6 +14,10 @@ from .mockserver import DelayedResource, MockServer +class NamedSpider(Spider): + name = "named" + + @ensureDeferred async def test_autothrottle_handling(): crawler = get_crawler() @@ -259,3 +267,100 @@ def parse(self, response): await crawler.crawl() assert crawler.stats.get_value("finish_reason") == "failed_forbidden_domain" + + +@ensureDeferred +async def test_spm_conflict_smartproxy(): + try: + import scrapy_zyte_smartproxy # noqa: F401 + except ImportError: + raise SkipTest("scrapy-zyte-smartproxy missing") + + for setting, attribute, conflict in ( + (None, None, False), + (None, False, False), + (None, True, True), + (False, None, False), + (False, False, False), + (False, True, True), + (True, None, True), + (True, False, False), + (True, True, True), + ): + + class SPMSpider(Spider): + name = "spm_spider" + + if attribute is not None: + SPMSpider.zyte_smartproxy_enabled = attribute + + settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_SMARTPROXY_APIKEY": "foo", + **SETTINGS, + } + mws = dict(cast(Dict[Any, int], settings["DOWNLOADER_MIDDLEWARES"])) + mws["scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware"] = 610 + settings["DOWNLOADER_MIDDLEWARES"] = mws + + if setting is not None: + settings["ZYTE_SMARTPROXY_ENABLED"] = setting + + crawler = get_crawler(SPMSpider, settings_dict=settings) + await crawler.crawl() + expected = "plugin_conflict" if conflict else "finished" + assert crawler.stats.get_value("finish_reason") == expected, ( + setting, + attribute, + conflict, + ) + + +@ensureDeferred +async def test_spm_conflict_crawlera(): + try: + import scrapy_crawlera # noqa: F401 + except ImportError: + raise SkipTest("scrapy-crawlera missing") + else: + SCRAPY_CRAWLERA_VERSION = Version(scrapy_crawlera.__version__) + + for setting, attribute, conflict in ( + (None, None, False), + (None, False, False), + (None, True, True), + (False, None, False), + (False, False, False), + (False, True, True), + (True, None, True), + # https://github.com/scrapy-plugins/scrapy-zyte-smartproxy/commit/49ebedd8b1d48cf2667db73f18da3e2c2c7fbfa7 + (True, False, SCRAPY_CRAWLERA_VERSION < Version("1.7")), + (True, True, True), + ): + + class CrawleraSpider(Spider): + name = "crawlera_spider" + + if attribute is not None: + CrawleraSpider.crawlera_enabled = attribute + + settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + "CRAWLERA_APIKEY": "foo", + **SETTINGS, + } + mws = dict(cast(Dict[Any, int], settings["DOWNLOADER_MIDDLEWARES"])) + mws["scrapy_crawlera.CrawleraMiddleware"] = 610 + settings["DOWNLOADER_MIDDLEWARES"] = mws + + if setting is not None: + settings["CRAWLERA_ENABLED"] = setting + + crawler = get_crawler(CrawleraSpider, settings_dict=settings) + await crawler.crawl() + expected = "plugin_conflict" if conflict else "finished" + assert crawler.stats.get_value("finish_reason") == expected, ( + setting, + attribute, + conflict, + ) diff --git a/tox.ini b/tox.ini index eb006d0e..1f6c28e7 100644 --- a/tox.ini +++ b/tox.ini @@ -92,6 +92,20 @@ deps = web-poet==0.13.0 zyte-common-items==0.7.0 +[testenv:pinned-extra] +basepython=python3.7 +deps = + {[testenv:pinned-scrapy-2x0]deps} + scrapy-crawlera==1.1.0 + scrapy-zyte-smartproxy==2.0.0 + +[testenv:extra] +basepython=python3.11 +deps = + {[testenv]deps} + scrapy-crawlera + scrapy-zyte-smartproxy + [testenv:mypy] deps = mypy==1.4.1