Skip to content

Commit

Permalink
Fail on SPM + transparent mode (#152)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 30, 2023
1 parent 702cc63 commit d701f48
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ jobs:
- python-version: '3.11'
toxenv: provider

- python-version: '3.7'
toxenv: pinned-extra
- python-version: '3.11'
toxenv: extra

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand Down
55 changes: 55 additions & 0 deletions scrapy_zyte_api/_middlewares.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
from typing import cast

from scrapy import signals
from scrapy.exceptions import IgnoreRequest
from zyte_api.aio.errors import RequestError

Expand Down Expand Up @@ -32,10 +34,63 @@ def __init__(self, crawler) -> None:
f"reached."
)

crawler.signals.connect(self.open_spider, signal=signals.spider_opened)
crawler.signals.connect(
self._start_requests_processed, signal=_start_requests_processed
)

def _get_spm_mw(self):
spm_mw_classes = []

try:
from scrapy_crawlera import CrawleraMiddleware
except ImportError:
pass
else:
spm_mw_classes.append(CrawleraMiddleware)

try:
from scrapy_zyte_smartproxy import ZyteSmartProxyMiddleware
except ImportError:
pass
else:
spm_mw_classes.append(ZyteSmartProxyMiddleware)

middlewares = self._crawler.engine.downloader.middleware.middlewares
for middleware in middlewares:
if isinstance(middleware, tuple(spm_mw_classes)):
return middleware
return None

def open_spider(self, spider):
settings = self._crawler.settings
in_transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False)
spm_mw = self._get_spm_mw()
spm_is_enabled = spm_mw and spm_mw.is_enabled(spider)
if not in_transparent_mode or not spm_is_enabled:
return
logger.error(
"Both scrapy-zyte-smartproxy and the transparent mode of "
"scrapy-zyte-api are enabled. You should only enable one of "
"those at the same time.\n"
"\n"
"To combine requests that use scrapy-zyte-api and requests "
"that use scrapy-zyte-smartproxy in the same spider:\n"
"\n"
"1. Leave scrapy-zyte-smartproxy enabled.\n"
"2. Disable the transparent mode of scrapy-zyte-api.\n"
"3. To send a specific request through Zyte API, use "
"request.meta to set dont_proxy to True and zyte_api_automap "
"either to True or to a dictionary of extra request fields."
)
from twisted.internet import reactor
from twisted.internet.interfaces import IReactorCore

reactor = cast(IReactorCore, reactor)
reactor.callLater(
0, self._crawler.engine.close_spider, spider, "plugin_conflict"
)

def _start_requests_processed(self, count):
self._total_start_request_count = count
self._maybe_close()
Expand Down
105 changes: 105 additions & 0 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from typing import Any, Dict, cast
from unittest import SkipTest

from packaging.version import Version
from pytest_twisted import ensureDeferred
from scrapy import Request, Spider
from scrapy.item import Item
Expand All @@ -10,6 +14,10 @@
from .mockserver import DelayedResource, MockServer


class NamedSpider(Spider):
name = "named"


@ensureDeferred
async def test_autothrottle_handling():
crawler = get_crawler()
Expand Down Expand Up @@ -259,3 +267,100 @@ def parse(self, response):
await crawler.crawl()

assert crawler.stats.get_value("finish_reason") == "failed_forbidden_domain"


@ensureDeferred
async def test_spm_conflict_smartproxy():
try:
import scrapy_zyte_smartproxy # noqa: F401
except ImportError:
raise SkipTest("scrapy-zyte-smartproxy missing")

for setting, attribute, conflict in (
(None, None, False),
(None, False, False),
(None, True, True),
(False, None, False),
(False, False, False),
(False, True, True),
(True, None, True),
(True, False, False),
(True, True, True),
):

class SPMSpider(Spider):
name = "spm_spider"

if attribute is not None:
SPMSpider.zyte_smartproxy_enabled = attribute

settings = {
"ZYTE_API_TRANSPARENT_MODE": True,
"ZYTE_SMARTPROXY_APIKEY": "foo",
**SETTINGS,
}
mws = dict(cast(Dict[Any, int], settings["DOWNLOADER_MIDDLEWARES"]))
mws["scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware"] = 610
settings["DOWNLOADER_MIDDLEWARES"] = mws

if setting is not None:
settings["ZYTE_SMARTPROXY_ENABLED"] = setting

crawler = get_crawler(SPMSpider, settings_dict=settings)
await crawler.crawl()
expected = "plugin_conflict" if conflict else "finished"
assert crawler.stats.get_value("finish_reason") == expected, (
setting,
attribute,
conflict,
)


@ensureDeferred
async def test_spm_conflict_crawlera():
try:
import scrapy_crawlera # noqa: F401
except ImportError:
raise SkipTest("scrapy-crawlera missing")
else:
SCRAPY_CRAWLERA_VERSION = Version(scrapy_crawlera.__version__)

for setting, attribute, conflict in (
(None, None, False),
(None, False, False),
(None, True, True),
(False, None, False),
(False, False, False),
(False, True, True),
(True, None, True),
# https://github.com/scrapy-plugins/scrapy-zyte-smartproxy/commit/49ebedd8b1d48cf2667db73f18da3e2c2c7fbfa7
(True, False, SCRAPY_CRAWLERA_VERSION < Version("1.7")),
(True, True, True),
):

class CrawleraSpider(Spider):
name = "crawlera_spider"

if attribute is not None:
CrawleraSpider.crawlera_enabled = attribute

settings = {
"ZYTE_API_TRANSPARENT_MODE": True,
"CRAWLERA_APIKEY": "foo",
**SETTINGS,
}
mws = dict(cast(Dict[Any, int], settings["DOWNLOADER_MIDDLEWARES"]))
mws["scrapy_crawlera.CrawleraMiddleware"] = 610
settings["DOWNLOADER_MIDDLEWARES"] = mws

if setting is not None:
settings["CRAWLERA_ENABLED"] = setting

crawler = get_crawler(CrawleraSpider, settings_dict=settings)
await crawler.crawl()
expected = "plugin_conflict" if conflict else "finished"
assert crawler.stats.get_value("finish_reason") == expected, (
setting,
attribute,
conflict,
)
14 changes: 14 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,20 @@ deps =
web-poet==0.13.0
zyte-common-items==0.7.0

[testenv:pinned-extra]
basepython=python3.7
deps =
{[testenv:pinned-scrapy-2x0]deps}
scrapy-crawlera==1.1.0
scrapy-zyte-smartproxy==2.0.0

[testenv:extra]
basepython=python3.11
deps =
{[testenv]deps}
scrapy-crawlera
scrapy-zyte-smartproxy

[testenv:mypy]
deps =
mypy==1.4.1
Expand Down

0 comments on commit d701f48

Please sign in to comment.