From f58207606c79139f78ad147372e71406bb7d0ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 19 Jun 2024 11:04:09 +0200 Subject: [PATCH] Add session management (#193) --- .coveragerc | 12 + .github/workflows/test.yml | 5 +- CHANGES.rst | 11 +- docs/conf.py | 4 + docs/index.rst | 1 + docs/reference/meta.rst | 43 + docs/reference/request.rst | 2 +- docs/reference/settings.rst | 258 +++- docs/setup.rst | 13 +- docs/usage/session.rst | 350 ++++++ scrapy_zyte_api/__init__.py | 23 + scrapy_zyte_api/_middlewares.py | 8 +- scrapy_zyte_api/_session.py | 813 ++++++++++++ scrapy_zyte_api/addon.py | 39 +- scrapy_zyte_api/utils.py | 8 + tests/__init__.py | 28 +- tests/mockserver.py | 63 +- tests/test_addon.py | 44 +- tests/test_api_requests.py | 3 + tests/test_middlewares.py | 2 +- tests/test_sessions.py | 2046 +++++++++++++++++++++++++++++++ 21 files changed, 3724 insertions(+), 52 deletions(-) create mode 100644 .coveragerc create mode 100644 docs/usage/session.rst create mode 100644 scrapy_zyte_api/_session.py create mode 100644 tests/test_sessions.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..d34ea0ef --- /dev/null +++ b/.coveragerc @@ -0,0 +1,12 @@ +[run] +branch = true +include = scrapy_zyte_api/* +omit = + tests/* +disable_warnings = include-ignored + +[report] +# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 +exclude_lines = + pragma: no cover + if TYPE_CHECKING: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3d4c4520..5586cb21 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -58,8 +58,9 @@ jobs: run: | tox -e ${{ matrix.toxenv || 'py' }} - name: coverage - if: ${{ success() }} - run: bash <(curl -s https://codecov.io/bash) + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} check: runs-on: ubuntu-latest diff --git a/CHANGES.rst b/CHANGES.rst index f8361a38..f55df45b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,15 @@ Changes ======= +N.N.N (YYYY-MM-DD) +------------------ + +* The recommended position for ``ScrapyZyteAPIDownloaderMiddleware`` changed + from 1000 to 633, to accommodate for the new + ``ScrapyZyteAPISessionDownloaderMiddleware``, which needs to be after + ``ScrapyZyteAPIDownloaderMiddleware`` and before the Scrapy cookie downloader + middleware (700). + 0.18.4 (2024-06-10) ------------------- @@ -396,7 +405,7 @@ When upgrading, you should set the following in your Scrapy settings: .. code-block:: python DOWNLOADER_MIDDLEWARES = { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, } # only applicable for Scrapy 2.7+ REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter" diff --git a/docs/conf.py b/docs/conf.py index 54792f92..55f7e7ae 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,4 +56,8 @@ "https://zyte-common-items.readthedocs.io/en/latest", None, ), + "zyte-spider-templates": ( + "https://zyte-spider-templates.readthedocs.io/en/latest", + None, + ), } diff --git a/docs/index.rst b/docs/index.rst index 1d2cb524..0be78fa6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -26,6 +26,7 @@ either :ref:`globally ` or :ref:`per request `, or usage/default usage/retry usage/scrapy-poet + usage/session usage/stats usage/fingerprint usage/proxy diff --git a/docs/reference/meta.rst b/docs/reference/meta.rst index 2bea6669..1a11b267 100644 --- a/docs/reference/meta.rst +++ b/docs/reference/meta.rst @@ -86,3 +86,46 @@ string. `_. See :ref:`retry`. + + +.. reqmeta:: zyte_api_session_enabled + +zyte_api_session_enabled +========================= + +Default: :setting:`ZYTE_API_SESSION_ENABLED` + +Whether to use :ref:`scrapy-zyte-api session management ` for the +request (``True``) or not (``False``). + + +.. reqmeta:: zyte_api_session_location + +zyte_api_session_location +========================= + +Default: ``{}`` + +Address for ``setLocation``-based session initialization. See +:setting:`ZYTE_API_SESSION_LOCATION` for details. + +This request metadata key, if not empty, takes precedence over the +:setting:`ZYTE_API_SESSION_LOCATION` setting, the +:setting:`ZYTE_API_SESSION_PARAMS` setting, and the +:reqmeta:`zyte_api_session_location` request metadata key. + + +.. reqmeta:: zyte_api_session_params + +zyte_api_session_params +======================= + +Default: ``{}`` + +Parameters to use for session initialization. See +:setting:`ZYTE_API_SESSION_PARAMS` for details. + +This request metadata key, if not empty, takes precedence over the +:setting:`ZYTE_API_SESSION_PARAMS` setting, but it can be overridden +by the :setting:`ZYTE_API_SESSION_LOCATION` setting or the +:reqmeta:`zyte_api_session_location` request metadata key. diff --git a/docs/reference/request.rst b/docs/reference/request.rst index cd039830..70c04daf 100644 --- a/docs/reference/request.rst +++ b/docs/reference/request.rst @@ -173,7 +173,7 @@ They will be mapped even if defined with their default value. Headers will also be mapped if set to a non-default value elsewhere, e.g. in a custom downloader middleware, as long as it is done before the scrapy-zyte-api downloader middleware, which is responsible for the mapping, processes the -request. Here “before” means a lower value than ``1000`` in the +request. Here “before” means a lower value than ``633`` in the :setting:`DOWNLOADER_MIDDLEWARES ` setting. Similarly, you can add any of those headers to the diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 3b0dc4ba..d98e5f72 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -198,7 +198,8 @@ ZYTE_API_MAX_REQUESTS Default: ``None`` When set to an integer value > 0, the spider will close when the number of Zyte -API requests reaches it. +API requests reaches it, with ``closespider_max_zapi_requests`` as the close +reason. Note that requests with error responses that cannot be retried or exceed their retry limit also count here. @@ -246,6 +247,261 @@ subclass. See :ref:`retry`. +.. setting:: ZYTE_API_SESSION_CHECKER + +ZYTE_API_SESSION_CHECKER +======================== + +Default: ``None`` + +A :ref:`Scrapy component ` (or its import path as a string) +that defines a ``check`` method. + +If ``check`` returns ``True``, the response session is considered valid; if +``check`` returns ``False``, the response session is considered invalid, and +will be discarded. ``check`` can also raise a +:exc:`~scrapy.exceptions.CloseSpider` exception to close the spider. + +If defined, the ``check`` method is called on every response that is using a +:ref:`session managed by scrapy-zyte-api `. If not defined, the +default implementation checks the outcome of the ``setLocation`` action if +session initialization was location-based, as described in +:ref:`session-check`. + +Example: + +.. code-block:: python + :caption: settings.py + + from scrapy import Request + from scrapy.http.response import Response + + + class MySessionChecker: + + def check(self, request: Request, response: Response) -> bool: + return bool(response.css(".is_valid")) + + + ZYTE_API_SESSION_CHECKER = MySessionChecker + +Because the session checker is a Scrapy component, you can access the crawler +object, for example to read settings: + +.. code-block:: python + :caption: settings.py + + from scrapy import Request + from scrapy.http.response import Response + + + class MySessionChecker: + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + location = crawler.settings["ZYTE_API_SESSION_LOCATION"] + self.postal_code = location["postalCode"] + + def check(self, request: Request, response: Response) -> bool: + return response.css(".postal_code::text").get() == self.postal_code + + + ZYTE_API_SESSION_CHECKER = MySessionChecker + + +.. setting:: ZYTE_API_SESSION_ENABLED + +ZYTE_API_SESSION_ENABLED +======================== + +Default: ``False`` + +Enables :ref:`scrapy-zyte-api session management `. + + +.. setting:: ZYTE_API_SESSION_LOCATION + +ZYTE_API_SESSION_LOCATION +========================= + +Default: ``{}`` + +If defined, sessions are initialized using the ``setLocation`` +:http:`action `, and the value of this setting must be the +target address :class:`dict`. For example: + +.. code-block:: python + :caption: settings.py + + ZYTE_API_SESSION_LOCATION = {"postalCode": "10001"} + +If the :setting:`ZYTE_API_SESSION_PARAMS` setting or the +:reqmeta:`zyte_api_session_params` request metadata key set a ``"url"``, it +will be used for session initialization as well. Otherwise, the URL of the +request for which the session is being initialized will be used instead. + +This setting, if not empty, takes precedence over the +:setting:`ZYTE_API_SESSION_PARAMS` setting and the +:reqmeta:`zyte_api_session_params` request metadata key, but it can be +overridden by the :reqmeta:`zyte_api_session_location` request metadata key. + +To disable the :setting:`ZYTE_API_SESSION_LOCATION` setting on a specific +request, e.g. to use the :setting:`ZYTE_API_SESSION_PARAMS` setting or the +:reqmeta:`zyte_api_session_params` request metadata key instead, set +the :reqmeta:`zyte_api_session_location` request metadata key to ``{}``. + + +.. setting:: ZYTE_API_SESSION_MAX_BAD_INITS + +ZYTE_API_SESSION_MAX_BAD_INITS +============================== + +Default: ``8`` + +The maximum number of :ref:`scrapy-zyte-api sessions ` per pool that +are allowed to fail their session check right after creation in a row. If the +maximum is reached, the spider closes with ``bad_session_inits`` as the close +reason. + +To override this value for specific pools, use +:setting:`ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL`. + + +.. setting:: ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL + +ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL +======================================= + +Default: ``{}`` + +:class:`dict` where keys are :ref:`pool ` IDs and values are +overrides of :setting:`ZYTE_API_SESSION_POOL_SIZE` for those pools. + + +.. setting:: ZYTE_API_SESSION_MAX_ERRORS + +ZYTE_API_SESSION_MAX_ERRORS +=========================== + +Default: ``1`` + +Maximum number of :ref:`unsuccessful responses +` allowed for any given session before +discarding the session. + +You might want to increase this number if you find that a session may continue +to work even after an unsuccessful response. See :ref:`optimize-sessions`. + +.. note:: This setting does not affect session checks + (:setting:`ZYTE_API_SESSION_CHECKER`). A session is always discarded the + first time it fails its session check. + + +.. setting:: ZYTE_API_SESSION_PARAMS + +ZYTE_API_SESSION_PARAMS +======================= + +Default: ``{"browserHtml": True}`` + +Parameters to use for session initialization. + +It works similarly to :http:`request:sessionContextParams` from +:ref:`server-managed sessions `, but it supports +arbitrary Zyte API parameters instead of a specific subset. + +If it does not define a ``"url"``, the URL of the request for which the session +is being initialized will be used. + +This setting can be overridden by the :setting:`ZYTE_API_SESSION_LOCATION` +setting, the :reqmeta:`zyte_api_session_location` request metadata key, or the +:reqmeta:`zyte_api_session_params` request metadata key. + +Example: + +.. code-block:: python + :caption: settings.py + + ZYTE_API_SESSION_PARAMS = { + "browserHtml": True, + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ], + } + +.. tip:: The example above is equivalent to setting + :setting:`ZYTE_API_SESSION_LOCATION` to ``{"postalCode": "10001"}``. + + +.. setting:: ZYTE_API_SESSION_POOL_SIZE + +ZYTE_API_SESSION_POOL_SIZE +========================== + +Default: ``8`` + +The maximum number of active :ref:`scrapy-zyte-api sessions ` to keep +per :ref:`pool `. + +To override this value for specific pools, use +:setting:`ZYTE_API_SESSION_POOL_SIZES`. + +Increase this number to lower the frequency with which requests are sent +through each session, which on some websites may increase the lifetime of each +session. See :ref:`optimize-sessions`. + + +.. setting:: ZYTE_API_SESSION_POOL_SIZES + +ZYTE_API_SESSION_POOL_SIZES +=========================== + +Default: ``{}`` + +:class:`dict` where keys are :ref:`pool ` IDs and values are +overrides of :setting:`ZYTE_API_SESSION_POOL_SIZE` for those pools. + + +.. setting:: ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS + +ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS +=================================== + +Default: ``60`` + +scrapy-zyte-api maintains a rotation queue of ready-to-use sessions per +:ref:`pool `. At some points, the queue might be empty for a +given pool because all its sessions are in the process of being initialized or +refreshed. + +If the queue is empty when trying to assign a session to a request, +scrapy-zyte-api will wait some time +(:setting:`ZYTE_API_SESSION_QUEUE_WAIT_TIME`), and then try to get a session +from the queue again. + +Use this setting to configure the maximum number of attempts before giving up +and raising a :exc:`RuntimeError` exception. + + +.. setting:: ZYTE_API_SESSION_QUEUE_WAIT_TIME + +ZYTE_API_SESSION_QUEUE_WAIT_TIME +=================================== + +Default: ``1.0`` + +Number of seconds to wait between attempts to get a session from a rotation +queue. + +See :setting:`ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS` for details. + + .. setting:: ZYTE_API_SKIP_HEADERS ZYTE_API_SKIP_HEADERS diff --git a/docs/setup.rst b/docs/setup.rst index 6395d870..5efe15ec 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -104,7 +104,7 @@ scrapy-zyte-api integration as follows: "https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", } DOWNLOADER_MIDDLEWARES = { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, } SPIDER_MIDDLEWARES = { "scrapy_zyte_api.ScrapyZyteAPISpiderMiddleware": 100, @@ -139,6 +139,17 @@ If you already had a custom value for :setting:`REQUEST_FINGERPRINTER_CLASS ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS = "myproject.CustomRequestFingerprinter" +For :ref:`session management support `, add the following downloader +middleware to the :setting:`DOWNLOADER_MIDDLEWARES +` setting: + +.. code-block:: python + :caption: settings.py + + DOWNLOADER_MIDDLEWARES = { + "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, + } + .. _reactor-change: diff --git a/docs/usage/session.rst b/docs/usage/session.rst new file mode 100644 index 00000000..a53d7a8c --- /dev/null +++ b/docs/usage/session.rst @@ -0,0 +1,350 @@ +.. _session: + +================== +Session management +================== + +Zyte API provides powerful session APIs: + +- :ref:`Client-managed sessions ` give you full control + over session management. + +- :ref:`Server-managed sessions ` let Zyte API + handle session management for you. + +When using scrapy-zyte-api, you can use these session APIs through the +corresponding Zyte API fields (:http:`request:session`, +:http:`request:sessionContext`). + +However, scrapy-zyte-api also provides its own session management API, similar +to that of :ref:`server-managed sessions `, but +built on top of :ref:`client-managed sessions `. + +scrapy-zyte-api session management offers some advantages over +:ref:`server-managed sessions `: + +- You can perform :ref:`session validity checks `, so that the + sessions of responses that do not pass those checks are refreshed, and the + responses retried with a different session. + +- You can use arbitrary Zyte API parameters for :ref:`session initialization + `, beyond those that :http:`request:sessionContextParameters` + supports. + +- You have granular control over the session pool size, max errors, etc. See + :ref:`optimize-sessions` and :ref:`session-configs`. + +However, scrapy-zyte-api session manager is not a replacement for +:ref:`server-managed sessions ` or +:ref:`client-managed sessions `: + +- :ref:`Server-managed sessions ` offer a longer + life time than the :ref:`client-managed sessions ` + that scrapy-zyte-api session management uses, so as long as you do not need + one of the scrapy-zyte-api session management features, they can be + significantly more efficient (fewer total sessions needed per crawl). + + Zyte API can also optimize server-managed sessions based on the target + website. With scrapy-zyte-api session management, you need to :ref:`handle + optimization yourself `. + +- :ref:`Client-managed sessions ` offer full control + over session management, while scrapy-zyte-api session management removes + some of that control to provide an easier API for supported use cases. + +.. _enable-sessions: + +Enabling session management +=========================== + +To enable session management for all requests, set +:setting:`ZYTE_API_SESSION_ENABLED` to ``True``. You can also toggle session +management on or off for specific requests using the +:reqmeta:`zyte_api_session_enabled` request metadata key. + +By default, scrapy-zyte-api will maintain up to 8 sessions per domain, each +initialized with a :ref:`browser request ` targeting the URL +of the first request that will use the session. Sessions will be automatically +rotated among requests, and refreshed as they expire or get banned. + +For session management to work as expected, your +:setting:`ZYTE_API_RETRY_POLICY` should not retry 520 and 521 responses: + +- If you are using the default retry policy + (:data:`~zyte_api.zyte_api_retrying`) or + :data:`~zyte_api.aggressive_retrying`: + + - If you are :ref:`using the add-on `, they are + automatically replaced with a matching session-specific retry policy, + either :data:`~scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY` or + :data:`~scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY`. + + - If you are not using the add-on, set :setting:`ZYTE_API_RETRY_POLICY` + manually to either + :data:`~scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY` or + :data:`~scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY`. For example: + + .. code-block:: python + :caption: settings.py + + ZYTE_API_RETRY_POLICY = "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY" + +- If you are using a custom retry policy, modify it to not retry 520 and 521 + responses. + +.. _session-init: + +Initializing sessions +===================== + +To change how sessions are initialized, you have the following options: + +- To run the ``setLocation`` :http:`action ` for session + initialization, use the :setting:`ZYTE_API_SESSION_LOCATION` setting or the + :reqmeta:`zyte_api_session_location` request metadata key. + +- For session initialization with arbitrary Zyte API request fields, use the + :setting:`ZYTE_API_SESSION_PARAMS` setting or the + :reqmeta:`zyte_api_session_params` request metadata key. + +- To customize session initialization per request, define + :meth:`~scrapy_zyte_api.SessionConfig.params` in a :ref:`session config + override `. + +.. _session-check: + +Checking sessions +================= + +Responses from a session can be checked for session validity. If a response +does not pass a session validity check, the session is discarded, and the +request is retried with a different session. + +Session checking can be useful to work around scenarios where session +initialization fails, e.g. due to rendering issues, IP-geolocation mismatches, +A-B tests, etc. It can also help in cases where website sessions expire before +Zyte API sessions. + +By default, for sessions that are initialized with a location, the outcome of +the ``setLocation`` action is checked. If the action fails, the session is +discarded. If the action is not even available for a given website, the spider +is closed with ``unsupported_set_location`` as the close reason, so that you +can set a proper :ref:`session initialization logic ` for +requests targeting that website. + +For sessions initialized with arbitrary or no parameters, no session check is +performed, sessions are assumed to be fine until they expire or are banned. +That is so even if those arbitrary parameters include a ``setLocation`` action. + +To implement your own code to check session responses and determine whether +their session should be kept or discarded, use the +:setting:`ZYTE_API_SESSION_CHECKER` setting. + +If you need to check session validity for multiple websites, it is better to +define a separate :ref:`session config override ` for each +website, each with its own implementation of +:meth:`~scrapy_zyte_api.SessionConfig.check`. + +If your session checking implementation relies on the response body (e.g. it +uses CSS or XPath expressions), you should make sure that you are getting one, +which might not be the case if you are mostly using :ref:`Zyte API automatic +extraction `, e.g. when using :doc:`Zyte spider templates +`. For example, you can use +:setting:`ZYTE_API_AUTOMAP_PARAMS` and :setting:`ZYTE_API_PROVIDER_PARAMS` to +force :http:`request:browserHtml` or :http:`request:httpResponseBody` to be set +on every Zyte API request: + +.. code-block:: python + :caption: setting.py + + ZYTE_API_AUTOMAP_PARAMS = {"browserHtml": True} + ZYTE_API_PROVIDER_PARAMS = {"browserHtml": True} + +.. _optimize-sessions: + +Optimizing sessions +=================== + +For faster crawls and lower costs, specially where session initialization +requests are more expensive than session usage requests (e.g. because +initialization relies on ``browserHtml`` and usage relies on +``httpResponseBody``), you should try to make your sessions live as long as +possible before they are discarded. + +Here are some things you can try: + +- On some websites, sending too many requests too fast through a session can + cause the target website to ban that session. + + On those websites, you can increase the number of sessions in the pool + (:setting:`ZYTE_API_SESSION_POOL_SIZE`). The more different sessions you + use, the more slowly you send requests through each session. + + Mind, however, that :ref:`client-managed sessions ` + expire after `15 minutes since creation or 2 minutes since the last request + `_. + At a certain point, increasing :setting:`ZYTE_API_SESSION_POOL_SIZE` + without increasing :setting:`CONCURRENT_REQUESTS + ` and :setting:`CONCURRENT_REQUESTS_PER_DOMAIN + ` accordingly can be + counterproductive. + +- By default, sessions are discarded as soon as an :ref:`unsuccessful + response ` is received. + + However, on some websites sessions may remain valid even after a few + unsuccessful responses. If that is the case, you might want to increase + :setting:`ZYTE_API_SESSION_MAX_ERRORS` to require a higher number of + unsuccessful responses before discarding a session. + +If you do not need :ref:`session checking ` and your +:ref:`initialization parameters ` are only +:http:`request:browserHtml` and :http:`request:actions`, :ref:`server-managed +sessions ` might be a more cost-effective choice, as +they live much longer than :ref:`client-managed sessions +`. + + +.. _session-configs: + +Overriding session configs +========================== + +For spiders that target a single website, using settings and request metadata +keys for :ref:`session initialization ` and :ref:`session +checking ` should do the job. However, for broad crawls or +:doc:`multi-website spiders `, you might want to +define different session configs for different websites. + +The default session config is implemented by the +:class:`~scrapy_zyte_api.SessionConfig` class: + +.. autoclass:: scrapy_zyte_api.SessionConfig + :members: + +To define a different session config for a given URL pattern, install +:doc:`web-poet ` and define a subclass of +:class:`~scrapy_zyte_api.SessionConfig` decorated with +:func:`~scrapy_zyte_api.session_config`: + +.. autofunction:: scrapy_zyte_api.session_config + +.. _session-cookies: + +Cookie handling +=============== + +All requests involved in session management, both requests to initialize a +session and requests that are assigned a session, have their +:reqmeta:`dont_merge_cookies ` request metadata key +set to ``True`` if not already defined. Each Zyte API session handles its own +cookies instead. + +If you set :reqmeta:`dont_merge_cookies ` to +``False`` in a request that uses a session, that request will include cookies +managed by Scrapy. However, session initialization requests will still have +:reqmeta:`dont_merge_cookies ` set to ``True``, you +cannot override that. + +To include cookies in session initialization requests, use +:http:`request:requestCookies` in :ref:`session initialization parameters +`. But mind that those cookies are only set during that request, +:ref:`they are not added to the session cookie jar +`. + + +Session retry policies +====================== + +The following retry policies are designed to work well with session management +(see :ref:`enable-sessions`): + +.. autodata:: scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY + :annotation: + +.. autodata:: scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY + :annotation: + + +Spider closers +============== + +Session management can close your spider early in the following scenarios: + +- ``bad_session_inits``: Too many session initializations failed in a row for + a given session pool. + + You can use the :setting:`ZYTE_API_SESSION_MAX_BAD_INITS` and + :setting:`ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL` settings to adjust that + maximum. + +- ``pool_error``: There was an error determining the session pool ID for some + request. + + It is most likely the result of a bad implementation of + :meth:`SessionConfig.pool `; the + logs should contain an error message with a traceback for such errors. + +A custom :meth:`SessionConfig.check ` +implementation may also close your spider with a custom reason by raising a +:exc:`~scrapy.exceptions.CloseSpider` exception. + + +.. _session-stats: + +Session stats +============= + +The following stats exist for scrapy-zyte-api session management: + +``scrapy-zyte-api/sessions/pools/{pool}/init/check-error`` + Number of times that a session for pool ``{pool}`` triggered an unexpected + exception during its session validation check right after initialization. + + It is most likely the result of a bad implementation of + :meth:`SessionConfig.check `; the + logs should contain an error message with a traceback for such errors. + +``scrapy-zyte-api/sessions/pools/{pool}/init/check-failed`` + Number of times that a session from pool ``{pool}`` failed its session + validation check right after initialization. + +``scrapy-zyte-api/sessions/pools/{pool}/init/check-passed`` + Number of times that a session from pool ``{pool}`` passed its session + validation check right after initialization. + +``scrapy-zyte-api/sessions/pools/{pool}/init/failed`` + Number of times that initializing a session for pool ``{pool}`` resulted in + an :ref:`unsuccessful response `. + +``scrapy-zyte-api/sessions/pools/{pool}/init/param-error`` + Number of times that initializing a session for pool ``{pool}`` triggered + an unexpected exception when obtaining the Zyte API parameters for session + initialization. + + It is most likely the result of a bad implementation of + :meth:`SessionConfig.params `; the + logs should contain an error message with a traceback for such errors. + +``scrapy-zyte-api/sessions/pools/{pool}/use/check-error`` + Number of times that a response that used a session from pool ``{pool}`` + triggered an unexpected exception during its session validation check. + + It is most likely the result of a bad implementation of + :meth:`SessionConfig.check `; the + logs should contain an error message with a traceback for such errors. + +``scrapy-zyte-api/sessions/pools/{pool}/use/check-failed`` + Number of times that a response that used a session from pool ``{pool}`` + failed its session validation check. + +``scrapy-zyte-api/sessions/pools/{pool}/use/check-passed`` + Number of times that a response that used a session from pool ``{pool}`` + passed its session validation check. + +``scrapy-zyte-api/sessions/pools/{pool}/use/expired`` + Number of times that a session from pool ``{pool}`` expired. + +``scrapy-zyte-api/sessions/pools/{pool}/use/failed`` + Number of times that a request that used a session from pool ``{pool}`` + got an :ref:`unsuccessful response `. diff --git a/scrapy_zyte_api/__init__.py b/scrapy_zyte_api/__init__.py index 8ac1810f..8767d177 100644 --- a/scrapy_zyte_api/__init__.py +++ b/scrapy_zyte_api/__init__.py @@ -12,5 +12,28 @@ ) from ._page_inputs import Actions, Geolocation, Screenshot from ._request_fingerprinter import ScrapyZyteAPIRequestFingerprinter +from ._session import ( + SESSION_AGGRESSIVE_RETRY_POLICY as _SESSION_AGGRESSIVE_RETRY_POLICY, +) +from ._session import SESSION_DEFAULT_RETRY_POLICY as _SESSION_DEFAULT_RETRY_POLICY +from ._session import ( + ScrapyZyteAPISessionDownloaderMiddleware, + SessionConfig, + session_config, +) from .addon import Addon from .handler import ScrapyZyteAPIDownloadHandler + +# We re-define the variables here for Sphinx to pick the documentation. + +#: Alternative to the :ref:`default retry policy ` for +#: :ref:`session management ` that does not retry 520 responses. +SESSION_DEFAULT_RETRY_POLICY = _SESSION_DEFAULT_RETRY_POLICY + +#: Alternative to the :ref:`aggresive retry policy ` +#: for :ref:`session management ` that does not retry 520 and 521 +#: responses. +#: +#: .. note:: When using python-zyte-api 0.5.2 or lower, this is the same as +#: :data:`~scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY`. +SESSION_AGGRESSIVE_RETRY_POLICY = _SESSION_AGGRESSIVE_RETRY_POLICY diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index 5ec10a08..2c4e73a2 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -1,15 +1,13 @@ -import logging +from logging import getLogger from typing import cast from scrapy import Request from scrapy.exceptions import IgnoreRequest -from zyte_api.aio.errors import RequestError +from zyte_api import RequestError from ._params import _ParamParser -logger = logging.getLogger(__name__) - - +logger = getLogger(__name__) _start_requests_processed = object() diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py new file mode 100644 index 00000000..fec3e5b0 --- /dev/null +++ b/scrapy_zyte_api/_session.py @@ -0,0 +1,813 @@ +from asyncio import Task, create_task, sleep +from collections import defaultdict, deque +from copy import deepcopy +from functools import partial +from logging import getLogger +from typing import Any, Deque, Dict, Optional, Set, Type, TypeVar, Union, cast +from uuid import uuid4 +from weakref import WeakKeyDictionary + +from scrapy import Request, Spider +from scrapy.crawler import Crawler +from scrapy.exceptions import CloseSpider, IgnoreRequest +from scrapy.http import Response +from scrapy.utils.httpobj import urlparse_cached +from scrapy.utils.misc import create_instance, load_object +from scrapy.utils.python import global_object_name +from tenacity import stop_after_attempt +from zyte_api import RequestError, RetryFactory + +from scrapy_zyte_api.utils import _DOWNLOAD_NEEDS_SPIDER + +logger = getLogger(__name__) +SESSION_INIT_META_KEY = "_is_session_init_request" +ZYTE_API_META_KEYS = ("zyte_api", "zyte_api_automap", "zyte_api_provider") + + +class SessionRetryFactory(RetryFactory): + temporary_download_error_stop = stop_after_attempt(1) + + +SESSION_DEFAULT_RETRY_POLICY = SessionRetryFactory().build() + +try: + from zyte_api import AggressiveRetryFactory, stop_on_count +except ImportError: + SESSION_AGGRESSIVE_RETRY_POLICY = SESSION_DEFAULT_RETRY_POLICY +else: + + class AggressiveSessionRetryFactory(AggressiveRetryFactory): + download_error_stop = stop_on_count(1) + + SESSION_AGGRESSIVE_RETRY_POLICY = AggressiveSessionRetryFactory().build() + + +try: + from scrapy_poet import DummyResponse +except ImportError: + + class DummyResponse: # type: ignore[no-redef] + pass + + +try: + from scrapy.downloadermiddlewares.retry import get_retry_request +except ImportError: # pragma: no cover + # https://github.com/scrapy/scrapy/blob/b1fe97dc6c8509d58b29c61cf7801eeee1b409a9/scrapy/downloadermiddlewares/retry.py#L57-L142 + def get_retry_request( + request, + *, + spider, + reason="unspecified", + max_retry_times=None, + priority_adjust=None, + stats_base_key="retry", + ): + settings = spider.crawler.settings + assert spider.crawler.stats + stats = spider.crawler.stats + retry_times = request.meta.get("retry_times", 0) + 1 + if max_retry_times is None: + max_retry_times = request.meta.get("max_retry_times") + if max_retry_times is None: + max_retry_times = settings.getint("RETRY_TIMES") + if retry_times <= max_retry_times: + logger.debug( + "Retrying %(request)s (failed %(retry_times)d times): %(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + new_request: Request = request.copy() + new_request.meta["retry_times"] = retry_times + new_request.dont_filter = True + if priority_adjust is None: + priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST") + new_request.priority = request.priority + priority_adjust + + if callable(reason): + reason = reason() + if isinstance(reason, Exception): + reason = global_object_name(reason.__class__) + + stats.inc_value(f"{stats_base_key}/count") + stats.inc_value(f"{stats_base_key}/reason_count/{reason}") + return new_request + stats.inc_value(f"{stats_base_key}/max_reached") + logger.error( + "Gave up retrying %(request)s (failed %(retry_times)d times): " + "%(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + return None + + +try: + from scrapy.http.request import NO_CALLBACK +except ImportError: + + def NO_CALLBACK(response): + pass # pragma: no cover + + +try: + from scrapy.utils.defer import deferred_to_future +except ImportError: # pragma: no cover + import asyncio + from warnings import catch_warnings, filterwarnings + + # https://github.com/scrapy/scrapy/blob/b1fe97dc6c8509d58b29c61cf7801eeee1b409a9/scrapy/utils/reactor.py#L119-L147 + def set_asyncio_event_loop(): + try: + with catch_warnings(): + # In Python 3.10.9, 3.11.1, 3.12 and 3.13, a DeprecationWarning + # is emitted about the lack of a current event loop, because in + # Python 3.14 and later `get_event_loop` will raise a + # RuntimeError in that event. Because our code is already + # prepared for that future behavior, we ignore the deprecation + # warning. + filterwarnings( + "ignore", + message="There is no current event loop", + category=DeprecationWarning, + ) + event_loop = asyncio.get_event_loop() + except RuntimeError: + # `get_event_loop` raises RuntimeError when called with no asyncio + # event loop yet installed in the following scenarios: + # - Previsibly on Python 3.14 and later. + # https://github.com/python/cpython/issues/100160#issuecomment-1345581902 + event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(event_loop) + return event_loop + + # https://github.com/scrapy/scrapy/blob/b1fe97dc6c8509d58b29c61cf7801eeee1b409a9/scrapy/utils/reactor.py#L115-L116 + def _get_asyncio_event_loop(): + return set_asyncio_event_loop() + + # https://github.com/scrapy/scrapy/blob/b1fe97dc6c8509d58b29c61cf7801eeee1b409a9/scrapy/utils/defer.py#L360-L379 + def deferred_to_future(d): + return d.asFuture(_get_asyncio_event_loop()) + + +try: + from scrapy.utils.misc import build_from_crawler +except ImportError: + T = TypeVar("T") + + def build_from_crawler( + objcls: Type[T], crawler: Crawler, /, *args: Any, **kwargs: Any + ) -> T: + return create_instance(objcls, settings=None, crawler=crawler, *args, **kwargs) + + +class PoolError(ValueError): + pass + + +class TooManyBadSessionInits(RuntimeError): + pass + + +class SessionConfig: + """Default session configuration for :ref:`scrapy-zyte-api sessions + `.""" + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + self.crawler = crawler + + settings = crawler.settings + self._fallback_location = settings.getdict("ZYTE_API_SESSION_LOCATION") + self._fallback_params = settings.getdict( + "ZYTE_API_SESSION_PARAMS", {"browserHtml": True} + ) + + checker_cls = settings.get("ZYTE_API_SESSION_CHECKER", None) + if checker_cls: + self._checker = build_from_crawler(load_object(checker_cls), crawler) + else: + self._checker = None + + def pool(self, request: Request) -> str: + """Return the ID of the session pool to use for *request*. + + .. _session-pools: + + The default implementation returns the request URL netloc, e.g. + ``"books.toscrape.com"`` for a request targeting + https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html. + + scrapy-zyte-api can maintain multiple session pools, each pool with up + to :setting:`ZYTE_API_SESSION_POOL_SIZE` sessions. + """ + return urlparse_cached(request).netloc + + def location(self, request: Request) -> Dict[str, str]: + """Return the address :class:`dict` to use for ``setLocation``-based + session initialization for *request*. + + The default implementation is based on settings and request metadata + keys as described in :ref:`session-init`. + """ + return request.meta.get("zyte_api_session_location", self._fallback_location) + + def params(self, request: Request) -> Dict[str, Any]: + """Return the Zyte API request parameters to use to initialize a + session for *request*. + + The default implementation is based on settings and request metadata + keys as described in :ref:`session-init`. + """ + location = self.location(request) + params = request.meta.get("zyte_api_session_params", self._fallback_params) + if not location: + return params + return { + "url": params.get("url", request.url), + "browserHtml": True, + "actions": [ + { + "action": "setLocation", + "address": location, + } + ], + } + + def check(self, response: Response, request: Request) -> bool: + """Return ``True`` if the session used to fetch *response* should be + kept, return ``False`` if it should be discarded, or raise + :exc:`~scrapy.exceptions.CloseSpider` if the spider should be closed. + + The default implementation checks the outcome of the ``setLocation`` + action if session initialization was location-based, as described in + :ref:`session-check`. + """ + if self._checker: + return self._checker.check(response, request) + location = self.location(request) + if not location: + return True + for action in response.raw_api_response.get("actions", []): + if action.get("action", None) != "setLocation": + continue + if action.get("error", "").startswith("Action setLocation not supported "): + logger.error( + f"Stopping the spider, tried to use the setLocation " + f"action on an unsupported website " + f"({urlparse_cached(request).netloc})." + ) + raise CloseSpider("unsupported_set_location") + return action.get("status", None) == "success" + return True + + +try: + from web_poet import RulesRegistry +except ImportError: + + class SessionConfigRulesRegistry: + + def session_config_cls(self, request: Request) -> Type[SessionConfig]: + return SessionConfig + + def session_config( + self, + include, + *, + instead_of: Optional[Type] = SessionConfig, + exclude=None, + priority: int = 500, + **kwargs, + ): + """Mark the decorated :class:`SessionConfig` subclass as the + :ref:`session config ` to use for the specified + URL patterns. + + Usage example: + + .. code-block:: python + + from typing import Any, Dict + + from scrapy import Request + from scrapy.http.response import Response + from scrapy_zyte_api import SessionConfig, session_config + + + @session_config(["ecommerce.de.example, ecommerce.us.example"]) + class EcommerceExampleSessionConfig(SessionConfig): + + def pool(self, request: Request) -> str: + return "ecommerce.example" + + def params(self, request: Request) -> Dict[str, Any]: + return { + "url": request.url, + "browserHtml": True, + "actions": [ + { + "action": "type", + "selector": {"type": "css", "value": ".zipcode"}, + "text": self.location(request)["postalCode"], + }, + { + "action": "click", + "selector": {"type": "css", "value": "[type='submit']"}, + }, + ], + } + + def check(self, response: Response, request: Request) -> bool: + return ( + response.css(".zipcode::text").get() == self.location(request)["postalCode"] + ) + + Your :class:`~scrapy_zyte_api.SessionConfig` subclass must be + defined in a module that gets imported at run time. See + ``SCRAPY_POET_DISCOVER`` in the :ref:`scrapy-poet setting reference + `. + + The parameters of this decorator are those of + :func:`web_poet.handle_urls`, only *instead_of* is + :class:`SessionConfig` by default, *to_return* is not supported, + and session configs are registered in their own rule registry. + """ + raise RuntimeError( + "To use the @session_config decorator you first must install " + "web-poet." + ) + +else: + from url_matcher import Patterns + from web_poet import ApplyRule + from web_poet.rules import Strings + + class SessionConfigRulesRegistry(RulesRegistry): # type: ignore[no-redef] + + def __init__(self): + rules = [ApplyRule(for_patterns=Patterns(include=[""]), use=SessionConfig)] # type: ignore[arg-type] + super().__init__(rules=rules) + + def session_config_cls(self, request: Request) -> Type[SessionConfig]: + cls = SessionConfig + overrides: Dict[Type[SessionConfig], Type[SessionConfig]] = self.overrides_for(request.url) # type: ignore[assignment] + while cls in overrides: + cls = overrides[cls] + return cls + + def session_config( + self, + include: Strings, + *, + instead_of: Optional[Type[SessionConfig]] = SessionConfig, + exclude: Optional[Strings] = None, + priority: int = 500, + **kwargs, + ): + return self.handle_urls( + include=include, + instead_of=instead_of, # type: ignore[arg-type] + exclude=exclude, + priority=priority, + **kwargs, + ) + + +session_config_registry = SessionConfigRulesRegistry() +session_config = session_config_registry.session_config + + +class _SessionManager: + + def __init__(self, crawler: Crawler): + self._crawler = crawler + + settings = crawler.settings + + pool_size = settings.getint("ZYTE_API_SESSION_POOL_SIZE", 8) + self._pending_initial_sessions: Dict[str, int] = defaultdict(lambda: pool_size) + pool_sizes = settings.getdict("ZYTE_API_SESSION_POOL_SIZES", {}) + for pool, size in pool_sizes.items(): + self._pending_initial_sessions[pool] = size + + self._max_errors = settings.getint("ZYTE_API_SESSION_MAX_ERRORS", 1) + self._errors: Dict[str, int] = defaultdict(int) + + max_bad_inits = settings.getint("ZYTE_API_SESSION_MAX_BAD_INITS", 8) + self._max_bad_inits: Dict[str, int] = defaultdict(lambda: max_bad_inits) + max_bad_inits_per_pool = settings.getdict( + "ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL", {} + ) + for pool, pool_max_bad_inits in max_bad_inits_per_pool.items(): + self._max_bad_inits[pool] = pool_max_bad_inits + self._bad_inits: Dict[str, int] = defaultdict(int) + + # Transparent mode, needed to determine whether to set the session + # using ``zyte_api`` or ``zyte_api_automap``. + self._transparent_mode: bool = settings.getbool( + "ZYTE_API_TRANSPARENT_MODE", False + ) + + # Each pool contains the IDs of sessions that have not expired yet. + # + # While the initial sessions of a pool have not all been started, for + # every request needing a session, a new session is initialized and + # then added to the pool. + # + # Once a pool is full, sessions are picked from the pool queue, which + # should contain all pool sessions that have been initialized. + # + # As soon as a session expires, it is removed from its pool, and a task + # to initialize that new session is started. + self._pools: Dict[str, Set[str]] = defaultdict(set) + self._pool_cache: WeakKeyDictionary[Request, str] = WeakKeyDictionary() + + # The queue is a rotating list of session IDs to use. + # + # The way to use the queue is to get a session ID with popleft(), and + # put it back to the end of the queue with append(). + # + # The queue may contain session IDs from expired sessions. If the + # popped session ID cannot be found in the pool, then it should be + # discarded instead of being put back in the queue. + # + # When a new session ID is added to the pool, it is still not added to + # the queue until the session is actually initialized, when it is + # appended to the queue. + # + # If the queue is empty, sleep and try again. Sessions from the pool + # will be appended to the queue as they are initialized and ready to + # use. + self._queues: Dict[str, Deque[str]] = defaultdict(deque) + self._queue_max_attempts = settings.getint( + "ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS", 60 + ) + self._queue_wait_time = settings.getfloat( + "ZYTE_API_SESSION_QUEUE_WAIT_TIME", 1.0 + ) + + # Contains the on-going tasks to create new sessions. + # + # Keeping a reference to those tasks until they are done is necessary + # to prevent garbage collection to remove the tasks. + self._init_tasks: Set[Task] = set() + + self._session_config_cache: WeakKeyDictionary[Request, SessionConfig] = ( + WeakKeyDictionary() + ) + self._session_config_map: Dict[Type[SessionConfig], SessionConfig] = {} + + def _get_session_config(self, request: Request) -> SessionConfig: + try: + return self._session_config_cache[request] + except KeyError: + cls = session_config_registry.session_config_cls(request) + if cls not in self._session_config_map: + self._session_config_map[cls] = build_from_crawler(cls, self._crawler) + self._session_config_cache[request] = self._session_config_map[cls] + return self._session_config_map[cls] + + def _get_pool(self, request): + try: + return self._pool_cache[request] + except KeyError: + session_config = self._get_session_config(request) + try: + pool = session_config.pool(request) + except Exception: + raise PoolError + self._pool_cache[request] = pool + return pool + + async def _init_session(self, session_id: str, request: Request, pool: str) -> bool: + session_config = self._get_session_config(request) + try: + session_params = session_config.params(request) + except Exception: + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/init/param-error" + ) + logger.exception( + f"Unexpected exception raised while obtaining session " + f"initialization parameters for request {request}." + ) + return False + session_params = deepcopy(session_params) + session_init_url = session_params.pop("url", request.url) + spider = self._crawler.spider + session_init_request = Request( + session_init_url, + meta={ + SESSION_INIT_META_KEY: True, + "dont_merge_cookies": True, + "zyte_api": {**session_params, "session": {"id": session_id}}, + }, + callback=NO_CALLBACK, + ) + if _DOWNLOAD_NEEDS_SPIDER: + deferred = self._crawler.engine.download( + session_init_request, spider=spider + ) + else: + deferred = self._crawler.engine.download(session_init_request) + try: + response = await deferred_to_future(deferred) + except Exception: + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/init/failed" + ) + return False + else: + try: + result = session_config.check(response, session_init_request) + except CloseSpider: + raise + except Exception: + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/init/check-error" + ) + logger.exception( + f"Unexpected exception raised while checking session " + f"validity on response {response}." + ) + return False + outcome = "passed" if result else "failed" + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/init/check-{outcome}" + ) + return result + + async def _create_session(self, request: Request, pool: str) -> str: + while True: + session_id = str(uuid4()) + session_init_succeeded = await self._init_session(session_id, request, pool) + if session_init_succeeded: + self._pools[pool].add(session_id) + self._bad_inits[pool] = 0 + break + self._bad_inits[pool] += 1 + if self._bad_inits[pool] >= self._max_bad_inits[pool]: + raise TooManyBadSessionInits + self._queues[pool].append(session_id) + return session_id + + async def _next_from_queue(self, request: Request, pool: str) -> str: + session_id = None + attempts = 0 + while session_id not in self._pools[pool]: # After 1st loop: invalid session. + try: + session_id = self._queues[pool].popleft() + except IndexError: # No ready-to-use session available. + attempts += 1 + if attempts >= self._queue_max_attempts: + raise RuntimeError( + f"Could not get a session ID from the session " + f"rotation queue after {attempts} attempts, waiting " + f"at least {self._queue_wait_time} seconds between " + f"attempts. Either the values of the " + f"ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS and " + f"ZYTE_API_SESSION_QUEUE_WAIT_TIME settings are too " + f"low for your scenario, in which case you can modify " + f"them accordingly, or there might be a bug with " + f"scrapy-zyte-api session management. If you think it " + f"could be the later, please report the issue at " + f"https://github.com/scrapy-plugins/scrapy-zyte-api/issues/new " + f"providing a minimal reproducible example if " + f"possible, or debug logs and stats otherwise." + ) + await sleep(self._queue_wait_time) + assert session_id is not None + self._queues[pool].append(session_id) + return session_id + + async def _next(self, request) -> str: + """Return the ID of the next working session in the session pool + rotation. + + *request* is needed to determine the URL to use for request + initialization. + """ + pool = self._get_pool(request) + if self._pending_initial_sessions[pool] >= 1: + self._pending_initial_sessions[pool] -= 1 + session_id = await self._create_session(request, pool) + else: + session_id = await self._next_from_queue(request, pool) + return session_id + + def is_init_request(self, request: Request) -> bool: + """Return ``True`` if the request is one of the requests being used + to initialize a session, or ``False`` otherwise. + + If ``True`` is returned for a request, the session ID of that request + should not be modified, or it will break the session management logic. + """ + return request.meta.get(SESSION_INIT_META_KEY, False) + + def _get_request_session_id(self, request: Request) -> Optional[str]: + for meta_key in ZYTE_API_META_KEYS: + if meta_key not in request.meta: + continue + session_id = request.meta[meta_key].get("session", {}).get("id", None) + if session_id: + return session_id + logger.warning( + f"Request {request} had no session ID assigned, unexpectedly. " + f"If you are sure this issue is not caused by your own code, " + f"please report this at " + f"https://github.com/scrapy-plugins/scrapy-zyte-api/issues/new " + f"providing a minimal, reproducible example." + ) + return None + + def _start_session_refresh(self, session_id: str, request: Request, pool: str): + try: + self._pools[pool].remove(session_id) + except KeyError: + # More than 1 request was using the same session concurrently. Do + # not refresh the session again. + pass + else: + task = create_task(self._create_session(request, pool)) + self._init_tasks.add(task) + task.add_done_callback(self._init_tasks.discard) + try: + del self._errors[session_id] + except KeyError: + pass + + def _start_request_session_refresh(self, request: Request, pool: str): + session_id = self._get_request_session_id(request) + if session_id is None: + return + self._start_session_refresh(session_id, request, pool) + + async def check(self, response: Response, request: Request) -> bool: + """Check the response for signs of session expiration, update the + internal session pool accordingly, and return ``False`` if the session + has expired or ``True`` if the session passed validation.""" + session_config = self._get_session_config(request) + pool = self._get_pool(request) + try: + passed = session_config.check(response, request) + except CloseSpider: + raise + except Exception: + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/use/check-error" + ) + logger.exception( + f"Unexpected exception raised while checking session " + f"validity on response {response}." + ) + else: + outcome = "passed" if passed else "failed" + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/use/check-{outcome}" + ) + if passed: + return True + self._start_request_session_refresh(request, pool) + return False + + async def assign(self, request: Request): + """Assign a working session to *request*.""" + session_id = await self._next(request) + # Note: If there is a session set already (e.g. a request being + # retried), it is overridden. + request.meta.setdefault("zyte_api_provider", {})["session"] = {"id": session_id} + if ( + "zyte_api" in request.meta + or request.meta.get("zyte_api_automap", None) is False + or ( + "zyte_api_automap" not in request.meta + and self._transparent_mode is False + ) + ): + meta_key = "zyte_api" + else: + meta_key = "zyte_api_automap" + request.meta.setdefault(meta_key, {}) + if not isinstance(request.meta[meta_key], dict): + request.meta[meta_key] = {} + request.meta[meta_key]["session"] = {"id": session_id} + request.meta.setdefault("dont_merge_cookies", True) + + def handle_error(self, request: Request): + pool = self._get_pool(request) + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/use/failed" + ) + session_id = self._get_request_session_id(request) + if session_id is not None: + self._errors[session_id] += 1 + if self._errors[session_id] < self._max_errors: + return + self._start_request_session_refresh(request, pool) + + def handle_expiration(self, request: Request): + pool = self._get_pool(request) + self._crawler.stats.inc_value( + f"scrapy-zyte-api/sessions/pools/{pool}/use/expired" + ) + self._start_request_session_refresh(request, pool) + + +class FatalErrorHandler: + + def __init__(self, crawler): + self.crawler = crawler + + async def __aenter__(self): + return None + + async def __aexit__(self, exc_type, exc, tb): + if exc_type is None: + return + from twisted.internet import reactor + from twisted.internet.interfaces import IReactorCore + + reactor = cast(IReactorCore, reactor) + close = partial( + reactor.callLater, 0, self.crawler.engine.close_spider, self.crawler.spider + ) + if issubclass(exc_type, TooManyBadSessionInits): + close("bad_session_inits") + elif issubclass(exc_type, PoolError): + close("pool_error") + elif issubclass(exc_type, CloseSpider): + close(exc.reason) + + +class ScrapyZyteAPISessionDownloaderMiddleware: + + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler) + + def __init__(self, crawler: Crawler): + self._enabled = crawler.settings.getbool("ZYTE_API_SESSION_ENABLED", False) + self._crawler = crawler + self._sessions = _SessionManager(crawler) + self._fatal_error_handler = FatalErrorHandler(crawler) + + async def process_request(self, request: Request, spider: Spider) -> None: + if not request.meta.get( + "zyte_api_session_enabled", self._enabled + ) or self._sessions.is_init_request(request): + return + async with self._fatal_error_handler: + await self._sessions.assign(request) + + async def process_response( + self, request: Request, response: Response, spider: Spider + ) -> Union[Request, Response, None]: + if ( + isinstance(response, DummyResponse) + or not request.meta.get("zyte_api_session_enabled", self._enabled) + or self._sessions.is_init_request(request) + ): + return response + async with self._fatal_error_handler: + passed = await self._sessions.check(response, request) + if not passed: + new_request_or_none = get_retry_request( + request, + spider=spider, + reason="session_expired", + ) + if not new_request_or_none: + raise IgnoreRequest + return new_request_or_none + return response + + async def process_exception( + self, request: Request, exception: Exception, spider: Spider + ) -> Union[Request, None]: + if ( + not isinstance(exception, RequestError) + or not request.meta.get("zyte_api_session_enabled", self._enabled) + or self._sessions.is_init_request(request) + ): + return None + + if exception.parsed.type == "/problem/session-expired": + async with self._fatal_error_handler: + self._sessions.handle_expiration(request) + reason = "session_expired" + elif exception.status in {520, 521}: + async with self._fatal_error_handler: + self._sessions.handle_error(request) + reason = "download_error" + else: + return None + + return get_retry_request( + request, + spider=spider, + reason=reason, + ) diff --git a/scrapy_zyte_api/addon.py b/scrapy_zyte_api/addon.py index 916d8591..7cf90257 100644 --- a/scrapy_zyte_api/addon.py +++ b/scrapy_zyte_api/addon.py @@ -1,8 +1,10 @@ from scrapy.settings import BaseSettings from scrapy.utils.misc import load_object +from zyte_api import zyte_api_retrying from scrapy_zyte_api import ( ScrapyZyteAPIDownloaderMiddleware, + ScrapyZyteAPISessionDownloaderMiddleware, ScrapyZyteAPISpiderMiddleware, ) @@ -22,6 +24,24 @@ def _setdefault(settings, setting, cls, pos): settings[setting][cls] = pos +# NOTE: We use import paths instead of the classes because retry policy classes +# are not pickleable (https://github.com/jd/tenacity/issues/147), which is a +# Scrapy requirement +# (https://doc.scrapy.org/en/latest/topics/settings.html#compatibility-with-pickle). +_SESSION_RETRY_POLICIES = { + zyte_api_retrying: "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", +} + +try: + from zyte_api import aggressive_retrying +except ImportError: + pass +else: + _SESSION_RETRY_POLICIES[aggressive_retrying] = ( + "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" + ) + + class Addon: def update_settings(self, settings: BaseSettings) -> None: from scrapy.settings.default_settings import ( @@ -70,7 +90,13 @@ def update_settings(self, settings: BaseSettings) -> None: "https" ] = "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPSDownloadHandler" _setdefault( - settings, "DOWNLOADER_MIDDLEWARES", ScrapyZyteAPIDownloaderMiddleware, 1000 + settings, "DOWNLOADER_MIDDLEWARES", ScrapyZyteAPIDownloaderMiddleware, 633 + ) + _setdefault( + settings, + "DOWNLOADER_MIDDLEWARES", + ScrapyZyteAPISessionDownloaderMiddleware, + 667, ) _setdefault(settings, "SPIDER_MIDDLEWARES", ScrapyZyteAPISpiderMiddleware, 100) settings.set( @@ -89,3 +115,14 @@ def update_settings(self, settings: BaseSettings) -> None: _setdefault(settings, "DOWNLOADER_MIDDLEWARES", InjectionMiddleware, 543) _setdefault(settings, "SCRAPY_POET_PROVIDERS", ZyteApiProvider, 1100) + + if settings.getbool("ZYTE_API_SESSION_ENABLED", False): + retry_policy = settings.get( + "ZYTE_API_RETRY_POLICY", "zyte_api.zyte_api_retrying" + ) + loaded_retry_policy = load_object(retry_policy) + settings.set( + "ZYTE_API_RETRY_POLICY", + _SESSION_RETRY_POLICIES.get(loaded_retry_policy, retry_policy), + settings.getpriority("ZYTE_API_RETRY_POLICY"), + ) diff --git a/scrapy_zyte_api/utils.py b/scrapy_zyte_api/utils.py index 17776ad2..2ab50755 100644 --- a/scrapy_zyte_api/utils.py +++ b/scrapy_zyte_api/utils.py @@ -1,3 +1,5 @@ +from importlib.metadata import version + import scrapy from packaging.version import Version from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT @@ -6,6 +8,9 @@ USER_AGENT = f"scrapy-zyte-api/{__version__} {PYTHON_ZYTE_API_USER_AGENT}" +_PYTHON_ZYTE_API_VERSION = Version(version("zyte_api")) +_PYTHON_ZYTE_API_0_5_2 = Version("0.5.2") + _SCRAPY_VERSION = Version(scrapy.__version__) _SCRAPY_2_1_0 = Version("2.1.0") _SCRAPY_2_4_0 = Version("2.4.0") @@ -19,6 +24,9 @@ # https://github.com/scrapy/scrapy/commit/e4bdd1cb958b7d89b86ea66f0af1cec2d91a6d44 _NEEDS_EARLY_REACTOR = _SCRAPY_2_4_0 <= _SCRAPY_VERSION < _SCRAPY_2_6_0 +_DOWNLOAD_NEEDS_SPIDER = _SCRAPY_VERSION < _SCRAPY_2_6_0 +_RAW_CLASS_SETTING_SUPPORT = _SCRAPY_VERSION >= _SCRAPY_2_4_0 +_REQUEST_ERROR_HAS_QUERY = _PYTHON_ZYTE_API_VERSION >= _PYTHON_ZYTE_API_0_5_2 _RESPONSE_HAS_ATTRIBUTES = _SCRAPY_VERSION >= _SCRAPY_2_6_0 _RESPONSE_HAS_IP_ADDRESS = _SCRAPY_VERSION >= _SCRAPY_2_1_0 _RESPONSE_HAS_PROTOCOL = _SCRAPY_VERSION >= _SCRAPY_2_5_0 diff --git a/tests/__init__.py b/tests/__init__.py index bdcdbad2..55a5b470 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -4,6 +4,7 @@ from scrapy import Spider from scrapy.crawler import Crawler +from scrapy.utils.misc import load_object from scrapy.utils.test import get_crawler as _get_crawler from zyte_api.aio.client import AsyncClient @@ -20,7 +21,8 @@ "https": "scrapy_zyte_api.handler.ScrapyZyteAPIDownloadHandler", }, "DOWNLOADER_MIDDLEWARES": { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, + "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, }, "REQUEST_FINGERPRINTER_CLASS": "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter", "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", # Silence deprecation warning @@ -95,6 +97,30 @@ async def make_handler( await handler._close() # NOQA +def serialize_settings(settings): + result = dict(settings) + for setting in ( + "ADDONS", + "ZYTE_API_FALLBACK_HTTP_HANDLER", + "ZYTE_API_FALLBACK_HTTPS_HANDLER", + ): + if setting in settings: + del result[setting] + for setting in ( + "DOWNLOADER_MIDDLEWARES", + "SCRAPY_POET_PROVIDERS", + "SPIDER_MIDDLEWARES", + ): + if setting in result: + for key in list(result[setting]): + if isinstance(key, str): + obj = load_object(key) + result[setting][obj] = result[setting].pop(key) + for key in result["DOWNLOAD_HANDLERS"]: + result["DOWNLOAD_HANDLERS"][key] = result["DOWNLOAD_HANDLERS"][key].__class__ + return result + + @contextmanager def set_env(**env_vars): old_environ = dict(environ) diff --git a/tests/mockserver.py b/tests/mockserver.py index de93e9e0..dc709dd5 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -24,6 +24,16 @@ from . import SETTINGS, make_handler +# https://github.com/scrapy/scrapy/blob/02b97f98e74a994ad3e4d74e7ed55207e508a576/tests/mockserver.py#L27C1-L33C19 +def getarg(request, name, default=None, type=None): + if name in request.args: + value = request.args[name][0] + if type is not None: + value = type(value) + return value + return default + + def get_ephemeral_port(): s = socket.socket() s.bind(("", 0)) @@ -110,6 +120,15 @@ def render_POST(self, request): "detail": "Account is suspended, check billing details.", } return json.dumps(response_data).encode() + if "temporary-download-error" in request_data["url"]: + request.setResponseCode(520) + response_data = { + "status": 520, + "type": "/download/temporary-error", + "title": "...", + "detail": "...", + } + return json.dumps(response_data).encode() html = "Hello

World!

" if "browserHtml" in request_data: @@ -128,6 +147,25 @@ def render_POST(self, request): response_data["screenshot"] = b64encode( b"screenshot-body-contents" ).decode() + + if "session" in request_data: + # See test_sessions.py::test_param_precedence + if domain.startswith("postal-code-10001"): + postal_code = None + for action in request_data.get("actions", []): + try: + postal_code = action["address"]["postalCode"] + except (KeyError, IndexError, TypeError): + pass + else: + break + if postal_code != "10001" and not domain.startswith( + "postal-code-10001-soft" + ): + request.setResponseCode(500) + return b"" + response_data["session"] = request_data["session"] + if "httpResponseBody" in request_data: headers = request_data.get("customHttpRequestHeaders", []) for header in headers: @@ -151,13 +189,24 @@ def render_POST(self, request): if actions: results: List[_ActionResult] = [] for action in actions: - results.append( - { - "action": action["action"], - "elapsedTime": 1.0, - "status": "success", - } - ) + result: _ActionResult = { + "action": action["action"], + "elapsedTime": 1.0, + "status": "success", + } + if action["action"] == "setLocation": + if domain.startswith("postal-code-10001"): + try: + postal_code = action["address"]["postalCode"] + except (KeyError, IndexError, TypeError): + postal_code = None + if postal_code != "10001": + result["status"] = "returned" + result["error"] = "Action setLocation failed" + elif domain.startswith("no-location-support"): + result["status"] = "returned" + result["error"] = "Action setLocation not supported on …" + results.append(result) response_data["actions"] = results # type: ignore[assignment] if request_data.get("product") is True: diff --git a/tests/test_addon.py b/tests/test_addon.py index 0155d6bb..a69d7030 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -4,17 +4,17 @@ from pytest_twisted import ensureDeferred from scrapy import Request from scrapy.core.downloader.handlers.http import HTTP10DownloadHandler -from scrapy.utils.misc import load_object from scrapy.utils.test import get_crawler from scrapy_zyte_api import ( ScrapyZyteAPIDownloaderMiddleware, + ScrapyZyteAPISessionDownloaderMiddleware, ScrapyZyteAPISpiderMiddleware, ) from scrapy_zyte_api.handler import ScrapyZyteAPIHTTPDownloadHandler from . import get_crawler as get_crawler_zyte_api -from . import get_download_handler, make_handler +from . import get_download_handler, make_handler, serialize_settings pytest.importorskip("scrapy.addons") @@ -81,34 +81,11 @@ async def test_addon_fallback_explicit(): @ensureDeferred async def test_addon_matching_settings(): - def serialize(settings): - result = dict(settings) - for setting in ( - "ADDONS", - "ZYTE_API_FALLBACK_HTTP_HANDLER", - "ZYTE_API_FALLBACK_HTTPS_HANDLER", - ): - if setting in settings: - del result[setting] - for setting in ( - "DOWNLOADER_MIDDLEWARES", - "SCRAPY_POET_PROVIDERS", - "SPIDER_MIDDLEWARES", - ): - if setting in result: - for key in list(result[setting]): - if isinstance(key, str): - obj = load_object(key) - result[setting][obj] = result[setting].pop(key) - for key in result["DOWNLOAD_HANDLERS"]: - result["DOWNLOAD_HANDLERS"][key] = result["DOWNLOAD_HANDLERS"][ - key - ].__class__ - return result - crawler = await get_crawler_zyte_api({"ZYTE_API_TRANSPARENT_MODE": True}) addon_crawler = await get_crawler_zyte_api(use_addon=True) - assert serialize(crawler.settings) == serialize(addon_crawler.settings) + assert serialize_settings(crawler.settings) == serialize_settings( + addon_crawler.settings + ) @ensureDeferred @@ -161,7 +138,8 @@ def _test_setting_changes(initial_settings, expected_settings): BASE_EXPECTED = { "DOWNLOADER_MIDDLEWARES": { - ScrapyZyteAPIDownloaderMiddleware: 1000, + ScrapyZyteAPIDownloaderMiddleware: 633, + ScrapyZyteAPISessionDownloaderMiddleware: 667, }, "DOWNLOAD_HANDLERS": { "http": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPDownloadHandler", @@ -198,7 +176,8 @@ def _test_setting_changes(initial_settings, expected_settings): **BASE_EXPECTED, "DOWNLOADER_MIDDLEWARES": { "builtins.str": 123, - ScrapyZyteAPIDownloaderMiddleware: 1000, + ScrapyZyteAPIDownloaderMiddleware: 633, + ScrapyZyteAPISessionDownloaderMiddleware: 667, }, }, ), @@ -212,6 +191,7 @@ def _test_setting_changes(initial_settings, expected_settings): **BASE_EXPECTED, "DOWNLOADER_MIDDLEWARES": { ScrapyZyteAPIDownloaderMiddleware: 999, + ScrapyZyteAPISessionDownloaderMiddleware: 667, }, }, ), @@ -225,6 +205,7 @@ def _test_setting_changes(initial_settings, expected_settings): **BASE_EXPECTED, "DOWNLOADER_MIDDLEWARES": { "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 999, + ScrapyZyteAPISessionDownloaderMiddleware: 667, }, }, ), @@ -245,7 +226,8 @@ def test_no_poet_setting_changes(initial_settings, expected_settings): { **BASE_EXPECTED, "DOWNLOADER_MIDDLEWARES": { - ScrapyZyteAPIDownloaderMiddleware: 1000, + ScrapyZyteAPIDownloaderMiddleware: 633, + ScrapyZyteAPISessionDownloaderMiddleware: 667, InjectionMiddleware: 543, }, "SCRAPY_POET_PROVIDERS": { diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index aa42074e..b9ca96fc 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -588,6 +588,7 @@ async def test_default_params_merging( crawler = await get_crawler({setting_key: setting}) handler = get_download_handler(crawler, "https") param_parser = handler._param_parser + caplog.clear() with caplog.at_level("WARNING"): api_params = param_parser.parse(request) for key in ignore_keys: @@ -2896,6 +2897,7 @@ async def test_automap_cookie_limit(meta, caplog): ) cookiejar += 1 cookie_middleware.process_request(request, spider=None) + caplog.clear() with caplog.at_level("WARNING"): api_params = param_parser.parse(request) assert api_params["experimental"]["requestCookies"] == [ @@ -3215,6 +3217,7 @@ async def test_default_params_automap(default_params, meta, expected, warnings, crawler = await get_crawler(settings) handler = get_download_handler(crawler, "https") param_parser = handler._param_parser + caplog.clear() with caplog.at_level("WARNING"): api_params = param_parser.parse(request) api_params.pop("url") diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 82db1d75..9e5e662d 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -140,7 +140,7 @@ def parse(self, response): settings = { "DOWNLOADER_MIDDLEWARES": { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000 + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633 }, "ZYTE_API_MAX_REQUESTS": zapi_max_requests, "ZYTE_API_URL": server.urljoin("/"), diff --git a/tests/test_sessions.py b/tests/test_sessions.py new file mode 100644 index 00000000..045e16e5 --- /dev/null +++ b/tests/test_sessions.py @@ -0,0 +1,2046 @@ +from collections import deque +from copy import copy, deepcopy +from math import floor +from typing import Any, Dict, Union +from unittest.mock import patch + +import pytest +from aiohttp.client_exceptions import ServerConnectionError +from pytest_twisted import ensureDeferred +from scrapy import Request, Spider, signals +from scrapy.exceptions import CloseSpider +from scrapy.http import Response +from scrapy.utils.httpobj import urlparse_cached +from zyte_api import RequestError + +from scrapy_zyte_api import ( + SESSION_AGGRESSIVE_RETRY_POLICY, + SESSION_DEFAULT_RETRY_POLICY, + SessionConfig, + session_config, +) +from scrapy_zyte_api._session import SESSION_INIT_META_KEY, session_config_registry +from scrapy_zyte_api.utils import _RAW_CLASS_SETTING_SUPPORT, _REQUEST_ERROR_HAS_QUERY + +from . import get_crawler, serialize_settings + +UNSET = object() + + +@pytest.mark.parametrize( + ("setting", "meta", "outcome"), + ( + (UNSET, UNSET, False), + (UNSET, True, True), + (UNSET, False, False), + (True, UNSET, True), + (True, True, True), + (True, False, False), + (False, UNSET, False), + (False, True, True), + (False, False, False), + ), +) +@ensureDeferred +async def test_enabled(setting, meta, outcome, mockserver): + settings = {"ZYTE_API_URL": mockserver.urljoin("/")} + if setting is not UNSET: + settings["ZYTE_API_SESSION_ENABLED"] = setting + meta_dict = {} + if meta is not UNSET: + meta_dict = {"zyte_api_session_enabled": meta} + + class TestSpider(Spider): + name = "test" + + def start_requests(self): + yield Request("https://example.com", meta=meta_dict) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + if outcome: + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + } + else: + assert session_stats == {} + + +@pytest.mark.parametrize( + ("params_setting", "params_meta", "location_setting", "location_meta", "outcome"), + ( + (UNSET, UNSET, UNSET, UNSET, False), + (UNSET, UNSET, UNSET, None, False), + (UNSET, UNSET, UNSET, False, False), + (UNSET, UNSET, UNSET, True, True), + (UNSET, UNSET, False, UNSET, False), + (UNSET, UNSET, False, None, False), + (UNSET, UNSET, False, False, False), + (UNSET, UNSET, False, True, True), + (UNSET, UNSET, True, UNSET, True), + (UNSET, UNSET, True, None, False), + (UNSET, UNSET, True, False, False), + (UNSET, UNSET, True, True, True), + (UNSET, False, UNSET, UNSET, False), + (UNSET, False, UNSET, None, False), + (UNSET, False, UNSET, False, False), + (UNSET, False, UNSET, True, True), + (UNSET, False, False, UNSET, False), + (UNSET, False, False, None, False), + (UNSET, False, False, False, False), + (UNSET, False, False, True, True), + (UNSET, False, True, UNSET, True), + (UNSET, False, True, None, False), + (UNSET, False, True, False, False), + (UNSET, False, True, True, True), + (UNSET, True, UNSET, UNSET, True), + (UNSET, True, UNSET, None, True), + (UNSET, True, UNSET, False, False), + (UNSET, True, UNSET, True, True), + (UNSET, True, False, UNSET, False), + (UNSET, True, False, None, True), + (UNSET, True, False, False, False), + (UNSET, True, False, True, True), + (UNSET, True, True, UNSET, True), + (UNSET, True, True, None, True), + (UNSET, True, True, False, False), + (UNSET, True, True, True, True), + (False, UNSET, UNSET, UNSET, False), + (False, UNSET, UNSET, None, False), + (False, UNSET, UNSET, False, False), + (False, UNSET, UNSET, True, True), + (False, UNSET, False, UNSET, False), + (False, UNSET, False, None, False), + (False, UNSET, False, False, False), + (False, UNSET, False, True, True), + (False, UNSET, True, UNSET, True), + (False, UNSET, True, None, False), + (False, UNSET, True, False, False), + (False, UNSET, True, True, True), + (False, False, UNSET, UNSET, False), + (False, False, UNSET, None, False), + (False, False, UNSET, False, False), + (False, False, UNSET, True, True), + (False, False, False, UNSET, False), + (False, False, False, None, False), + (False, False, False, False, False), + (False, False, False, True, True), + (False, False, True, UNSET, True), + (False, False, True, None, False), + (False, False, True, False, False), + (False, False, True, True, True), + (False, True, UNSET, UNSET, True), + (False, True, UNSET, None, True), + (False, True, UNSET, False, False), + (False, True, UNSET, True, True), + (False, True, False, UNSET, False), + (False, True, False, None, True), + (False, True, False, False, False), + (False, True, False, True, True), + (False, True, True, UNSET, True), + (False, True, True, None, True), + (False, True, True, False, False), + (False, True, True, True, True), + (True, UNSET, UNSET, UNSET, True), + (True, UNSET, UNSET, None, True), + (True, UNSET, UNSET, False, False), + (True, UNSET, UNSET, True, True), + (True, UNSET, False, UNSET, False), + (True, UNSET, False, None, True), + (True, UNSET, False, False, False), + (True, UNSET, False, True, True), + (True, UNSET, True, UNSET, True), + (True, UNSET, True, None, True), + (True, UNSET, True, False, False), + (True, UNSET, True, True, True), + (True, False, UNSET, UNSET, False), + (True, False, UNSET, None, False), + (True, False, UNSET, False, False), + (True, False, UNSET, True, True), + (True, False, False, UNSET, False), + (True, False, False, None, False), + (True, False, False, False, False), + (True, False, False, True, True), + (True, False, True, UNSET, True), + (True, False, True, None, False), + (True, False, True, False, False), + (True, False, True, True, True), + (True, True, UNSET, UNSET, True), + (True, True, UNSET, None, True), + (True, True, UNSET, False, False), + (True, True, UNSET, True, True), + (True, True, False, UNSET, False), + (True, True, False, None, True), + (True, True, False, False, False), + (True, True, False, True, True), + (True, True, True, UNSET, True), + (True, True, True, None, True), + (True, True, True, False, False), + (True, True, True, True, True), + ), +) +@ensureDeferred +async def test_param_precedence( + params_setting, params_meta, location_setting, location_meta, outcome, mockserver +): + postal_codes = {True: "10001", False: "10002"} + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + meta: Dict[str, Any] = {} + + if params_setting is not UNSET: + settings["ZYTE_API_SESSION_PARAMS"] = { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_codes[params_setting]}, + } + ] + } + if params_meta is not UNSET: + meta["zyte_api_session_params"] = { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_codes[params_meta]}, + } + ] + } + if location_setting is not UNSET: + settings["ZYTE_API_SESSION_LOCATION"] = { + "postalCode": postal_codes[location_setting] + } + if location_meta is None: + meta["zyte_api_session_location"] = {} + elif location_meta is not UNSET: + meta["zyte_api_session_location"] = {"postalCode": postal_codes[location_meta]} + + class TestSpider(Spider): + name = "test" + + def start_requests(self): + yield Request( + "https://postal-code-10001.example", + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_codes[True]}, + } + ] + }, + **meta, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + if outcome: + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 1, + } + else: + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, + } + + +@pytest.mark.parametrize( + ("params", "close_reason", "stats"), + ( + ( + {"browserHtml": True}, + "bad_session_inits", + { + "scrapy-zyte-api/sessions/pools/forbidden.example/init/failed": 1, + }, + ), + ( + {"browserHtml": True, "url": "https://example.com"}, + "failed_forbidden_domain", + { + "scrapy-zyte-api/sessions/pools/forbidden.example/init/check-passed": 1, + }, + ), + ), +) +@ensureDeferred +async def test_url_override(params, close_reason, stats, mockserver): + """If session params define a URL, that URL is used for session + initialization. Otherwise, the URL from the request getting the session + assigned first is used for session initialization.""" + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_PARAMS": params, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://forbidden.example"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert crawler.spider.close_reason == close_reason + assert session_stats == stats + + +class ConstantChecker: + + def __init__(self, result): + self._result = result + + def check(self, response: Response, request: Request) -> bool: + if self._result in (True, False): + return self._result + raise self._result + + +class TrueChecker(ConstantChecker): + def __init__(self): + super().__init__(True) + + +class FalseChecker(ConstantChecker): + def __init__(self): + super().__init__(False) + + +class CloseSpiderChecker(ConstantChecker): + def __init__(self): + super().__init__(CloseSpider("closed_by_checker")) + + +class UnexpectedExceptionChecker(ConstantChecker): + def __init__(self): + super().__init__(Exception) + + +class TrueCrawlerChecker(ConstantChecker): + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + super().__init__(crawler.settings["ZYTE_API_SESSION_ENABLED"]) + + +class FalseCrawlerChecker(ConstantChecker): + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + super().__init__(not crawler.settings["ZYTE_API_SESSION_ENABLED"]) + + +class UseChecker(ConstantChecker): + """Always pass for session initialization requests, apply the check logic + only on session use requests.""" + + def check(self, response: Response, request: Request) -> bool: + if response.meta.get(SESSION_INIT_META_KEY, False) is True: + return True + return super().check(request, response) + + +class FalseUseChecker(FalseChecker, UseChecker): + pass + + +class CloseSpiderUseChecker(CloseSpiderChecker, UseChecker): + pass + + +class UnexpectedExceptionUseChecker(UnexpectedExceptionChecker, UseChecker): + pass + + +# NOTE: There is no use checker subclass for TrueChecker because the outcome +# would be the same (always return True), and there are no use checker +# subclasses for the crawler classes because the init use is enough to verify +# that using the crawler works. + + +@pytest.mark.parametrize( + ("checker", "close_reason", "stats"), + ( + *( + pytest.param( + checker, + close_reason, + stats, + marks=pytest.mark.skipif( + not _RAW_CLASS_SETTING_SUPPORT, + reason=( + "Configuring component classes instead of their import " + "paths requires Scrapy 2.4+." + ), + ), + ) + for checker, close_reason, stats in ( + ( + TrueChecker, + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + }, + ), + ( + FalseChecker, + "bad_session_inits", + {"scrapy-zyte-api/sessions/pools/example.com/init/check-failed": 1}, + ), + ( + FalseUseChecker, + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/example.com/use/check-failed": 1, + }, + ), + (CloseSpiderChecker, "closed_by_checker", {}), + ( + CloseSpiderUseChecker, + "closed_by_checker", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + }, + ), + ( + UnexpectedExceptionChecker, + "bad_session_inits", + {"scrapy-zyte-api/sessions/pools/example.com/init/check-error": 1}, + ), + ( + UnexpectedExceptionUseChecker, + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/example.com/use/check-error": 1, + }, + ), + ( + TrueCrawlerChecker, + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + }, + ), + ( + FalseCrawlerChecker, + "bad_session_inits", + {"scrapy-zyte-api/sessions/pools/example.com/init/check-failed": 1}, + ), + ) + ), + ( + "tests.test_sessions.TrueChecker", + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + }, + ), + ( + "tests.test_sessions.FalseChecker", + "bad_session_inits", + {"scrapy-zyte-api/sessions/pools/example.com/init/check-failed": 1}, + ), + ( + "tests.test_sessions.FalseUseChecker", + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/example.com/use/check-failed": 1, + }, + ), + ("tests.test_sessions.CloseSpiderChecker", "closed_by_checker", {}), + ( + "tests.test_sessions.CloseSpiderUseChecker", + "closed_by_checker", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + }, + ), + ( + "tests.test_sessions.UnexpectedExceptionChecker", + "bad_session_inits", + {"scrapy-zyte-api/sessions/pools/example.com/init/check-error": 1}, + ), + ( + "tests.test_sessions.UnexpectedExceptionUseChecker", + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/example.com/use/check-error": 1, + }, + ), + ( + "tests.test_sessions.TrueCrawlerChecker", + "finished", + { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + }, + ), + ( + "tests.test_sessions.FalseCrawlerChecker", + "bad_session_inits", + {"scrapy-zyte-api/sessions/pools/example.com/init/check-failed": 1}, + ), + ), +) +@ensureDeferred +async def test_checker(checker, close_reason, stats, mockserver): + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_CHECKER": checker, + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert crawler.spider.close_reason == close_reason + assert session_stats == stats + + +@pytest.mark.parametrize( + ("postal_code", "url", "close_reason", "stats"), + ( + ( + None, + "https://postal-code-10001-soft.example", + "finished", + { + "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/use/check-passed": 1, + }, + ), + ( + "10001", + "https://postal-code-10001-soft.example", + "finished", + { + "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/use/check-passed": 1, + }, + ), + ( + "10002", + "https://postal-code-10001-soft.example", + "bad_session_inits", + { + "scrapy-zyte-api/sessions/pools/postal-code-10001-soft.example/init/check-failed": 1 + }, + ), + ( + "10001", + "https://no-location-support.example", + "unsupported_set_location", + {}, + ), + ), +) +@ensureDeferred +async def test_checker_location(postal_code, url, close_reason, stats, mockserver): + """The default checker looks into the outcome of the ``setLocation`` action + if a location meta/setting was used.""" + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + if postal_code is not None: + settings["ZYTE_API_SESSION_LOCATION"] = {"postalCode": postal_code} + + class TestSpider(Spider): + name = "test" + + def start_requests(self): + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": postal_code}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert crawler.spider.close_reason == close_reason + assert session_stats == stats + + +class CloseSpiderURLChecker: + + def check(self, request: Request, response: Response) -> bool: + if "fail" in request.url: + raise CloseSpider("closed_by_checker") + return True + + +@ensureDeferred +async def test_checker_close_spider_use(mockserver): + """A checker can raise CloseSpider not only during session initialization, + but also during session use.""" + settings = { + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions.CloseSpiderURLChecker", + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com/fail"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert crawler.spider.close_reason == "closed_by_checker" + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + } + + +@pytest.mark.parametrize( + ("setting", "value"), + ( + (0, 1), + (1, 1), + (2, 2), + (None, 8), + ), +) +@ensureDeferred +async def test_max_bad_inits(setting, value, mockserver): + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_PARAMS": {"browserHtml": True, "httpResponseBody": True}, + } + if setting is not None: + settings["ZYTE_API_SESSION_MAX_BAD_INITS"] = setting + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/failed": value, + } + + +@pytest.mark.parametrize( + ("global_setting", "pool_setting", "value"), + ( + (None, 0, 1), + (None, 1, 1), + (None, 2, 2), + (3, None, 3), + ), +) +@ensureDeferred +async def test_max_bad_inits_per_pool(global_setting, pool_setting, value, mockserver): + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_PARAMS": {"browserHtml": True, "httpResponseBody": True}, + } + if global_setting is not None: + settings["ZYTE_API_SESSION_MAX_BAD_INITS"] = global_setting + if pool_setting is not None: + settings["ZYTE_API_SESSION_MAX_BAD_INITS_PER_POOL"] = { + "pool.example": pool_setting + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com", "https://pool.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/failed": ( + 8 if global_setting is None else global_setting + ), + "scrapy-zyte-api/sessions/pools/pool.example/init/failed": value, + } + + +@pytest.mark.parametrize( + ("setting", "value"), + ( + (None, 1), + (0, 1), + (1, 1), + (2, 2), + ), +) +@ensureDeferred +async def test_max_errors(setting, value, mockserver): + retry_times = 2 + settings = { + "RETRY_TIMES": retry_times, + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + if setting is not None: + settings["ZYTE_API_SESSION_MAX_ERRORS"] = setting + + class TestSpider(Spider): + name = "test" + start_urls = ["https://temporary-download-error.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/temporary-download-error.example/init/check-passed": floor( + (retry_times + 1) / value + ) + + 1, + "scrapy-zyte-api/sessions/pools/temporary-download-error.example/use/failed": retry_times + + 1, + } + + +class DomainChecker: + + def check(self, request: Request, response: Response) -> bool: + domain = urlparse_cached(request).netloc + return "fail" not in domain + + +@ensureDeferred +async def test_check_overrides_error(mockserver): + """Max errors are ignored if a session does not pass its session check.""" + retry_times = 2 + settings = { + "RETRY_TIMES": retry_times, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions.DomainChecker", + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_ERRORS": 2, + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://session-check-fails.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/session-check-fails.example/init/check-passed": retry_times + + 2, + "scrapy-zyte-api/sessions/pools/session-check-fails.example/use/check-failed": retry_times + + 1, + } + + +@pytest.mark.parametrize( + ("setting", "value"), + ( + (1, 1), + (2, 2), + (None, 8), + ), +) +@ensureDeferred +async def test_pool_size(setting, value, mockserver): + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + } + if setting is not None: + settings["ZYTE_API_SESSION_POOL_SIZE"] = setting + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] * (value + 1) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": value, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": value + 1, + } + + +@pytest.mark.parametrize( + ("global_setting", "pool_setting", "value"), + ( + (None, 1, 1), + (None, 2, 2), + (3, None, 3), + ), +) +@ensureDeferred +async def test_pool_sizes(global_setting, pool_setting, value, mockserver): + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + } + if global_setting is not None: + settings["ZYTE_API_SESSION_POOL_SIZE"] = global_setting + if pool_setting is not None: + settings["ZYTE_API_SESSION_POOL_SIZES"] = {"pool.example": pool_setting} + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com", "https://pool.example"] * (value + 1) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": ( + value if pool_setting is None else min(value + 1, 8) + ), + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": value + 1, + "scrapy-zyte-api/sessions/pools/pool.example/init/check-passed": value, + "scrapy-zyte-api/sessions/pools/pool.example/use/check-passed": value + 1, + } + + +def mock_request_error(*, status=200, response_content=None): + kwargs: Dict[str, Any] = {} + if _REQUEST_ERROR_HAS_QUERY: + kwargs["query"] = {} + return RequestError( + history=None, + request_info=None, + response_content=response_content, + status=status, + **kwargs, + ) + + +# Number of times to test request errors that must be retried forever. +FOREVER_TIMES = 100 + + +class fast_forward: + def __init__(self, time): + self.time = time + + +@pytest.mark.parametrize( + ("retrying", "outcomes", "exhausted"), + ( + *( + (retry_policy, outcomes, exhausted) + for retry_policy in ( + SESSION_DEFAULT_RETRY_POLICY, + SESSION_AGGRESSIVE_RETRY_POLICY, + ) + for status in (520, 521) + for outcomes, exhausted in ( + ( + (mock_request_error(status=status),), + True, + ), + ( + (mock_request_error(status=429),), + False, + ), + ( + ( + mock_request_error(status=429), + mock_request_error(status=status), + ), + True, + ), + ) + ), + ), +) +@ensureDeferred +@patch("time.monotonic") +async def test_retry_stop(monotonic_mock, retrying, outcomes, exhausted): + monotonic_mock.return_value = 0 + last_outcome = outcomes[-1] + outcomes = deque(outcomes) + + def wait(retry_state): + return 0.0 + + retrying = copy(retrying) + retrying.wait = wait + + async def run(): + while True: + try: + outcome = outcomes.popleft() + except IndexError: + return + else: + if isinstance(outcome, fast_forward): + monotonic_mock.return_value += outcome.time + continue + raise outcome + + run = retrying.wraps(run) + try: + await run() + except Exception as outcome: + assert exhausted + assert outcome is last_outcome + else: + assert not exhausted + + +try: + from scrapy import addons # noqa: F401 +except ImportError: + ADDON_SUPPORT = False +else: + ADDON_SUPPORT = True + + +@pytest.mark.parametrize( + ("manual_settings", "addon_settings"), + ( + ( + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + {}, + ), + ( + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + {"ZYTE_API_RETRY_POLICY": "zyte_api.zyte_api_retrying"}, + ), + ( + { + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" + }, + {"ZYTE_API_RETRY_POLICY": "zyte_api.aggressive_retrying"}, + ), + ( + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + {"ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY"}, + ), + ( + { + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" + }, + { + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_AGGRESSIVE_RETRY_POLICY" + }, + ), + ( + {"ZYTE_API_RETRY_POLICY": "tests.test_sessions.UNSET"}, + {"ZYTE_API_RETRY_POLICY": "tests.test_sessions.UNSET"}, + ), + ), +) +@ensureDeferred +@pytest.mark.skipif( + not ADDON_SUPPORT, reason="No add-on support in this version of Scrapy" +) +async def test_addon(manual_settings, addon_settings): + crawler = await get_crawler( + { + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_SESSION_ENABLED": True, + **manual_settings, + } + ) + addon_crawler = await get_crawler( + {"ZYTE_API_SESSION_ENABLED": True, **addon_settings}, use_addon=True + ) + assert serialize_settings(crawler.settings) == serialize_settings( + addon_crawler.settings + ) + + +@ensureDeferred +async def test_session_config(mockserver): + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001-a.example", + "postal-code-10001-a-fail.example", + "postal-code-10001-a-alternative.example", + ] + ) + class CustomSessionConfig(SessionConfig): + + def params(self, request: Request): + return { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + } + + def check(self, response: Response, request: Request) -> bool: + domain = urlparse_cached(request).netloc + return "fail" not in domain + + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-a-alternative.example": + return "postal-code-10001-a.example" + return domain + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001-a.example", + "https://postal-code-10001-a-alternative.example", + "https://postal-code-10001-a-fail.example", + "https://postal-code-10001-b.example", + ] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001-a.example/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001-a.example/use/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001-a-fail.example/init/check-failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-b.example/init/failed": 1, + } + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001-a.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-a-alternative.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-a-fail.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-b.example/init/failed": 1, + } + + +@ensureDeferred +async def test_session_config_location(mockserver): + """A custom session config can be used to customize the params for + location, e.g. to include extra actions, while still relying on the default + check to determine whether or not the session remains valid based on the + outcome of the ``setLocation`` action.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example"]) + class CustomSessionConfig(SessionConfig): + + def params(self, request: Request): + return { + "actions": [ + { + "action": "waitForNavigation", + }, + { + "action": "setLocation", + "address": self.location(request), + }, + ] + } + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example"] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 1, + } + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_session_config_location_no_set_location(mockserver): + """A custom session config can be used to customize the params for + location to the point where they do not use a ``setLocation`` action. In + that case, the default session check will return ``True`` by default, i.e. + it will not fail due to not finding ``setLocation`` in response actions + data.""" + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + + def params(self, request: Request): + postal_code = self.location(request)["postalCode"] + return { + "actions": [ + { + "action": "click", + "selector": {"type": "css", "value": f"#zip{postal_code}"}, + }, + ] + } + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + } + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_session_config_param_error(mockserver): + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + + def params(self, request: Request): + raise Exception + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/param-error": 1, + } + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_session_config_pool_caching(mockserver): + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + def __init__(self, crawler): + super().__init__(crawler) + self.pools = deque(("example.com",)) + + def pool(self, request: Request): + # The following code would fail on the second call, which never + # happens due to pool caching. + return self.pools.popleft() + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + } + assert crawler.spider.close_reason == "finished" + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_session_config_pool_error(mockserver): + # NOTE: This error should only happen during the initial process_request + # call. By the time the code reaches process_response, the cached pool + # value for that request is reused, so there is no new call to + # SessionConfig.pool that could fail during process_response only. + + pytest.importorskip("web_poet") + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + + def pool(self, request: Request): + raise Exception + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + pass + + def closed(self, reason): + self.close_reason = reason + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == {} + assert crawler.spider.close_reason == "pool_error" + + # Clean up the session config registry. + session_config_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_session_config_no_web_poet(mockserver): + """If web-poet is not installed, @session_config raises a RuntimeError.""" + try: + import web_poet # noqa: F401 + except ImportError: + pass + else: + pytest.skip("Test only relevant when web-poet is not installed.") + + with pytest.raises(RuntimeError): + + @session_config(["example.com"]) + class CustomSessionConfig(SessionConfig): + pass + + +@ensureDeferred +async def test_session_refresh(mockserver): + """If a response does not pass a session validity check, the session is + discarded, and the request is retried with a different session.""" + + class Tracker: + def __init__(self): + self.sessions = [] + + def track_session(self, request: Request, spider: Spider): + self.sessions.append(request.meta["zyte_api"]["session"]["id"]) + + tracker = Tracker() + + settings = { + "RETRY_TIMES": 1, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_CHECKER": "tests.test_sessions.DomainChecker", + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://session-check-fails.example"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect( + tracker.track_session, signal=signals.request_reached_downloader + ) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/session-check-fails.example/init/check-passed": 3, + "scrapy-zyte-api/sessions/pools/session-check-fails.example/use/check-failed": 2, + } + assert len(tracker.sessions) == 5 + assert tracker.sessions[0] == tracker.sessions[1] + assert tracker.sessions[0] != tracker.sessions[2] + assert tracker.sessions[2] == tracker.sessions[3] + assert tracker.sessions[0] != tracker.sessions[4] + assert tracker.sessions[2] != tracker.sessions[4] + + +@ensureDeferred +async def test_session_refresh_concurrent(mockserver): + """When more than 1 request is using the same session concurrently, it can + happen that more than 1 response triggers a session refresh. In those + cases, the same session should be refreshed only once, not once per + response triggering a refresh.""" + settings = { + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + "ZYTE_API_SESSION_MAX_ERRORS": 1, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com/"] + + def parse(self, response): + for n in range(2): + yield Request(f"https://example.com/{n}?temporary-download-error") + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/init/failed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/failed": 2, + } + + +@ensureDeferred +async def test_cookies(mockserver): + class Tracker: + def __init__(self): + self.cookies = [] + + def track(self, request: Request, spider: Spider): + cookie = request.headers.get(b"Cookie", None) + self.cookies.append(cookie) + + tracker = Tracker() + + settings = { + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + + def start_requests(self): + yield Request( + "https://example.com", + cookies={"a": "b"}, + meta={"zyte_api_session_enabled": False}, + ) + + def parse(self, response): + yield Request( + "https://example.com/2", + meta={"zyte_api_session_enabled": False}, + callback=self.parse2, + ) + + def parse2(self, response): + yield Request( + "https://example.com/3", + callback=self.parse3, + ) + + def parse3(self, response): + yield Request( + "https://example.com/4", + meta={"dont_merge_cookies": False}, + callback=self.parse4, + ) + + def parse4(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 2, + } + + assert tracker.cookies == [ + # The 1st request sets cookies and disables session management, so + # cookies are set. + b"a=b", + # The 2nd request disables session management, and gets the cookies set + # by the previous request in the global cookiejar. + b"a=b", + # The 3rd request uses session management, and neither the session init + # request nor the actual request using the session get cookies. + None, + None, + # The 4th request uses session management but sets dont_merge_cookies + # to ``False``, so while session init does not use cookies, the actual + # request using the session gets the cookies. + None, + b"a=b", + ] + + +@ensureDeferred +async def test_empty_queue(mockserver): + """After a pool is full, there might be a situation when the middleware + tries to assign a session to a request but all sessions of the pool are + pending creation or a refresh. In those cases, the assign process should + wait until a session becomes available in the queue.""" + settings = { + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + # We send 2 requests in parallel, so only the first one gets a session + # created on demand, and the other one is forced to wait until that + # session is initialized. + start_urls = ["https://example.com/1", "https://example.com/2"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 2, + } + + +@ensureDeferred +async def test_empty_queue_limit(mockserver): + settings = { + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_QUEUE_MAX_ATTEMPTS": 1, + "ZYTE_API_SESSION_QUEUE_WAIT_TIME": 0, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com/1", "https://example.com/2"] + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + } + + +class SessionIDRemovingDownloaderMiddleware: + + def process_exception( + self, request: Request, exception: Exception, spider: Spider + ) -> Union[Request, None]: + if not isinstance(exception, RequestError) or request.meta.get( + "_is_session_init_request", False + ): + return None + + del request.meta["zyte_api_automap"]["session"] + del request.meta["zyte_api_provider"]["session"] + return None + + +@ensureDeferred +async def test_missing_session_id(mockserver, caplog): + """If a session ID is missing from a request that should have had it + assigned, a warning is logged about it.""" + + settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, + "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, + "tests.test_sessions.SessionIDRemovingDownloaderMiddleware": 675, + }, + "RETRY_TIMES": 0, + "ZYTE_API_RETRY_POLICY": "scrapy_zyte_api.SESSION_DEFAULT_RETRY_POLICY", + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_PARAMS": {"url": "https://example.com"}, + "ZYTE_API_SESSION_POOL_SIZE": 1, + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://temporary-download-error.example"] + + def parse(self, response): + pass + + caplog.clear() + caplog.set_level("WARNING") + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/temporary-download-error.example/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/temporary-download-error.example/use/failed": 1, + } + assert "had no session ID assigned, unexpectedly" in caplog.text + + +@pytest.mark.parametrize( + ("settings", "meta", "meta_key"), + ( + ( + {}, + {}, + "zyte_api", + ), + ( + {}, + {"zyte_api": {}}, + "zyte_api", + ), + ( + {}, + {"zyte_api": {"httpResponseBody": True}}, + "zyte_api", + ), + ( + {}, + {"zyte_api_automap": True}, + "zyte_api_automap", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {}, + "zyte_api_automap", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {"zyte_api_automap": False}, + "zyte_api", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {"zyte_api_automap": {}}, + "zyte_api_automap", + ), + ( + {"ZYTE_API_TRANSPARENT_MODE": True}, + {"zyte_api_automap": True}, + "zyte_api_automap", + ), + ), +) +@ensureDeferred +async def test_assign_meta_key(settings, meta, meta_key, mockserver): + """Session ID is set in the zyte_api_provider meta key always, and in + either zyte_api or zyte_api_automap depending on some settings and meta + keys.""" + + class Tracker: + def __init__(self): + self.meta: Dict[str, Any] = {} + + def track(self, request: Request, spider: Spider): + self.meta = deepcopy(request.meta) + + tracker = Tracker() + + settings = { + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + **settings, + } + + class TestSpider(Spider): + name = "test" + + def start_requests(self): + yield Request( + "https://example.com", + meta=meta, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + } + + assert ( + tracker.meta["zyte_api_provider"]["session"] + == tracker.meta[meta_key]["session"] + ) + other_meta_key = "zyte_api" if meta_key != "zyte_api" else "zyte_api_automap" + assert tracker.meta.get(other_meta_key, False) is False + + +@ensureDeferred +async def test_provider(mockserver): + pytest.importorskip("scrapy_poet") + + from scrapy_poet import DummyResponse + from zyte_common_items import Product + + class Tracker: + def __init__(self): + self.query: Dict[str, Any] = {} + + def track(self, request: Request, spider: Spider): + self.query = request.meta["zyte_api"] + + tracker = Tracker() + + settings = { + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + + def start_requests(self): + yield Request("https://example.com", callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + crawler.signals.connect(tracker.track, signal=signals.request_reached_downloader) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1, + } + assert "product" in tracker.query + + +class ExceptionRaisingDownloaderMiddleware: + + async def process_request(self, request: Request, spider: Spider) -> None: + if request.meta.get("_is_session_init_request", False): + return + raise spider.exception + + +@pytest.mark.parametrize( + ("exception", "stat", "reason"), + ( + ( + mock_request_error( + status=422, response_content=b'{"type": "/problem/session-expired"}' + ), + "expired", + "session_expired", + ), + ( + mock_request_error(status=520), + "failed", + "download_error", + ), + ( + mock_request_error(status=521), + "failed", + "download_error", + ), + ( + mock_request_error(status=500), + None, + None, + ), + ( + ServerConnectionError(), + None, + None, + ), + ( + RuntimeError(), + None, + None, + ), + ), +) +@ensureDeferred +async def test_exceptions(exception, stat, reason, mockserver, caplog): + settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633, + "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, + "tests.test_sessions.ExceptionRaisingDownloaderMiddleware": 675, + }, + "RETRY_TIMES": 0, + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_URL": mockserver.urljoin("/"), + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.exception = exception + + def parse(self, response): + pass + + caplog.clear() + caplog.set_level("ERROR") + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + if stat is not None: + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2, + f"scrapy-zyte-api/sessions/pools/example.com/use/{stat}": 1, + } + else: + assert session_stats == { + "scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 1, + } + if reason is not None: + assert reason in caplog.text