diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e2769671..34815c52 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -4,4 +4,6 @@ commit = True tag = True tag_name = {new_version} +[bumpversion:file:docs/conf.py] + [bumpversion:file:scrapy_zyte_api/__version__.py] diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 66643b86..ee74037b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -62,7 +62,7 @@ jobs: fail-fast: false matrix: python-version: ["3.11"] - tox-job: ["mypy", "linters", "twine-check"] + tox-job: ["mypy", "linters", "twine-check", "docs"] steps: - uses: actions/checkout@v3 diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..1519565e --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,12 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py +build: + os: ubuntu-22.04 + tools: + python: "3.11" # Keep in sync with .github/workflows/test.yml +python: + install: + - requirements: docs/requirements.txt + - path: . diff --git a/CHANGES.rst b/CHANGES.rst index 82474181..3f29f488 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -186,7 +186,7 @@ Changes cookiejar of the request. * A new boolean setting, ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED``, can be - set to ``True`` to enable automated mapping of cookies from a request + set to ``True`` to enable automatic mapping of cookies from a request cookiejar into the ``experimental.requestCookies`` Zyte API parameter. * ``ZyteAPITextResponse`` is now a subclass of ``HtmlResponse``, so that the @@ -258,10 +258,10 @@ When upgrading, you should set the following in your Scrapy settings: be set to ``True`` to make all requests use Zyte API by default, with request parameters being automatically mapped to Zyte API parameters. * Add a Request meta key, ``zyte_api_automap``, that can be used to enable - automated request parameter mapping for specific requests, or to modify the - outcome of automated request parameter mapping for specific requests. + automatic request parameter mapping for specific requests, or to modify the + outcome of automatic request parameter mapping for specific requests. * Add a ``ZYTE_API_AUTOMAP_PARAMS`` setting, which is a counterpart for - ``ZYTE_API_DEFAULT_PARAMS`` that applies to requests where automated request + ``ZYTE_API_DEFAULT_PARAMS`` that applies to requests where automatic request parameter mapping is enabled. * Add the ``ZYTE_API_SKIP_HEADERS`` and ``ZYTE_API_BROWSER_HEADERS`` settings to control the automatic mapping of request headers. diff --git a/README.rst b/README.rst index a5f89e9f..41965a56 100644 --- a/README.rst +++ b/README.rst @@ -18,12 +18,15 @@ scrapy-zyte-api :target: https://codecov.io/gh/scrapy-plugins/scrapy-zyte-api :alt: Coverage report +.. description starts -Scrapy plugin for `Zyte API`_. +Scrapy plugin for seamless `Zyte API`_ integration. .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html +.. description ends +<<<<<<< HEAD Requirements ============ @@ -978,3 +981,968 @@ Running behind a proxy If you require a proxy to access Zyte API (e.g. a corporate proxy), configure the ``HTTP_PROXY`` and ``HTTPS_PROXY`` environment variables accordingly, and set the ``ZYTE_API_USE_ENV_PROXY`` setting to ``True``. +||||||| 5e205ef +Requirements +============ + +* Python 3.7+ +* Scrapy 2.0.1+ + +scrapy-poet integration requires more recent software: + +* Python 3.8+ +* Scrapy 2.6+ + +Installation +============ + +.. code-block:: + + pip install scrapy-zyte-api + + +Quick start +=========== + +Get a `Zyte API`_ key, and add it to your project settings.py: + +.. code-block:: python + + ZYTE_API_KEY = "YOUR_API_KEY" + +Instead of adding API key to setting.py you can also set +``ZYTE_API_KEY`` environment variable. + +Then, set up the scrapy-zyte-api integration: + +.. code-block:: python + + DOWNLOAD_HANDLERS = { + "http": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", + "https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", + } + DOWNLOADER_MIDDLEWARES = { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, + } + REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter" + TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" + +By default, scrapy-zyte-api doesn't change the spider behavior. +To switch your spider to use Zyte API for all requests, +set the following option: + +.. code-block:: python + + ZYTE_API_TRANSPARENT_MODE = True + +Configuration +============= + +To enable this plugin: + +- Set the ``http`` and ``https`` keys in the `DOWNLOAD_HANDLERS + `_ + Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIDownloadHandler"``. + +- Add ``"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware"`` to the + `DOWNLOADER_MIDDLEWARES + `_ + Scrapy setting with any value, e.g. ``1000``. + +- Set the `REQUEST_FINGERPRINTER_CLASS + `_ + Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter"``. + +- Set the `TWISTED_REACTOR + `_ + Scrapy setting to + ``"twisted.internet.asyncioreactor.AsyncioSelectorReactor"``. + + .. note:: On existing projects that were not using the asyncio Twisted + reactor, your existing code may need changes, such as: + + - `Handling a pre-installed Twisted reactor + `_. + + Some Twisted imports install the default, non-asyncio Twisted + reactor as a side effect. Once a reactor is installed, it cannot be + changed for the whole run time. + + - `Converting Twisted Deferreds into asyncio Futures + `_. + + Note that you might be using Deferreds without realizing it through + some Scrapy functions and methods. For example, when you yield the + return value of ``self.crawler.engine.download()`` from a spider + callback, you are yielding a Deferred. + +- Set `your Zyte API key + `_ as + either the ``ZYTE_API_KEY`` Scrapy setting or as an environment variable of + the same name. + +The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to +``False`` to disable this plugin. + +If you want to use scrapy-poet integration, add a provider to +``SCRAPY_POET_PROVIDERS`` (see `scrapy-poet integration`_): + +.. code-block:: python + + SCRAPY_POET_PROVIDERS = { + "scrapy_zyte_api.providers.ZyteApiProvider": 1100, + } + +Usage +===== + +You can send requests through Zyte API in one of the following ways: + +- Send all request through Zyte API by default, letting Zyte API parameters + be chosen automatically based on your Scrapy request parameters. See + `Using transparent mode`_. + +- Send specific requests through Zyte API, setting all Zyte API parameters + manually, keeping full control of what is sent to Zyte API. + See `Sending requests with manually-defined parameters`_. + +- Send specific requests through Zyte API, letting Zyte API parameters be + chosen automatically based on your Scrapy request parameters. + See `Sending requests with automatically-mapped parameters`_. + +Zyte API response parameters are mapped into Scrapy response parameters where +possible. See `Response mapping`_ for details. + + +Using transparent mode +---------------------- + +Set the ``ZYTE_API_TRANSPARENT_MODE`` `Scrapy setting`_ to ``True`` to handle +Scrapy requests as follows: + +.. _Scrapy setting: https://docs.scrapy.org/en/latest/topics/settings.html + +- By default, requests are sent through Zyte API with automatically-mapped + parameters. See `Sending requests with automatically-mapped parameters`_ + for details about automatic request parameter mapping. + + You do not need to set the ``zyte_api_automap`` request meta key to + ``True``, but you can set it to a dictionary to extend your Zyte API + request parameters. + +- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent + through Zyte API with manually-defined parameters. + See `Sending requests with manually-defined parameters`_. + +- Requests with the ``zyte_api_automap`` request meta key set to ``False`` + are *not* sent through Zyte API. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + start_urls = ["https://quotes.toscrape.com/"] + + custom_settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + } + + def parse(self, response): + print(response.text) + # "…" + + +Sending requests with manually-defined parameters +------------------------------------------------- + +To send a Scrapy request through Zyte API with manually-defined parameters, +define your Zyte API parameters in the ``zyte_api`` key in +`Request.meta `_ +as a ``dict``. + +The only exception is the ``url`` parameter, which should not be defined as a +Zyte API parameter. The value from ``Request.url`` is used automatically. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "browserHtml": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +Note that response headers are necessary for raw response decoding. When +defining parameters manually and requesting ``httpResponseBody`` extraction, +remember to also request ``httpResponseHeaders`` extraction: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "httpResponseBody": True, + "httpResponseHeaders": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +To learn more about Zyte API parameters, see the `data extraction usage`_ and +`API reference`_ pages of the `Zyte API documentation`_. + +.. _API reference: https://docs.zyte.com/zyte-api/openapi.html +.. _data extraction usage: https://docs.zyte.com/zyte-api/usage/extract.html +.. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html + + +Sending requests with automatically-mapped parameters +----------------------------------------------------- + +To send a Scrapy request through Zyte API letting Zyte API parameters be +automatically chosen based on the parameters of that Scrapy request, set the +``zyte_api_automap`` key in +`Request.meta `_ +to ``True``. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api_automap": True, + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +See also `Using transparent mode`_ and `Automated request parameter mapping`_. + + +Response mapping +---------------- + +Zyte API responses are mapped with one of the following classes: + +- ``scrapy_zyte_api.responses.ZyteAPITextResponse``, a subclass of + ``scrapy.http.TextResponse``, is used to map text responses, i.e. responses + with ``browserHtml`` or responses with both ``httpResponseBody`` and + ``httpResponseHeaders`` with a text body (e.g. plain text, HTML, JSON). + +- ``scrapy_zyte_api.responses.ZyteAPIResponse``, a subclass of + ``scrapy.http.Response``, is used to map any other response. + +Zyte API response parameters are mapped into response class attributes where +possible: + +- ``url`` becomes ``response.url``. + +- ``statusCode`` becomes ``response.status``. + +- ``httpResponseHeaders`` and ``experimental.responseCookies`` become + ``response.headers``. + +- ``experimental.responseCookies`` is also mapped into the request cookiejar. + +- ``browserHtml`` and ``httpResponseBody`` are mapped into both + ``response.text`` (``str``) and ``response.body`` (``bytes``). + + If none of these parameters were present, e.g. if the only requested output + was ``screenshot``, ``response.text`` and ``response.body`` would be empty. + + If a future version of Zyte API supported requesting both outputs on the + same request, and both parameters were present, ``browserHtml`` would be + the one mapped into ``response.text`` and ``response.body``. + +Both response classes have a ``raw_api_response`` attribute that contains a +``dict`` with the complete, raw response from Zyte API, where you can find all +Zyte API response parameters, including those that are not mapped into other +response class atttributes. + +For example, for a request for ``httpResponseBody`` and +``httpResponseHeaders``, you would get: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {b"Content-Type": [b"text/html"], …} + print(response.text) + # "…" + print(response.body) + # b"…" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", + # "httpResponseHeaders": […], + # } + +For a request for ``screenshot``, on the other hand, the response would look +as follows: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {} + print(response.text) + # "" + print(response.body) + # b"" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "screenshot": "iVBORw0KGgoAAAANSUh…", + # } + from base64 import b64decode + print(b64decode(response.raw_api_response["screenshot"])) + # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' + + +Automated request parameter mapping +----------------------------------- + +When you enable automated request parameter mapping, be it through transparent +mode (see `Using transparent mode`_) or for a specific request (see +`Sending requests with automatically-mapped parameters`_), Zyte API +parameters are chosen as follows by default: + +- ``Request.url`` becomes ``url``, same as in requests with manually-defined + parameters. + +- If ``Request.method`` is something other than ``"GET"``, it becomes + ``httpRequestMethod``. + +- ``Request.headers`` become ``customHttpRequestHeaders``. + +- ``Request.body`` becomes ``httpRequestBody``. + +- If the ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED`` Scrapy setting is + ``True``, the COOKIES_ENABLED_ Scrapy setting is ``True`` (default), and + provided request metadata does not set dont_merge_cookies_ to ``True``: + + .. _COOKIES_ENABLED: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-setting-COOKIES_ENABLED + .. _dont_merge_cookies: https://docs.scrapy.org/en/latest/topics/request-response.html#std-reqmeta-dont_merge_cookies + + - ``experimental.responseCookies`` is set to ``True``. + + - Cookies from the request `cookie jar`_ become + ``experimental.requestCookies``. + + .. _cookie jar: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-reqmeta-cookiejar + + All cookies from the cookie jar are set, regardless of their cookie + domain. This is because Zyte API requests may involve requests to + different domains (e.g. when following cross-domain redirects, or + during browser rendering). + + If the cookies to be set exceed the limit defined in the + ``ZYTE_API_MAX_COOKIES`` setting (100 by default), a warning is logged, + and only as many cookies as the limit allows are set for the target + request. To silence this warning, set ``experimental.requestCookies`` + manually, e.g. to an empty dict. Alternatively, if Zyte API starts + supporting more than 100 request cookies, update the + ``ZYTE_API_MAX_COOKIES`` setting accordingly. + + If you are using a custom downloader middleware to handle request + cookiejars, you can point the ``ZYTE_API_COOKIE_MIDDLEWARE`` setting to + its import path to make scrapy-zyte-api work with it. The downloader + middleware is expected to have a ``jars`` property with the same + signature as in the built-in Scrapy downloader middleware for cookie + handling. + +- ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. + + This is subject to change without prior notice in future versions of + scrapy-zyte-api, so please account for the following: + + - If you are requesting a binary resource, such as a PDF file or an + image file, set ``httpResponseBody`` to ``True`` explicitly in your + requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/img/zyte.png", + meta={ + "zyte_api_automap": {"httpResponseBody": True}, + }, + ) + + In the future, we may stop setting ``httpResponseBody`` to ``True`` by + default, and instead use a different, new Zyte API parameter that only + works for non-binary responses (e.g. HMTL, JSON, plain text). + + - If you need to access response headers, be it through + ``response.headers`` or through + ``response.raw_api_response["httpResponseHeaders"]``, set + ``httpResponseHeaders`` to ``True`` explicitly in your requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/", + meta={ + "zyte_api_automap": {"httpResponseHeaders": True}, + }, + ) + + At the moment we request response headers because some response headers + are necessary to properly decode the response body as text. In the + future, Zyte API may be able to handle this decoding automatically, so + we would stop setting ``httpResponseHeaders`` to ``True`` by default. + +For example, the following Scrapy request: + +.. code-block:: python + + Request( + method="POST" + url="https://httpbin.org/anything", + headers={"Content-Type": "application/json"}, + body=b'{"foo": "bar"}', + cookies={"a": "b"}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "customHttpRequestHeaders": [ + { + "name": "Content-Type", + "value": "application/json" + } + ], + "experimental": { + "requestCookies": [ + { + "name": "a", + "value": "b", + "domain": "" + } + ], + "responseCookies": true + }, + "httpResponseBody": true, + "httpResponseHeaders": true, + "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", + "httpRequestMethod": "POST", + "url": "https://httpbin.org/anything" + } + +You may set the ``zyte_api_automap`` key in +`Request.meta `_ +to a ``dict`` of Zyte API parameters to extend or override choices made by +automated request parameter mapping. + +Enabling ``browserHtml``, ``screenshot``, or an automatic extraction property, +unsets ``httpResponseBody`` and ``httpResponseHeaders``, and makes +``Request.headers`` become ``requestHeaders`` instead of +``customHttpRequestHeaders``. For example, the following Scrapy request: + +.. code-block:: python + + Request( + url="https://quotes.toscrape.com", + headers={"Referer": "https://example.com/"}, + meta={"zyte_api_automap": {"browserHtml": True}}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "browserHtml": true, + "experimental": { + "responseCookies": true + }, + "requestHeaders": {"referer": "https://example.com/"}, + "url": "https://quotes.toscrape.com" + } + +When mapping headers, headers not supported by Zyte API are excluded from the +mapping by default. Use the following `Scrapy settings`_ to change which +headers are included or excluded from header mapping: + +.. _Scrapy settings: https://docs.scrapy.org/en/latest/topics/settings.html + +- ``ZYTE_API_SKIP_HEADERS`` determines headers that must *not* be mapped as + ``customHttpRequestHeaders``, and its default value is: + + .. code-block:: python + + ["User-Agent"] + +- ``ZYTE_API_BROWSER_HEADERS`` determines headers that *can* be mapped as + ``requestHeaders``. It is a ``dict``, where keys are header names and + values are the key that represents them in ``requestHeaders``. Its default + value is: + + .. code-block:: python + + {"Referer": "referer"} + +To maximize support for potential future changes in Zyte API, automated +request parameter mapping allows some parameter values and parameter +combinations that Zyte API does not currently support, and may never support: + +- ``Request.method`` becomes ``httpRequestMethod`` even for unsupported_ + ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. + + .. _unsupported: https://docs.zyte.com/zyte-api/usage/extract.html#zyte-api-set-method + +- You can set ``customHttpRequestHeaders`` or ``requestHeaders`` to ``True`` + to force their mapping from ``Request.headers`` in scenarios where they + would not be mapped otherwise. + + Conversely, you can set ``customHttpRequestHeaders`` or ``requestHeaders`` + to ``False`` to prevent their mapping from ``Request.headers``. + +- ``Request.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` + is unset. + +- You can set ``httpResponseBody`` to ``False`` (which unsets the parameter), + and not set ``browserHtml`` or ``screenshot`` to ``True``. In this case, + ``Request.headers`` is mapped as ``requestHeaders``. + +- You can set ``httpResponseBody`` to ``True`` and also set ``browserHtml`` + or ``screenshot`` to ``True``. In this case, ``Request.headers`` is mapped + both as ``customHttpRequestHeaders`` and as ``requestHeaders``, and + ``browserHtml`` is used as the Scrapy response body. + + +Setting default parameters +========================== + +Often the same configuration needs to be used for all Zyte API requests. For +example, all requests may need to set the same geolocation, or the spider only +uses ``browserHtml`` requests. + +The following settings allow you to define Zyte API parameters to be included +in all requests: + +- ``ZYTE_API_DEFAULT_PARAMS`` is a ``dict`` of parameters to be combined with + manually-defined parameters. See `Sending requests with manually-defined parameters`_. + + You may set the ``zyte_api`` request meta key to an empty ``dict`` to only + use default parameters for that request. + +- ``ZYTE_API_AUTOMAP_PARAMS`` is a ``dict`` of parameters to be combined with + automatically-mapped parameters. + See `Sending requests with automatically-mapped parameters`_. + +For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to +``{"geolocation": "US"}`` and ``zyte_api`` to ``{"browserHtml": True}``, +``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. + +Parameters in these settings are merged with request-specific parameters, with +request-specific parameters taking precedence. + +``ZYTE_API_DEFAULT_PARAMS`` has no effect on requests that use automated +request parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on +requests that use manually-defined parameters. + +When using transparent mode (see `Using transparent mode`_), be careful +of which parameters you define through ``ZYTE_API_AUTOMAP_PARAMS``. In +transparent mode, all Scrapy requests go through Zyte API, even requests that +Scrapy sends automatically, such as those for ``robots.txt`` files when +ROBOTSTXT_OBEY_ is ``True``, or those for sitemaps when using a `sitemap +spider`_. Certain parameters, like ``browserHtml`` or ``screenshot``, are not +meant to be used for every single request. + +If the ``zyte_api_default_params`` request meta key is set to ``False``, the +value of the ``ZYTE_API_DEFAULT_PARAMS`` setting for this request is ignored. + +.. _ROBOTSTXT_OBEY: https://docs.scrapy.org/en/latest/topics/settings.html#robotstxt-obey +.. _sitemap spider: https://docs.scrapy.org/en/latest/topics/spiders.html#sitemapspider + + +Customizing the retry policy +============================ + +API requests are retried automatically using the default retry policy of +`python-zyte-api`_. + +API requests that exceed retries are dropped. You cannot manage API request +retries through Scrapy downloader middlewares. + +Use the ``ZYTE_API_RETRY_POLICY`` setting or the ``zyte_api_retry_policy`` +request meta key to override the default `python-zyte-api`_ retry policy with a +custom retry policy. + +A custom retry policy must be an instance of `tenacity.AsyncRetrying`_. + +Scrapy settings must be picklable, which `retry policies are not +`_, so you cannot assign retry +policy objects directly to the ``ZYTE_API_RETRY_POLICY`` setting, and must use +their import path string instead. + +When setting a retry policy through request meta, you can assign the +``zyte_api_retry_policy`` request meta key either the retry policy object +itself or its import path string. If you need your requests to be serializable, +however, you may also need to use the import path string. + +For example, to increase the maximum number of retries to 10 before dropping +the API request, you can subclass RetryFactory_ as follows: + +.. code-block:: python + + # project/retry_policies.py + from tenacity import stop_after_attempt + from zyte_api.aio.retry import RetryFactory + + class CustomRetryFactory(RetryFactory): + temporary_download_error_stop = stop_after_attempt(10) + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + + # project/settings.py + ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" + + +To extend this retry policy, so it will also retry HTTP 521 errors, the same +as HTTP 520 errors, you can implement: + +.. code-block:: python + + # project/retry_policies.py + from tenacity import retry_if_exception, RetryCallState, stop_after_attempt + from zyte_api.aio.errors import RequestError + from zyte_api.aio.retry import RetryFactory + + def is_http_521(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status == 521 + + class CustomRetryFactory(RetryFactory): + + retry_condition = ( + RetryFactory.retry_condition + | retry_if_exception(is_http_521) + ) + temporary_download_error_stop = stop_after_attempt(10) + + def wait(self, retry_state: RetryCallState) -> float: + if is_http_521(retry_state.outcome.exception()): + return self.temporary_download_error_wait(retry_state=retry_state) + return super().wait(retry_state) + + def stop(self, retry_state: RetryCallState) -> bool: + if is_http_521(retry_state.outcome.exception()): + return self.temporary_download_error_stop(retry_state) + return super().stop(retry_state) + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + + # project/settings.py + ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" + +.. _python-zyte-api: https://github.com/zytedata/python-zyte-api +.. _RetryFactory: https://github.com/zytedata/python-zyte-api/blob/main/zyte_api/aio/retry.py +.. _tenacity.AsyncRetrying: https://tenacity.readthedocs.io/en/latest/api.html#tenacity.AsyncRetrying + + +Misc settings +============= + +- ``ZYTE_API_MAX_REQUESTS`` + + Default: ``None`` + + When set to an integer value > 0, the spider will close when the number of + successful Zyte API requests reaches it. Note that in some cases, the actual + number of successful Zyte API requests would be below this number if some of + the in-progress requests fail or error out. + + +Stats +===== + +Stats from python-zyte-api_ are exposed as Scrapy stats with the +``scrapy-zyte-api`` prefix. + +For example, ``scrapy-zyte-api/status_codes/`` stats indicate the +status code of Zyte API responses (e.g. ``429`` for `rate limiting +`_ or +``520`` for `temporary download errors +`_). + +.. note:: The actual status code that is received from the target website, i.e. + the `statusCode + `_ + response field of a `Zyte API successful response + `_, + is accounted for in the ``downloader/response_status_count/`` + stat, as with any other Scrapy response. + + +Request fingerprinting +====================== + +The request fingerprinter class of this plugin ensures that Scrapy 2.7 and +later generate unique `request fingerprints +`_ +for Zyte API requests based on some of their parameters. + +For example, a request for ``browserHtml`` and a request for ``screenshot`` +with the same target URL are considered different requests. Similarly, requests +with the same target URL but different ``actions`` are also considered +different requests. + +Zyte API parameters that affect request fingerprinting +------------------------------------------------------ + +The request fingerprinter class of this plugin generates request fingerprints +for Zyte API requests based on the following Zyte API parameters: + +- ``url`` (`canonicalized `_) + + For URLs that include a URL fragment, like ``https://example.com#foo``, URL + canonicalization keeps the URL fragment if ``browserHtml`` or + ``screenshot`` are enabled. + +- Request attribute parameters (``httpRequestBody``, + ``httpRequestMethod``) + +- Output parameters (``browserHtml``, ``httpResponseBody``, + ``httpResponseHeaders``, ``screenshot``) + +- Rendering option parameters (``actions``, ``javascript``, + ``screenshotOptions``) + +- ``geolocation`` + +The following Zyte API parameters are *not* taken into account for request +fingerprinting: + +- Request header parameters (``customHttpRequestHeaders``, + ``requestHeaders``) + +- Metadata parameters (``echoData``, ``jobId``) + +- Experimental parameters (``experimental``) + + +Changing the fingerprinting of non-Zyte-API requests +---------------------------------------------------- + +You can assign a request fingerprinter class to the +``ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS`` Scrapy setting to configure +a custom request fingerprinter class to use for requests that do not go through +Zyte API: + +.. code-block:: python + + ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS = "custom.RequestFingerprinter" + +By default, requests that do not go through Zyte API use the default request +fingerprinter class of the installed Scrapy version. + + +Request fingerprinting before Scrapy 2.7 +---------------------------------------- + +If you have a Scrapy version older than Scrapy 2.7, Zyte API parameters are not +taken into account for request fingerprinting. This can cause some Scrapy +components, like the filter of duplicate requests or the HTTP cache extension, +to interpret 2 different requests as being the same. + +To avoid most issues, use automated request parameter mapping, either through +transparent mode or setting ``zyte_api_automap`` to ``True`` in +``Request.meta``, and then use ``Request`` attributes instead of +``Request.meta`` as much as possible. Unlike ``Request.meta``, ``Request`` +attributes do affect request fingerprints in Scrapy versions older than Scrapy +2.7. + +For requests that must have the same ``Request`` attributes but should still +be considered different, such as browser-based requests with different URL +fragments, you can set ``dont_filter`` to ``True`` on ``Request.meta`` to +prevent the duplicate filter of Scrapy to filter any of them out. For example: + +.. code-block:: python + + yield Request( + "https://toscrape.com#1", + meta={"zyte_api_automap": {"browserHtml": True}}, + dont_filter=True, + ) + yield Request( + "https://toscrape.com#2", + meta={"zyte_api_automap": {"browserHtml": True}}, + dont_filter=True, + ) + +Note, however, that for other Scrapy components, like the HTTP cache +extensions, these 2 requests would still be considered identical. + + +Logging request parameters +========================== + +Set the ``ZYTE_API_LOG_REQUESTS`` setting to ``True`` and the ``LOG_LEVEL`` +setting to ``"DEBUG"`` to enable the logging of debug messages that indicate +the JSON object sent on every extract request to Zyte API. + +For example:: + + Sending Zyte API extract request: {"url": "https://example.com", "httpResponseBody": true} + +The ``ZYTE_API_LOG_REQUESTS_TRUNCATE``, 64 by default, determines the maximum +length of any string value in the logged JSON object, excluding object keys. To +disable truncation, set it to 0. + +scrapy-poet integration +======================= + +``scrapy-zyte-api`` includes a `scrapy-poet provider`_ that you can use to get +data from Zyte API in page objects. It requires additional dependencies which +you can get by installing the optional ``provider`` feature: +``pip install scrapy-zyte-api[provider]``. Enable the provider in the Scrapy +settings:: + + SCRAPY_POET_PROVIDERS = { + "scrapy_zyte_api.providers.ZyteApiProvider": 1100, + } + +Request some supported dependencies in the page object:: + + @attrs.define + class ProductPage(BasePage): + response: BrowserResponse + product: Product + + + class ZyteApiSpider(scrapy.Spider): + ... + + def parse_page(self, response: DummyResponse, page: ProductPage): + ... + +Or request them directly in the callback:: + + class ZyteApiSpider(scrapy.Spider): + ... + + def parse_page(self, + response: DummyResponse, + browser_response: BrowserResponse, + product: Product, + ): + ... + +The currently supported dependencies are: + +* ``web_poet.BrowserHtml`` +* ``web_poet.BrowserResponse`` +* ``zyte_common_items.Product`` +* ``zyte_common_items.ProductList`` +* ``zyte_common_items.ProductNavigation`` +* ``zyte_common_items.Article`` +* ``zyte_common_items.ArticleList`` +* ``zyte_common_items.ArticleNavigation`` + +The provider will make a request to Zyte API using the ``ZYTE_API_KEY`` and +``ZYTE_API_URL`` settings. + +The provider will ignore the transparent mode and parameter mapping settings. +To add extra parameters to all Zyte API requests sent by the provider, set them +as a dictionary through the ``ZYTE_API_PROVIDER_PARAMS`` setting, for example +in ``settings.py``:: + + ZYTE_API_PROVIDER_PARAMS = {"geolocation": "IE"} + +When the ``ZYTE_API_PROVIDER_PARAMS`` setting includes one of the Zyte API +extraction options (e.g. ``productOptions`` for ``product``), but the +final Zyte API request doesn't include the corresponding data type, the +unused options are automatically removed. So, it's safe to use +``ZYTE_API_PROVIDER_PARAMS`` to set the default options for various extraction +types, e.g.:: + + ZYTE_API_PROVIDER_PARAMS = { + "productOptions": {"extractFrom": "httpResponseBody"}, + "productNavigationOptions": {"extractFrom": "httpResponseBody"}, + } + +Note that the built-in ``scrapy_poet.page_input_providers.ItemProvider`` has a +priority of 2000, so when you have page objects producing +``zyte_common_items.Product`` items you should use higher values for +``ZyteApiProvider`` if you want these items to come from these page objects, +and lower values if you want them to come from Zyte API. + +Currently, when ``ItemProvider`` is used together with ``ZyteApiProvider``, +it may make more requests than is optimal: the normal Scrapy response will be +always requested even when using a ``DummyResponse`` annotation, and in some +dependency combinations two Zyte API requests will be made for the same page. +We are planning to solve these problems in the future releases of +``scrapy-poet`` and ``scrapy-zyte-api``. + +.. _scrapy-poet provider: https://scrapy-poet.readthedocs.io/en/stable/providers.html + + +Running behind a proxy +====================== + +If you require a proxy to access Zyte API (e.g. a corporate proxy), configure +the ``HTTP_PROXY`` and ``HTTPS_PROXY`` environment variables accordingly, and +set the ``ZYTE_API_USE_ENV_PROXY`` setting to ``True``. +======= +* Documentation: https://scrapy-zyte-api.readthedocs.io/en/latest/ +* License: BSD 3-clause +>>>>>>> scrapy-plugins/main diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py new file mode 100644 index 00000000..41814277 --- /dev/null +++ b/docs/_ext/__init__.py @@ -0,0 +1,66 @@ +import re + +from docutils import nodes +from docutils.parsers.rst.roles import set_classes + + +def http_api_reference_role( + name, rawtext, text, lineno, inliner, options={}, content=[] +): + match = re.search( + r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text + ) + if match: + display_text = match[1] + reference = match[2] + else: + display_text = None + reference = text + if reference.startswith("request:"): + request_or_response = "request" + elif reference.startswith("response:"): + request_or_response = "response/200" + else: + raise ValueError( + f":http: directive reference must start with request: or " + f"response:, got {reference} from {text!r}." + ) + + field = reference.split(":", maxsplit=1)[1] + if not display_text: + display_text = field + refuri = ( + f"https://docs.zyte.com/zyte-api/usage/reference.html" + f"#operation/extract/{request_or_response}/{field}" + ) + set_classes(options) + node = nodes.reference(rawtext, display_text, refuri=refuri, **options) + return [node], [] + + +def setup(app): + app.add_role("http", http_api_reference_role) + # https://stackoverflow.com/a/13663325 + # + # Scrapy’s + # https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6 + app.add_crossref_type( + directivename="setting", + rolename="setting", + indextemplate="pair: %s; setting", + ) + app.add_crossref_type( + directivename="signal", + rolename="signal", + indextemplate="pair: %s; signal", + ) + app.add_crossref_type( + directivename="command", + rolename="command", + indextemplate="pair: %s; command", + ) + app.add_crossref_type( + directivename="reqmeta", + rolename="reqmeta", + indextemplate="pair: %s; reqmeta", + ) diff --git a/docs/changes.rst b/docs/changes.rst new file mode 100644 index 00000000..d9e113ec --- /dev/null +++ b/docs/changes.rst @@ -0,0 +1 @@ +.. include:: ../CHANGES.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..c91fd871 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,58 @@ +import sys +from pathlib import Path + +project = "scrapy-zyte-api" +copyright = "2023, Zyte Group Ltd" +author = "Zyte Group Ltd" +release = "0.12.2" + +sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext +extensions = [ + "_ext", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", +] + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +html_theme = "sphinx_rtd_theme" + +intersphinx_mapping = { + "python": ( + "https://docs.python.org/3", + None, + ), + "python-zyte-api": ( + "https://python-zyte-api.readthedocs.io/en/stable", + None, + ), + "scrapy": ( + "https://docs.scrapy.org/en/latest", + None, + ), + "scrapy-poet": ( + "https://scrapy-poet.readthedocs.io/en/stable", + None, + ), + "tenacity": ( + "https://tenacity.readthedocs.io/en/latest", + None, + ), + "w3lib": ( + "https://w3lib.readthedocs.io/en/latest", + None, + ), + "web-poet": ( + "https://web-poet.readthedocs.io/en/stable", + None, + ), + "zyte": ( + "https://docs.zyte.com", + None, + ), + "zyte-common-items": ( + "https://zyte-common-items.readthedocs.io/en/latest", + None, + ), +} diff --git a/docs/first-steps/scrapy-poet.rst b/docs/first-steps/scrapy-poet.rst new file mode 100644 index 00000000..2b4c6db7 --- /dev/null +++ b/docs/first-steps/scrapy-poet.rst @@ -0,0 +1,25 @@ +.. _scrapy-poet-setup: + +================= +scrapy-poet setup +================= + +For :ref:`scrapy-poet integration `: + +- Install or reinstall ``scrapy-zyte-api`` with the ``provider`` extra to + install additional required dependencies: + + .. code-block:: shell + + pip install scrapy-zyte-api[provider] + +- Add the following provider to the ``SCRAPY_POET_PROVIDERS`` setting: + + .. code-block:: python + + SCRAPY_POET_PROVIDERS = { + "scrapy_zyte_api.providers.ZyteApiProvider": 1100, + } + +You can now :ref:`use scrapy-poet ` to get data from Zyte API in +page objects. diff --git a/docs/first-steps/setup.rst b/docs/first-steps/setup.rst new file mode 100644 index 00000000..33e9f7c4 --- /dev/null +++ b/docs/first-steps/setup.rst @@ -0,0 +1,95 @@ +.. _setup: + +============= +Initial setup +============= + +Learn how to get scrapy-zyte-api installed and configured on an existing +:doc:`Scrapy ` project. + +.. tip:: :ref:`Zyte’s web scraping tutorial ` covers + scrapy-zyte-api setup as well. + +Requirements +============ + +You need at least: + +- A :ref:`Zyte API ` subscription (there’s a :ref:`free trial + `). + +- Python 3.7+ + +- Scrapy 2.0.1+ + +:doc:`scrapy-poet ` integration requires higher versions: + +- Python 3.8+ + +- Scrapy 2.6+ + + +Installation +============ + +.. code-block:: shell + + pip install scrapy-zyte-api + + +Configuration +============= + +Add your `Zyte API key`_, and add it to your project ``settings.py``: + +.. _Zyte API key: https://app.zyte.com/o/zyte-api/api-access + +.. code-block:: python + + ZYTE_API_KEY = "YOUR_API_KEY" + +Alternatively, you can set your API key in the ``ZYTE_API_KEY`` environment +variable instead. + +Then, set up scrapy-zyte-api integration in ``settings.py``: + +.. code-block:: python + + DOWNLOAD_HANDLERS = { + "http": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", + "https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", + } + DOWNLOADER_MIDDLEWARES = { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, + } + REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter" + TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" + +By default, scrapy-zyte-api doesn't change the spider behavior. To switch your +spider to use Zyte API for all requests, set the following setting as well: + +.. code-block:: python + + ZYTE_API_TRANSPARENT_MODE = True + +If you already had a custom value for :setting:`REQUEST_FINGERPRINTER_CLASS +`, set that value on +:ref:`ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS` instead. + +If you had a different value for :setting:`TWISTED_REACTOR +` or no value at all, you will be changing the Twisted +reactor that your Scrapy project uses, and your existing code may need changes, +such as: + +- :ref:`asyncio-preinstalled-reactor`. + + Some Twisted imports install the default, non-asyncio Twisted + reactor as a side effect. Once a reactor is installed, it cannot be + changed for the whole run time. + +- :ref:`asyncio-await-dfd`. + + Note that you might be using Deferreds without realizing it through + some Scrapy functions and methods. For example, when you yield the + return value of ``self.crawler.engine.download()`` from a spider + callback, you are yielding a Deferred. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..3834977f --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,49 @@ +============================= +scrapy-zyte-api documentation +============================= + +.. include:: ../README.rst + :start-after: .. description starts + :end-before: .. description ends + +After the :ref:`initial setup `, you can use Zyte API automatically, +either :ref:`globally ` or :ref:`per request `, or +:ref:`manually per request `. + +.. toctree:: + :caption: First steps + :hidden: + + first-steps/setup + first-steps/scrapy-poet + +.. toctree:: + :caption: Usage + :hidden: + + usage/transparent + usage/manual + usage/automap + usage/default + usage/retry + usage/scrapy-poet + usage/stats + usage/fingerprint + usage/proxy + +.. toctree:: + :caption: Reference + :hidden: + + reference/request + reference/response + reference/settings + reference/meta + reference/inputs + reference/fingerprint-params + +.. toctree:: + :caption: All the rest + :hidden: + + changes diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..954237b9 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/reference/fingerprint-params.rst b/docs/reference/fingerprint-params.rst new file mode 100644 index 00000000..83e8f428 --- /dev/null +++ b/docs/reference/fingerprint-params.rst @@ -0,0 +1,36 @@ +.. _fingerprint-params: + +================================= +Request fingerprinting parameters +================================= + +The request fingerprinter class of scrapy-zyte-api generates request +fingerprints for Zyte API requests based on the following Zyte API parameters: + +- :http:`request:url` (:func:`canonicalized `) + + For URLs that include a URL fragment, like ``https://example.com#foo``, URL + canonicalization keeps the URL fragment if :http:`request:browserHtml` or + :http:`request:screenshot` are enabled. + +- Request attribute parameters (:http:`request:httpRequestBody`, + :http:`request:httpRequestMethod`) + +- Output parameters (:http:`request:browserHtml`, + :http:`request:httpResponseBody`, :http:`request:httpResponseHeaders`, + :http:`request:screenshot`) + +- Rendering option parameters (:http:`request:actions`, + :http:`request:javascript`, :http:`request:screenshotOptions`) + +- :http:`request:geolocation` + +The following Zyte API parameters are *not* taken into account for request +fingerprinting: + +- Request header parameters (:http:`request:customHttpRequestHeaders`, + :http:`request:requestHeaders`, :http:`request:requestCookies`) + +- Metadata parameters (:http:`request:echoData`, :http:`request:jobId`) + +- Experimental parameters (:http:`request:experimental`) diff --git a/docs/reference/inputs.rst b/docs/reference/inputs.rst new file mode 100644 index 00000000..e5fbe3a4 --- /dev/null +++ b/docs/reference/inputs.rst @@ -0,0 +1,26 @@ +.. _inputs: + +====== +Inputs +====== + +:ref:`scrapy-poet integration `, once :ref:`set up +`, allows obtaining the following :ref:`inputs +` from :doc:`web-poet ` and +:doc:`zyte-common-items ` through Zyte API: + +- :class:`web_poet.BrowserHtml` + +- :class:`web_poet.BrowserResponse` + +- :class:`zyte_common_items.Article` + +- :class:`zyte_common_items.ArticleList` + +- :class:`zyte_common_items.ArticleNavigation` + +- :class:`zyte_common_items.Product` + +- :class:`zyte_common_items.ProductList` + +- :class:`zyte_common_items.ProductNavigation` diff --git a/docs/reference/meta.rst b/docs/reference/meta.rst new file mode 100644 index 00000000..3ba83b57 --- /dev/null +++ b/docs/reference/meta.rst @@ -0,0 +1,60 @@ +.. _meta: + +================= +Request.meta keys +================= + +Keys that can be defined in :attr:`Request.meta ` for +scrapy-zyte-api. + +.. _zyte_api: + +zyte_api +======== + +Default: ``False`` + +See :ref:`manual`. + + +.. _zyte_api_automap: + +zyte_api_automap +================ + +Default: :ref:`ZYTE_API_TRANSPARENT_MODE` (``False``) + +See :ref:`automap`. + + +.. _zyte_api_default_params_meta: + +zyte_api_default_params +======================= + +Default: ``True`` + +If set to ``False``, the values of :ref:`ZYTE_API_AUTOMAP_PARAMS` and +:ref:`ZYTE_API_DEFAULT_PARAMS` are ignored for this request. + + +.. _zyte_api_retry_policy_meta: + +zyte_api_retry_policy +===================== + +Default: :ref:`ZYTE_API_RETRY_POLICY` +(:data:`zyte_api.aio.retry.zyte_api_retrying`) + +Determines the retry policy for Zyte API requests used to fulfill this request. + +It must be a :class:`tenacity.AsyncRetrying` subclass or its import path as a +string. + +.. note:: If you need your request to be serializable, e.g. to use + :class:`~scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware`, you + must specify the import path of your retry policy class as a string, + because `retry policies are not serializable + `_. + +See :ref:`retry`. diff --git a/docs/reference/request.rst b/docs/reference/request.rst new file mode 100644 index 00000000..4f7847dd --- /dev/null +++ b/docs/reference/request.rst @@ -0,0 +1,180 @@ +.. _request: + +=============== +Request mapping +=============== + +When you enable automatic request parameter mapping, be it through +:ref:`transparent mode ` or :ref:`for a specific request +`, some Zyte API parameters are :ref:`chosen automatically for you +`, and you can then :ref:`change them further +` if you wish. + +.. _request-automatic: + +Automatic mapping +================= + +- :attr:`Request.url ` becomes :http:`request:url`, + same as in :ref:`requests with manual parameters `. + +- If :attr:`Request.method ` is something other + than ``"GET"``, it becomes :http:`request:httpRequestMethod`. + +- :attr:`Request.headers ` become + :http:`request:customHttpRequestHeaders`. + +- :attr:`Request.body ` becomes + :http:`request:httpRequestBody`. + +- If the :setting:`COOKIES_ENABLED ` is ``True`` + (default), and :attr:`Request.meta ` does not set + :reqmeta:`dont_merge_cookies ` to ``True``: + + - :http:`request:responseCookies` becomes ``True``. + + - Cookies from the :reqmeta:`cookiejar ` become + :http:`request:requestCookies`. + + All cookies from the cookie jar are set, regardless of their cookie + domain. This is because Zyte API requests may involve requests to + different domains (e.g. when following cross-domain redirects, or + during browser rendering). + + See also: :ref:`ZYTE_API_MAX_COOKIES`, + :ref:`ZYTE_API_COOKIE_MIDDLEWARE`. + +- :http:`request:httpResponseBody` and :http:`request:httpResponseHeaders` + are set to ``True``. + + This is subject to change without prior notice in future versions of + scrapy-zyte-api, so please account for the following: + + - If you are requesting a binary resource, such as a PDF file or an + image file, set :http:`request:httpResponseBody` to ``True`` explicitly + in your requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/img/zyte.png", + meta={ + "zyte_api_automap": {"httpResponseBody": True}, + }, + ) + + In the future, we may stop setting :http:`request:httpResponseBody` to + ``True`` by default, and instead use a different, new Zyte API + parameter that only works for non-binary responses (e.g. HMTL, JSON, + plain text). + + - If you need to access response headers, be it through + :attr:`response.headers ` + or through + :attr:`response.raw_api_response["httpResponseHeaders"] `, + set :http:`request:httpResponseHeaders` to ``True`` explicitly in your + requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/", + meta={ + "zyte_api_automap": {"httpResponseHeaders": True}, + }, + ) + + At the moment scrapy-zyte-api requests response headers because some + response headers are necessary to properly decode the response body as + text. In the future, Zyte API may be able to handle this decoding + automatically, so scrapy-zyte-api would stop setting + :http:`request:httpResponseHeaders` to ``True`` by default. + +For example, the following Scrapy request: + +.. code-block:: python + + Request( + method="POST" + url="https://httpbin.org/anything", + headers={"Content-Type": "application/json"}, + body=b'{"foo": "bar"}', + cookies={"a": "b"}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "customHttpRequestHeaders": [ + { + "name": "Content-Type", + "value": "application/json" + } + ], + "httpResponseBody": true, + "httpResponseHeaders": true, + "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", + "httpRequestMethod": "POST", + "requestCookies": [ + { + "name": "a", + "value": "b", + "domain": "" + } + ], + "responseCookies": true, + "url": "https://httpbin.org/anything" + } + +Header mapping +============== + +When mapping headers, headers not supported by Zyte API are excluded from the +mapping by default. + +Use :ref:`ZYTE_API_SKIP_HEADERS` and :ref:`ZYTE_API_BROWSER_HEADERS` to change +which headers are included or excluded from header mapping. + + +.. _request-unsupported: + +Unsupported scenarios +===================== + +To maximize support for potential future changes in Zyte API, automatic +request parameter mapping allows some parameter values and parameter +combinations that Zyte API does not currently support, and may never support: + +- :attr:`Request.method ` becomes + :http:`request:httpRequestMethod` even for unsupported + :http:`request:httpRequestMethod` values, and even if + :http:`request:httpResponseBody` is unset. + +- You can set :http:`request:customHttpRequestHeaders` or + :http:`request:requestHeaders` to ``True`` to force their mapping from + :attr:`Request.headers ` in scenarios where + they would not be mapped otherwise. + + Conversely, you can set :http:`request:customHttpRequestHeaders` or + :http:`request:requestHeaders` to ``False`` to prevent their mapping from + :attr:`Request.headers `. + +- :attr:`Request.body ` becomes + :http:`request:httpRequestBody` even if :http:`request:httpResponseBody` is + unset. + +- You can set :http:`request:httpResponseBody` to ``False`` (which unsets the + parameter), and not set :http:`request:browserHtml` or + :http:`request:screenshot` to ``True``. In this case, + :attr:`Request.headers ` is mapped as + :http:`request:requestHeaders`. + +- You can set :http:`request:httpResponseBody` to ``True`` and also set + :http:`request:browserHtml` or :http:`request:screenshot` to ``True``. In + this case, :attr:`Request.headers ` is mapped + both as :http:`request:customHttpRequestHeaders` and as + :http:`request:requestHeaders`, and :http:`request:browserHtml` is used as + :class:`response.body `. diff --git a/docs/reference/response.rst b/docs/reference/response.rst new file mode 100644 index 00000000..04c7db73 --- /dev/null +++ b/docs/reference/response.rst @@ -0,0 +1,150 @@ +.. _response: + +================ +Response mapping +================ + +.. _response-parameters: + +Parameters +========== + +Zyte API response parameters are mapped into :ref:`response class +` attributes where possible: + +- :http:`response:url` becomes :class:`response.url + `. + +- :http:`response:statusCode` becomes :class:`response.status + `. + +- :http:`response:httpResponseHeaders` and + :http:`response:responseCookies` become :class:`response.headers + `. + +- :http:`response:responseCookies` is also mapped into the request + :reqmeta:`cookiejar `. + +- :http:`response:browserHtml` and :http:`response:httpResponseBody` are + mapped into both + :class:`response.text ` + and + :class:`response.body `. + + If none of these parameters were present, e.g. if the only requested output + was :http:`response:screenshot`, + :class:`response.text ` + and + :class:`response.body ` + would be empty. + + If a future version of Zyte API supported requesting both outputs on the + same request, and both parameters were present, + :http:`response:browserHtml` would be the one mapped into + :class:`response.text ` + and + :class:`response.body `. + +Both :ref:`response classes ` have a +:class:`response.raw_api_response ` +attribute that contains a :class:`dict` with the complete, raw response from +Zyte API, where you can find all Zyte API response parameters, including those +that are not mapped into other response class attributes. + +For example, for a request for :http:`response:httpResponseBody` and +:http:`response:httpResponseHeaders`, you would get: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {b"Content-Type": [b"text/html"], …} + print(response.text) + # "…" + print(response.body) + # b"…" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", + # "httpResponseHeaders": […], + # } + +For a request for :http:`response:screenshot`, on the other hand, the response +would look as follows: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {} + print(response.text) + # "" + print(response.body) + # b"" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "screenshot": "iVBORw0KGgoAAAANSUh…", + # } + from base64 import b64decode + print(b64decode(response.raw_api_response["screenshot"])) + # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' + + +.. _response-classes: + +Classes +======= + +Zyte API responses are mapped with one of the following classes: + +- :class:`~scrapy_zyte_api.responses.ZyteAPITextResponse` is used to map text + responses, i.e. responses with :http:`response:browserHtml` or responses + with both :http:`response:httpResponseBody` and + :http:`response:httpResponseHeaders` with a text body (e.g. plain text, + HTML, JSON). + +- :class:`~scrapy_zyte_api.responses.ZyteAPIResponse` is used to map any + other response. + +.. autoclass:: scrapy_zyte_api.responses.ZyteAPIResponse + :show-inheritance: + + .. autoattribute:: url + + .. autoattribute:: status + + .. autoattribute:: headers + + .. attribute:: body + :type: bytes + + .. autoattribute:: raw_api_response + +.. autoclass:: scrapy_zyte_api.responses.ZyteAPITextResponse + :show-inheritance: + + .. autoattribute:: url + + .. autoattribute:: status + + .. autoattribute:: headers + + .. attribute:: body + :type: bytes + + .. attribute:: text + :type: str + + .. autoattribute:: raw_api_response diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst new file mode 100644 index 00000000..602cdfe9 --- /dev/null +++ b/docs/reference/settings.rst @@ -0,0 +1,256 @@ +.. _settings: + +======== +Settings +======== + +:ref:`Settings ` for scrapy-zyte-api. + +.. _ZYTE_API_AUTOMAP_PARAMS: + +ZYTE_API_AUTOMAP_PARAMS +======================= + +Default: ``{}`` + +:class:`dict` of parameters to be combined with :ref:`automatic request +parameters `. + +These parameters are merged with :ref:`zyte_api_automap` parameters. +:ref:`zyte_api_automap` parameters take precedence. + +This setting has no effect on requests with :ref:`manual request parameters +`. + +When using :ref:`transparent mode `, be careful of which +parameters you define in this setting. In transparent mode, all Scrapy requests +go through Zyte API, even requests that Scrapy sends automatically, such as +those for ``robots.txt`` files when :setting:`ROBOTSTXT_OBEY +` is ``True``, or those for sitemaps when using +:class:`~scrapy.spiders.SitemapSpider`. Certain parameters, like +:http:`request:browserHtml` or :http:`request:screenshot`, are not meant to be +used for every single request. + +If :ref:`zyte_api_default_params ` in +:attr:`Request.meta ` is set to ``False``, this +setting is ignored for that request. + +See :ref:`default`. + + +.. _ZYTE_API_BROWSER_HEADERS: + +ZYTE_API_BROWSER_HEADERS +======================== + +Default: ``{"Referer": "referer"}`` + +Determines headers that *can* be mapped as :http:`request:requestHeaders`. + +It is a :class:`dict`, where keys are header names and values are the key that +represents them in :http:`request:requestHeaders`. + + +.. _ZYTE_API_COOKIE_MIDDLEWARE: + +ZYTE_API_COOKIE_MIDDLEWARE +========================== + +Default: :class:`scrapy.downloadermiddlewares.cookies.CookiesMiddleware` + +If you are using a custom downloader middleware to handle request cookie jars, +you can point this setting to its import path to make scrapy-zyte-api work with +it. + +Your cookie downloader middleware must have a ``jars`` property with the same +signature as in the built-in Scrapy downloader middleware for cookie handling. + + +.. _ZYTE_API_DEFAULT_PARAMS: + +ZYTE_API_DEFAULT_PARAMS +======================= + +Default: ``{}`` + +:class:`dict` of parameters to be combined with :ref:`manual request parameters +`. + +You may set :ref:`zyte_api` to an empty :class:`dict` to only use the +parameters defined here for that request. + +These parameters are merged with :ref:`zyte_api` parameters. :ref:`zyte_api` +parameters take precedence. + +This setting has no effect on requests with :ref:`automatic request parameters +`. + +If :ref:`zyte_api_default_params ` in +:attr:`Request.meta ` is set to ``False``, this +setting is ignored for that request. + +See :ref:`default`. + + +.. _ZYTE_API_ENABLED: + +ZYTE_API_ENABLED +================ + +Default: ``True`` + +Can be set to ``False`` to disable scrapy-zyte-api. + + +.. _ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS: + +ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS +============================================= + +Default: :class:`scrapy.utils.request.RequestFingerprinter` + +:ref:`Request fingerprinter ` to for requests that do not +go through Zyte API. See :ref:`fingerprint`. + + +.. _ZYTE_API_KEY: + +ZYTE_API_KEY +============ + +Default: ``None`` + +Your `Zyte API key`_. + +.. _Zyte API key: https://app.zyte.com/o/zyte-api/api-access + +You can alternatively define an environment variable with the same name. + +.. tip:: On :ref:`Scrapy Cloud `, this setting is defined + automatically. + + +.. _ZYTE_API_LOG_REQUESTS: + +ZYTE_API_LOG_REQUESTS +===================== + +Default: ``False`` + +Set this to ``True`` and :setting:`LOG_LEVEL ` to ``"DEBUG"`` +to enable the logging of debug messages that indicate the JSON object sent on +every Zyte API request. + +For example:: + + Sending Zyte API extract request: {"url": "https://example.com", "httpResponseBody": true} + +See also: :ref:`ZYTE_API_LOG_REQUESTS_TRUNCATE`. + + +.. _ZYTE_API_LOG_REQUESTS_TRUNCATE: + +ZYTE_API_LOG_REQUESTS_TRUNCATE +============================== + +Default: ``64`` + +Determines the maximum length of any string value in the JSON object logged +when :ref:`ZYTE_API_LOG_REQUESTS` is enabled, excluding object keys. + +To disable truncation, set this to ``0``. + + +.. _ZYTE_API_MAX_COOKIES: + +ZYTE_API_MAX_COOKIES +==================== + +Default: ``100`` + +If the cookies to be set during :ref:`request mapping ` exceed this +limit, a warning is logged, and only as many cookies as the limit allows are +set for the target request. + +To silence this warning, set :http:`request:requestCookies` manually, e.g. to +an empty :class:`dict`. + +Alternatively, if :http:`request:requestCookies` starts supporting more than +100 cookies, update this setting accordingly. + + +.. _ZYTE_API_MAX_REQUESTS: + +ZYTE_API_MAX_REQUESTS +===================== + +Default: ``None`` + +When set to an integer value > 0, the spider will close when the number of Zyte +API requests reaches it. + +Note that requests with error responses that cannot be retried or exceed their +retry limit also count here. + + +.. _ZYTE_API_PROVIDER_PARAMS: + +ZYTE_API_PROVIDER_PARAMS +======================== + +Default: ``{}`` + +Defines additional request parameters to use in Zyte API requests sent by the +:ref:`scrapy-poet integration `. + + +.. _ZYTE_API_RETRY_POLICY: + +ZYTE_API_RETRY_POLICY +===================== + +Default: ``"zyte_api.aio.retry.zyte_api_retrying"`` + +Determines the retry policy for Zyte API requests. + +It must be a string with the import path of a :class:`tenacity.AsyncRetrying` +subclass. + +.. note:: :ref:`Settings ` must be :mod:`picklable `, + and `retry policies are not `_, + so you cannot assign a retry policy class directly to this setting, you + must use their import path as a string instead. + +See :ref:`retry`. + + +.. _ZYTE_API_SKIP_HEADERS: + +ZYTE_API_SKIP_HEADERS +===================== + +Default: ``["User-Agent"]`` + +Determines headers that must *not* be mapped as +:http:`request:customHttpRequestHeaders`. + + +.. _ZYTE_API_TRANSPARENT_MODE: + +ZYTE_API_TRANSPARENT_MODE +========================= + +Default: ``False`` + +See :ref:`transparent`. + + +.. _ZYTE_API_USE_ENV_PROXY: + +ZYTE_API_USE_ENV_PROXY +====================== + +Default: ``False`` + +Set to ``True`` to make Zyte API requests respect system proxy settings. See +:ref:`proxy`. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..c71ccc8d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +Sphinx==7.2.6 +sphinx-rtd-theme==1.3.0 diff --git a/docs/usage/automap.rst b/docs/usage/automap.rst new file mode 100644 index 00000000..839e2570 --- /dev/null +++ b/docs/usage/automap.rst @@ -0,0 +1,78 @@ +.. _automap: + +============================ +Automatic request parameters +============================ + +To send a Scrapy request through Zyte API letting Zyte API request parameters +be automatically chosen based on the parameters of that Scrapy request, set the +:ref:`zyte_api_automap` key in :attr:`Request.meta ` +to ``True``. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api_automap": True, + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +In :ref:`transparent mode `, :ref:`zyte_api_automap` is ``True`` +by default. + +See :ref:`request` to learn how exactly request parameters are mapped when +using automatic request parameters. + + +.. _request-change: + +Changing parameters +=================== + +You may set :ref:`zyte_api_automap` in :attr:`Request.meta +` to a :class:`dict` of Zyte API parameters to add, +modify, or remove (by setting to ``False``) automatic request parameters. This +also works in :ref:`transparent mode `. + +Enabling :http:`request:browserHtml`, :http:`request:screenshot`, or an +automatic extraction property, unsets :http:`request:httpResponseBody` and +:http:`request:httpResponseHeaders`, and makes ``Request.headers`` become +:http:`request:requestHeaders` instead of +:http:`request:customHttpRequestHeaders`. For example, the following Scrapy +request: + +.. code-block:: python + + Request( + url="https://quotes.toscrape.com", + headers={"Referer": "https://example.com/"}, + meta={"zyte_api_automap": {"browserHtml": True}}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "browserHtml": true, + "responseCookies": true, + "requestHeaders": {"referer": "https://example.com/"}, + "url": "https://quotes.toscrape.com" + } + +See also: :ref:`request-unsupported`. \ No newline at end of file diff --git a/docs/usage/default.rst b/docs/usage/default.rst new file mode 100644 index 00000000..582e3617 --- /dev/null +++ b/docs/usage/default.rst @@ -0,0 +1,22 @@ +.. _default: + +================== +Default parameters +================== + +Often the same configuration needs to be used for all Zyte API requests. For +example, all requests may need to set the same :http:`request:geolocation`, or +the spider only uses :http:`request:browserHtml` requests. + +The following settings allow you to define Zyte API parameters to be included +in all requests: + +- :ref:`ZYTE_API_AUTOMAP_PARAMS`, for :ref:`transparent mode ` + and :ref:`automatic request parameters `. + +- :ref:`ZYTE_API_DEFAULT_PARAMS`, for :ref:`manual request parameters + `. + +For example, if you set :ref:`ZYTE_API_DEFAULT_PARAMS` to +``{"geolocation": "US"}`` and :ref:`zyte_api` to ``{"browserHtml": True}``, +``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. diff --git a/docs/usage/fingerprint.rst b/docs/usage/fingerprint.rst new file mode 100644 index 00000000..0a19ccd3 --- /dev/null +++ b/docs/usage/fingerprint.rst @@ -0,0 +1,57 @@ +.. _fingerprint: + +Request fingerprinting +====================== + +The request fingerprinter class of scrapy-zyte-api ensures that Scrapy 2.7 and +later generate unique :ref:`request fingerprints ` for +Zyte API requests :ref:`based on some of their parameters +`. + +For example, a request for :http:`request:browserHtml` and a request for +:http:`request:screenshot` with the same target URL are considered different +requests. Similarly, requests with the same target URL but different +:http:`request:actions` are also considered different requests. + +Use :ref:`ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS` to define a custom +request fingerprinting for requests that do not go through Zyte API. + + +Request fingerprinting before Scrapy 2.7 +---------------------------------------- + +If you have a Scrapy version older than Scrapy 2.7, Zyte API parameters are not +taken into account for request fingerprinting. This can cause some Scrapy +components, like the filter of duplicate requests or the HTTP cache extension, +to interpret 2 different requests as being the same. + +To avoid most issues, use :ref:`automatic request parameters `, either +through :ref:`transparent mode ` or setting +:ref:`zyte_api_automap` to ``True`` in :attr:`Request.meta +`, and then use :class:`~scrapy.http.Request` +attributes instead of :attr:`Request.meta ` as much +as possible. Unlike :attr:`Request.meta `, +:class:`~scrapy.http.Request` attributes do affect request fingerprints in +Scrapy versions older than Scrapy 2.7. + +For requests that must have the same :class:`~scrapy.http.Request` attributes +but should still be considered different, such as browser-based requests with +different URL fragments, you can set ``dont_filter=True`` when creating your +request to prevent the duplicate filter of Scrapy to filter any of them out. +For example: + +.. code-block:: python + + yield Request( + "https://toscrape.com#1", + meta={"zyte_api_automap": {"browserHtml": True}}, + dont_filter=True, + ) + yield Request( + "https://toscrape.com#2", + meta={"zyte_api_automap": {"browserHtml": True}}, + dont_filter=True, + ) + +Note, however, that for other Scrapy components, like the HTTP cache +extensions, these 2 requests would still be considered identical. diff --git a/docs/usage/manual.rst b/docs/usage/manual.rst new file mode 100644 index 00000000..8293caf2 --- /dev/null +++ b/docs/usage/manual.rst @@ -0,0 +1,67 @@ +.. _manual: + +========================= +Manual request parameters +========================= + +To send a Scrapy request through Zyte API with manually-defined Zyte API +request parameters, define your parameters in the :ref:`zyte_api` key in +:attr:`Request.meta ` as a :class:`dict`. + +The only exception is the :http:`request:url` parameter, which should not be +defined as a Zyte API parameter. The value from :attr:`Request.url +` is used automatically. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "browserHtml": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +Note that response headers are necessary for raw response decoding. When +defining parameters manually and requesting :http:`request:httpResponseBody`, +remember to also request :http:`request:httpResponseHeaders`: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "httpResponseBody": True, + "httpResponseHeaders": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +To learn more about Zyte API parameters, see the upstream :ref:`usage +` and :ref:`API reference ` pages. diff --git a/docs/usage/proxy.rst b/docs/usage/proxy.rst new file mode 100644 index 00000000..a14731a9 --- /dev/null +++ b/docs/usage/proxy.rst @@ -0,0 +1,9 @@ +.. _proxy: + +============= +Using a proxy +============= + +If you need a proxy to access Zyte API (e.g. a corporate proxy), configure +the ``HTTP_PROXY`` and ``HTTPS_PROXY`` environment variables accordingly, and +set the :ref:`ZYTE_API_USE_ENV_PROXY` setting to ``True``. diff --git a/docs/usage/retry.rst b/docs/usage/retry.rst new file mode 100644 index 00000000..f497c82f --- /dev/null +++ b/docs/usage/retry.rst @@ -0,0 +1,69 @@ +.. _retry: + +Retries +======= + +API requests are retried automatically using the default retry policy of +:doc:`python-zyte-api `. + +API requests that exceed retries are dropped. You cannot manage API request +retries through :ref:`downloader middlewares `. + +Use the :ref:`ZYTE_API_RETRY_POLICY` setting or the :ref:`zyte_api_retry_policy +` :attr:`Request.meta ` +key to override the default retry policy with a custom retry policy. + +For example, to increase the maximum number of retries to 10 before dropping +the API request, you can subclass :class:`~zyte_api.aio.retry.RetryFactory` as +follows: + +.. code-block:: python + + # project/retry_policies.py + from tenacity import stop_after_attempt + from zyte_api.aio.retry import RetryFactory + + class CustomRetryFactory(RetryFactory): + temporary_download_error_stop = stop_after_attempt(10) + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + + # project/settings.py + ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" + + +To extend this retry policy, so it will also retry HTTP 521 errors, the same +as HTTP 520 errors, you can implement: + +.. code-block:: python + + # project/retry_policies.py + from tenacity import retry_if_exception, RetryCallState, stop_after_attempt + from zyte_api.aio.errors import RequestError + from zyte_api.aio.retry import RetryFactory + + def is_http_521(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status == 521 + + class CustomRetryFactory(RetryFactory): + + retry_condition = ( + RetryFactory.retry_condition + | retry_if_exception(is_http_521) + ) + temporary_download_error_stop = stop_after_attempt(10) + + def wait(self, retry_state: RetryCallState) -> float: + if is_http_521(retry_state.outcome.exception()): + return self.temporary_download_error_wait(retry_state=retry_state) + return super().wait(retry_state) + + def stop(self, retry_state: RetryCallState) -> bool: + if is_http_521(retry_state.outcome.exception()): + return self.temporary_download_error_stop(retry_state) + return super().stop(retry_state) + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + + # project/settings.py + ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" diff --git a/docs/usage/scrapy-poet.rst b/docs/usage/scrapy-poet.rst new file mode 100644 index 00000000..a043b7ea --- /dev/null +++ b/docs/usage/scrapy-poet.rst @@ -0,0 +1,67 @@ +.. _scrapy-poet: + +======================= +scrapy-poet integration +======================= + +After you :ref:`set up scrapy-poet integration `, you can +request :ref:`supported page inputs ` in your page objects:: + + @attrs.define + class ProductPage(BasePage): + response: BrowserResponse + product: Product + + + class ZyteApiSpider(scrapy.Spider): + ... + + def parse_page(self, response: DummyResponse, page: ProductPage): + ... + +Or request them directly in the callback:: + + class ZyteApiSpider(scrapy.Spider): + ... + + def parse_page(self, + response: DummyResponse, + browser_response: BrowserResponse, + product: Product, + ): + ... + +Default parameters +================== + +scrapy-poet integration ignores :ref:`default parameters `. + +To add extra parameters to all Zyte API requests sent by the provider, set them +as a dictionary through the :ref:`ZYTE_API_PROVIDER_PARAMS` setting, for +example in ``settings.py``:: + + ZYTE_API_PROVIDER_PARAMS = {"geolocation": "IE"} + +When :ref:`ZYTE_API_PROVIDER_PARAMS` includes one of the Zyte API extraction +options (e.g. ``productOptions`` for ``product``), but the final Zyte API +request doesn't include the corresponding data type, the unused options are +automatically removed. So, it's safe to use :ref:`ZYTE_API_PROVIDER_PARAMS` to +set the default options for various extraction types, e.g.:: + + ZYTE_API_PROVIDER_PARAMS = { + "productOptions": {"extractFrom": "httpResponseBody"}, + "productNavigationOptions": {"extractFrom": "httpResponseBody"}, + } + +Note that the built-in ``scrapy_poet.page_input_providers.ItemProvider`` has a +priority of 2000, so when you have page objects producing +:class:`zyte_common_items.Product` items you should use higher values for +``ZyteApiProvider`` if you want these items to come from these page objects, +and lower values if you want them to come from Zyte API. + +Currently, when ``ItemProvider`` is used together with ``ZyteApiProvider``, +it may make more requests than is optimal: the normal Scrapy response will be +always requested even when using a :class:`~scrapy_poet.DummyResponse` +annotation, and in some dependency combinations two Zyte API requests will be +made for the same page. We are planning to solve these problems in the future +releases of :doc:`scrapy-poet ` and scrapy-zyte-api. diff --git a/docs/usage/stats.rst b/docs/usage/stats.rst new file mode 100644 index 00000000..392838e4 --- /dev/null +++ b/docs/usage/stats.rst @@ -0,0 +1,19 @@ +.. _stats: + +===== +Stats +===== + +Stats from :doc:`python-zyte-api ` are exposed as +:ref:`Scrapy stats ` with the ``scrapy-zyte-api`` prefix. + +For example, ``scrapy-zyte-api/status_codes/`` stats indicate the +status code of Zyte API responses (e.g. ``429`` for :ref:`rate limiting +` or ``520`` for :ref:`temporary download errors +`). + +.. note:: The actual status code that is received from the target website, i.e. + the :http:`response:statusCode` response field of a :ref:`Zyte API + successful response `, is accounted for in + the ``downloader/response_status_count/`` stat, as with any + other Scrapy response. diff --git a/docs/usage/transparent.rst b/docs/usage/transparent.rst new file mode 100644 index 00000000..354a710c --- /dev/null +++ b/docs/usage/transparent.rst @@ -0,0 +1,35 @@ +.. _transparent: + +================ +Transparent mode +================ + +Set :ref:`ZYTE_API_TRANSPARENT_MODE` to ``True`` to handle requests as follows: + +- By default, requests are sent with :ref:`automatic request + parameters `. + +- Requests with :ref:`zyte_api` set to a ``dict`` are sent with :ref:`manual + request parameters `. + +- Requests with :ref:`zyte_api_automap` set to ``False`` are *not* sent + through Zyte API. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + start_urls = ["https://quotes.toscrape.com/"] + + custom_settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + } + + def parse(self, response): + print(response.text) + # "…" diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 9386df03..077f2eb7 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -217,13 +217,8 @@ async def _download_request( session=self._session, retrying=retrying, ) - except RequestError as er: - error_detail = (er.parsed.data or {}).get("detail", er.message) - logger.error( - f"Got Zyte API error (status={er.status}, type={er.parsed.type!r}, " - f"request_id={er.request_id!r}) while processing URL ({request.url}): " - f"{error_detail}" - ) + except RequestError as error: + self._process_request_error(request, error) raise except Exception as er: logger.error( @@ -235,6 +230,21 @@ async def _download_request( return _process_response(api_response, request, self._cookie_jars) + def _process_request_error(self, request, error): + detail = (error.parsed.data or {}).get("detail", error.message) + logger.error( + f"Got Zyte API error (status={error.status}, " + f"type={error.parsed.type!r}, request_id={error.request_id!r}) " + f"while processing URL ({request.url}): {detail}" + ) + for status, error_type, close_reason in ( + (401, "/auth/key-not-found", "zyte_api_bad_key"), + (403, "/auth/account-suspended", "zyte_api_suspended_account"), + ): + if error.status == status and error.parsed.type == error_type: + self._crawler.engine.close_spider(self._crawler.spider, close_reason) + return + def _log_request(self, params): if not self._must_log_request: return diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index fbbf16e4..0f891d6c 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -56,8 +56,7 @@ def replace(self, *args, **kwargs): def raw_api_response(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. - To see the full list of parameters and their description, kindly refer to the - `Zyte API Specification `_. + For the full list of parameters, see :ref:`zyte-api-reference`. """ return self._raw_api_response diff --git a/tests/mockserver.py b/tests/mockserver.py index ccecc70c..5b5df1b2 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -8,6 +8,7 @@ from importlib import import_module from subprocess import PIPE, Popen from typing import Dict, Optional +from urllib.parse import urlparse from pytest_twisted import ensureDeferred from scrapy import Request @@ -78,6 +79,26 @@ def render_POST(self, request): return json.dumps(response_data).encode() response_data["url"] = request_data["url"] + domain = urlparse(request_data["url"]).netloc + if "bad-key" in domain: + request.setResponseCode(401) + response_data = { + "status": 401, + "type": "/auth/key-not-found", + "title": "Authentication Key Not Found", + "detail": "The authentication key is not valid or can't be matched.", + } + return json.dumps(response_data).encode() + if "suspended-account" in domain: + request.setResponseCode(403) + response_data = { + "status": 403, + "type": "/auth/account-suspended", + "title": "Account Suspended", + "detail": "Account is suspended, check billing details.", + } + return json.dumps(response_data).encode() + html = "Hello

World!

" if "browserHtml" in request_data: if "httpResponseBody" in request_data: diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index a03594d8..a01dc2bf 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -703,7 +703,7 @@ def _test_automap( [], ), # httpResponseBody can be explicitly requested in meta, and should be - # in cases where a binary response is expected, since automated mapping + # in cases where a binary response is expected, since automatic mapping # may stop working for binary responses in the future. ( {"httpResponseBody": True}, @@ -2196,7 +2196,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca # # Setting requestCookies to [] disables automatic mapping, but logs a # a warning recommending to either use False to achieve the same or - # remove the parameter to let automated mapping work. + # remove the parameter to let automatic mapping work. *( ( settings, diff --git a/tests/test_handler.py b/tests/test_handler.py index 1ff0d6ab..cf985bea 100644 --- a/tests/test_handler.py +++ b/tests/test_handler.py @@ -8,7 +8,7 @@ import pytest from pytest_twisted import ensureDeferred -from scrapy import Request +from scrapy import Request, Spider from scrapy.exceptions import NotConfigured from scrapy.settings import Settings from scrapy.utils.misc import create_instance @@ -21,6 +21,7 @@ from scrapy_zyte_api.utils import USER_AGENT from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS, UNSET, make_handler, set_env +from .mockserver import MockServer @pytest.mark.parametrize( @@ -484,3 +485,74 @@ def test_user_agent_for_build_client(user_agent, expected): ) client = ScrapyZyteAPIDownloadHandler._build_client(settings) assert client.user_agent == expected + + +@ensureDeferred +async def test_bad_key(): + class TestSpider(Spider): + name = "test" + start_urls = ["https://bad-key.example"] + + def parse(self, response): + pass + + settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + **SETTINGS, + } + + with MockServer() as server: + settings["ZYTE_API_URL"] = server.urljoin("/") + crawler = get_crawler(TestSpider, settings_dict=settings) + await crawler.crawl() + + assert crawler.stats.get_value("finish_reason") == "zyte_api_bad_key" + + +# NOTE: Under the assumption that a case of bad key will happen since the +# beginning of a crawl, we only test the start_urls scenario, and not also the +# case of follow-up responses suddenly giving such an error. + + +@ensureDeferred +async def test_suspended_account_start_urls(): + class TestSpider(Spider): + name = "test" + start_urls = ["https://suspended-account.example"] + + def parse(self, response): + pass + + settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + **SETTINGS, + } + + with MockServer() as server: + settings["ZYTE_API_URL"] = server.urljoin("/") + crawler = get_crawler(TestSpider, settings_dict=settings) + await crawler.crawl() + + assert crawler.stats.get_value("finish_reason") == "zyte_api_suspended_account" + + +@ensureDeferred +async def test_suspended_account_callback(): + class TestSpider(Spider): + name = "test" + start_urls = ["https://example.com"] + + def parse(self, response): + yield response.follow("https://suspended-account.example") + + settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + **SETTINGS, + } + + with MockServer() as server: + settings["ZYTE_API_URL"] = server.urljoin("/") + crawler = get_crawler(TestSpider, settings_dict=settings) + await crawler.crawl() + + assert crawler.stats.get_value("finish_reason") == "zyte_api_suspended_account" diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 17d6b356..4b11bbd4 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -415,3 +415,17 @@ def test_url_fragments(params, match): assert fingerprint1 == fingerprint2 else: assert fingerprint1 != fingerprint2 + + +def test_autoextract(): + crawler = get_crawler() + fingerprinter = create_instance( + ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler + ) + request1 = Request("https://toscrape.com", meta={"zyte_api": {"product": True}}) + fingerprint1 = fingerprinter.fingerprint(request1) + request2 = Request( + "https://toscrape.com", meta={"zyte_api": {"productNavigation": True}} + ) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 != fingerprint2 diff --git a/tox.ini b/tox.ini index a3cb6096..7470d226 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,py311,mypy,linters,twine-check +envlist = py37,py38,py39,py310,py311,mypy,linters,twine-check,docs [testenv] deps = @@ -19,7 +19,7 @@ commands = deps = {[testenv]deps} packaging==20.0 - zyte-api==0.4.0 + zyte-api==0.4.8 # https://stackoverflow.com/a/73046084 Twisted==21.7.0 @@ -108,3 +108,10 @@ deps = commands = python setup.py sdist twine check dist/* + +[testenv:docs] +changedir = docs +deps = + -rdocs/requirements.txt +commands = + sphinx-build -W -b html . {envtmpdir}/html